xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/tavor/tavor_misc.c (revision 9e39c5ba00a55fa05777cc94b148296af305e135)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_misc.c
29  *    Tavor Miscellaneous routines - Address Handle, Multicast, Protection
30  *    Domain, and port-related operations
31  *
32  *    Implements all the routines necessary for allocating, freeing, querying
33  *    and modifying Address Handles and Protection Domains.  Also implements
34  *    all the routines necessary for adding and removing Queue Pairs to/from
35  *    Multicast Groups.  Lastly, it implements the routines necessary for
36  *    port-related query and modify operations.
37  */
38 
39 #include <sys/types.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/modctl.h>
44 #include <sys/bitmap.h>
45 #include <sys/sysmacros.h>
46 
47 #include <sys/ib/adapters/tavor/tavor.h>
48 
49 /* used for helping uniquify fmr pool taskq name */
50 static uint_t tavor_debug_fmrpool_cnt = 0x00000000;
51 
52 static void tavor_udav_sync(tavor_ahhdl_t ah, tavor_hw_udav_t *udav,
53     uint_t flag);
54 static int tavor_mcg_qplist_add(tavor_state_t *state, tavor_mcghdl_t mcg,
55     tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp, uint_t *qp_found);
56 static int tavor_mcg_qplist_remove(tavor_mcghdl_t mcg,
57     tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp);
58 static void tavor_qp_mcg_refcnt_inc(tavor_qphdl_t qp);
59 static void tavor_qp_mcg_refcnt_dec(tavor_qphdl_t qp);
60 static uint_t tavor_mcg_walk_mgid_hash(tavor_state_t *state,
61     uint64_t start_indx, ib_gid_t mgid, uint_t *prev_indx);
62 static void tavor_mcg_setup_new_hdr(tavor_mcghdl_t mcg,
63     tavor_hw_mcg_t *mcg_hdr, ib_gid_t mgid, tavor_rsrc_t *mcg_rsrc);
64 static int tavor_mcg_hash_list_remove(tavor_state_t *state, uint_t curr_indx,
65     uint_t prev_indx, tavor_hw_mcg_t *mcg_entry);
66 static int tavor_mcg_entry_invalidate(tavor_state_t *state,
67     tavor_hw_mcg_t *mcg_entry, uint_t indx);
68 static int tavor_mgid_is_valid(ib_gid_t gid);
69 static int tavor_mlid_is_valid(ib_lid_t lid);
70 static void tavor_fmr_processing(void *fmr_args);
71 static int tavor_fmr_cleanup(tavor_state_t *state, tavor_fmrhdl_t pool);
72 static void tavor_fmr_cache_init(tavor_fmrhdl_t fmr);
73 static void tavor_fmr_cache_fini(tavor_fmrhdl_t fmr);
74 static int tavor_fmr_avl_compare(const void *q, const void *e);
75 
76 
77 /*
78  * tavor_ah_alloc()
79  *    Context: Can be called only from user or kernel context.
80  */
81 int
82 tavor_ah_alloc(tavor_state_t *state, tavor_pdhdl_t pd,
83     ibt_adds_vect_t *attr_p, tavor_ahhdl_t *ahhdl, uint_t sleepflag)
84 {
85 	tavor_rsrc_t		*udav, *rsrc;
86 	tavor_hw_udav_t		udav_entry;
87 	tavor_ahhdl_t		ah;
88 	ibt_mr_attr_t		mr_attr;
89 	tavor_mr_options_t	op;
90 	tavor_mrhdl_t		mr;
91 	uint64_t		data;
92 	uint32_t		size;
93 	int			status, i, flag;
94 	char			*errormsg;
95 
96 	TAVOR_TNF_ENTER(tavor_ah_alloc);
97 
98 	/*
99 	 * Someday maybe the "ibt_adds_vect_t *attr_p" will be NULL to
100 	 * indicate that we wish to allocate an "invalid" (i.e. empty)
101 	 * address handle XXX
102 	 */
103 
104 	/* Validate that specified port number is legal */
105 	if (!tavor_portnum_is_valid(state, attr_p->av_port_num)) {
106 		/* Set "status" and "errormsg" and goto failure */
107 		TAVOR_TNF_FAIL(IBT_HCA_PORT_INVALID, "invalid port num");
108 		goto ahalloc_fail;
109 	}
110 
111 	/*
112 	 * Allocate a UDAV entry.  This will be filled in with all the
113 	 * necessary parameters to define the Address Handle.  Unlike the
114 	 * other hardware resources no ownership transfer takes place as
115 	 * these UDAV entries are always owned by hardware.
116 	 */
117 	status = tavor_rsrc_alloc(state, TAVOR_UDAV, 1, sleepflag, &udav);
118 	if (status != DDI_SUCCESS) {
119 		/* Set "status" and "errormsg" and goto failure */
120 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed UDAV");
121 		goto ahalloc_fail;
122 	}
123 
124 	/*
125 	 * Allocate the software structure for tracking the address handle
126 	 * (i.e. the Tavor Address Handle struct).  If we fail here, we must
127 	 * undo the previous resource allocation.
128 	 */
129 	status = tavor_rsrc_alloc(state, TAVOR_AHHDL, 1, sleepflag, &rsrc);
130 	if (status != DDI_SUCCESS) {
131 		/* Set "status" and "errormsg" and goto failure */
132 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed AH handler");
133 		goto ahalloc_fail1;
134 	}
135 	ah = (tavor_ahhdl_t)rsrc->tr_addr;
136 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
137 
138 	/* Increment the reference count on the protection domain (PD) */
139 	tavor_pd_refcnt_inc(pd);
140 
141 	/*
142 	 * Fill in the UDAV entry.  Note: We are only filling in a temporary
143 	 * copy here, which we will later copy into the actual entry in
144 	 * Tavor DDR memory.  This starts be zeroing out the temporary copy
145 	 * and then calling tavor_set_addr_path() to fill in the common
146 	 * portions that can be pulled from the "ibt_adds_vect_t" passed in
147 	 */
148 	bzero(&udav_entry, sizeof (tavor_hw_udav_t));
149 	status = tavor_set_addr_path(state, attr_p,
150 	    (tavor_hw_addr_path_t *)&udav_entry, TAVOR_ADDRPATH_UDAV, NULL);
151 	if (status != DDI_SUCCESS) {
152 		tavor_pd_refcnt_dec(pd);
153 		tavor_rsrc_free(state, &rsrc);
154 		tavor_rsrc_free(state, &udav);
155 		/* Set "status" and "errormsg" and goto failure */
156 		TAVOR_TNF_FAIL(status, "failed in tavor_set_addr_path");
157 		goto ahalloc_fail;
158 	}
159 	udav_entry.pd	  = pd->pd_pdnum;
160 	udav_entry.msg_sz = state->ts_cfg_profile->cp_max_mtu - 1;
161 
162 	/*
163 	 * Register the memory for the UDAV.  The memory for the UDAV must
164 	 * be registered in the Tavor TPT tables.  This gives us the LKey
165 	 * that we will need when we later post a UD work request that
166 	 * uses this address handle.
167 	 * We might be able to pre-register all the memory for the UDAV XXX
168 	 */
169 	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
170 	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)udav->tr_addr;
171 	mr_attr.mr_len	 = udav->tr_len;
172 	mr_attr.mr_as	 = NULL;
173 	mr_attr.mr_flags = flag;
174 	op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
175 	op.mro_bind_dmahdl = NULL;
176 	op.mro_bind_override_addr = 0;
177 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
178 	if (status != DDI_SUCCESS) {
179 		/* Set "status" and "errormsg" and goto failure */
180 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
181 		goto ahalloc_fail2;
182 	}
183 
184 	/*
185 	 * Fill in the UDAV entry.  Here we copy all the information from
186 	 * the temporary UDAV into the DDR memory for the real UDAV entry.
187 	 * Note that we copy everything but the first 64-bit word.  This
188 	 * is where the PD number for the address handle resides.
189 	 * By filling everything except the PD and then writing the PD in
190 	 * a separate step below, we can ensure that the UDAV is not
191 	 * accessed while there are partially written values in it (something
192 	 * which really should not happen anyway).  This is guaranteed
193 	 * because we take measures to ensure that the PD number is zero for
194 	 * all unused UDAV (and because PD#0 is reserved for Tavor).
195 	 */
196 	size = sizeof (tavor_hw_udav_t) >> 3;
197 	for (i = 1; i < size; i++) {
198 		data = ((uint64_t *)&udav_entry)[i];
199 		ddi_put64(udav->tr_acchdl, ((uint64_t *)udav->tr_addr + i),
200 		    data);
201 	}
202 	data = ((uint64_t *)&udav_entry)[0];
203 	ddi_put64(udav->tr_acchdl, (uint64_t *)udav->tr_addr, data);
204 
205 	/*
206 	 * Fill in the rest of the Tavor Address Handle struct.  Having
207 	 * successfully copied the UDAV into the hardware, we update the
208 	 * following fields for use in further operations on the AH.
209 	 *
210 	 * NOTE: We are saving away a copy of the "av_dgid.gid_guid" field
211 	 * here because we may need to return it later to the IBTF (as a
212 	 * result of a subsequent query operation).  Unlike the other UDAV
213 	 * parameters, the value of "av_dgid.gid_guid" is not always preserved
214 	 * by being written to hardware.  The reason for this is described in
215 	 * tavor_set_addr_path().
216 	 */
217 	ah->ah_udavrsrcp = udav;
218 	ah->ah_rsrcp	 = rsrc;
219 	ah->ah_pdhdl	 = pd;
220 	ah->ah_mrhdl	 = mr;
221 	ah->ah_save_guid = attr_p->av_dgid.gid_guid;
222 	ah->ah_save_srate = attr_p->av_srate;
223 	*ahhdl = ah;
224 
225 	/* Determine if later ddi_dma_sync will be necessary */
226 	ah->ah_sync = TAVOR_UDAV_IS_SYNC_REQ(state);
227 
228 	/* Sync the UDAV for use by the hardware */
229 	tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
230 
231 	TAVOR_TNF_EXIT(tavor_ah_alloc);
232 	return (DDI_SUCCESS);
233 
234 ahalloc_fail2:
235 	tavor_pd_refcnt_dec(pd);
236 	tavor_rsrc_free(state, &rsrc);
237 ahalloc_fail1:
238 	tavor_rsrc_free(state, &udav);
239 ahalloc_fail:
240 	TNF_PROBE_1(tavor_ah_alloc_fail, TAVOR_TNF_ERROR, "",
241 	    tnf_string, msg, errormsg);
242 	TAVOR_TNF_EXIT(tavor_ah_alloc);
243 	return (status);
244 }
245 
246 
247 /*
248  * tavor_ah_free()
249  *    Context: Can be called only from user or kernel context.
250  */
251 /* ARGSUSED */
252 int
253 tavor_ah_free(tavor_state_t *state, tavor_ahhdl_t *ahhdl, uint_t sleepflag)
254 {
255 	tavor_rsrc_t		*udav, *rsrc;
256 	tavor_pdhdl_t		pd;
257 	tavor_mrhdl_t		mr;
258 	tavor_ahhdl_t		ah;
259 	int			status;
260 
261 	TAVOR_TNF_ENTER(tavor_ah_free);
262 
263 	/*
264 	 * Pull all the necessary information from the Tavor Address Handle
265 	 * struct.  This is necessary here because the resource for the
266 	 * AH is going to be freed up as part of this operation.
267 	 */
268 	ah    = *ahhdl;
269 	mutex_enter(&ah->ah_lock);
270 	udav  = ah->ah_udavrsrcp;
271 	rsrc  = ah->ah_rsrcp;
272 	pd    = ah->ah_pdhdl;
273 	mr    = ah->ah_mrhdl;
274 	mutex_exit(&ah->ah_lock);
275 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
276 
277 	/*
278 	 * Deregister the memory for the UDAV.  If this fails for any reason,
279 	 * then it is an indication that something (either in HW or SW) has
280 	 * gone seriously wrong.  So we print a warning message and return
281 	 * failure.
282 	 */
283 	status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
284 	    sleepflag);
285 	if (status != DDI_SUCCESS) {
286 		TNF_PROBE_0(tavor_ah_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
287 		TAVOR_TNF_EXIT(tavor_ah_free);
288 		return (ibc_get_ci_failure(0));
289 	}
290 
291 	/*
292 	 * Write zero to the first 64-bit word in the UDAV entry.  As
293 	 * described above (in tavor_ah_alloc), the PD number is stored in
294 	 * the first 64-bits of each UDAV and setting this to zero is
295 	 * guaranteed to invalidate the entry.
296 	 */
297 	ddi_put64(udav->tr_acchdl, (uint64_t *)udav->tr_addr, 0);
298 
299 	/* Sync the UDAV for use by the hardware */
300 	tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
301 
302 	/* Decrement the reference count on the protection domain (PD) */
303 	tavor_pd_refcnt_dec(pd);
304 
305 	/* Free the Tavor Address Handle structure */
306 	tavor_rsrc_free(state, &rsrc);
307 
308 	/* Free up the UDAV entry resource */
309 	tavor_rsrc_free(state, &udav);
310 
311 	/* Set the ahhdl pointer to NULL and return success */
312 	*ahhdl = NULL;
313 
314 	TAVOR_TNF_EXIT(tavor_ah_free);
315 	return (DDI_SUCCESS);
316 }
317 
318 
319 /*
320  * tavor_ah_query()
321  *    Context: Can be called from interrupt or base context.
322  */
323 /* ARGSUSED */
324 int
325 tavor_ah_query(tavor_state_t *state, tavor_ahhdl_t ah, tavor_pdhdl_t *pd,
326     ibt_adds_vect_t *attr_p)
327 {
328 	tavor_hw_udav_t		udav_entry;
329 	tavor_rsrc_t		*udav;
330 	uint64_t		data;
331 	uint32_t		size;
332 	int			i;
333 
334 	TAVOR_TNF_ENTER(tavor_ah_query);
335 
336 	mutex_enter(&ah->ah_lock);
337 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr_p))
338 
339 	/*
340 	 * Pull all the necessary information from the Tavor Address Handle
341 	 * structure
342 	 */
343 	udav	= ah->ah_udavrsrcp;
344 	*pd	= ah->ah_pdhdl;
345 
346 	/*
347 	 * Copy the UDAV entry into the temporary copy.  Here we copy all
348 	 * the information from the UDAV entry in DDR memory into the
349 	 * temporary UDAV.  Note:  We don't need to sync the UDAV for
350 	 * reading by software because Tavor HW never modifies the entry.
351 	 */
352 	size = sizeof (tavor_hw_udav_t) >> 3;
353 	for (i = 0; i < size; i++) {
354 		data = ddi_get64(udav->tr_acchdl,
355 		    ((uint64_t *)udav->tr_addr + i));
356 		((uint64_t *)&udav_entry)[i] = data;
357 	}
358 
359 	/*
360 	 * Fill in "ibt_adds_vect_t".  We call tavor_get_addr_path() to fill
361 	 * the common portions that can be pulled from the UDAV we pass in.
362 	 *
363 	 * NOTE: We will also fill the "av_dgid.gid_guid" field from the
364 	 * "ah_save_guid" field we have previously saved away.  The reason
365 	 * for this is described in tavor_ah_alloc() and tavor_ah_modify().
366 	 */
367 	tavor_get_addr_path(state, (tavor_hw_addr_path_t *)&udav_entry,
368 	    attr_p, TAVOR_ADDRPATH_UDAV, NULL);
369 
370 	attr_p->av_dgid.gid_guid = ah->ah_save_guid;
371 	attr_p->av_srate = ah->ah_save_srate;
372 
373 	mutex_exit(&ah->ah_lock);
374 	TAVOR_TNF_EXIT(tavor_ah_query);
375 	return (DDI_SUCCESS);
376 }
377 
378 
379 /*
380  * tavor_ah_modify()
381  *    Context: Can be called from interrupt or base context.
382  */
383 /* ARGSUSED */
384 int
385 tavor_ah_modify(tavor_state_t *state, tavor_ahhdl_t ah,
386     ibt_adds_vect_t *attr_p)
387 {
388 	tavor_hw_udav_t		udav_entry;
389 	tavor_rsrc_t		*udav;
390 	uint64_t		data_new, data_old;
391 	uint32_t		udav_pd, size, portnum_new;
392 	int			i, status;
393 
394 	TAVOR_TNF_ENTER(tavor_ah_modify);
395 
396 	/* Validate that specified port number is legal */
397 	if (!tavor_portnum_is_valid(state, attr_p->av_port_num)) {
398 		TNF_PROBE_1(tavor_ah_modify_inv_portnum,
399 		    TAVOR_TNF_ERROR, "", tnf_uint, port, attr_p->av_port_num);
400 		TAVOR_TNF_EXIT(tavor_ah_modify);
401 		return (IBT_HCA_PORT_INVALID);
402 	}
403 
404 	mutex_enter(&ah->ah_lock);
405 
406 	/*
407 	 * Pull all the necessary information from the Tavor Address Handle
408 	 * structure
409 	 */
410 	udav = ah->ah_udavrsrcp;
411 
412 	/*
413 	 * Fill in the UDAV entry.  Note: we are only filling in a temporary
414 	 * copy here, which we will later copy into the actual entry in
415 	 * Tavor DDR memory.  This starts be zeroing out the temporary copy
416 	 * and then calling tavor_set_addr_path() to fill in the common
417 	 * portions that can be pulled from the "ibt_adds_vect_t" passed in
418 	 *
419 	 * NOTE: We also need to save away a copy of the "av_dgid.gid_guid"
420 	 * field here (just as we did during tavor_ah_alloc()) because we
421 	 * may need to return it later to the IBTF (as a result of a
422 	 * subsequent query operation).  As explained in tavor_ah_alloc(),
423 	 * unlike the other UDAV parameters, the value of "av_dgid.gid_guid"
424 	 * is not always preserved by being written to hardware.  The reason
425 	 * for this is described in tavor_set_addr_path().
426 	 */
427 	bzero(&udav_entry, sizeof (tavor_hw_udav_t));
428 	status = tavor_set_addr_path(state, attr_p,
429 	    (tavor_hw_addr_path_t *)&udav_entry, TAVOR_ADDRPATH_UDAV, NULL);
430 	if (status != DDI_SUCCESS) {
431 		mutex_exit(&ah->ah_lock);
432 		TNF_PROBE_0(tavor_ah_modify_setaddrpath_fail,
433 		    TAVOR_TNF_ERROR, "");
434 		TAVOR_TNF_EXIT(tavor_ah_modify);
435 		return (status);
436 	}
437 	ah->ah_save_guid = attr_p->av_dgid.gid_guid;
438 	ah->ah_save_srate = attr_p->av_srate;
439 
440 	/*
441 	 * Save away the current PD number for this UDAV.  Then temporarily
442 	 * invalidate the entry (by setting the PD to zero).  Note:  Since
443 	 * the first 32 bits of the UDAV actually contain the current port
444 	 * number _and_ current PD number, we need to mask off some bits.
445 	 */
446 	udav_pd = ddi_get32(udav->tr_acchdl, (uint32_t *)udav->tr_addr);
447 	udav_pd = udav_pd & 0xFFFFFF;
448 	ddi_put32(udav->tr_acchdl, (uint32_t *)udav->tr_addr, 0);
449 
450 	/* Sync the UDAV for use by the hardware */
451 	tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
452 
453 	/*
454 	 * Copy UDAV structure to the entry
455 	 *    Note:  We copy in 64-bit chunks.  For the first two of these
456 	 *    chunks it is necessary to read the current contents of the
457 	 *    UDAV, mask off the modifiable portions (maintaining any
458 	 *    of the "reserved" portions), and then mask on the new data.
459 	 */
460 	size = sizeof (tavor_hw_udav_t) >> 3;
461 	for (i = 0; i < size; i++) {
462 		data_new = ((uint64_t *)&udav_entry)[i];
463 		data_old = ddi_get64(udav->tr_acchdl,
464 		    ((uint64_t *)udav->tr_addr + i));
465 
466 		/*
467 		 * Apply mask to change only the relevant values.  Note: We
468 		 * extract the new portnum from the address handle here
469 		 * because the "PD" and "portnum" fields are in the same
470 		 * 32-bit word in the UDAV.  We will use the (new) port
471 		 * number extracted here when we write the valid PD number
472 		 * in the last step below.
473 		 */
474 		if (i == 0) {
475 			data_old = data_old & TAVOR_UDAV_MODIFY_MASK0;
476 			portnum_new = data_new >> 56;
477 		} else if (i == 1) {
478 			data_old = data_old & TAVOR_UDAV_MODIFY_MASK1;
479 		} else {
480 			data_old = 0;
481 		}
482 
483 		/* Write the updated values to the UDAV (in DDR) */
484 		data_new = data_old | data_new;
485 		ddi_put64(udav->tr_acchdl, ((uint64_t *)udav->tr_addr + i),
486 		    data_new);
487 	}
488 
489 	/*
490 	 * Sync the body of the UDAV for use by the hardware.  After we
491 	 * have updated the PD number (to make the UDAV valid), we sync
492 	 * again to push the entire entry out for hardware access.
493 	 */
494 	tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
495 
496 	/*
497 	 * Put the valid PD number back into UDAV entry.  Note: Because port
498 	 * number and PD number are in the same word, we must mask the
499 	 * new port number with the old PD number before writing it back
500 	 * to the UDAV entry
501 	 */
502 	udav_pd = ((portnum_new << 24) | udav_pd);
503 	ddi_put32(udav->tr_acchdl, (uint32_t *)udav->tr_addr, udav_pd);
504 
505 	/* Sync the rest of the UDAV for use by the hardware */
506 	tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
507 
508 	mutex_exit(&ah->ah_lock);
509 	TAVOR_TNF_EXIT(tavor_ah_modify);
510 	return (DDI_SUCCESS);
511 }
512 
513 
514 /*
515  * tavor_udav_sync()
516  *    Context: Can be called from interrupt or base context.
517  */
518 /* ARGSUSED */
519 static void
520 tavor_udav_sync(tavor_ahhdl_t ah, tavor_hw_udav_t *udav, uint_t flag)
521 {
522 	ddi_dma_handle_t	dmahdl;
523 	off_t			offset;
524 	int			status;
525 
526 	TAVOR_TNF_ENTER(tavor_udav_sync);
527 
528 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
529 
530 	/* Determine if AH needs to be synced or not */
531 	if (ah->ah_sync == 0) {
532 		TAVOR_TNF_EXIT(tavor_udav_sync);
533 		return;
534 	}
535 
536 	/* Get the DMA handle from AH handle */
537 	dmahdl = ah->ah_mrhdl->mr_bindinfo.bi_dmahdl;
538 
539 	/* Calculate offset into address handle */
540 	offset = (off_t)0;
541 	status = ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_udav_t), flag);
542 	if (status != DDI_SUCCESS) {
543 		TNF_PROBE_0(tavor_udav_sync_getnextentry_fail,
544 		    TAVOR_TNF_ERROR, "");
545 		TAVOR_TNF_EXIT(tavor_udav_sync);
546 		return;
547 	}
548 
549 	TAVOR_TNF_EXIT(tavor_udav_sync);
550 }
551 
552 
553 /*
554  * tavor_mcg_attach()
555  *    Context: Can be called only from user or kernel context.
556  */
557 int
558 tavor_mcg_attach(tavor_state_t *state, tavor_qphdl_t qp, ib_gid_t gid,
559     ib_lid_t lid)
560 {
561 	tavor_rsrc_t		*rsrc;
562 	tavor_hw_mcg_t		*mcg_entry;
563 	tavor_hw_mcg_qp_list_t	*mcg_entry_qplist;
564 	tavor_mcghdl_t		mcg, newmcg;
565 	uint64_t		mgid_hash;
566 	uint32_t		end_indx;
567 	int			status;
568 	uint_t			qp_found;
569 	char			*errormsg;
570 
571 	TAVOR_TNF_ENTER(tavor_mcg_attach);
572 
573 	/*
574 	 * It is only allowed to attach MCG to UD queue pairs.  Verify
575 	 * that the intended QP is of the appropriate transport type
576 	 */
577 	if (qp->qp_serv_type != TAVOR_QP_UD) {
578 		/* Set "status" and "errormsg" and goto failure */
579 		TAVOR_TNF_FAIL(IBT_QP_SRV_TYPE_INVALID, "invalid service type");
580 		goto mcgattach_fail;
581 	}
582 
583 	/*
584 	 * Check for invalid Multicast DLID.  Specifically, all Multicast
585 	 * LIDs should be within a well defined range.  If the specified LID
586 	 * is outside of that range, then return an error.
587 	 */
588 	if (tavor_mlid_is_valid(lid) == 0) {
589 		/* Set "status" and "errormsg" and goto failure */
590 		TAVOR_TNF_FAIL(IBT_MC_MLID_INVALID, "invalid MLID");
591 		goto mcgattach_fail;
592 	}
593 	/*
594 	 * Check for invalid Multicast GID.  All Multicast GIDs should have
595 	 * a well-defined pattern of bits and flags that are allowable.  If
596 	 * the specified GID does not meet the criteria, then return an error.
597 	 */
598 	if (tavor_mgid_is_valid(gid) == 0) {
599 		/* Set "status" and "errormsg" and goto failure */
600 		TAVOR_TNF_FAIL(IBT_MC_MGID_INVALID, "invalid MGID");
601 		goto mcgattach_fail;
602 	}
603 
604 	/*
605 	 * Compute the MGID hash value.  Since the MCG table is arranged as
606 	 * a number of separate hash chains, this operation converts the
607 	 * specified MGID into the starting index of an entry in the hash
608 	 * table (i.e. the index for the start of the appropriate hash chain).
609 	 * Subsequent operations below will walk the chain searching for the
610 	 * right place to add this new QP.
611 	 */
612 	status = tavor_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
613 	    &mgid_hash, TAVOR_SLEEPFLAG_FOR_CONTEXT());
614 	if (status != TAVOR_CMD_SUCCESS) {
615 		cmn_err(CE_CONT, "Tavor: MGID_HASH command failed: %08x\n",
616 		    status);
617 		TNF_PROBE_1(tavor_mcg_attach_mgid_hash_cmd_fail,
618 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
619 		TAVOR_TNF_EXIT(tavor_mcg_attach);
620 		return (ibc_get_ci_failure(0));
621 	}
622 
623 	/*
624 	 * Grab the multicast group mutex.  Then grab the pre-allocated
625 	 * temporary buffer used for holding and/or modifying MCG entries.
626 	 * Zero out the temporary MCG entry before we begin.
627 	 */
628 	mutex_enter(&state->ts_mcglock);
629 	mcg_entry = state->ts_mcgtmp;
630 	mcg_entry_qplist = TAVOR_MCG_GET_QPLIST_PTR(mcg_entry);
631 	bzero(mcg_entry, TAVOR_MCGMEM_SZ(state));
632 
633 	/*
634 	 * Walk through the array of MCG entries starting at "mgid_hash".
635 	 * Try to find the appropriate place for this new QP to be added.
636 	 * This could happen when the first entry of the chain has MGID == 0
637 	 * (which means that the hash chain is empty), or because we find
638 	 * an entry with the same MGID (in which case we'll add the QP to
639 	 * that MCG), or because we come to the end of the chain (in which
640 	 * case this is the first QP being added to the multicast group that
641 	 * corresponds to the MGID.  The tavor_mcg_walk_mgid_hash() routine
642 	 * walks the list and returns an index into the MCG table.  The entry
643 	 * at this index is then checked to determine which case we have
644 	 * fallen into (see below).  Note:  We are using the "shadow" MCG
645 	 * list (of tavor_mcg_t structs) for this lookup because the real
646 	 * MCG entries are in hardware (and the lookup process would be much
647 	 * more time consuming).
648 	 */
649 	end_indx = tavor_mcg_walk_mgid_hash(state, mgid_hash, gid, NULL);
650 	mcg	 = &state->ts_mcghdl[end_indx];
651 
652 	/*
653 	 * If MGID == 0, then the hash chain is empty.  Just fill in the
654 	 * current entry.  Note:  No need to allocate an MCG table entry
655 	 * as all the hash chain "heads" are already preallocated.
656 	 */
657 	if ((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) {
658 
659 		/* Fill in the current entry in the "shadow" MCG list */
660 		tavor_mcg_setup_new_hdr(mcg, mcg_entry, gid, NULL);
661 
662 		/*
663 		 * Try to add the new QP number to the list.  This (and the
664 		 * above) routine fills in a temporary MCG.  The "mcg_entry"
665 		 * and "mcg_entry_qplist" pointers simply point to different
666 		 * offsets within the same temporary copy of the MCG (for
667 		 * convenience).  Note:  If this fails, we need to invalidate
668 		 * the entries we've already put into the "shadow" list entry
669 		 * above.
670 		 */
671 		status = tavor_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
672 		    &qp_found);
673 		if (status != DDI_SUCCESS) {
674 			bzero(mcg, sizeof (struct tavor_sw_mcg_list_s));
675 			mutex_exit(&state->ts_mcglock);
676 			/* Set "status" and "errormsg" and goto failure */
677 			TAVOR_TNF_FAIL(status, "failed qplist add");
678 			goto mcgattach_fail;
679 		}
680 
681 		/*
682 		 * Once the temporary MCG has been filled in, write the entry
683 		 * into the appropriate location in the Tavor MCG entry table.
684 		 * If it's successful, then drop the lock and return success.
685 		 * Note: In general, this operation shouldn't fail.  If it
686 		 * does, then it is an indication that something (probably in
687 		 * HW, but maybe in SW) has gone seriously wrong.  We still
688 		 * want to zero out the entries that we've filled in above
689 		 * (in the tavor_mcg_setup_new_hdr() routine).
690 		 */
691 		status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
692 		    TAVOR_CMD_NOSLEEP_SPIN);
693 		if (status != TAVOR_CMD_SUCCESS) {
694 			bzero(mcg, sizeof (struct tavor_sw_mcg_list_s));
695 			mutex_exit(&state->ts_mcglock);
696 			TAVOR_WARNING(state, "failed to write MCG entry");
697 			cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
698 			    "%08x\n", status);
699 			TNF_PROBE_2(tavor_mcg_attach_write_mgm_cmd_fail,
700 			    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
701 			    tnf_uint, indx, end_indx);
702 			TAVOR_TNF_EXIT(tavor_mcg_attach);
703 			return (ibc_get_ci_failure(0));
704 		}
705 
706 		/*
707 		 * Now that we know all the Tavor firmware accesses have been
708 		 * successful, we update the "shadow" MCG entry by incrementing
709 		 * the "number of attached QPs" count.
710 		 *
711 		 * We increment only if the QP is not already part of the
712 		 * MCG by checking the 'qp_found' flag returned from the
713 		 * qplist_add above.
714 		 */
715 		if (!qp_found) {
716 			mcg->mcg_num_qps++;
717 
718 			/*
719 			 * Increment the refcnt for this QP.  Because the QP
720 			 * was added to this MCG, the refcnt must be
721 			 * incremented.
722 			 */
723 			tavor_qp_mcg_refcnt_inc(qp);
724 		}
725 
726 		/*
727 		 * We drop the lock and return success.
728 		 */
729 		mutex_exit(&state->ts_mcglock);
730 		TAVOR_TNF_EXIT(tavor_mcg_attach);
731 		return (DDI_SUCCESS);
732 	}
733 
734 	/*
735 	 * If the specified MGID matches the MGID in the current entry, then
736 	 * we need to try to add the QP to the current MCG entry.  In this
737 	 * case, it means that we need to read the existing MCG entry (into
738 	 * the temporary MCG), add the new QP number to the temporary entry
739 	 * (using the same method we used above), and write the entry back
740 	 * to the hardware (same as above).
741 	 */
742 	if ((mcg->mcg_mgid_h == gid.gid_prefix) &&
743 	    (mcg->mcg_mgid_l == gid.gid_guid)) {
744 
745 		/*
746 		 * Read the current MCG entry into the temporary MCG.  Note:
747 		 * In general, this operation shouldn't fail.  If it does,
748 		 * then it is an indication that something (probably in HW,
749 		 * but maybe in SW) has gone seriously wrong.
750 		 */
751 		status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx,
752 		    TAVOR_CMD_NOSLEEP_SPIN);
753 		if (status != TAVOR_CMD_SUCCESS) {
754 			mutex_exit(&state->ts_mcglock);
755 			TAVOR_WARNING(state, "failed to read MCG entry");
756 			cmn_err(CE_CONT, "Tavor: READ_MGM command failed: "
757 			    "%08x\n", status);
758 			TNF_PROBE_2(tavor_mcg_attach_read_mgm_cmd_fail,
759 			    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
760 			    tnf_uint, indx, end_indx);
761 			TAVOR_TNF_EXIT(tavor_mcg_attach);
762 			return (ibc_get_ci_failure(0));
763 		}
764 
765 		/*
766 		 * Try to add the new QP number to the list.  This routine
767 		 * fills in the necessary pieces of the temporary MCG.  The
768 		 * "mcg_entry_qplist" pointer is used to point to the portion
769 		 * of the temporary MCG that holds the QP numbers.
770 		 *
771 		 * Note: tavor_mcg_qplist_add() returns SUCCESS if it
772 		 * already found the QP in the list.  In this case, the QP is
773 		 * not added on to the list again.  Check the flag 'qp_found'
774 		 * if this value is needed to be known.
775 		 *
776 		 */
777 		status = tavor_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
778 		    &qp_found);
779 		if (status != DDI_SUCCESS) {
780 			mutex_exit(&state->ts_mcglock);
781 			/* Set "status" and "errormsg" and goto failure */
782 			TAVOR_TNF_FAIL(status, "failed qplist add");
783 			goto mcgattach_fail;
784 		}
785 
786 		/*
787 		 * Once the temporary MCG has been updated, write the entry
788 		 * into the appropriate location in the Tavor MCG entry table.
789 		 * If it's successful, then drop the lock and return success.
790 		 * Note: In general, this operation shouldn't fail.  If it
791 		 * does, then it is an indication that something (probably in
792 		 * HW, but maybe in SW) has gone seriously wrong.
793 		 */
794 		status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
795 		    TAVOR_CMD_NOSLEEP_SPIN);
796 		if (status != TAVOR_CMD_SUCCESS) {
797 			mutex_exit(&state->ts_mcglock);
798 			TAVOR_WARNING(state, "failed to write MCG entry");
799 			cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
800 			    "%08x\n", status);
801 			TNF_PROBE_2(tavor_mcg_attach_write_mgm_cmd_fail,
802 			    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
803 			    tnf_uint, indx, end_indx);
804 			TAVOR_TNF_EXIT(tavor_mcg_attach);
805 			return (ibc_get_ci_failure(0));
806 		}
807 
808 		/*
809 		 * Now that we know all the Tavor firmware accesses have been
810 		 * successful, we update the current "shadow" MCG entry by
811 		 * incrementing the "number of attached QPs" count.
812 		 *
813 		 * We increment only if the QP is not already part of the
814 		 * MCG by checking the 'qp_found' flag returned from the
815 		 * qplist_add above.
816 		 */
817 		if (!qp_found) {
818 			mcg->mcg_num_qps++;
819 
820 			/*
821 			 * Increment the refcnt for this QP.  Because the QP
822 			 * was added to this MCG, the refcnt must be
823 			 * incremented.
824 			 */
825 			tavor_qp_mcg_refcnt_inc(qp);
826 		}
827 
828 		/*
829 		 * We drop the lock and return success.
830 		 */
831 		mutex_exit(&state->ts_mcglock);
832 		TAVOR_TNF_EXIT(tavor_mcg_attach);
833 		return (DDI_SUCCESS);
834 	}
835 
836 	/*
837 	 * If we've reached here, then we're at the end of the hash chain.
838 	 * We need to allocate a new MCG entry, fill it in, write it to Tavor,
839 	 * and update the previous entry to link the new one to the end of the
840 	 * chain.
841 	 */
842 
843 	/*
844 	 * Allocate an MCG table entry.  This will be filled in with all
845 	 * the necessary parameters to define the multicast group.  Then it
846 	 * will be written to the hardware in the next-to-last step below.
847 	 */
848 	status = tavor_rsrc_alloc(state, TAVOR_MCG, 1, TAVOR_NOSLEEP, &rsrc);
849 	if (status != DDI_SUCCESS) {
850 		mutex_exit(&state->ts_mcglock);
851 		/* Set "status" and "errormsg" and goto failure */
852 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MCG");
853 		goto mcgattach_fail;
854 	}
855 
856 	/*
857 	 * Fill in the new entry in the "shadow" MCG list.  Note:  Just as
858 	 * it does above, tavor_mcg_setup_new_hdr() also fills in a portion
859 	 * of the temporary MCG entry (the rest of which will be filled in by
860 	 * tavor_mcg_qplist_add() below)
861 	 */
862 	newmcg = &state->ts_mcghdl[rsrc->tr_indx];
863 	tavor_mcg_setup_new_hdr(newmcg, mcg_entry, gid, rsrc);
864 
865 	/*
866 	 * Try to add the new QP number to the list.  This routine fills in
867 	 * the final necessary pieces of the temporary MCG.  The
868 	 * "mcg_entry_qplist" pointer is used to point to the portion of the
869 	 * temporary MCG that holds the QP numbers.  If we fail here, we
870 	 * must undo the previous resource allocation.
871 	 *
872 	 * Note: tavor_mcg_qplist_add() can we return SUCCESS if it already
873 	 * found the QP in the list.  In this case, the QP is not added on to
874 	 * the list again.  Check the flag 'qp_found' if this value is needed
875 	 * to be known.
876 	 */
877 	status = tavor_mcg_qplist_add(state, newmcg, mcg_entry_qplist, qp,
878 	    &qp_found);
879 	if (status != DDI_SUCCESS) {
880 		bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
881 		tavor_rsrc_free(state, &rsrc);
882 		mutex_exit(&state->ts_mcglock);
883 		/* Set "status" and "errormsg" and goto failure */
884 		TAVOR_TNF_FAIL(status, "failed qplist add");
885 		goto mcgattach_fail;
886 	}
887 
888 	/*
889 	 * Once the temporary MCG has been updated, write the entry into the
890 	 * appropriate location in the Tavor MCG entry table.  If this is
891 	 * successful, then we need to chain the previous entry to this one.
892 	 * Note: In general, this operation shouldn't fail.  If it does, then
893 	 * it is an indication that something (probably in HW, but maybe in
894 	 * SW) has gone seriously wrong.
895 	 */
896 	status = tavor_write_mgm_cmd_post(state, mcg_entry, rsrc->tr_indx,
897 	    TAVOR_CMD_NOSLEEP_SPIN);
898 	if (status != TAVOR_CMD_SUCCESS) {
899 		bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
900 		tavor_rsrc_free(state, &rsrc);
901 		mutex_exit(&state->ts_mcglock);
902 		TAVOR_WARNING(state, "failed to write MCG entry");
903 		cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
904 		    status);
905 		TNF_PROBE_2(tavor_mcg_attach_write_mgm_cmd_fail,
906 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
907 		    tnf_uint, indx, rsrc->tr_indx);
908 		TAVOR_TNF_EXIT(tavor_mcg_attach);
909 		return (ibc_get_ci_failure(0));
910 	}
911 
912 	/*
913 	 * Now read the current MCG entry (the one previously at the end of
914 	 * hash chain) into the temporary MCG.  We are going to update its
915 	 * "next_gid_indx" now and write the entry back to the MCG table.
916 	 * Note:  In general, this operation shouldn't fail.  If it does, then
917 	 * it is an indication that something (probably in HW, but maybe in SW)
918 	 * has gone seriously wrong.  We will free up the MCG entry resource,
919 	 * but we will not undo the previously written MCG entry in the HW.
920 	 * This is OK, though, because the MCG entry is not currently attached
921 	 * to any hash chain.
922 	 */
923 	status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx,
924 	    TAVOR_CMD_NOSLEEP_SPIN);
925 	if (status != TAVOR_CMD_SUCCESS) {
926 		bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
927 		tavor_rsrc_free(state, &rsrc);
928 		mutex_exit(&state->ts_mcglock);
929 		TAVOR_WARNING(state, "failed to read MCG entry");
930 		cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n",
931 		    status);
932 		TNF_PROBE_2(tavor_mcg_attach_read_mgm_cmd_fail,
933 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
934 		    tnf_uint, indx, end_indx);
935 		TAVOR_TNF_EXIT(tavor_mcg_attach);
936 		return (ibc_get_ci_failure(0));
937 	}
938 
939 	/*
940 	 * Finally, we update the "next_gid_indx" field in the temporary MCG
941 	 * and attempt to write the entry back into the Tavor MCG table.  If
942 	 * this succeeds, then we update the "shadow" list to reflect the
943 	 * change, drop the lock, and return success.  Note:  In general, this
944 	 * operation shouldn't fail.  If it does, then it is an indication
945 	 * that something (probably in HW, but maybe in SW) has gone seriously
946 	 * wrong.  Just as we do above, we will free up the MCG entry resource,
947 	 * but we will not try to undo the previously written MCG entry.  This
948 	 * is OK, though, because (since we failed here to update the end of
949 	 * the chain) that other entry is not currently attached to any chain.
950 	 */
951 	mcg_entry->next_gid_indx = rsrc->tr_indx;
952 	status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
953 	    TAVOR_CMD_NOSLEEP_SPIN);
954 	if (status != TAVOR_CMD_SUCCESS) {
955 		bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
956 		tavor_rsrc_free(state, &rsrc);
957 		mutex_exit(&state->ts_mcglock);
958 		TAVOR_WARNING(state, "failed to write MCG entry");
959 		cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
960 		    status);
961 		TNF_PROBE_2(tavor_mcg_attach_write_mgm_cmd_fail,
962 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
963 		    tnf_uint, indx, end_indx);
964 		TAVOR_TNF_EXIT(tavor_mcg_attach);
965 		return (ibc_get_ci_failure(0));
966 	}
967 	mcg = &state->ts_mcghdl[end_indx];
968 	mcg->mcg_next_indx = rsrc->tr_indx;
969 
970 	/*
971 	 * Now that we know all the Tavor firmware accesses have been
972 	 * successful, we update the new "shadow" MCG entry by incrementing
973 	 * the "number of attached QPs" count.  Then we drop the lock and
974 	 * return success.
975 	 */
976 	newmcg->mcg_num_qps++;
977 
978 	/*
979 	 * Increment the refcnt for this QP.  Because the QP
980 	 * was added to this MCG, the refcnt must be
981 	 * incremented.
982 	 */
983 	tavor_qp_mcg_refcnt_inc(qp);
984 
985 	mutex_exit(&state->ts_mcglock);
986 	TAVOR_TNF_EXIT(tavor_mcg_attach);
987 	return (DDI_SUCCESS);
988 
989 mcgattach_fail:
990 	TNF_PROBE_1(tavor_mcg_attach_fail, TAVOR_TNF_ERROR, "", tnf_string,
991 	    msg, errormsg);
992 	TAVOR_TNF_EXIT(tavor_mcg_attach);
993 	return (status);
994 }
995 
996 
997 /*
998  * tavor_mcg_detach()
999  *    Context: Can be called only from user or kernel context.
1000  */
1001 int
1002 tavor_mcg_detach(tavor_state_t *state, tavor_qphdl_t qp, ib_gid_t gid,
1003     ib_lid_t lid)
1004 {
1005 	tavor_hw_mcg_t		*mcg_entry;
1006 	tavor_hw_mcg_qp_list_t	*mcg_entry_qplist;
1007 	tavor_mcghdl_t		mcg;
1008 	uint64_t		mgid_hash;
1009 	uint32_t		end_indx, prev_indx;
1010 	int			status;
1011 
1012 	TAVOR_TNF_ENTER(tavor_mcg_detach);
1013 
1014 	/*
1015 	 * Check for invalid Multicast DLID.  Specifically, all Multicast
1016 	 * LIDs should be within a well defined range.  If the specified LID
1017 	 * is outside of that range, then return an error.
1018 	 */
1019 	if (tavor_mlid_is_valid(lid) == 0) {
1020 		TNF_PROBE_0(tavor_mcg_detach_invmlid_fail, TAVOR_TNF_ERROR, "");
1021 		TAVOR_TNF_EXIT(tavor_mcg_detach);
1022 		return (IBT_MC_MLID_INVALID);
1023 	}
1024 
1025 	/*
1026 	 * Compute the MGID hash value.  As described above, the MCG table is
1027 	 * arranged as a number of separate hash chains.  This operation
1028 	 * converts the specified MGID into the starting index of an entry in
1029 	 * the hash table (i.e. the index for the start of the appropriate
1030 	 * hash chain).  Subsequent operations below will walk the chain
1031 	 * searching for a matching entry from which to attempt to remove
1032 	 * the specified QP.
1033 	 */
1034 	status = tavor_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
1035 	    &mgid_hash, TAVOR_SLEEPFLAG_FOR_CONTEXT());
1036 	if (status != TAVOR_CMD_SUCCESS) {
1037 		cmn_err(CE_CONT, "Tavor: MGID_HASH command failed: %08x\n",
1038 		    status);
1039 		TNF_PROBE_1(tavor_mcg_detach_mgid_hash_cmd_fail,
1040 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
1041 		TAVOR_TNF_EXIT(tavor_mcg_attach);
1042 		return (ibc_get_ci_failure(0));
1043 	}
1044 
1045 	/*
1046 	 * Grab the multicast group mutex.  Then grab the pre-allocated
1047 	 * temporary buffer used for holding and/or modifying MCG entries.
1048 	 */
1049 	mutex_enter(&state->ts_mcglock);
1050 	mcg_entry = state->ts_mcgtmp;
1051 	mcg_entry_qplist = TAVOR_MCG_GET_QPLIST_PTR(mcg_entry);
1052 
1053 	/*
1054 	 * Walk through the array of MCG entries starting at "mgid_hash".
1055 	 * Try to find an MCG entry with a matching MGID.  The
1056 	 * tavor_mcg_walk_mgid_hash() routine walks the list and returns an
1057 	 * index into the MCG table.  The entry at this index is checked to
1058 	 * determine whether it is a match or not.  If it is a match, then
1059 	 * we continue on to attempt to remove the QP from the MCG.  If it
1060 	 * is not a match (or not a valid MCG entry), then we return an error.
1061 	 */
1062 	end_indx = tavor_mcg_walk_mgid_hash(state, mgid_hash, gid, &prev_indx);
1063 	mcg	 = &state->ts_mcghdl[end_indx];
1064 
1065 	/*
1066 	 * If MGID == 0 (the hash chain is empty) or if the specified MGID
1067 	 * does not match the MGID in the current entry, then return
1068 	 * IBT_MC_MGID_INVALID (to indicate that the specified MGID is not
1069 	 * valid).
1070 	 */
1071 	if (((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) ||
1072 	    ((mcg->mcg_mgid_h != gid.gid_prefix) ||
1073 	    (mcg->mcg_mgid_l != gid.gid_guid))) {
1074 		mutex_exit(&state->ts_mcglock);
1075 		TNF_PROBE_0(tavor_mcg_detach_invmgid_fail, TAVOR_TNF_ERROR, "");
1076 		TAVOR_TNF_EXIT(tavor_mcg_detach);
1077 		return (IBT_MC_MGID_INVALID);
1078 	}
1079 
1080 	/*
1081 	 * Read the current MCG entry into the temporary MCG.  Note: In
1082 	 * general, this operation shouldn't fail.  If it does, then it is
1083 	 * an indication that something (probably in HW, but maybe in SW)
1084 	 * has gone seriously wrong.
1085 	 */
1086 	status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx,
1087 	    TAVOR_CMD_NOSLEEP_SPIN);
1088 	if (status != TAVOR_CMD_SUCCESS) {
1089 		mutex_exit(&state->ts_mcglock);
1090 		TAVOR_WARNING(state, "failed to read MCG entry");
1091 		cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n",
1092 		    status);
1093 		TNF_PROBE_2(tavor_mcg_detach_read_mgm_cmd_fail,
1094 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1095 		    tnf_uint, indx, end_indx);
1096 		TAVOR_TNF_EXIT(tavor_mcg_attach);
1097 		return (ibc_get_ci_failure(0));
1098 	}
1099 
1100 	/*
1101 	 * Search the QP number list for a match.  If a match is found, then
1102 	 * remove the entry from the QP list.  Otherwise, if no match is found,
1103 	 * return an error.
1104 	 */
1105 	status = tavor_mcg_qplist_remove(mcg, mcg_entry_qplist, qp);
1106 	if (status != DDI_SUCCESS) {
1107 		mutex_exit(&state->ts_mcglock);
1108 		TAVOR_TNF_EXIT(tavor_mcg_detach);
1109 		return (status);
1110 	}
1111 
1112 	/*
1113 	 * Decrement the MCG count for this QP.  When the 'qp_mcg'
1114 	 * field becomes 0, then this QP is no longer a member of any
1115 	 * MCG.
1116 	 */
1117 	tavor_qp_mcg_refcnt_dec(qp);
1118 
1119 	/*
1120 	 * If the current MCG's QP number list is about to be made empty
1121 	 * ("mcg_num_qps" == 1), then remove the entry itself from the hash
1122 	 * chain.  Otherwise, just write the updated MCG entry back to the
1123 	 * hardware.  In either case, once we successfully update the hardware
1124 	 * chain, then we decrement the "shadow" list entry's "mcg_num_qps"
1125 	 * count (or zero out the entire "shadow" list entry) before returning
1126 	 * success.  Note:  Zeroing out the "shadow" list entry is done
1127 	 * inside of tavor_mcg_hash_list_remove().
1128 	 */
1129 	if (mcg->mcg_num_qps == 1) {
1130 
1131 		/* Remove an MCG entry from the hash chain */
1132 		status = tavor_mcg_hash_list_remove(state, end_indx, prev_indx,
1133 		    mcg_entry);
1134 		if (status != DDI_SUCCESS) {
1135 			mutex_exit(&state->ts_mcglock);
1136 			TAVOR_TNF_EXIT(tavor_mcg_detach);
1137 			return (status);
1138 		}
1139 
1140 	} else {
1141 		/*
1142 		 * Write the updated MCG entry back to the Tavor MCG table.
1143 		 * If this succeeds, then we update the "shadow" list to
1144 		 * reflect the change (i.e. decrement the "mcg_num_qps"),
1145 		 * drop the lock, and return success.  Note:  In general,
1146 		 * this operation shouldn't fail.  If it does, then it is an
1147 		 * indication that something (probably in HW, but maybe in SW)
1148 		 * has gone seriously wrong.
1149 		 */
1150 		status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
1151 		    TAVOR_CMD_NOSLEEP_SPIN);
1152 		if (status != TAVOR_CMD_SUCCESS) {
1153 			mutex_exit(&state->ts_mcglock);
1154 			TAVOR_WARNING(state, "failed to write MCG entry");
1155 			cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
1156 			    "%08x\n", status);
1157 			TNF_PROBE_2(tavor_mcg_detach_write_mgm_cmd_fail,
1158 			    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1159 			    tnf_uint, indx, end_indx);
1160 			TAVOR_TNF_EXIT(tavor_mcg_detach);
1161 			return (ibc_get_ci_failure(0));
1162 		}
1163 		mcg->mcg_num_qps--;
1164 	}
1165 
1166 	mutex_exit(&state->ts_mcglock);
1167 	TAVOR_TNF_EXIT(tavor_mcg_detach);
1168 	return (DDI_SUCCESS);
1169 }
1170 
1171 /*
1172  * tavor_qp_mcg_refcnt_inc()
1173  *    Context: Can be called from interrupt or base context.
1174  */
1175 static void
1176 tavor_qp_mcg_refcnt_inc(tavor_qphdl_t qp)
1177 {
1178 	/* Increment the QP's MCG reference count */
1179 	mutex_enter(&qp->qp_lock);
1180 	qp->qp_mcg_refcnt++;
1181 	TNF_PROBE_1_DEBUG(tavor_qp_mcg_refcnt_inc, TAVOR_TNF_TRACE, "",
1182 	    tnf_uint, refcnt, qp->qp_mcg_refcnt);
1183 	mutex_exit(&qp->qp_lock);
1184 }
1185 
1186 
1187 /*
1188  * tavor_qp_mcg_refcnt_dec()
1189  *    Context: Can be called from interrupt or base context.
1190  */
1191 static void
1192 tavor_qp_mcg_refcnt_dec(tavor_qphdl_t qp)
1193 {
1194 	/* Decrement the QP's MCG reference count */
1195 	mutex_enter(&qp->qp_lock);
1196 	qp->qp_mcg_refcnt--;
1197 	TNF_PROBE_1_DEBUG(tavor_qp_mcg_refcnt_dec, TAVOR_TNF_TRACE, "",
1198 	    tnf_uint, refcnt, qp->qp_mcg_refcnt);
1199 	mutex_exit(&qp->qp_lock);
1200 }
1201 
1202 
1203 /*
1204  * tavor_mcg_qplist_add()
1205  *    Context: Can be called from interrupt or base context.
1206  */
1207 static int
1208 tavor_mcg_qplist_add(tavor_state_t *state, tavor_mcghdl_t mcg,
1209     tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp,
1210     uint_t *qp_found)
1211 {
1212 	uint_t		qplist_indx;
1213 
1214 	TAVOR_TNF_ENTER(tavor_mcg_qplist_add);
1215 
1216 	ASSERT(MUTEX_HELD(&state->ts_mcglock));
1217 
1218 	qplist_indx = mcg->mcg_num_qps;
1219 
1220 	/*
1221 	 * Determine if we have exceeded the maximum number of QP per
1222 	 * multicast group.  If we have, then return an error
1223 	 */
1224 	if (qplist_indx >= state->ts_cfg_profile->cp_num_qp_per_mcg) {
1225 		TNF_PROBE_0(tavor_mcg_qplist_add_too_many_qps,
1226 		    TAVOR_TNF_ERROR, "");
1227 		TAVOR_TNF_EXIT(tavor_mcg_qplist_add);
1228 		return (IBT_HCA_MCG_QP_EXCEEDED);
1229 	}
1230 
1231 	/*
1232 	 * Determine if the QP is already attached to this MCG table.  If it
1233 	 * is, then we break out and treat this operation as a NO-OP
1234 	 */
1235 	for (qplist_indx = 0; qplist_indx < mcg->mcg_num_qps;
1236 	    qplist_indx++) {
1237 		if (mcg_qplist[qplist_indx].qpn == qp->qp_qpnum) {
1238 			break;
1239 		}
1240 	}
1241 
1242 	/*
1243 	 * If the QP was already on the list, set 'qp_found' to TRUE.  We still
1244 	 * return SUCCESS in this case, but the qplist will not have been
1245 	 * updated because the QP was already on the list.
1246 	 */
1247 	if (qplist_indx < mcg->mcg_num_qps) {
1248 		*qp_found = 1;
1249 	} else {
1250 		/*
1251 		 * Otherwise, append the new QP number to the end of the
1252 		 * current QP list.  Note: We will increment the "mcg_num_qps"
1253 		 * field on the "shadow" MCG list entry later (after we know
1254 		 * that all necessary Tavor firmware accesses have been
1255 		 * successful).
1256 		 *
1257 		 * Set 'qp_found' to 0 so we know the QP was added on to the
1258 		 * list for sure.
1259 		 */
1260 		mcg_qplist[qplist_indx].q   = TAVOR_MCG_QPN_VALID;
1261 		mcg_qplist[qplist_indx].qpn = qp->qp_qpnum;
1262 		*qp_found = 0;
1263 	}
1264 
1265 	TAVOR_TNF_EXIT(tavor_mcg_qplist_add);
1266 	return (DDI_SUCCESS);
1267 }
1268 
1269 
1270 
1271 /*
1272  * tavor_mcg_qplist_remove()
1273  *    Context: Can be called from interrupt or base context.
1274  */
1275 static int
1276 tavor_mcg_qplist_remove(tavor_mcghdl_t mcg, tavor_hw_mcg_qp_list_t *mcg_qplist,
1277     tavor_qphdl_t qp)
1278 {
1279 	uint_t		i, qplist_indx;
1280 
1281 	TAVOR_TNF_ENTER(tavor_mcg_qplist_remove);
1282 
1283 	/*
1284 	 * Search the MCG QP list for a matching QPN.  When
1285 	 * it's found, we swap the last entry with the current
1286 	 * one, set the last entry to zero, decrement the last
1287 	 * entry, and return.  If it's not found, then it's
1288 	 * and error.
1289 	 */
1290 	qplist_indx = mcg->mcg_num_qps;
1291 	for (i = 0; i < qplist_indx; i++) {
1292 		if (mcg_qplist[i].qpn == qp->qp_qpnum) {
1293 			mcg_qplist[i] = mcg_qplist[qplist_indx - 1];
1294 			mcg_qplist[qplist_indx - 1].q = TAVOR_MCG_QPN_INVALID;
1295 			mcg_qplist[qplist_indx - 1].qpn = 0;
1296 
1297 			TAVOR_TNF_EXIT(tavor_mcg_qplist_remove);
1298 			return (DDI_SUCCESS);
1299 		}
1300 	}
1301 
1302 	TNF_PROBE_0(tavor_mcg_qplist_remove_invqphdl_fail, TAVOR_TNF_ERROR, "");
1303 	TAVOR_TNF_EXIT(tavor_mcg_qplist_remove);
1304 	return (IBT_QP_HDL_INVALID);
1305 }
1306 
1307 
1308 /*
1309  * tavor_mcg_walk_mgid_hash()
1310  *    Context: Can be called from interrupt or base context.
1311  */
1312 static uint_t
1313 tavor_mcg_walk_mgid_hash(tavor_state_t *state, uint64_t start_indx,
1314     ib_gid_t mgid, uint_t *p_indx)
1315 {
1316 	tavor_mcghdl_t	curr_mcghdl;
1317 	uint_t		curr_indx, prev_indx;
1318 
1319 	TAVOR_TNF_ENTER(tavor_mcg_walk_mgid_hash);
1320 
1321 	ASSERT(MUTEX_HELD(&state->ts_mcglock));
1322 
1323 	/* Start at the head of the hash chain */
1324 	curr_indx   = start_indx;
1325 	prev_indx   = curr_indx;
1326 	curr_mcghdl = &state->ts_mcghdl[curr_indx];
1327 
1328 	/* If the first entry in the chain has MGID == 0, then stop */
1329 	if ((curr_mcghdl->mcg_mgid_h == 0) &&
1330 	    (curr_mcghdl->mcg_mgid_l == 0)) {
1331 		goto end_mgid_hash_walk;
1332 	}
1333 
1334 	/* If the first entry in the chain matches the MGID, then stop */
1335 	if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
1336 	    (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
1337 		goto end_mgid_hash_walk;
1338 	}
1339 
1340 	/* Otherwise, walk the hash chain looking for a match */
1341 	while (curr_mcghdl->mcg_next_indx != 0) {
1342 		prev_indx = curr_indx;
1343 		curr_indx = curr_mcghdl->mcg_next_indx;
1344 		curr_mcghdl = &state->ts_mcghdl[curr_indx];
1345 
1346 		if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
1347 		    (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
1348 			break;
1349 		}
1350 	}
1351 
1352 end_mgid_hash_walk:
1353 	/*
1354 	 * If necessary, return the index of the previous entry too.  This
1355 	 * is primarily used for detaching a QP from a multicast group.  It
1356 	 * may be necessary, in that case, to delete an MCG entry from the
1357 	 * hash chain and having the index of the previous entry is helpful.
1358 	 */
1359 	if (p_indx != NULL) {
1360 		*p_indx = prev_indx;
1361 	}
1362 	TAVOR_TNF_EXIT(tavor_mcg_walk_mgid_hash);
1363 	return (curr_indx);
1364 }
1365 
1366 
1367 /*
1368  * tavor_mcg_setup_new_hdr()
1369  *    Context: Can be called from interrupt or base context.
1370  */
1371 static void
1372 tavor_mcg_setup_new_hdr(tavor_mcghdl_t mcg, tavor_hw_mcg_t *mcg_hdr,
1373     ib_gid_t mgid, tavor_rsrc_t *mcg_rsrc)
1374 {
1375 	TAVOR_TNF_ENTER(tavor_mcg_setup_new_hdr);
1376 
1377 	/*
1378 	 * Fill in the fields of the "shadow" entry used by software
1379 	 * to track MCG hardware entry
1380 	 */
1381 	mcg->mcg_mgid_h	   = mgid.gid_prefix;
1382 	mcg->mcg_mgid_l	   = mgid.gid_guid;
1383 	mcg->mcg_rsrcp	   = mcg_rsrc;
1384 	mcg->mcg_next_indx = 0;
1385 	mcg->mcg_num_qps   = 0;
1386 
1387 	/*
1388 	 * Fill the header fields of the MCG entry (in the temporary copy)
1389 	 */
1390 	mcg_hdr->mgid_h		= mgid.gid_prefix;
1391 	mcg_hdr->mgid_l		= mgid.gid_guid;
1392 	mcg_hdr->next_gid_indx	= 0;
1393 
1394 	TAVOR_TNF_EXIT(tavor_mcg_setup_new_hdr);
1395 }
1396 
1397 
1398 /*
1399  * tavor_mcg_hash_list_remove()
1400  *    Context: Can be called only from user or kernel context.
1401  */
1402 static int
1403 tavor_mcg_hash_list_remove(tavor_state_t *state, uint_t curr_indx,
1404     uint_t prev_indx, tavor_hw_mcg_t *mcg_entry)
1405 {
1406 	tavor_mcghdl_t		curr_mcg, prev_mcg, next_mcg;
1407 	uint_t			next_indx;
1408 	int			status;
1409 
1410 	/* Get the pointer to "shadow" list for current entry */
1411 	curr_mcg = &state->ts_mcghdl[curr_indx];
1412 
1413 	/*
1414 	 * If this is the first entry on a hash chain, then attempt to replace
1415 	 * the entry with the next entry on the chain.  If there are no
1416 	 * subsequent entries on the chain, then this is the only entry and
1417 	 * should be invalidated.
1418 	 */
1419 	if (curr_indx == prev_indx) {
1420 
1421 		/*
1422 		 * If this is the only entry on the chain, then invalidate it.
1423 		 * Note:  Invalidating an MCG entry means writing all zeros
1424 		 * to the entry.  This is only necessary for those MCG
1425 		 * entries that are the "head" entries of the individual hash
1426 		 * chains.  Regardless of whether this operation returns
1427 		 * success or failure, return that result to the caller.
1428 		 */
1429 		next_indx = curr_mcg->mcg_next_indx;
1430 		if (next_indx == 0) {
1431 			status = tavor_mcg_entry_invalidate(state, mcg_entry,
1432 			    curr_indx);
1433 			bzero(curr_mcg, sizeof (struct tavor_sw_mcg_list_s));
1434 			TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1435 			return (status);
1436 		}
1437 
1438 		/*
1439 		 * Otherwise, this is just the first entry on the chain, so
1440 		 * grab the next one
1441 		 */
1442 		next_mcg = &state->ts_mcghdl[next_indx];
1443 
1444 		/*
1445 		 * Read the next MCG entry into the temporary MCG.  Note:
1446 		 * In general, this operation shouldn't fail.  If it does,
1447 		 * then it is an indication that something (probably in HW,
1448 		 * but maybe in SW) has gone seriously wrong.
1449 		 */
1450 		status = tavor_read_mgm_cmd_post(state, mcg_entry, next_indx,
1451 		    TAVOR_CMD_NOSLEEP_SPIN);
1452 		if (status != TAVOR_CMD_SUCCESS) {
1453 			TAVOR_WARNING(state, "failed to read MCG entry");
1454 			cmn_err(CE_CONT, "Tavor: READ_MGM command failed: "
1455 			    "%08x\n", status);
1456 			TNF_PROBE_2(tavor_mcg_hash_list_rem_read_mgm_cmd_fail,
1457 			    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1458 			    tnf_uint, indx, next_indx);
1459 			TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1460 			return (ibc_get_ci_failure(0));
1461 		}
1462 
1463 		/*
1464 		 * Copy/Write the temporary MCG back to the hardware MCG list
1465 		 * using the current index.  This essentially removes the
1466 		 * current MCG entry from the list by writing over it with
1467 		 * the next one.  If this is successful, then we can do the
1468 		 * same operation for the "shadow" list.  And we can also
1469 		 * free up the Tavor MCG entry resource that was associated
1470 		 * with the (old) next entry.  Note:  In general, this
1471 		 * operation shouldn't fail.  If it does, then it is an
1472 		 * indication that something (probably in HW, but maybe in SW)
1473 		 * has gone seriously wrong.
1474 		 */
1475 		status = tavor_write_mgm_cmd_post(state, mcg_entry, curr_indx,
1476 		    TAVOR_CMD_NOSLEEP_SPIN);
1477 		if (status != TAVOR_CMD_SUCCESS) {
1478 			TAVOR_WARNING(state, "failed to write MCG entry");
1479 			cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
1480 			    "%08x\n", status);
1481 			TNF_PROBE_2(tavor_mcg_hash_list_rem_write_mgm_cmd_fail,
1482 			    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1483 			    tnf_uint, indx, curr_indx);
1484 			TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1485 			return (ibc_get_ci_failure(0));
1486 		}
1487 
1488 		/*
1489 		 * Copy all the software tracking information from the next
1490 		 * entry on the "shadow" MCG list into the current entry on
1491 		 * the list.  Then invalidate (zero out) the other "shadow"
1492 		 * list entry.
1493 		 */
1494 		bcopy(next_mcg, curr_mcg, sizeof (struct tavor_sw_mcg_list_s));
1495 		bzero(next_mcg, sizeof (struct tavor_sw_mcg_list_s));
1496 
1497 		/*
1498 		 * Free up the Tavor MCG entry resource used by the "next"
1499 		 * MCG entry.  That resource is no longer needed by any
1500 		 * MCG entry which is first on a hash chain (like the "next"
1501 		 * entry has just become).
1502 		 */
1503 		tavor_rsrc_free(state, &curr_mcg->mcg_rsrcp);
1504 
1505 		TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1506 		return (DDI_SUCCESS);
1507 	}
1508 
1509 	/*
1510 	 * Else if this is the last entry on the hash chain (or a middle
1511 	 * entry, then we update the previous entry's "next_gid_index" field
1512 	 * to make it point instead to the next entry on the chain.  By
1513 	 * skipping over the removed entry in this way, we can then free up
1514 	 * any resources associated with the current entry.  Note:  We don't
1515 	 * need to invalidate the "skipped over" hardware entry because it
1516 	 * will no be longer connected to any hash chains, and if/when it is
1517 	 * finally re-used, it will be written with entirely new values.
1518 	 */
1519 
1520 	/*
1521 	 * Read the next MCG entry into the temporary MCG.  Note:  In general,
1522 	 * this operation shouldn't fail.  If it does, then it is an
1523 	 * indication that something (probably in HW, but maybe in SW) has
1524 	 * gone seriously wrong.
1525 	 */
1526 	status = tavor_read_mgm_cmd_post(state, mcg_entry, prev_indx,
1527 	    TAVOR_CMD_NOSLEEP_SPIN);
1528 	if (status != TAVOR_CMD_SUCCESS) {
1529 		TAVOR_WARNING(state, "failed to read MCG entry");
1530 		cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n",
1531 		    status);
1532 		TNF_PROBE_2(tavor_mcg_hash_list_rem_read_mgm_cmd_fail,
1533 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1534 		    tnf_uint, indx, prev_indx);
1535 		TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1536 		return (ibc_get_ci_failure(0));
1537 	}
1538 
1539 	/*
1540 	 * Finally, we update the "next_gid_indx" field in the temporary MCG
1541 	 * and attempt to write the entry back into the Tavor MCG table.  If
1542 	 * this succeeds, then we update the "shadow" list to reflect the
1543 	 * change, free up the Tavor MCG entry resource that was associated
1544 	 * with the current entry, and return success.  Note:  In general,
1545 	 * this operation shouldn't fail.  If it does, then it is an indication
1546 	 * that something (probably in HW, but maybe in SW) has gone seriously
1547 	 * wrong.
1548 	 */
1549 	mcg_entry->next_gid_indx = curr_mcg->mcg_next_indx;
1550 	status = tavor_write_mgm_cmd_post(state, mcg_entry, prev_indx,
1551 	    TAVOR_CMD_NOSLEEP_SPIN);
1552 	if (status != TAVOR_CMD_SUCCESS) {
1553 		TAVOR_WARNING(state, "failed to write MCG entry");
1554 		cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
1555 		    status);
1556 		TNF_PROBE_2(tavor_mcg_hash_list_rem_write_mgm_cmd_fail,
1557 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1558 		    tnf_uint, indx, prev_indx);
1559 		TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1560 		return (ibc_get_ci_failure(0));
1561 	}
1562 
1563 	/*
1564 	 * Get the pointer to the "shadow" MCG list entry for the previous
1565 	 * MCG.  Update its "mcg_next_indx" to point to the next entry
1566 	 * the one after the current entry. Note:  This next index may be
1567 	 * zero, indicating the end of the list.
1568 	 */
1569 	prev_mcg = &state->ts_mcghdl[prev_indx];
1570 	prev_mcg->mcg_next_indx = curr_mcg->mcg_next_indx;
1571 
1572 	/*
1573 	 * Free up the Tavor MCG entry resource used by the current entry.
1574 	 * This resource is no longer needed because the chain now skips over
1575 	 * the current entry.  Then invalidate (zero out) the current "shadow"
1576 	 * list entry.
1577 	 */
1578 	tavor_rsrc_free(state, &curr_mcg->mcg_rsrcp);
1579 	bzero(curr_mcg, sizeof (struct tavor_sw_mcg_list_s));
1580 
1581 	TAVOR_TNF_EXIT(tavor_mcg_hash_list_remove);
1582 	return (DDI_SUCCESS);
1583 }
1584 
1585 
1586 /*
1587  * tavor_mcg_entry_invalidate()
1588  *    Context: Can be called only from user or kernel context.
1589  */
1590 static int
1591 tavor_mcg_entry_invalidate(tavor_state_t *state, tavor_hw_mcg_t *mcg_entry,
1592     uint_t indx)
1593 {
1594 	int		status;
1595 
1596 	TAVOR_TNF_ENTER(tavor_mcg_entry_invalidate);
1597 
1598 	/*
1599 	 * Invalidate the hardware MCG entry by zeroing out this temporary
1600 	 * MCG and writing it the the hardware.  Note: In general, this
1601 	 * operation shouldn't fail.  If it does, then it is an indication
1602 	 * that something (probably in HW, but maybe in SW) has gone seriously
1603 	 * wrong.
1604 	 */
1605 	bzero(mcg_entry, TAVOR_MCGMEM_SZ(state));
1606 	status = tavor_write_mgm_cmd_post(state, mcg_entry, indx,
1607 	    TAVOR_CMD_NOSLEEP_SPIN);
1608 	if (status != TAVOR_CMD_SUCCESS) {
1609 		TAVOR_WARNING(state, "failed to write MCG entry");
1610 		cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
1611 		    status);
1612 		TNF_PROBE_2(tavor_mcg_entry_invalidate_write_mgm_cmd_fail,
1613 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status,
1614 		    tnf_uint, indx, indx);
1615 		TAVOR_TNF_EXIT(tavor_mcg_entry_invalidate);
1616 		return (ibc_get_ci_failure(0));
1617 	}
1618 
1619 	TAVOR_TNF_EXIT(tavor_mcg_entry_invalidate);
1620 	return (DDI_SUCCESS);
1621 }
1622 
1623 
1624 /*
1625  * tavor_mgid_is_valid()
1626  *    Context: Can be called from interrupt or base context.
1627  */
1628 static int
1629 tavor_mgid_is_valid(ib_gid_t gid)
1630 {
1631 	uint_t		topbits, flags, scope;
1632 
1633 	TAVOR_TNF_ENTER(tavor_mgid_is_valid);
1634 
1635 	/*
1636 	 * According to IBA 1.1 specification (section 4.1.1) a valid
1637 	 * "multicast GID" must have its top eight bits set to all ones
1638 	 */
1639 	topbits = (gid.gid_prefix >> TAVOR_MCG_TOPBITS_SHIFT) &
1640 	    TAVOR_MCG_TOPBITS_MASK;
1641 	if (topbits != TAVOR_MCG_TOPBITS) {
1642 		TNF_PROBE_0(tavor_mgid_is_valid_invbits_fail, TAVOR_TNF_ERROR,
1643 		    "");
1644 		TAVOR_TNF_EXIT(tavor_mgid_is_valid);
1645 		return (0);
1646 	}
1647 
1648 	/*
1649 	 * The next 4 bits are the "flag" bits.  These are valid only
1650 	 * if they are "0" (which correspond to permanently assigned/
1651 	 * "well-known" multicast GIDs) or "1" (for so-called "transient"
1652 	 * multicast GIDs).  All other values are reserved.
1653 	 */
1654 	flags = (gid.gid_prefix >> TAVOR_MCG_FLAGS_SHIFT) &
1655 	    TAVOR_MCG_FLAGS_MASK;
1656 	if (!((flags == TAVOR_MCG_FLAGS_PERM) ||
1657 	    (flags == TAVOR_MCG_FLAGS_NONPERM))) {
1658 		TNF_PROBE_1(tavor_mgid_is_valid_invflags_fail, TAVOR_TNF_ERROR,
1659 		    "", tnf_uint, flags, flags);
1660 		TAVOR_TNF_EXIT(tavor_mgid_is_valid);
1661 		return (0);
1662 	}
1663 
1664 	/*
1665 	 * The next 4 bits are the "scope" bits.  These are valid only
1666 	 * if they are "2" (Link-local), "5" (Site-local), "8"
1667 	 * (Organization-local) or "E" (Global).  All other values
1668 	 * are reserved (or currently unassigned).
1669 	 */
1670 	scope = (gid.gid_prefix >> TAVOR_MCG_SCOPE_SHIFT) &
1671 	    TAVOR_MCG_SCOPE_MASK;
1672 	if (!((scope == TAVOR_MCG_SCOPE_LINKLOC) ||
1673 	    (scope == TAVOR_MCG_SCOPE_SITELOC)	 ||
1674 	    (scope == TAVOR_MCG_SCOPE_ORGLOC)	 ||
1675 	    (scope == TAVOR_MCG_SCOPE_GLOBAL))) {
1676 		TNF_PROBE_1(tavor_mgid_is_valid_invscope_fail, TAVOR_TNF_ERROR,
1677 		    "", tnf_uint, scope, scope);
1678 		TAVOR_TNF_EXIT(tavor_mgid_is_valid);
1679 		return (0);
1680 	}
1681 
1682 	/*
1683 	 * If it passes all of the above checks, then we will consider it
1684 	 * a valid multicast GID.
1685 	 */
1686 	TAVOR_TNF_EXIT(tavor_mgid_is_valid);
1687 	return (1);
1688 }
1689 
1690 
1691 /*
1692  * tavor_mlid_is_valid()
1693  *    Context: Can be called from interrupt or base context.
1694  */
1695 static int
1696 tavor_mlid_is_valid(ib_lid_t lid)
1697 {
1698 	TAVOR_TNF_ENTER(tavor_mlid_is_valid);
1699 
1700 	/*
1701 	 * According to IBA 1.1 specification (section 4.1.1) a valid
1702 	 * "multicast DLID" must be between 0xC000 and 0xFFFE.
1703 	 */
1704 	if ((lid < IB_LID_MC_FIRST) || (lid > IB_LID_MC_LAST)) {
1705 		TNF_PROBE_1(tavor_mlid_is_valid_invdlid_fail, TAVOR_TNF_ERROR,
1706 		    "", tnf_uint, mlid, lid);
1707 		TAVOR_TNF_EXIT(tavor_mlid_is_valid);
1708 		return (0);
1709 	}
1710 
1711 	TAVOR_TNF_EXIT(tavor_mlid_is_valid);
1712 	return (1);
1713 }
1714 
1715 
1716 /*
1717  * tavor_pd_alloc()
1718  *    Context: Can be called only from user or kernel context.
1719  */
1720 int
1721 tavor_pd_alloc(tavor_state_t *state, tavor_pdhdl_t *pdhdl, uint_t sleepflag)
1722 {
1723 	tavor_rsrc_t	*rsrc;
1724 	tavor_pdhdl_t	pd;
1725 	int		status;
1726 
1727 	TAVOR_TNF_ENTER(tavor_pd_alloc);
1728 
1729 	/*
1730 	 * Allocate the software structure for tracking the protection domain
1731 	 * (i.e. the Tavor Protection Domain handle).  By default each PD
1732 	 * structure will have a unique PD number assigned to it.  All that
1733 	 * is necessary is for software to initialize the PD reference count
1734 	 * (to zero) and return success.
1735 	 */
1736 	status = tavor_rsrc_alloc(state, TAVOR_PDHDL, 1, sleepflag, &rsrc);
1737 	if (status != DDI_SUCCESS) {
1738 		TNF_PROBE_0(tavor_pd_alloc_rsrcalloc_fail, TAVOR_TNF_ERROR, "");
1739 		TAVOR_TNF_EXIT(tavor_pd_alloc);
1740 		return (IBT_INSUFF_RESOURCE);
1741 	}
1742 	pd = (tavor_pdhdl_t)rsrc->tr_addr;
1743 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd))
1744 
1745 	pd->pd_refcnt = 0;
1746 	*pdhdl = pd;
1747 
1748 	TAVOR_TNF_EXIT(tavor_pd_alloc);
1749 	return (DDI_SUCCESS);
1750 }
1751 
1752 
1753 /*
1754  * tavor_pd_free()
1755  *    Context: Can be called only from user or kernel context.
1756  */
1757 int
1758 tavor_pd_free(tavor_state_t *state, tavor_pdhdl_t *pdhdl)
1759 {
1760 	tavor_rsrc_t	*rsrc;
1761 	tavor_pdhdl_t	pd;
1762 
1763 	TAVOR_TNF_ENTER(tavor_pd_free);
1764 
1765 	/*
1766 	 * Pull all the necessary information from the Tavor Protection Domain
1767 	 * handle.  This is necessary here because the resource for the
1768 	 * PD is going to be freed up as part of this operation.
1769 	 */
1770 	pd   = *pdhdl;
1771 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd))
1772 	rsrc = pd->pd_rsrcp;
1773 
1774 	/*
1775 	 * Check the PD reference count.  If the reference count is non-zero,
1776 	 * then it means that this protection domain is still referenced by
1777 	 * some memory region, queue pair, address handle, or other IB object
1778 	 * If it is non-zero, then return an error.  Otherwise, free the
1779 	 * Tavor resource and return success.
1780 	 */
1781 	if (pd->pd_refcnt != 0) {
1782 		TNF_PROBE_1(tavor_pd_free_refcnt_fail, TAVOR_TNF_ERROR, "",
1783 		    tnf_int, refcnt, pd->pd_refcnt);
1784 		TAVOR_TNF_EXIT(tavor_pd_free);
1785 		return (IBT_PD_IN_USE);
1786 	}
1787 
1788 	/* Free the Tavor Protection Domain handle */
1789 	tavor_rsrc_free(state, &rsrc);
1790 
1791 	/* Set the pdhdl pointer to NULL and return success */
1792 	*pdhdl = (tavor_pdhdl_t)NULL;
1793 
1794 	TAVOR_TNF_EXIT(tavor_pd_free);
1795 	return (DDI_SUCCESS);
1796 }
1797 
1798 
1799 /*
1800  * tavor_pd_refcnt_inc()
1801  *    Context: Can be called from interrupt or base context.
1802  */
1803 void
1804 tavor_pd_refcnt_inc(tavor_pdhdl_t pd)
1805 {
1806 	/* Increment the protection domain's reference count */
1807 	mutex_enter(&pd->pd_lock);
1808 	TNF_PROBE_1_DEBUG(tavor_pd_refcnt_inc, TAVOR_TNF_TRACE, "",
1809 	    tnf_uint, refcnt, pd->pd_refcnt);
1810 	pd->pd_refcnt++;
1811 	mutex_exit(&pd->pd_lock);
1812 
1813 }
1814 
1815 
1816 /*
1817  * tavor_pd_refcnt_dec()
1818  *    Context: Can be called from interrupt or base context.
1819  */
1820 void
1821 tavor_pd_refcnt_dec(tavor_pdhdl_t pd)
1822 {
1823 	/* Decrement the protection domain's reference count */
1824 	mutex_enter(&pd->pd_lock);
1825 	pd->pd_refcnt--;
1826 	TNF_PROBE_1_DEBUG(tavor_pd_refcnt_dec, TAVOR_TNF_TRACE, "",
1827 	    tnf_uint, refcnt, pd->pd_refcnt);
1828 	mutex_exit(&pd->pd_lock);
1829 
1830 }
1831 
1832 
1833 /*
1834  * tavor_port_query()
1835  *    Context: Can be called only from user or kernel context.
1836  */
1837 int
1838 tavor_port_query(tavor_state_t *state, uint_t port, ibt_hca_portinfo_t *pi)
1839 {
1840 	sm_portinfo_t		portinfo;
1841 	sm_guidinfo_t		guidinfo;
1842 	sm_pkey_table_t		pkeytable;
1843 	ib_gid_t		*sgid;
1844 	uint_t			sgid_max, pkey_max, tbl_size;
1845 	int			i, j, indx, status;
1846 
1847 	TAVOR_TNF_ENTER(tavor_port_query);
1848 
1849 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pi))
1850 
1851 	/* Validate that specified port number is legal */
1852 	if (!tavor_portnum_is_valid(state, port)) {
1853 		TNF_PROBE_1(tavor_port_query_inv_portnum_fail,
1854 		    TAVOR_TNF_ERROR, "", tnf_uint, port, port);
1855 		TAVOR_TNF_EXIT(tavor_port_query);
1856 		return (IBT_HCA_PORT_INVALID);
1857 	}
1858 
1859 	/*
1860 	 * We use the Tavor MAD_IFC command to post a GetPortInfo MAD
1861 	 * to the firmware (for the specified port number).  This returns
1862 	 * a full PortInfo MAD (in "portinfo") which we subsequently
1863 	 * parse to fill in the "ibt_hca_portinfo_t" structure returned
1864 	 * to the IBTF.
1865 	 */
1866 	status = tavor_getportinfo_cmd_post(state, port,
1867 	    TAVOR_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
1868 	if (status != TAVOR_CMD_SUCCESS) {
1869 		cmn_err(CE_CONT, "Tavor: GetPortInfo (port %02d) command "
1870 		    "failed: %08x\n", port, status);
1871 		TNF_PROBE_1(tavor_port_query_getportinfo_cmd_fail,
1872 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
1873 		TAVOR_TNF_EXIT(tavor_port_query);
1874 		return (ibc_get_ci_failure(0));
1875 	}
1876 
1877 	/*
1878 	 * Parse the PortInfo MAD and fill in the IBTF structure
1879 	 */
1880 	pi->p_base_lid		= portinfo.LID;
1881 	pi->p_qkey_violations	= portinfo.Q_KeyViolations;
1882 	pi->p_pkey_violations	= portinfo.P_KeyViolations;
1883 	pi->p_sm_sl		= portinfo.MasterSMSL;
1884 	pi->p_sm_lid		= portinfo.MasterSMLID;
1885 	pi->p_linkstate		= portinfo.PortState;
1886 	pi->p_port_num		= portinfo.LocalPortNum;
1887 	pi->p_mtu		= portinfo.MTUCap;
1888 	pi->p_lmc		= portinfo.LMC;
1889 	pi->p_max_vl		= portinfo.VLCap;
1890 	pi->p_subnet_timeout	= portinfo.SubnetTimeOut;
1891 	pi->p_msg_sz		= ((uint32_t)1 << TAVOR_QP_LOG_MAX_MSGSZ);
1892 	tbl_size = state->ts_cfg_profile->cp_log_max_gidtbl;
1893 	pi->p_sgid_tbl_sz	= (1 << tbl_size);
1894 	tbl_size = state->ts_cfg_profile->cp_log_max_pkeytbl;
1895 	pi->p_pkey_tbl_sz	= (1 << tbl_size);
1896 
1897 	/*
1898 	 * Convert InfiniBand-defined port capability flags to the format
1899 	 * specified by the IBTF
1900 	 */
1901 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM)
1902 		pi->p_capabilities |= IBT_PORT_CAP_SM;
1903 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM_DISABLED)
1904 		pi->p_capabilities |= IBT_PORT_CAP_SM_DISABLED;
1905 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SNMP_SUPPD)
1906 		pi->p_capabilities |= IBT_PORT_CAP_SNMP_TUNNEL;
1907 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_DM_SUPPD)
1908 		pi->p_capabilities |= IBT_PORT_CAP_DM;
1909 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_VM_SUPPD)
1910 		pi->p_capabilities |= IBT_PORT_CAP_VENDOR;
1911 
1912 	/*
1913 	 * Fill in the SGID table.  Since the only access to the Tavor
1914 	 * GID tables is through the firmware's MAD_IFC interface, we
1915 	 * post as many GetGUIDInfo MADs as necessary to read in the entire
1916 	 * contents of the SGID table (for the specified port).  Note:  The
1917 	 * GetGUIDInfo command only gets eight GUIDs per operation.  These
1918 	 * GUIDs are then appended to the GID prefix for the port (from the
1919 	 * GetPortInfo above) to form the entire SGID table.
1920 	 */
1921 	for (i = 0; i < pi->p_sgid_tbl_sz; i += 8) {
1922 		status = tavor_getguidinfo_cmd_post(state, port, i >> 3,
1923 		    TAVOR_SLEEPFLAG_FOR_CONTEXT(), &guidinfo);
1924 		if (status != TAVOR_CMD_SUCCESS) {
1925 			cmn_err(CE_CONT, "Tavor: GetGUIDInfo (port %02d) "
1926 			    "command failed: %08x\n", port, status);
1927 			TNF_PROBE_1(tavor_port_query_getguidinfo_cmd_fail,
1928 			    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
1929 			TAVOR_TNF_EXIT(tavor_port_query);
1930 			return (ibc_get_ci_failure(0));
1931 		}
1932 
1933 		/* Figure out how many of the entries are valid */
1934 		sgid_max = min((pi->p_sgid_tbl_sz - i), 8);
1935 		for (j = 0; j < sgid_max; j++) {
1936 			indx = (i + j);
1937 			sgid = &pi->p_sgid_tbl[indx];
1938 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sgid))
1939 			sgid->gid_prefix = portinfo.GidPrefix;
1940 			sgid->gid_guid	 = guidinfo.GUIDBlocks[j];
1941 		}
1942 	}
1943 
1944 	/*
1945 	 * Fill in the PKey table.  Just as for the GID tables above, the
1946 	 * only access to the Tavor PKey tables is through the firmware's
1947 	 * MAD_IFC interface.  We post as many GetPKeyTable MADs as necessary
1948 	 * to read in the entire contents of the PKey table (for the specified
1949 	 * port).  Note:  The GetPKeyTable command only gets 32 PKeys per
1950 	 * operation.
1951 	 */
1952 	for (i = 0; i < pi->p_pkey_tbl_sz; i += 32) {
1953 		status = tavor_getpkeytable_cmd_post(state, port, i,
1954 		    TAVOR_SLEEPFLAG_FOR_CONTEXT(), &pkeytable);
1955 		if (status != TAVOR_CMD_SUCCESS) {
1956 			cmn_err(CE_CONT, "Tavor: GetPKeyTable (port %02d) "
1957 			    "command failed: %08x\n", port, status);
1958 			TNF_PROBE_1(tavor_port_query_getpkeytable_cmd_fail,
1959 			    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
1960 			TAVOR_TNF_EXIT(tavor_port_query);
1961 			return (ibc_get_ci_failure(0));
1962 		}
1963 
1964 		/* Figure out how many of the entries are valid */
1965 		pkey_max = min((pi->p_pkey_tbl_sz - i), 32);
1966 		for (j = 0; j < pkey_max; j++) {
1967 			indx = (i + j);
1968 			pi->p_pkey_tbl[indx] = pkeytable.P_KeyTableBlocks[j];
1969 		}
1970 	}
1971 
1972 	TAVOR_TNF_EXIT(tavor_port_query);
1973 	return (DDI_SUCCESS);
1974 }
1975 
1976 
1977 /*
1978  * tavor_port_modify()
1979  *    Context: Can be called only from user or kernel context.
1980  */
1981 /* ARGSUSED */
1982 int
1983 tavor_port_modify(tavor_state_t *state, uint8_t port,
1984     ibt_port_modify_flags_t flags, uint8_t init_type)
1985 {
1986 	sm_portinfo_t	portinfo;
1987 	uint32_t	capmask, reset_qkey;
1988 	int		status;
1989 
1990 	TAVOR_TNF_ENTER(tavor_port_modify);
1991 
1992 	/*
1993 	 * Return an error if either of the unsupported flags are set
1994 	 */
1995 	if ((flags & IBT_PORT_SHUTDOWN) ||
1996 	    (flags & IBT_PORT_SET_INIT_TYPE)) {
1997 		TNF_PROBE_1(tavor_port_modify_inv_flags_fail,
1998 		    TAVOR_TNF_ERROR, "", tnf_uint, flags, flags);
1999 		TAVOR_TNF_EXIT(tavor_port_modify);
2000 		return (IBT_NOT_SUPPORTED);
2001 	}
2002 
2003 	/*
2004 	 * Determine whether we are trying to reset the QKey counter
2005 	 */
2006 	reset_qkey = (flags & IBT_PORT_RESET_QKEY) ? 1 : 0;
2007 
2008 	/* Validate that specified port number is legal */
2009 	if (!tavor_portnum_is_valid(state, port)) {
2010 		TNF_PROBE_1(tavor_port_modify_inv_portnum_fail,
2011 		    TAVOR_TNF_ERROR, "", tnf_uint, port, port);
2012 		TAVOR_TNF_EXIT(tavor_port_modify);
2013 		return (IBT_HCA_PORT_INVALID);
2014 	}
2015 
2016 	/*
2017 	 * Use the Tavor MAD_IFC command to post a GetPortInfo MAD to the
2018 	 * firmware (for the specified port number).  This returns a full
2019 	 * PortInfo MAD (in "portinfo") from which we pull the current
2020 	 * capability mask.  We then modify the capability mask as directed
2021 	 * by the "pmod_flags" field, and write the updated capability mask
2022 	 * using the Tavor SET_IB command (below).
2023 	 */
2024 	status = tavor_getportinfo_cmd_post(state, port,
2025 	    TAVOR_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
2026 	if (status != TAVOR_CMD_SUCCESS) {
2027 		TNF_PROBE_1(tavor_port_modify_getportinfo_cmd_fail,
2028 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
2029 		TAVOR_TNF_EXIT(tavor_port_modify);
2030 		return (ibc_get_ci_failure(0));
2031 	}
2032 
2033 	/*
2034 	 * Convert InfiniBand-defined port capability flags to the format
2035 	 * specified by the IBTF.  Specifically, we modify the capability
2036 	 * mask based on the specified values.
2037 	 */
2038 	capmask = portinfo.CapabilityMask;
2039 
2040 	if (flags & IBT_PORT_RESET_SM)
2041 		capmask &= ~SM_CAP_MASK_IS_SM;
2042 	else if (flags & IBT_PORT_SET_SM)
2043 		capmask |= SM_CAP_MASK_IS_SM;
2044 
2045 	if (flags & IBT_PORT_RESET_SNMP)
2046 		capmask &= ~SM_CAP_MASK_IS_SNMP_SUPPD;
2047 	else if (flags & IBT_PORT_SET_SNMP)
2048 		capmask |= SM_CAP_MASK_IS_SNMP_SUPPD;
2049 
2050 	if (flags & IBT_PORT_RESET_DEVMGT)
2051 		capmask &= ~SM_CAP_MASK_IS_DM_SUPPD;
2052 	else if (flags & IBT_PORT_SET_DEVMGT)
2053 		capmask |= SM_CAP_MASK_IS_DM_SUPPD;
2054 
2055 	if (flags & IBT_PORT_RESET_VENDOR)
2056 		capmask &= ~SM_CAP_MASK_IS_VM_SUPPD;
2057 	else if (flags & IBT_PORT_SET_VENDOR)
2058 		capmask |= SM_CAP_MASK_IS_VM_SUPPD;
2059 
2060 	/*
2061 	 * Use the Tavor SET_IB command to update the capability mask and
2062 	 * (possibly) reset the QKey violation counter for the specified port.
2063 	 * Note: In general, this operation shouldn't fail.  If it does, then
2064 	 * it is an indication that something (probably in HW, but maybe in
2065 	 * SW) has gone seriously wrong.
2066 	 */
2067 	status = tavor_set_ib_cmd_post(state, capmask, port, reset_qkey,
2068 	    TAVOR_SLEEPFLAG_FOR_CONTEXT());
2069 	if (status != TAVOR_CMD_SUCCESS) {
2070 		TAVOR_WARNING(state, "failed to modify port capabilities");
2071 		cmn_err(CE_CONT, "Tavor: SET_IB (port %02d) command failed: "
2072 		    "%08x\n", port, status);
2073 		TNF_PROBE_1(tavor_port_modify_set_ib_cmd_fail,
2074 		    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status, status);
2075 		TAVOR_TNF_EXIT(tavor_port_modify);
2076 		return (ibc_get_ci_failure(0));
2077 	}
2078 
2079 	TAVOR_TNF_EXIT(tavor_port_modify);
2080 	return (DDI_SUCCESS);
2081 }
2082 
2083 
2084 /*
2085  * tavor_set_addr_path()
2086  *    Context: Can be called from interrupt or base context.
2087  *
2088  * Note: This routine is used for two purposes.  It is used to fill in the
2089  * Tavor UDAV fields, and it is used to fill in the address path information
2090  * for QPs.  Because the two Tavor structures are similar, common fields can
2091  * be filled in here.  Because they are slightly different, however, we pass
2092  * an additional flag to indicate which type is being filled.
2093  */
2094 int
2095 tavor_set_addr_path(tavor_state_t *state, ibt_adds_vect_t *av,
2096     tavor_hw_addr_path_t *path, uint_t type, tavor_qphdl_t qp)
2097 {
2098 	uint_t		gidtbl_sz;
2099 
2100 	TAVOR_TNF_ENTER(tavor_set_addr_path);
2101 
2102 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av))
2103 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path))
2104 
2105 	path->ml_path	= av->av_src_path;
2106 	path->rlid	= av->av_dlid;
2107 	path->sl	= av->av_srvl;
2108 
2109 	/* Port number only valid (in "av_port_num") if this is a UDAV */
2110 	if (type == TAVOR_ADDRPATH_UDAV) {
2111 		path->portnum = av->av_port_num;
2112 	}
2113 
2114 	/*
2115 	 * Validate (and fill in) static rate.
2116 	 *
2117 	 * The stat_rate_sup is used to decide how to set the rate and
2118 	 * if it is zero, the driver uses the old interface.
2119 	 */
2120 	if (state->ts_devlim.stat_rate_sup) {
2121 		if (av->av_srate == IBT_SRATE_20) {
2122 			path->max_stat_rate = 0; /* 4x@DDR injection rate */
2123 		} else if (av->av_srate == IBT_SRATE_5) {
2124 			path->max_stat_rate = 3; /* 1x@DDR injection rate */
2125 		} else if (av->av_srate == IBT_SRATE_10) {
2126 			path->max_stat_rate = 2; /* 4x@SDR injection rate */
2127 		} else if (av->av_srate == IBT_SRATE_2) {
2128 			path->max_stat_rate = 1; /* 1x@SDR injection rate */
2129 		} else if (av->av_srate == IBT_SRATE_NOT_SPECIFIED) {
2130 			path->max_stat_rate = 0; /* Max */
2131 		} else {
2132 			TNF_PROBE_1(tavor_set_addr_path_inv_srate_fail,
2133 			    TAVOR_TNF_ERROR, "", tnf_uint, srate, av->av_srate);
2134 			TAVOR_TNF_EXIT(tavor_set_addr_path);
2135 			return (IBT_STATIC_RATE_INVALID);
2136 		}
2137 	} else {
2138 		if (av->av_srate == IBT_SRATE_10) {
2139 			path->max_stat_rate = 0; /* 4x@SDR injection rate */
2140 		} else if (av->av_srate == IBT_SRATE_2) {
2141 			path->max_stat_rate = 1; /* 1x@SDR injection rate */
2142 		} else if (av->av_srate == IBT_SRATE_NOT_SPECIFIED) {
2143 			path->max_stat_rate = 0; /* Max */
2144 		} else {
2145 			TNF_PROBE_1(tavor_set_addr_path_inv_srate_fail,
2146 			    TAVOR_TNF_ERROR, "", tnf_uint, srate, av->av_srate);
2147 			TAVOR_TNF_EXIT(tavor_set_addr_path);
2148 			return (IBT_STATIC_RATE_INVALID);
2149 		}
2150 	}
2151 
2152 	/*
2153 	 * If this is a QP operation save asoft copy.
2154 	 */
2155 	if (qp) {
2156 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(qp->qp_save_srate))
2157 		qp->qp_save_srate = av->av_srate;
2158 	}
2159 
2160 	/* If "grh" flag is set, then check for valid SGID index too */
2161 	gidtbl_sz = (1 << state->ts_devlim.log_max_gid);
2162 	if ((av->av_send_grh) && (av->av_sgid_ix > gidtbl_sz)) {
2163 		TNF_PROBE_1(tavor_set_addr_path_inv_sgid_ix_fail,
2164 		    TAVOR_TNF_ERROR, "", tnf_uint, sgid_ix, av->av_sgid_ix);
2165 		TAVOR_TNF_EXIT(tavor_set_addr_path);
2166 		return (IBT_SGID_INVALID);
2167 	}
2168 
2169 	/*
2170 	 * Fill in all "global" values regardless of the value in the GRH
2171 	 * flag.  Because "grh" is not set unless "av_send_grh" is set, the
2172 	 * hardware will ignore the other "global" values as necessary.  Note:
2173 	 * SW does this here to enable later query operations to return
2174 	 * exactly the same params that were passed when the addr path was
2175 	 * last written.
2176 	 */
2177 	path->grh = av->av_send_grh;
2178 	if (type == TAVOR_ADDRPATH_QP) {
2179 		path->mgid_index = av->av_sgid_ix;
2180 	} else {
2181 		/*
2182 		 * For Tavor UDAV, the "mgid_index" field is the index into
2183 		 * a combined table (not a per-port table). So some extra
2184 		 * calculations are necessary.
2185 		 */
2186 		path->mgid_index = ((av->av_port_num - 1) * gidtbl_sz) +
2187 		    av->av_sgid_ix;
2188 	}
2189 	path->flow_label = av->av_flow;
2190 	path->tclass	 = av->av_tclass;
2191 	path->hop_limit	 = av->av_hop;
2192 	path->rgid_h	 = av->av_dgid.gid_prefix;
2193 
2194 	/*
2195 	 * According to Tavor PRM, the (31:0) part of rgid_l must be set to
2196 	 * "0x2" if the 'grh' or 'g' bit is cleared.  It also says that we
2197 	 * only need to do it for UDAV's.  So we enforce that here.
2198 	 *
2199 	 * NOTE: The entire 64 bits worth of GUID info is actually being
2200 	 * preserved (for UDAVs) by the callers of this function
2201 	 * (tavor_ah_alloc() and tavor_ah_modify()) and as long as the
2202 	 * 'grh' bit is not set, the upper 32 bits (63:32) of rgid_l are
2203 	 * "don't care".
2204 	 */
2205 	if ((path->grh) || (type == TAVOR_ADDRPATH_QP)) {
2206 		path->rgid_l = av->av_dgid.gid_guid;
2207 	} else {
2208 		path->rgid_l = 0x2;
2209 	}
2210 
2211 	TAVOR_TNF_EXIT(tavor_set_addr_path);
2212 	return (DDI_SUCCESS);
2213 }
2214 
2215 
2216 /*
2217  * tavor_get_addr_path()
2218  *    Context: Can be called from interrupt or base context.
2219  *
2220  * Note: Just like tavor_set_addr_path() above, this routine is used for two
2221  * purposes.  It is used to read in the Tavor UDAV fields, and it is used to
2222  * read in the address path information for QPs.  Because the two Tavor
2223  * structures are similar, common fields can be read in here.  But because
2224  * they are slightly different, we pass an additional flag to indicate which
2225  * type is being read.
2226  */
2227 void
2228 tavor_get_addr_path(tavor_state_t *state, tavor_hw_addr_path_t *path,
2229     ibt_adds_vect_t *av, uint_t type, tavor_qphdl_t qp)
2230 {
2231 	uint_t		gidtbl_sz;
2232 
2233 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path))
2234 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av))
2235 
2236 	av->av_src_path	= path->ml_path;
2237 	av->av_port_num	= path->portnum;
2238 	av->av_dlid	= path->rlid;
2239 	av->av_srvl	= path->sl;
2240 
2241 	/*
2242 	 * Set "av_ipd" value from max_stat_rate.
2243 	 */
2244 	if (qp) {
2245 		/*
2246 		 * If a QP operation use the soft copy
2247 		 */
2248 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(qp->qp_save_srate))
2249 		av->av_srate = qp->qp_save_srate;
2250 	} else {
2251 		/*
2252 		 * The stat_rate_sup is used to decide how the srate value is
2253 		 * set and
2254 		 * if it is zero, the driver uses the old interface.
2255 		 */
2256 		if (state->ts_devlim.stat_rate_sup) {
2257 			if (path->max_stat_rate	== 0) {
2258 				av->av_srate = IBT_SRATE_20; /* 4x@DDR rate */
2259 			} else if (path->max_stat_rate	== 1) {
2260 				av->av_srate = IBT_SRATE_2;  /* 1x@SDR rate */
2261 			} else if (path->max_stat_rate	== 2) {
2262 				av->av_srate = IBT_SRATE_10; /* 4x@SDR rate */
2263 			} else if (path->max_stat_rate	== 3) {
2264 				av->av_srate = IBT_SRATE_5;  /* 1xDDR rate */
2265 			}
2266 		} else {
2267 			if (path->max_stat_rate	== 0) {
2268 				av->av_srate = IBT_SRATE_10; /* 4x@SDR rate */
2269 			} else if (path->max_stat_rate	== 1) {
2270 				av->av_srate = IBT_SRATE_2;  /* 1x@SDR rate */
2271 			}
2272 		}
2273 	}
2274 
2275 	/*
2276 	 * Extract all "global" values regardless of the value in the GRH
2277 	 * flag.  Because "av_send_grh" is set only if "grh" is set, software
2278 	 * knows to ignore the other "global" values as necessary.  Note: SW
2279 	 * does it this way to enable these query operations to return exactly
2280 	 * the same params that were passed when the addr path was last written.
2281 	 */
2282 	av->av_send_grh		= path->grh;
2283 	if (type == TAVOR_ADDRPATH_QP) {
2284 		av->av_sgid_ix  = path->mgid_index;
2285 	} else {
2286 		/*
2287 		 * For Tavor UDAV, the "mgid_index" field is the index into
2288 		 * a combined table (not a per-port table). So some extra
2289 		 * calculations are necessary.
2290 		 */
2291 		gidtbl_sz = (1 << state->ts_devlim.log_max_gid);
2292 		av->av_sgid_ix = path->mgid_index - ((av->av_port_num - 1) *
2293 		    gidtbl_sz);
2294 	}
2295 	av->av_flow		= path->flow_label;
2296 	av->av_tclass		= path->tclass;
2297 	av->av_hop		= path->hop_limit;
2298 	av->av_dgid.gid_prefix	= path->rgid_h;
2299 	av->av_dgid.gid_guid	= path->rgid_l;
2300 }
2301 
2302 
2303 /*
2304  * tavor_portnum_is_valid()
2305  *    Context: Can be called from interrupt or base context.
2306  */
2307 int
2308 tavor_portnum_is_valid(tavor_state_t *state, uint_t portnum)
2309 {
2310 	uint_t	max_port;
2311 
2312 	max_port = state->ts_cfg_profile->cp_num_ports;
2313 	if ((portnum <= max_port) && (portnum != 0)) {
2314 		return (1);
2315 	} else {
2316 		return (0);
2317 	}
2318 }
2319 
2320 
2321 /*
2322  * tavor_pkeyindex_is_valid()
2323  *    Context: Can be called from interrupt or base context.
2324  */
2325 int
2326 tavor_pkeyindex_is_valid(tavor_state_t *state, uint_t pkeyindx)
2327 {
2328 	uint_t	max_pkeyindx;
2329 
2330 	max_pkeyindx = 1 << state->ts_cfg_profile->cp_log_max_pkeytbl;
2331 	if (pkeyindx < max_pkeyindx) {
2332 		return (1);
2333 	} else {
2334 		return (0);
2335 	}
2336 }
2337 
2338 
2339 /*
2340  * tavor_queue_alloc()
2341  *    Context: Can be called from interrupt or base context.
2342  */
2343 int
2344 tavor_queue_alloc(tavor_state_t *state, tavor_qalloc_info_t *qa_info,
2345     uint_t sleepflag)
2346 {
2347 	ddi_dma_attr_t		dma_attr;
2348 	int			(*callback)(caddr_t);
2349 	uint64_t		realsize, alloc_mask;
2350 	uint_t			dma_xfer_mode, type;
2351 	int			flag, status;
2352 
2353 	TAVOR_TNF_ENTER(tavor_queue_alloc);
2354 
2355 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info))
2356 
2357 	/* Set the callback flag appropriately */
2358 	callback = (sleepflag == TAVOR_SLEEP) ? DDI_DMA_SLEEP :
2359 	    DDI_DMA_DONTWAIT;
2360 
2361 	/*
2362 	 * Initialize many of the default DMA attributes.  Then set additional
2363 	 * alignment restrictions as necessary for the queue memory.  Also
2364 	 * respect the configured value for IOMMU bypass
2365 	 */
2366 	tavor_dma_attr_init(&dma_attr);
2367 	dma_attr.dma_attr_align = qa_info->qa_bind_align;
2368 	type = state->ts_cfg_profile->cp_iommu_bypass;
2369 	if (type == TAVOR_BINDMEM_BYPASS) {
2370 		dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2371 	}
2372 
2373 	/* Allocate a DMA handle */
2374 	status = ddi_dma_alloc_handle(state->ts_dip, &dma_attr, callback, NULL,
2375 	    &qa_info->qa_dmahdl);
2376 	if (status != DDI_SUCCESS) {
2377 		TNF_PROBE_0(tavor_queue_alloc_dmahdl_fail, TAVOR_TNF_ERROR, "");
2378 		TAVOR_TNF_EXIT(tavor_queue_alloc);
2379 		return (DDI_FAILURE);
2380 	}
2381 
2382 	/*
2383 	 * Determine the amount of memory to allocate, depending on the values
2384 	 * in "qa_bind_align" and "qa_alloc_align".  The problem we are trying
2385 	 * to solve here is that allocating a DMA handle with IOMMU bypass
2386 	 * (DDI_DMA_FORCE_PHYSICAL) constrains us to only requesting alignments
2387 	 * that are less than the page size.  Since we may need stricter
2388 	 * alignments on the memory allocated by ddi_dma_mem_alloc() (e.g. in
2389 	 * Tavor QP work queue memory allocation), we use the following method
2390 	 * to calculate how much additional memory to request, and we enforce
2391 	 * our own alignment on the allocated result.
2392 	 */
2393 	alloc_mask = qa_info->qa_alloc_align - 1;
2394 	if (qa_info->qa_bind_align == qa_info->qa_alloc_align) {
2395 		realsize = qa_info->qa_size;
2396 	} else {
2397 		realsize = qa_info->qa_size + alloc_mask;
2398 	}
2399 
2400 	/*
2401 	 * If we are to allocate the queue from system memory, then use
2402 	 * ddi_dma_mem_alloc() to find the space.  Otherwise, if we are to
2403 	 * allocate the queue from locally-attached DDR memory, then use the
2404 	 * vmem allocator to find the space.  In either case, return a pointer
2405 	 * to the memory range allocated (including any necessary alignment
2406 	 * adjustments), the "real" memory pointer, the "real" size, and a
2407 	 * ddi_acc_handle_t to use when reading from/writing to the memory.
2408 	 */
2409 	if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_NORMAL) {
2410 
2411 		/*
2412 		 * Determine whether to map STREAMING or CONSISTENT.  This is
2413 		 * based on the value set in the configuration profile at
2414 		 * attach time.
2415 		 */
2416 		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
2417 
2418 		/* Allocate system memory for the queue */
2419 		status = ddi_dma_mem_alloc(qa_info->qa_dmahdl, realsize,
2420 		    &state->ts_reg_accattr, dma_xfer_mode, callback, NULL,
2421 		    (caddr_t *)&qa_info->qa_buf_real,
2422 		    (size_t *)&qa_info->qa_buf_realsz, &qa_info->qa_acchdl);
2423 		if (status != DDI_SUCCESS) {
2424 			ddi_dma_free_handle(&qa_info->qa_dmahdl);
2425 			TNF_PROBE_0(tavor_queue_alloc_dma_memalloc_fail,
2426 			    TAVOR_TNF_ERROR, "");
2427 			TAVOR_TNF_EXIT(tavor_queue_alloc);
2428 			return (DDI_FAILURE);
2429 		}
2430 
2431 		/*
2432 		 * Save temporary copy of the real pointer.  (This may be
2433 		 * modified in the last step below).
2434 		 */
2435 		qa_info->qa_buf_aligned = qa_info->qa_buf_real;
2436 
2437 	} else if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_USERLAND) {
2438 
2439 		/* Allocate userland mappable memory for the queue */
2440 		flag = (sleepflag == TAVOR_SLEEP) ? DDI_UMEM_SLEEP :
2441 		    DDI_UMEM_NOSLEEP;
2442 		qa_info->qa_buf_real = ddi_umem_alloc(realsize, flag,
2443 		    &qa_info->qa_umemcookie);
2444 		if (qa_info->qa_buf_real == NULL) {
2445 			ddi_dma_free_handle(&qa_info->qa_dmahdl);
2446 			TNF_PROBE_0(tavor_queue_alloc_umem_fail,
2447 			    TAVOR_TNF_ERROR, "");
2448 			TAVOR_TNF_EXIT(tavor_queue_alloc);
2449 			return (DDI_FAILURE);
2450 		}
2451 
2452 		/*
2453 		 * Save temporary copy of the real pointer.  (This may be
2454 		 * modified in the last step below).
2455 		 */
2456 		qa_info->qa_buf_aligned = qa_info->qa_buf_real;
2457 
2458 	} else {  /* TAVOR_QUEUE_LOCATION_INDDR */
2459 
2460 		/* Allocate DDR memory for the queue */
2461 		flag = (sleepflag == TAVOR_SLEEP) ? VM_SLEEP : VM_NOSLEEP;
2462 		qa_info->qa_buf_real = (uint32_t *)vmem_xalloc(
2463 		    state->ts_ddrvmem, realsize, qa_info->qa_bind_align, 0, 0,
2464 		    NULL, NULL, flag);
2465 		if (qa_info->qa_buf_real == NULL) {
2466 			ddi_dma_free_handle(&qa_info->qa_dmahdl);
2467 			TNF_PROBE_0(tavor_queue_alloc_vmxa_fail,
2468 			    TAVOR_TNF_ERROR, "");
2469 			TAVOR_TNF_EXIT(tavor_queue_alloc);
2470 			return (DDI_FAILURE);
2471 		}
2472 
2473 		/*
2474 		 * Since "qa_buf_real" will be a PCI address (the offset into
2475 		 * the DDR memory), we first need to do some calculations to
2476 		 * convert it to its kernel mapped address.  (Note: This may
2477 		 * be modified again below, when any additional "alloc"
2478 		 * alignment constraint is applied).
2479 		 */
2480 		qa_info->qa_buf_aligned = (uint32_t *)(uintptr_t)(((uintptr_t)
2481 		    state->ts_reg_ddr_baseaddr) + ((uintptr_t)
2482 		    qa_info->qa_buf_real - state->ts_ddr.ddr_baseaddr));
2483 		qa_info->qa_buf_realsz	= realsize;
2484 		qa_info->qa_acchdl	= state->ts_reg_ddrhdl;
2485 	}
2486 
2487 	/*
2488 	 * The last step is to ensure that the final address ("qa_buf_aligned")
2489 	 * has the appropriate "alloc" alignment restriction applied to it
2490 	 * (if necessary).
2491 	 */
2492 	if (qa_info->qa_bind_align != qa_info->qa_alloc_align) {
2493 		qa_info->qa_buf_aligned = (uint32_t *)(uintptr_t)(((uintptr_t)
2494 		    qa_info->qa_buf_aligned + alloc_mask) & ~alloc_mask);
2495 	}
2496 
2497 	TAVOR_TNF_EXIT(tavor_queue_alloc);
2498 	return (DDI_SUCCESS);
2499 }
2500 
2501 
2502 /*
2503  * tavor_queue_free()
2504  *    Context: Can be called from interrupt or base context.
2505  */
2506 void
2507 tavor_queue_free(tavor_state_t *state, tavor_qalloc_info_t *qa_info)
2508 {
2509 	TAVOR_TNF_ENTER(tavor_queue_free);
2510 
2511 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info))
2512 
2513 	/*
2514 	 * Depending on how (i.e. from where) we allocated the memory for
2515 	 * this queue, we choose the appropriate method for releasing the
2516 	 * resources.
2517 	 */
2518 	if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_NORMAL) {
2519 
2520 		ddi_dma_mem_free(&qa_info->qa_acchdl);
2521 
2522 	} else if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_USERLAND) {
2523 
2524 		ddi_umem_free(qa_info->qa_umemcookie);
2525 
2526 	} else {  /* TAVOR_QUEUE_LOCATION_INDDR */
2527 
2528 		vmem_xfree(state->ts_ddrvmem, qa_info->qa_buf_real,
2529 		    qa_info->qa_buf_realsz);
2530 	}
2531 
2532 	/* Always free the dma handle */
2533 	ddi_dma_free_handle(&qa_info->qa_dmahdl);
2534 
2535 	TAVOR_TNF_EXIT(tavor_queue_free);
2536 }
2537 
2538 
2539 /*
2540  * tavor_dmaattr_get()
2541  *    Context: Can be called from interrupt or base context.
2542  */
2543 void
2544 tavor_dma_attr_init(ddi_dma_attr_t *dma_attr)
2545 {
2546 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*dma_attr))
2547 
2548 	dma_attr->dma_attr_version	= DMA_ATTR_V0;
2549 	dma_attr->dma_attr_addr_lo	= 0;
2550 	dma_attr->dma_attr_addr_hi	= 0xFFFFFFFFFFFFFFFFull;
2551 	dma_attr->dma_attr_count_max	= 0xFFFFFFFFFFFFFFFFull;
2552 	dma_attr->dma_attr_align	= 1;
2553 	dma_attr->dma_attr_burstsizes	= 0x3FF;
2554 	dma_attr->dma_attr_minxfer	= 1;
2555 	dma_attr->dma_attr_maxxfer	= 0xFFFFFFFFFFFFFFFFull;
2556 	dma_attr->dma_attr_seg		= 0xFFFFFFFFFFFFFFFFull;
2557 	dma_attr->dma_attr_sgllen	= 0x7FFFFFFF;
2558 	dma_attr->dma_attr_granular	= 1;
2559 	dma_attr->dma_attr_flags	= 0;
2560 }
2561 
2562 /*
2563  * tavor_destroy_fmr_pool()
2564  * Create a pool of FMRs.
2565  *     Context: Can be called from kernel context only.
2566  */
2567 int
2568 tavor_create_fmr_pool(tavor_state_t *state, tavor_pdhdl_t pd,
2569     ibt_fmr_pool_attr_t *fmr_attr, tavor_fmrhdl_t *fmrpoolp)
2570 {
2571 	tavor_fmrhdl_t	fmrpool;
2572 	tavor_fmr_list_t *fmr, *fmr_next;
2573 	tavor_mrhdl_t   mr;
2574 	char		taskqname[48];
2575 	char		*errormsg;
2576 	int		status;
2577 	int		sleep;
2578 	int		i;
2579 
2580 	TAVOR_TNF_ENTER(tavor_create_fmr_pool);
2581 
2582 	sleep = (fmr_attr->fmr_flags & IBT_MR_SLEEP) ? TAVOR_SLEEP :
2583 	    TAVOR_NOSLEEP;
2584 	if ((sleep == TAVOR_SLEEP) &&
2585 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
2586 		TNF_PROBE_0(tavor_create_fmr_pool_invalid_flags,
2587 		    TAVOR_TNF_ERROR, "");
2588 		TAVOR_TNF_EXIT(tavor_create_fmr_pool);
2589 		return (IBT_INVALID_PARAM);
2590 	}
2591 
2592 	fmrpool = (tavor_fmrhdl_t)kmem_zalloc(sizeof (*fmrpool), sleep);
2593 	if (fmrpool == NULL) {
2594 		/* Set "status" and "errormsg" and goto failure */
2595 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed FMR Pool handle");
2596 		goto fail;
2597 	}
2598 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmrpool))
2599 
2600 	mutex_init(&fmrpool->fmr_lock, NULL, MUTEX_DRIVER,
2601 	    DDI_INTR_PRI(state->ts_intrmsi_pri));
2602 
2603 	fmrpool->fmr_state	    = state;
2604 	fmrpool->fmr_flush_function = fmr_attr->fmr_func_hdlr;
2605 	fmrpool->fmr_flush_arg	    = fmr_attr->fmr_func_arg;
2606 	fmrpool->fmr_pool_size	    = 0;
2607 	fmrpool->fmr_cache	    = 0;
2608 	fmrpool->fmr_max_pages	    = fmr_attr->fmr_max_pages_per_fmr;
2609 	fmrpool->fmr_page_sz	    = fmr_attr->fmr_page_sz;
2610 	fmrpool->fmr_dirty_watermark = fmr_attr->fmr_dirty_watermark;
2611 	fmrpool->fmr_dirty_len	    = 0;
2612 	fmrpool->fmr_flags	    = fmr_attr->fmr_flags;
2613 
2614 	/* Create taskq to handle cleanup and flush processing */
2615 	(void) snprintf(taskqname, 50, "fmrpool/%d/%d @ 0x%" PRIx64,
2616 	    fmr_attr->fmr_pool_size, tavor_debug_fmrpool_cnt,
2617 	    (uint64_t)(uintptr_t)fmrpool);
2618 	fmrpool->fmr_taskq = ddi_taskq_create(state->ts_dip, taskqname,
2619 	    TAVOR_TASKQ_NTHREADS, TASKQ_DEFAULTPRI, 0);
2620 	if (fmrpool->fmr_taskq == NULL) {
2621 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed task queue");
2622 		goto fail1;
2623 	}
2624 
2625 	fmrpool->fmr_free_list = NULL;
2626 	fmrpool->fmr_dirty_list = NULL;
2627 
2628 	if (fmr_attr->fmr_cache) {
2629 		tavor_fmr_cache_init(fmrpool);
2630 	}
2631 
2632 	for (i = 0; i < fmr_attr->fmr_pool_size; i++) {
2633 		status = tavor_mr_alloc_fmr(state, pd, fmrpool, &mr);
2634 		if (status != DDI_SUCCESS) {
2635 			TAVOR_TNF_FAIL(status, "failed fmr alloc");
2636 			goto fail2;
2637 		}
2638 
2639 		fmr = (tavor_fmr_list_t *)kmem_zalloc(
2640 		    sizeof (tavor_fmr_list_t), sleep);
2641 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr))
2642 
2643 		fmr->fmr = mr;
2644 		fmr->fmr_refcnt = 0;
2645 		fmr->fmr_remaps = 0;
2646 		fmr->fmr_pool = fmrpool;
2647 		fmr->fmr_in_cache = 0;
2648 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
2649 		mr->mr_fmr = fmr;
2650 
2651 		fmr->fmr_next = fmrpool->fmr_free_list;
2652 		fmrpool->fmr_free_list = fmr;
2653 		fmrpool->fmr_pool_size++;
2654 	}
2655 
2656 	/* Set to return pool */
2657 	*fmrpoolp = fmrpool;
2658 
2659 	TAVOR_TNF_EXIT(tavor_create_fmr_pool);
2660 	return (IBT_SUCCESS);
2661 fail2:
2662 	tavor_fmr_cache_fini(fmrpool);
2663 	for (fmr = fmrpool->fmr_free_list; fmr != NULL; fmr = fmr_next) {
2664 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr))
2665 		fmr_next = fmr->fmr_next;
2666 		(void) tavor_mr_dealloc_fmr(state, &fmr->fmr);
2667 		kmem_free(fmr, sizeof (tavor_fmr_list_t));
2668 	}
2669 	ddi_taskq_destroy(fmrpool->fmr_taskq);
2670 fail1:
2671 	kmem_free(fmrpool, sizeof (*fmrpool));
2672 fail:
2673 	TNF_PROBE_1(tavor_create_fmr_pool_fail, TAVOR_TNF_ERROR, "",
2674 	    tnf_string, msg, errormsg);
2675 	TAVOR_TNF_EXIT(tavor_create_fmr_pool);
2676 	if (status == DDI_FAILURE) {
2677 		return (ibc_get_ci_failure(0));
2678 	} else {
2679 		return (status);
2680 	}
2681 }
2682 
2683 /*
2684  * tavor_destroy_fmr_pool()
2685  * Destroy an FMR pool and free all associated resources.
2686  *     Context: Can be called from kernel context only.
2687  */
2688 int
2689 tavor_destroy_fmr_pool(tavor_state_t *state, tavor_fmrhdl_t fmrpool)
2690 {
2691 	tavor_fmr_list_t	*fmr, *fmr_next;
2692 	char			*errormsg;
2693 	int			status;
2694 
2695 	TAVOR_TNF_ENTER(tavor_destroy_fmr_pool);
2696 
2697 	mutex_enter(&fmrpool->fmr_lock);
2698 	status = tavor_fmr_cleanup(state, fmrpool);
2699 	if (status != DDI_SUCCESS) {
2700 		mutex_exit(&fmrpool->fmr_lock);
2701 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed fmr cleanup");
2702 		goto fail;
2703 	}
2704 
2705 	if (fmrpool->fmr_cache) {
2706 		tavor_fmr_cache_fini(fmrpool);
2707 	}
2708 
2709 	for (fmr = fmrpool->fmr_free_list; fmr != NULL; fmr = fmr_next) {
2710 		fmr_next = fmr->fmr_next;
2711 
2712 		(void) tavor_mr_dealloc_fmr(state, &fmr->fmr);
2713 		kmem_free(fmr, sizeof (tavor_fmr_list_t));
2714 	}
2715 	mutex_exit(&fmrpool->fmr_lock);
2716 
2717 	ddi_taskq_destroy(fmrpool->fmr_taskq);
2718 	mutex_destroy(&fmrpool->fmr_lock);
2719 
2720 	kmem_free(fmrpool, sizeof (*fmrpool));
2721 
2722 	TAVOR_TNF_EXIT(tavor_destroy_fmr_pool);
2723 	return (DDI_SUCCESS);
2724 fail:
2725 	TNF_PROBE_1(tavor_destroy_fmr_pool_fail, TAVOR_TNF_ERROR, "",
2726 	    tnf_string, msg, errormsg);
2727 	TAVOR_TNF_EXIT(tavor_destroy_fmr_pool);
2728 	return (status);
2729 }
2730 
2731 /*
2732  * tavor_flush_fmr_pool()
2733  * Ensure that all unmapped FMRs are fully invalidated.
2734  *     Context: Can be called from kernel context only.
2735  */
2736 int
2737 tavor_flush_fmr_pool(tavor_state_t *state, tavor_fmrhdl_t fmrpool)
2738 {
2739 	char		*errormsg;
2740 	int		status;
2741 
2742 	TAVOR_TNF_ENTER(tavor_flush_fmr_pool);
2743 
2744 	/*
2745 	 * Force the unmapping of all entries on the dirty list, regardless of
2746 	 * whether the watermark has been hit yet.
2747 	 */
2748 	/* grab the pool lock */
2749 	mutex_enter(&fmrpool->fmr_lock);
2750 	status = tavor_fmr_cleanup(state, fmrpool);
2751 	if (status != DDI_SUCCESS) {
2752 		mutex_exit(&fmrpool->fmr_lock);
2753 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed fmr cleanup");
2754 		goto fail;
2755 	}
2756 	/* release the pool lock */
2757 	mutex_exit(&fmrpool->fmr_lock);
2758 
2759 	TAVOR_TNF_EXIT(tavor_flush_fmr_pool);
2760 	return (DDI_SUCCESS);
2761 fail:
2762 	TNF_PROBE_1(tavor_flush_fmr_pool_fail, TAVOR_TNF_ERROR, "",
2763 	    tnf_string, msg, errormsg);
2764 	TAVOR_TNF_EXIT(tavor_flush_fmr_pool);
2765 	return (status);
2766 }
2767 
2768 /*
2769  * tavor_deregister_fmr()
2770  * Map memory into FMR
2771  *    Context: Can be called from interrupt or base context.
2772  */
2773 int
2774 tavor_register_physical_fmr(tavor_state_t *state, tavor_fmrhdl_t fmrpool,
2775     ibt_pmr_attr_t *mem_pattr, tavor_mrhdl_t *mr,
2776     ibt_pmr_desc_t *mem_desc_p)
2777 {
2778 	tavor_fmr_list_t	*fmr;
2779 	tavor_fmr_list_t	query;
2780 	avl_index_t		where;
2781 	int			status;
2782 
2783 	TAVOR_TNF_ENTER(tavor_register_physical_fmr);
2784 
2785 	/* Check length */
2786 	mutex_enter(&fmrpool->fmr_lock);
2787 	if (mem_pattr->pmr_len < 1 || (mem_pattr->pmr_num_buf >
2788 	    fmrpool->fmr_max_pages)) {
2789 		mutex_exit(&fmrpool->fmr_lock);
2790 		TNF_PROBE_0(tavor_register_physical_fmr_length_fail,
2791 		    TAVOR_TNF_ERROR, "");
2792 		TAVOR_TNF_EXIT(tavor_register_physical_fmr);
2793 		return (IBT_MR_LEN_INVALID);
2794 	}
2795 
2796 	mutex_enter(&fmrpool->fmr_cachelock);
2797 	/* lookup in fmr cache */
2798 	/* if exists, grab it, and return it */
2799 	if (fmrpool->fmr_cache) {
2800 		query.fmr_desc.pmd_iova = mem_pattr->pmr_iova;
2801 		query.fmr_desc.pmd_phys_buf_list_sz = mem_pattr->pmr_len;
2802 		fmr = (tavor_fmr_list_t *)avl_find(&fmrpool->fmr_cache_avl,
2803 		    &query, &where);
2804 
2805 		/*
2806 		 * If valid FMR was found in cache, return that fmr info
2807 		 */
2808 		if (fmr != NULL) {
2809 			fmr->fmr_refcnt++;
2810 			/* Store pmr desc for use in cache */
2811 			(void) memcpy(mem_desc_p, &fmr->fmr_desc,
2812 			    sizeof (ibt_pmr_desc_t));
2813 			*mr = (tavor_mrhdl_t)fmr->fmr;
2814 			mutex_exit(&fmrpool->fmr_cachelock);
2815 			mutex_exit(&fmrpool->fmr_lock);
2816 			TAVOR_TNF_EXIT(tavor_register_physical_fmr);
2817 			return (DDI_SUCCESS);
2818 		}
2819 	}
2820 
2821 	/* FMR does not exist in cache, proceed with registration */
2822 
2823 	/* grab next free entry */
2824 	fmr = fmrpool->fmr_free_list;
2825 	if (fmr == NULL) {
2826 		mutex_exit(&fmrpool->fmr_cachelock);
2827 		mutex_exit(&fmrpool->fmr_lock);
2828 		TNF_PROBE_0(tavor_register_physical_fmr_none_free,
2829 		    TAVOR_TNF_ERROR, "");
2830 		TAVOR_TNF_EXIT(tavor_register_physical_fmr);
2831 		return (IBT_INSUFF_RESOURCE);
2832 	}
2833 
2834 	fmrpool->fmr_free_list = fmrpool->fmr_free_list->fmr_next;
2835 	fmr->fmr_next = NULL;
2836 
2837 	status = tavor_mr_register_physical_fmr(state, mem_pattr, fmr->fmr,
2838 	    mem_desc_p);
2839 	if (status != DDI_SUCCESS) {
2840 		mutex_exit(&fmrpool->fmr_cachelock);
2841 		mutex_exit(&fmrpool->fmr_lock);
2842 		TNF_PROBE_0(tavor_register_physical_fmr_reg_fail,
2843 		    TAVOR_TNF_ERROR, "");
2844 		TAVOR_TNF_EXIT(tavor_register_physical_fmr);
2845 		return (status);
2846 	}
2847 
2848 	fmr->fmr_refcnt = 1;
2849 	fmr->fmr_remaps++;
2850 
2851 	/* Store pmr desc for use in cache */
2852 	(void) memcpy(&fmr->fmr_desc, mem_desc_p, sizeof (ibt_pmr_desc_t));
2853 	*mr = (tavor_mrhdl_t)fmr->fmr;
2854 
2855 	/* Store in cache */
2856 	if (fmrpool->fmr_cache) {
2857 		if (!fmr->fmr_in_cache) {
2858 			avl_insert(&fmrpool->fmr_cache_avl, fmr, where);
2859 			fmr->fmr_in_cache = 1;
2860 		}
2861 	}
2862 
2863 	mutex_exit(&fmrpool->fmr_cachelock);
2864 	mutex_exit(&fmrpool->fmr_lock);
2865 	TAVOR_TNF_EXIT(tavor_register_physical_fmr);
2866 	return (DDI_SUCCESS);
2867 }
2868 
2869 /*
2870  * tavor_deregister_fmr()
2871  * Unmap FMR
2872  *    Context: Can be called from kernel context only.
2873  */
2874 int
2875 tavor_deregister_fmr(tavor_state_t *state, tavor_mrhdl_t mr)
2876 {
2877 	tavor_fmr_list_t	*fmr;
2878 	tavor_fmrhdl_t		fmrpool;
2879 	int			status;
2880 
2881 	fmr = mr->mr_fmr;
2882 	fmrpool = fmr->fmr_pool;
2883 
2884 	/* Grab pool lock */
2885 	mutex_enter(&fmrpool->fmr_lock);
2886 	fmr->fmr_refcnt--;
2887 
2888 	if (fmr->fmr_refcnt == 0) {
2889 		/*
2890 		 * First, do some bit of invalidation, reducing our exposure to
2891 		 * having this region still registered in hardware.
2892 		 */
2893 		(void) tavor_mr_invalidate_fmr(state, mr);
2894 
2895 		/*
2896 		 * If we've exhausted our remaps then add the FMR to the dirty
2897 		 * list, not allowing it to be re-used until we have done a
2898 		 * flush.  Otherwise, simply add it back to the free list for
2899 		 * re-mapping.
2900 		 */
2901 		if (fmr->fmr_remaps <
2902 		    state->ts_cfg_profile->cp_fmr_max_remaps) {
2903 			/* add to free list */
2904 			fmr->fmr_next = fmrpool->fmr_free_list;
2905 			fmrpool->fmr_free_list = fmr;
2906 		} else {
2907 			/* add to dirty list */
2908 			fmr->fmr_next = fmrpool->fmr_dirty_list;
2909 			fmrpool->fmr_dirty_list = fmr;
2910 			fmrpool->fmr_dirty_len++;
2911 
2912 			status = ddi_taskq_dispatch(fmrpool->fmr_taskq,
2913 			    tavor_fmr_processing, fmrpool, DDI_NOSLEEP);
2914 			if (status == DDI_FAILURE) {
2915 				mutex_exit(&fmrpool->fmr_lock);
2916 				TNF_PROBE_0(tavor_agent_request_cb_taskq_fail,
2917 				    TAVOR_TNF_ERROR, "");
2918 				return (IBT_INSUFF_RESOURCE);
2919 			}
2920 		}
2921 	}
2922 	/* Release pool lock */
2923 	mutex_exit(&fmrpool->fmr_lock);
2924 
2925 	return (DDI_SUCCESS);
2926 }
2927 
2928 
2929 /*
2930  * tavor_fmr_processing()
2931  * If required, perform cleanup.
2932  *     Context: Called from taskq context only.
2933  */
2934 static void
2935 tavor_fmr_processing(void *fmr_args)
2936 {
2937 	tavor_fmrhdl_t		fmrpool;
2938 	char			*errormsg;
2939 	int			status;
2940 
2941 	TAVOR_TNF_ENTER(tavor_fmr_processing);
2942 
2943 	ASSERT(fmr_args != NULL);
2944 
2945 	fmrpool = (tavor_fmrhdl_t)fmr_args;
2946 
2947 	/* grab pool lock */
2948 	mutex_enter(&fmrpool->fmr_lock);
2949 	if (fmrpool->fmr_dirty_len >= fmrpool->fmr_dirty_watermark) {
2950 		status = tavor_fmr_cleanup(fmrpool->fmr_state, fmrpool);
2951 		if (status != DDI_SUCCESS) {
2952 			mutex_exit(&fmrpool->fmr_lock);
2953 			TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
2954 			    "failed fmr cleanup");
2955 			goto fail;
2956 		}
2957 
2958 		if (fmrpool->fmr_flush_function != NULL) {
2959 			(void) fmrpool->fmr_flush_function(
2960 			    (ibc_fmr_pool_hdl_t)fmrpool,
2961 			    fmrpool->fmr_flush_arg);
2962 		}
2963 	}
2964 
2965 	/* let pool lock go */
2966 	mutex_exit(&fmrpool->fmr_lock);
2967 
2968 	TAVOR_TNF_EXIT(tavor_fmr_processing);
2969 	return;
2970 fail:
2971 	TNF_PROBE_1(tavor_fmr_processing, TAVOR_TNF_ERROR, "",
2972 	    tnf_string, msg, errormsg);
2973 	TAVOR_TNF_EXIT(tavor_fmr_processing);
2974 }
2975 
2976 /*
2977  * tavor_fmr_cleanup()
2978  * Perform cleaning processing, walking the list and performing the MTT sync
2979  * operation if required.
2980  *    Context: can be called from taskq or base context.
2981  */
2982 static int
2983 tavor_fmr_cleanup(tavor_state_t *state, tavor_fmrhdl_t fmrpool)
2984 {
2985 	tavor_fmr_list_t	*fmr;
2986 	tavor_fmr_list_t	*fmr_next;
2987 	int			sync_needed;
2988 	int			status;
2989 
2990 	TAVOR_TNF_ENTER(tavor_fmr_cleanup);
2991 
2992 	ASSERT(MUTEX_HELD(&fmrpool->fmr_lock));
2993 
2994 	sync_needed = 0;
2995 	for (fmr = fmrpool->fmr_dirty_list; fmr; fmr = fmr_next) {
2996 		fmr_next = fmr->fmr_next;
2997 		fmr->fmr_remaps = 0;
2998 
2999 		(void) tavor_mr_deregister_fmr(state, fmr->fmr);
3000 
3001 		/*
3002 		 * Update lists.
3003 		 * - add fmr back to free list
3004 		 * - remove fmr from dirty list
3005 		 */
3006 		fmr->fmr_next = fmrpool->fmr_free_list;
3007 		fmrpool->fmr_free_list = fmr;
3008 
3009 
3010 		/*
3011 		 * Because we have updated the dirty list, and deregistered the
3012 		 * FMR entry, we do need to sync the TPT, so we set the
3013 		 * 'sync_needed' flag here so we sync once we finish dirty_list
3014 		 * processing.
3015 		 */
3016 		sync_needed = 1;
3017 	}
3018 
3019 	fmrpool->fmr_dirty_list = NULL;
3020 	fmrpool->fmr_dirty_len = 0;
3021 
3022 	if (sync_needed) {
3023 		status = tavor_sync_tpt_cmd_post(state, TAVOR_CMD_NOSLEEP_SPIN);
3024 		if (status != TAVOR_CMD_SUCCESS) {
3025 			TNF_PROBE_0(tavor_fmr_cleanup, TAVOR_TNF_ERROR, "");
3026 			TAVOR_TNF_EXIT(tavor_fmr_cleanup);
3027 			return (status);
3028 		}
3029 	}
3030 
3031 	TAVOR_TNF_EXIT(tavor_fmr_cleanup);
3032 	return (DDI_SUCCESS);
3033 }
3034 
3035 /*
3036  * tavor_fmr_avl_compare()
3037  *    Context: Can be called from user or kernel context.
3038  */
3039 static int
3040 tavor_fmr_avl_compare(const void *q, const void *e)
3041 {
3042 	tavor_fmr_list_t *entry, *query;
3043 
3044 	TAVOR_TNF_ENTER(tavor_qpn_avl_compare);
3045 
3046 	entry = (tavor_fmr_list_t *)e;
3047 	query = (tavor_fmr_list_t *)q;
3048 
3049 	if (query->fmr_desc.pmd_iova < entry->fmr_desc.pmd_iova) {
3050 		TAVOR_TNF_EXIT(tavor_qpn_avl_compare);
3051 		return (-1);
3052 	} else if (query->fmr_desc.pmd_iova > entry->fmr_desc.pmd_iova) {
3053 		TAVOR_TNF_EXIT(tavor_qpn_avl_compare);
3054 		return (+1);
3055 	} else {
3056 		TAVOR_TNF_EXIT(tavor_qpn_avl_compare);
3057 		return (0);
3058 	}
3059 }
3060 
3061 
3062 /*
3063  * tavor_fmr_cache_init()
3064  *    Context: Can be called from user or kernel context.
3065  */
3066 static void
3067 tavor_fmr_cache_init(tavor_fmrhdl_t fmr)
3068 {
3069 	TAVOR_TNF_ENTER(tavor_fmr_cache_init);
3070 
3071 	/* Initialize the lock used for FMR cache AVL tree access */
3072 	mutex_init(&fmr->fmr_cachelock, NULL, MUTEX_DRIVER,
3073 	    DDI_INTR_PRI(fmr->fmr_state->ts_intrmsi_pri));
3074 
3075 	/* Initialize the AVL tree for the FMR cache */
3076 	avl_create(&fmr->fmr_cache_avl, tavor_fmr_avl_compare,
3077 	    sizeof (tavor_fmr_list_t),
3078 	    offsetof(tavor_fmr_list_t, fmr_avlnode));
3079 
3080 	fmr->fmr_cache = 1;
3081 
3082 	TAVOR_TNF_EXIT(tavor_fmr_cache_init);
3083 }
3084 
3085 
3086 /*
3087  * tavor_fmr_cache_fini()
3088  *    Context: Can be called from user or kernel context.
3089  */
3090 static void
3091 tavor_fmr_cache_fini(tavor_fmrhdl_t fmr)
3092 {
3093 	void			*cookie;
3094 
3095 	TAVOR_TNF_ENTER(tavor_fmr_cache_fini);
3096 
3097 	/*
3098 	 * Empty all entries (if necessary) and destroy the AVL tree.
3099 	 * The FMRs themselves are freed as part of destroy_pool()
3100 	 */
3101 	cookie = NULL;
3102 	while (((void *)(tavor_fmr_list_t *)avl_destroy_nodes(
3103 	    &fmr->fmr_cache_avl, &cookie)) != NULL) {
3104 		/* loop through */
3105 	}
3106 	avl_destroy(&fmr->fmr_cache_avl);
3107 
3108 	/* Destroy the lock used for FMR cache */
3109 	mutex_destroy(&fmr->fmr_cachelock);
3110 
3111 	TAVOR_TNF_EXIT(tavor_fmr_cache_fini);
3112 }
3113 
3114 /*
3115  * tavor_get_dma_cookies()
3116  * Return DMA cookies in the pre-allocated paddr_list_p based on the length
3117  * needed.
3118  *    Context: Can be called from interrupt or base context.
3119  */
3120 int
3121 tavor_get_dma_cookies(tavor_state_t *state, ibt_phys_buf_t *paddr_list_p,
3122     ibt_va_attr_t *va_attrs, uint_t list_len, uint_t *cookiecnt,
3123     ibc_ma_hdl_t *ibc_ma_hdl_p)
3124 {
3125 	ddi_dma_handle_t	dma_hdl;
3126 	ddi_dma_attr_t		dma_attr;
3127 	ddi_dma_cookie_t	dmacookie;
3128 	uint_t			dma_xfer_mode;
3129 	int			(*callback)(caddr_t);
3130 	int			status;
3131 	int			i;
3132 
3133 	TAVOR_TNF_ENTER(tavor_get_dma_cookies);
3134 
3135 	/* Set the callback flag appropriately */
3136 	callback = (va_attrs->va_flags & IBT_VA_NOSLEEP) ? DDI_DMA_DONTWAIT :
3137 	    DDI_DMA_SLEEP;
3138 	if ((callback == DDI_DMA_SLEEP) &&
3139 	    (TAVOR_SLEEP != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
3140 		TNF_PROBE_0(tavor_ci_map_mem_area_invalid_flags,
3141 		    TAVOR_TNF_ERROR, "");
3142 		TAVOR_TNF_EXIT(tavor_ci_map_mem_area);
3143 		return (IBT_INVALID_PARAM);
3144 	}
3145 
3146 	/*
3147 	 * Initialize many of the default DMA attributes and allocate the DMA
3148 	 * handle.  Then, if we're bypassing the IOMMU, set the
3149 	 * DDI_DMA_FORCE_PHYSICAL flag.
3150 	 */
3151 	tavor_dma_attr_init(&dma_attr);
3152 
3153 #ifdef __x86
3154 	/*
3155 	 * On x86 we can specify a maximum segment length for our returned
3156 	 * cookies.
3157 	 */
3158 	if (va_attrs->va_flags & IBT_VA_FMR) {
3159 		dma_attr.dma_attr_seg = PAGESIZE - 1;
3160 	}
3161 #endif
3162 
3163 	/* Determine whether to map STREAMING or CONSISTENT */
3164 	dma_xfer_mode = (va_attrs->va_flags & IBT_VA_NONCOHERENT) ?
3165 	    DDI_DMA_STREAMING : DDI_DMA_CONSISTENT;
3166 
3167 #ifdef	__sparc
3168 	/*
3169 	 * First, disable streaming and switch to consistent if
3170 	 * configured to do so and IOMMU BYPASS is enabled.
3171 	 */
3172 	if (state->ts_cfg_profile->cp_disable_streaming_on_bypass &&
3173 	    dma_xfer_mode == DDI_DMA_STREAMING &&
3174 	    state->ts_cfg_profile->cp_iommu_bypass == TAVOR_BINDMEM_BYPASS) {
3175 		dma_xfer_mode = DDI_DMA_CONSISTENT;
3176 	}
3177 
3178 	/*
3179 	 * Then, if streaming is still specified, then "bypass" is not
3180 	 * allowed.
3181 	 */
3182 	if ((dma_xfer_mode == DDI_DMA_CONSISTENT) &&
3183 	    (state->ts_cfg_profile->cp_iommu_bypass == TAVOR_BINDMEM_BYPASS)) {
3184 		dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
3185 	}
3186 #endif
3187 
3188 	status = ddi_dma_alloc_handle(state->ts_dip, &dma_attr,
3189 	    callback, NULL, &dma_hdl);
3190 	if (status != DDI_SUCCESS) {
3191 		TNF_PROBE_1(tavor_ci_map_mem_area_alloc_handle_fail,
3192 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
3193 		TAVOR_TNF_EXIT(tavor_ci_map_mem_area);
3194 
3195 		switch (status) {
3196 		case DDI_DMA_NORESOURCES:
3197 			return (IBT_INSUFF_RESOURCE);
3198 		case DDI_DMA_BADATTR:
3199 		default:
3200 			return (ibc_get_ci_failure(0));
3201 		}
3202 	}
3203 
3204 	/*
3205 	 * Now bind the handle with the correct DMA attributes.
3206 	 */
3207 	if (va_attrs->va_flags & IBT_VA_BUF) {
3208 		status = ddi_dma_buf_bind_handle(dma_hdl, va_attrs->va_buf,
3209 		    DDI_DMA_RDWR | dma_xfer_mode, DDI_DMA_DONTWAIT,
3210 		    NULL, &dmacookie, cookiecnt);
3211 	} else {
3212 		status = ddi_dma_addr_bind_handle(dma_hdl, NULL,
3213 		    (caddr_t)(uintptr_t)va_attrs->va_vaddr, va_attrs->va_len,
3214 		    DDI_DMA_RDWR | dma_xfer_mode, DDI_DMA_DONTWAIT,
3215 		    NULL, &dmacookie, cookiecnt);
3216 	}
3217 	if (status != DDI_SUCCESS) {
3218 		ddi_dma_free_handle(&dma_hdl);
3219 		TNF_PROBE_0(tavor_ci_map_mem_area_bind_handle_fail,
3220 		    TAVOR_TNF_ERROR, "");
3221 		TAVOR_TNF_EXIT(tavor_ci_map_mem_area);
3222 
3223 		switch (status) {
3224 		case DDI_DMA_NORESOURCES:
3225 			return (IBT_INSUFF_RESOURCE);
3226 		case DDI_DMA_TOOBIG:
3227 			return (IBT_INVALID_PARAM);
3228 		case DDI_DMA_PARTIAL_MAP:
3229 		case DDI_DMA_INUSE:
3230 		case DDI_DMA_NOMAPPING:
3231 		default:
3232 			return (ibc_get_ci_failure(0));
3233 		}
3234 	}
3235 
3236 	/*
3237 	 * Verify our physical buffer list (PBL) is large enough to handle the
3238 	 * number of cookies that were returned.
3239 	 */
3240 	if (*cookiecnt > list_len) {
3241 		(void) ddi_dma_unbind_handle(dma_hdl);
3242 		ddi_dma_free_handle(&dma_hdl);
3243 		TNF_PROBE_0(tavor_ci_map_mem_area_toomany_cookie_fail,
3244 		    TAVOR_TNF_ERROR, "");
3245 		TAVOR_TNF_EXIT(tavor_ci_map_mem_area);
3246 		return (IBT_PBL_TOO_SMALL);
3247 	}
3248 
3249 	/*
3250 	 * We store the cookies returned by the DDI into our own PBL.  This
3251 	 * sets the cookies up for later processing (for example, if we want to
3252 	 * split up the cookies into smaller chunks).  We use the laddr and
3253 	 * size fields in each cookie to create each individual entry (PBE).
3254 	 */
3255 
3256 	/*
3257 	 * Store first cookie info first
3258 	 */
3259 	paddr_list_p[0].p_laddr = dmacookie.dmac_laddress;
3260 	paddr_list_p[0].p_size = dmacookie.dmac_size;
3261 
3262 	/*
3263 	 * Loop through each cookie, storing each cookie into our physical
3264 	 * buffer list.
3265 	 */
3266 	for (i = 1; i < *cookiecnt; i++) {
3267 		ddi_dma_nextcookie(dma_hdl, &dmacookie);
3268 
3269 		paddr_list_p[i].p_laddr = dmacookie.dmac_laddress;
3270 		paddr_list_p[i].p_size  = dmacookie.dmac_size;
3271 	}
3272 
3273 	/* return handle */
3274 	*ibc_ma_hdl_p = (ibc_ma_hdl_t)dma_hdl;
3275 	TAVOR_TNF_EXIT(tavor_get_dma_cookies);
3276 	return (DDI_SUCCESS);
3277 }
3278 
3279 /*
3280  * tavor_split_dma_cookies()
3281  * Split up cookies passed in from paddr_list_p, returning the new list in the
3282  * same buffers, based on the pagesize to split the cookies into.
3283  *    Context: Can be called from interrupt or base context.
3284  */
3285 /* ARGSUSED */
3286 int
3287 tavor_split_dma_cookies(tavor_state_t *state, ibt_phys_buf_t *paddr_list,
3288     ib_memlen_t *paddr_offset, uint_t list_len, uint_t *cookiecnt,
3289     uint_t pagesize)
3290 {
3291 	uint64_t	pageoffset;
3292 	uint64_t	pagemask;
3293 	uint_t		pageshift;
3294 	uint_t		current_cookiecnt;
3295 	uint_t		cookies_needed;
3296 	uint64_t	last_size, extra_cookie;
3297 	int		i_increment;
3298 	int		i, k;
3299 	int		status;
3300 
3301 	TAVOR_TNF_ENTER(tavor_split_dma_cookies);
3302 
3303 	/* Setup pagesize calculations */
3304 	pageoffset = pagesize - 1;
3305 	pagemask = (~pageoffset);
3306 	pageshift = highbit(pagesize) - 1;
3307 
3308 	/*
3309 	 * Setup first cookie offset based on pagesize requested.
3310 	 */
3311 	*paddr_offset = paddr_list[0].p_laddr & pageoffset;
3312 	paddr_list[0].p_laddr &= pagemask;
3313 
3314 	/* Save away the current number of cookies that are passed in */
3315 	current_cookiecnt = *cookiecnt;
3316 
3317 	/* Perform splitting up of current cookies into pagesize blocks */
3318 	for (i = 0; i < current_cookiecnt; i += i_increment) {
3319 		/*
3320 		 * If the cookie is smaller than pagesize, or already is
3321 		 * pagesize, then we are already within our limits, so we skip
3322 		 * it.
3323 		 */
3324 		if (paddr_list[i].p_size <= pagesize) {
3325 			i_increment = 1;
3326 			continue;
3327 		}
3328 
3329 		/*
3330 		 * If this is our first cookie, then we have to deal with the
3331 		 * offset that may be present in the first address.  So add
3332 		 * that to our size, to calculate potential change to the last
3333 		 * cookie's size.
3334 		 *
3335 		 * Also, calculate the number of cookies that we'll need to
3336 		 * split up this block into.
3337 		 */
3338 		if (i == 0) {
3339 			last_size = (paddr_list[i].p_size + *paddr_offset) &
3340 			    pageoffset;
3341 			cookies_needed = (paddr_list[i].p_size +
3342 			    *paddr_offset) >> pageshift;
3343 		} else {
3344 			last_size = 0;
3345 			cookies_needed = paddr_list[i].p_size >> pageshift;
3346 		}
3347 
3348 		/*
3349 		 * If our size is not a multiple of pagesize, we need one more
3350 		 * cookie.
3351 		 */
3352 		if (last_size) {
3353 			extra_cookie = 1;
3354 		} else {
3355 			extra_cookie = 0;
3356 		}
3357 
3358 		/*
3359 		 * Split cookie into pagesize chunks, shifting list of cookies
3360 		 * down, using more cookie slots in the PBL if necessary.
3361 		 */
3362 		status = tavor_dma_cookie_shift(paddr_list, i, list_len,
3363 		    current_cookiecnt - i, cookies_needed + extra_cookie);
3364 		if (status != 0) {
3365 			TNF_PROBE_0(tavor_split_cookies_toomany_fail,
3366 			    TAVOR_TNF_ERROR, "");
3367 			TAVOR_TNF_EXIT(tavor_dma_split_cookies);
3368 			return (status);
3369 		}
3370 
3371 		/*
3372 		 * If the very first cookie, we must take possible offset into
3373 		 * account.
3374 		 */
3375 		if (i == 0) {
3376 			paddr_list[i].p_size = pagesize - *paddr_offset;
3377 		} else {
3378 			paddr_list[i].p_size = pagesize;
3379 		}
3380 
3381 		/*
3382 		 * We have shifted the existing cookies down the PBL, now fill
3383 		 * in the blank entries by splitting up our current block.
3384 		 */
3385 		for (k = 1; k < cookies_needed; k++) {
3386 			paddr_list[i + k].p_laddr =
3387 			    paddr_list[i + k - 1].p_laddr + pagesize;
3388 			paddr_list[i + k].p_size = pagesize;
3389 		}
3390 
3391 		/* If we have one extra cookie (of less than pagesize...) */
3392 		if (extra_cookie) {
3393 			paddr_list[i + k].p_laddr =
3394 			    paddr_list[i + k - 1].p_laddr + pagesize;
3395 			paddr_list[i + k].p_size = last_size;
3396 		}
3397 
3398 		/* Increment cookiecnt appropriately based on cookies used */
3399 		i_increment = cookies_needed + extra_cookie;
3400 		current_cookiecnt += i_increment - 1;
3401 	}
3402 
3403 	/* Update to new cookie count */
3404 	*cookiecnt = current_cookiecnt;
3405 	TAVOR_TNF_EXIT(tavor_dma_split_cookies);
3406 	return (DDI_SUCCESS);
3407 }
3408 
3409 /*
3410  * tavor_dma_cookie_shift()
3411  *    Context: Can be called from interrupt or base context.
3412  */
3413 int
3414 tavor_dma_cookie_shift(ibt_phys_buf_t *paddr_list, int start, int end,
3415     int cookiecnt, int num_shift)
3416 {
3417 	int shift_start;
3418 	int i;
3419 
3420 	TAVOR_TNF_ENTER(tavor_dma_cookie_shift);
3421 
3422 	/* Calculating starting point in the PBL list */
3423 	shift_start = start + cookiecnt - 1;
3424 
3425 	/* Check if we're at the end of our PBL list */
3426 	if ((shift_start + num_shift - 1) >= end) {
3427 		TNF_PROBE_0(tavor_dma_cookie_shift_toomany_fail,
3428 		    TAVOR_TNF_ERROR, "");
3429 		TAVOR_TNF_EXIT(tavor_dma_cookie_shift);
3430 		return (IBT_PBL_TOO_SMALL);
3431 	}
3432 
3433 	for (i = shift_start; i > start; i--) {
3434 		paddr_list[i + num_shift - 1] = paddr_list[i];
3435 	}
3436 
3437 	TAVOR_TNF_EXIT(tavor_dma_cookie_shift);
3438 	return (DDI_SUCCESS);
3439 }
3440 
3441 
3442 /*
3443  * tavor_free_dma_cookies()
3444  *    Context: Can be called from interrupt or base context.
3445  */
3446 int
3447 tavor_free_dma_cookies(ibc_ma_hdl_t ma_hdl)
3448 {
3449 	ddi_dma_handle_t	dma_hdl;
3450 	int			status;
3451 
3452 	dma_hdl = (ddi_dma_handle_t)ma_hdl;
3453 
3454 	status = ddi_dma_unbind_handle(dma_hdl);
3455 	if (status != DDI_SUCCESS) {
3456 		TNF_PROBE_0(tavor_ci_free_dma_unbind_fail,
3457 		    TAVOR_TNF_ERROR, "");
3458 		TAVOR_TNF_EXIT(tavor_ci_unmap_mem_area);
3459 		return (ibc_get_ci_failure(0));
3460 	}
3461 
3462 	ddi_dma_free_handle(&dma_hdl);
3463 
3464 	return (DDI_SUCCESS);
3465 }
3466