xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/tavor/tavor_mr.c (revision c7facc54c4abed9e554ff80225311e6b7048d3c9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_mr.c
29  *    Tavor Memory Region/Window Routines
30  *
31  *    Implements all the routines necessary to provide the requisite memory
32  *    registration verbs.  These include operations like RegisterMemRegion(),
33  *    DeregisterMemRegion(), ReregisterMemRegion, RegisterSharedMemRegion,
34  *    etc., that affect Memory Regions.  It also includes the verbs that
35  *    affect Memory Windows, including AllocMemWindow(), FreeMemWindow(),
36  *    and QueryMemWindow().
37  */
38 
39 #include <sys/types.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/modctl.h>
44 #include <sys/esunddi.h>
45 
46 #include <sys/ib/adapters/tavor/tavor.h>
47 
48 
49 /*
50  * Used by tavor_mr_keycalc() below to fill in the "unconstrained" portion
51  * of Tavor memory keys (LKeys and RKeys)
52  */
53 static uint_t tavor_debug_memkey_cnt = 0x00000000;
54 
55 static int tavor_mr_common_reg(tavor_state_t *state, tavor_pdhdl_t pd,
56     tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op);
57 static int tavor_mr_common_rereg(tavor_state_t *state, tavor_mrhdl_t mr,
58     tavor_pdhdl_t pd, tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl_new,
59     tavor_mr_options_t *op);
60 static int tavor_mr_rereg_xlat_helper(tavor_state_t *state, tavor_mrhdl_t mr,
61     tavor_bind_info_t *bind, tavor_mr_options_t *op, uint64_t *mtt_addr,
62     uint_t sleep, uint_t *dereg_level);
63 static uint64_t tavor_mr_nummtt_needed(tavor_state_t *state,
64     tavor_bind_info_t *bind, uint_t *mtt_pgsize);
65 static int tavor_mr_mem_bind(tavor_state_t *state, tavor_bind_info_t *bind,
66     ddi_dma_handle_t dmahdl, uint_t sleep);
67 static void tavor_mr_mem_unbind(tavor_state_t *state,
68     tavor_bind_info_t *bind);
69 static int tavor_mr_fast_mtt_write(tavor_rsrc_t *mtt, tavor_bind_info_t *bind,
70     uint32_t mtt_pgsize_bits);
71 static int tavor_mtt_refcnt_inc(tavor_rsrc_t *rsrc);
72 static int tavor_mtt_refcnt_dec(tavor_rsrc_t *rsrc);
73 
74 /*
75  * The Tavor umem_lockmemory() callback ops.  When userland memory is
76  * registered, these callback ops are specified.  The tavor_umap_umemlock_cb()
77  * callback will be called whenever the memory for the corresponding
78  * ddi_umem_cookie_t is being freed.
79  */
80 static struct umem_callback_ops tavor_umem_cbops = {
81 	UMEM_CALLBACK_VERSION,
82 	tavor_umap_umemlock_cb,
83 };
84 
85 
86 /*
87  * tavor_mr_register()
88  *    Context: Can be called from interrupt or base context.
89  */
90 int
91 tavor_mr_register(tavor_state_t *state, tavor_pdhdl_t pd,
92     ibt_mr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op)
93 {
94 	tavor_bind_info_t	bind;
95 	int			status;
96 
97 	TAVOR_TNF_ENTER(tavor_mr_register);
98 
99 	/*
100 	 * Fill in the "bind" struct.  This struct provides the majority
101 	 * of the information that will be used to distinguish between an
102 	 * "addr" binding (as is the case here) and a "buf" binding (see
103 	 * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
104 	 * which does most of the "heavy lifting" for the Tavor memory
105 	 * registration routines.
106 	 */
107 	bind.bi_type  = TAVOR_BINDHDL_VADDR;
108 	bind.bi_addr  = mr_attr->mr_vaddr;
109 	bind.bi_len   = mr_attr->mr_len;
110 	bind.bi_as    = mr_attr->mr_as;
111 	bind.bi_flags = mr_attr->mr_flags;
112 	status = tavor_mr_common_reg(state, pd, &bind, mrhdl, op);
113 	if (status != DDI_SUCCESS) {
114 		TNF_PROBE_0(tavor_mr_register_cmnreg_fail,
115 		    TAVOR_TNF_ERROR, "");
116 		TAVOR_TNF_EXIT(tavor_mr_register);
117 		return (status);
118 	}
119 
120 	TAVOR_TNF_EXIT(tavor_mr_register);
121 	return (DDI_SUCCESS);
122 }
123 
124 
125 /*
126  * tavor_mr_register_buf()
127  *    Context: Can be called from interrupt or base context.
128  */
129 int
130 tavor_mr_register_buf(tavor_state_t *state, tavor_pdhdl_t pd,
131     ibt_smr_attr_t *mr_attr, struct buf *buf, tavor_mrhdl_t *mrhdl,
132     tavor_mr_options_t *op)
133 {
134 	tavor_bind_info_t	bind;
135 	int			status;
136 
137 	TAVOR_TNF_ENTER(tavor_mr_register_buf);
138 
139 	/*
140 	 * Fill in the "bind" struct.  This struct provides the majority
141 	 * of the information that will be used to distinguish between an
142 	 * "addr" binding (see above) and a "buf" binding (as is the case
143 	 * here).  The "bind" struct is later passed to tavor_mr_mem_bind()
144 	 * which does most of the "heavy lifting" for the Tavor memory
145 	 * registration routines.  Note: We have chosen to provide
146 	 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
147 	 * not set).  It is not critical what value we choose here as it need
148 	 * only be unique for the given RKey (which will happen by default),
149 	 * so the choice here is somewhat arbitrary.
150 	 */
151 	bind.bi_type  = TAVOR_BINDHDL_BUF;
152 	bind.bi_buf   = buf;
153 	if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
154 		bind.bi_addr  = mr_attr->mr_vaddr;
155 	} else {
156 		bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
157 	}
158 	bind.bi_as    = NULL;
159 	bind.bi_len   = (uint64_t)buf->b_bcount;
160 	bind.bi_flags = mr_attr->mr_flags;
161 	status = tavor_mr_common_reg(state, pd, &bind, mrhdl, op);
162 	if (status != DDI_SUCCESS) {
163 		TNF_PROBE_0(tavor_mr_register_buf_cmnreg_fail,
164 		    TAVOR_TNF_ERROR, "");
165 		TAVOR_TNF_EXIT(tavor_mr_register_buf);
166 		return (status);
167 	}
168 
169 	TAVOR_TNF_EXIT(tavor_mr_register_buf);
170 	return (DDI_SUCCESS);
171 }
172 
173 
174 /*
175  * tavor_mr_register_shared()
176  *    Context: Can be called from interrupt or base context.
177  */
178 int
179 tavor_mr_register_shared(tavor_state_t *state, tavor_mrhdl_t mrhdl,
180     tavor_pdhdl_t pd, ibt_smr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl_new)
181 {
182 	tavor_rsrc_pool_info_t	*rsrc_pool;
183 	tavor_rsrc_t		*mpt, *mtt, *rsrc;
184 	tavor_umap_db_entry_t	*umapdb;
185 	tavor_hw_mpt_t		mpt_entry;
186 	tavor_mrhdl_t		mr;
187 	tavor_bind_info_t	*bind;
188 	ddi_umem_cookie_t	umem_cookie;
189 	size_t			umem_len;
190 	caddr_t			umem_addr;
191 	uint64_t		mtt_addr, mtt_ddrbaseaddr, pgsize_msk;
192 	uint_t			sleep, mr_is_umem;
193 	int			status, umem_flags;
194 	char			*errormsg;
195 
196 	TAVOR_TNF_ENTER(tavor_mr_register_shared);
197 
198 	/*
199 	 * Check the sleep flag.  Ensure that it is consistent with the
200 	 * current thread context (i.e. if we are currently in the interrupt
201 	 * context, then we shouldn't be attempting to sleep).
202 	 */
203 	sleep = (mr_attr->mr_flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP :
204 	    TAVOR_SLEEP;
205 	if ((sleep == TAVOR_SLEEP) &&
206 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
207 		/* Set "status" and "errormsg" and goto failure */
208 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
209 		goto mrshared_fail;
210 	}
211 
212 	/* Increment the reference count on the protection domain (PD) */
213 	tavor_pd_refcnt_inc(pd);
214 
215 	/*
216 	 * Allocate an MPT entry.  This will be filled in with all the
217 	 * necessary parameters to define the shared memory region.
218 	 * Specifically, it will be made to reference the currently existing
219 	 * MTT entries and ownership of the MPT will be passed to the hardware
220 	 * in the last step below.  If we fail here, we must undo the
221 	 * protection domain reference count.
222 	 */
223 	status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
224 	if (status != DDI_SUCCESS) {
225 		/* Set "status" and "errormsg" and goto failure */
226 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
227 		goto mrshared_fail1;
228 	}
229 
230 	/*
231 	 * Allocate the software structure for tracking the shared memory
232 	 * region (i.e. the Tavor Memory Region handle).  If we fail here, we
233 	 * must undo the protection domain reference count and the previous
234 	 * resource allocation.
235 	 */
236 	status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
237 	if (status != DDI_SUCCESS) {
238 		/* Set "status" and "errormsg" and goto failure */
239 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
240 		goto mrshared_fail2;
241 	}
242 	mr = (tavor_mrhdl_t)rsrc->tr_addr;
243 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
244 
245 	/*
246 	 * Setup and validate the memory region access flags.  This means
247 	 * translating the IBTF's enable flags into the access flags that
248 	 * will be used in later operations.
249 	 */
250 	mr->mr_accflag = 0;
251 	if (mr_attr->mr_flags & IBT_MR_ENABLE_WINDOW_BIND)
252 		mr->mr_accflag |= IBT_MR_WINDOW_BIND;
253 	if (mr_attr->mr_flags & IBT_MR_ENABLE_LOCAL_WRITE)
254 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
255 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_READ)
256 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
257 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_WRITE)
258 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
259 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
260 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
261 
262 	/*
263 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
264 	 * from a certain number of "constrained" bits (the least significant
265 	 * bits) and some number of "unconstrained" bits.  The constrained
266 	 * bits must be set to the index of the entry in the MPT table, but
267 	 * the unconstrained bits can be set to any value we wish.  Note:
268 	 * if no remote access is required, then the RKey value is not filled
269 	 * in.  Otherwise both Rkey and LKey are given the same value.
270 	 */
271 	tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
272 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
273 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
274 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
275 		mr->mr_rkey = mr->mr_lkey;
276 	}
277 
278 	/* Grab the MR lock for the current memory region */
279 	mutex_enter(&mrhdl->mr_lock);
280 
281 	/*
282 	 * Check here to see if the memory region has already been partially
283 	 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
284 	 * If so, this is an error, return failure.
285 	 */
286 	if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
287 		mutex_exit(&mrhdl->mr_lock);
288 		/* Set "status" and "errormsg" and goto failure */
289 		TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
290 		goto mrshared_fail3;
291 	}
292 
293 	/*
294 	 * Determine if the original memory was from userland and, if so, pin
295 	 * the pages (again) with umem_lockmemory().  This will guarantee a
296 	 * separate callback for each of this shared region's MR handles.
297 	 * If this is userland memory, then allocate an entry in the
298 	 * "userland resources database".  This will later be added to
299 	 * the database (after all further memory registration operations are
300 	 * successful).  If we fail here, we must undo all the above setup.
301 	 */
302 	mr_is_umem = mrhdl->mr_is_umem;
303 	if (mr_is_umem) {
304 		umem_len   = ptob(btopr(mrhdl->mr_bindinfo.bi_len +
305 		    ((uintptr_t)mrhdl->mr_bindinfo.bi_addr & PAGEOFFSET)));
306 		umem_addr  = (caddr_t)((uintptr_t)mrhdl->mr_bindinfo.bi_addr &
307 		    ~PAGEOFFSET);
308 		umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
309 		    DDI_UMEMLOCK_LONGTERM);
310 		status = umem_lockmemory(umem_addr, umem_len, umem_flags,
311 		    &umem_cookie, &tavor_umem_cbops, NULL);
312 		if (status != 0) {
313 			mutex_exit(&mrhdl->mr_lock);
314 			/* Set "status" and "errormsg" and goto failure */
315 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umem pin");
316 			goto mrshared_fail3;
317 		}
318 
319 		umapdb = tavor_umap_db_alloc(state->ts_instance,
320 		    (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
321 		    (uint64_t)(uintptr_t)rsrc);
322 		if (umapdb == NULL) {
323 			mutex_exit(&mrhdl->mr_lock);
324 			/* Set "status" and "errormsg" and goto failure */
325 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
326 			goto mrshared_fail4;
327 		}
328 	}
329 
330 	/*
331 	 * Copy the MTT resource pointer (and additional parameters) from
332 	 * the original Tavor Memory Region handle.  Note: this is normally
333 	 * where the tavor_mr_mem_bind() routine would be called, but because
334 	 * we already have bound and filled-in MTT entries it is simply a
335 	 * matter here of managing the MTT reference count and grabbing the
336 	 * address of the MTT table entries (for filling in the shared region's
337 	 * MPT entry).
338 	 */
339 	mr->mr_mttrsrcp	  = mrhdl->mr_mttrsrcp;
340 	mr->mr_logmttpgsz = mrhdl->mr_logmttpgsz;
341 	mr->mr_bindinfo	  = mrhdl->mr_bindinfo;
342 	mr->mr_mttrefcntp = mrhdl->mr_mttrefcntp;
343 	mutex_exit(&mrhdl->mr_lock);
344 	bind = &mr->mr_bindinfo;
345 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
346 	mtt = mr->mr_mttrsrcp;
347 
348 	/*
349 	 * Increment the MTT reference count (to reflect the fact that
350 	 * the MTT is now shared)
351 	 */
352 	(void) tavor_mtt_refcnt_inc(mr->mr_mttrefcntp);
353 
354 	/*
355 	 * Update the new "bind" virtual address.  Do some extra work here
356 	 * to ensure proper alignment.  That is, make sure that the page
357 	 * offset for the beginning of the old range is the same as the
358 	 * offset for this new mapping
359 	 */
360 	pgsize_msk = (((uint64_t)1 << mr->mr_logmttpgsz) - 1);
361 	bind->bi_addr = ((mr_attr->mr_vaddr & ~pgsize_msk) |
362 	    (mr->mr_bindinfo.bi_addr & pgsize_msk));
363 
364 	/*
365 	 * Get the base address for the MTT table.  This will be necessary
366 	 * in the next step when we are setting up the MPT entry.
367 	 */
368 	rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
369 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
370 
371 	/*
372 	 * Fill in the MPT entry.  This is the final step before passing
373 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
374 	 * the information collected/calculated above to fill in the
375 	 * requisite portions of the MPT.
376 	 */
377 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
378 	mpt_entry.m_io	  = TAVOR_MEM_CYCLE_GENERATE;
379 	mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
380 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
381 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
382 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
383 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
384 	mpt_entry.lr	  = 1;
385 	mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
386 	mpt_entry.page_sz	= mr->mr_logmttpgsz - 0xC;
387 	mpt_entry.mem_key	= mr->mr_lkey;
388 	mpt_entry.pd		= pd->pd_pdnum;
389 	mpt_entry.start_addr	= bind->bi_addr;
390 	mpt_entry.reg_win_len	= bind->bi_len;
391 	mpt_entry.win_cnt_limit	= TAVOR_UNLIMITED_WIN_BIND;
392 	mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
393 	mpt_entry.mttseg_addr_h = mtt_addr >> 32;
394 	mpt_entry.mttseg_addr_l = mtt_addr >> 6;
395 
396 	/*
397 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
398 	 * the entry to the hardware.  Note: in general, this operation
399 	 * shouldn't fail.  But if it does, we have to undo everything we've
400 	 * done above before returning error.
401 	 */
402 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
403 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
404 	if (status != TAVOR_CMD_SUCCESS) {
405 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
406 		    status);
407 		TNF_PROBE_1(tavor_mr_register_shared_sw2hw_mpt_cmd_fail,
408 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
409 		/* Set "status" and "errormsg" and goto failure */
410 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
411 		    "tavor SW2HW_MPT command");
412 		goto mrshared_fail5;
413 	}
414 
415 	/*
416 	 * Fill in the rest of the Tavor Memory Region handle.  Having
417 	 * successfully transferred ownership of the MPT, we can update the
418 	 * following fields for use in further operations on the MR.
419 	 */
420 	mr->mr_mptrsrcp	  = mpt;
421 	mr->mr_mttrsrcp	  = mtt;
422 	mr->mr_pdhdl	  = pd;
423 	mr->mr_rsrcp	  = rsrc;
424 	mr->mr_is_umem	  = mr_is_umem;
425 	mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL;
426 	mr->mr_umem_cbfunc = NULL;
427 	mr->mr_umem_cbarg1 = NULL;
428 	mr->mr_umem_cbarg2 = NULL;
429 
430 	/*
431 	 * If this is userland memory, then we need to insert the previously
432 	 * allocated entry into the "userland resources database".  This will
433 	 * allow for later coordination between the tavor_umap_umemlock_cb()
434 	 * callback and tavor_mr_deregister().
435 	 */
436 	if (mr_is_umem) {
437 		tavor_umap_db_add(umapdb);
438 	}
439 
440 	*mrhdl_new = mr;
441 
442 	TAVOR_TNF_EXIT(tavor_mr_register_shared);
443 	return (DDI_SUCCESS);
444 
445 /*
446  * The following is cleanup for all possible failure cases in this routine
447  */
448 mrshared_fail5:
449 	(void) tavor_mtt_refcnt_dec(mr->mr_mttrefcntp);
450 	if (mr_is_umem) {
451 		tavor_umap_db_free(umapdb);
452 	}
453 mrshared_fail4:
454 	if (mr_is_umem) {
455 		ddi_umem_unlock(umem_cookie);
456 	}
457 mrshared_fail3:
458 	tavor_rsrc_free(state, &rsrc);
459 mrshared_fail2:
460 	tavor_rsrc_free(state, &mpt);
461 mrshared_fail1:
462 	tavor_pd_refcnt_dec(pd);
463 mrshared_fail:
464 	TNF_PROBE_1(tavor_mr_register_shared_fail, TAVOR_TNF_ERROR, "",
465 	    tnf_string, msg, errormsg);
466 	TAVOR_TNF_EXIT(tavor_mr_register_shared);
467 	return (status);
468 }
469 
470 
471 /*
472  * tavor_mr_deregister()
473  *    Context: Can be called from interrupt or base context.
474  */
475 /* ARGSUSED */
476 int
477 tavor_mr_deregister(tavor_state_t *state, tavor_mrhdl_t *mrhdl, uint_t level,
478     uint_t sleep)
479 {
480 	tavor_rsrc_t		*mpt, *mtt, *rsrc, *mtt_refcnt;
481 	tavor_umap_db_entry_t	*umapdb;
482 	tavor_pdhdl_t		pd;
483 	tavor_mrhdl_t		mr;
484 	tavor_bind_info_t	*bind;
485 	uint64_t		value;
486 	int			status, shared_mtt;
487 	char			*errormsg;
488 
489 	TAVOR_TNF_ENTER(tavor_mr_deregister);
490 
491 	/*
492 	 * Check the sleep flag.  Ensure that it is consistent with the
493 	 * current thread context (i.e. if we are currently in the interrupt
494 	 * context, then we shouldn't be attempting to sleep).
495 	 */
496 	if ((sleep == TAVOR_SLEEP) &&
497 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
498 		/* Set "status" and "errormsg" and goto failure */
499 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sleep flags");
500 		TNF_PROBE_1(tavor_mr_deregister_fail, TAVOR_TNF_ERROR, "",
501 		    tnf_string, msg, errormsg);
502 		TAVOR_TNF_EXIT(tavor_mr_deregister);
503 		return (status);
504 	}
505 
506 	/*
507 	 * Pull all the necessary information from the Tavor Memory Region
508 	 * handle.  This is necessary here because the resource for the
509 	 * MR handle is going to be freed up as part of the this
510 	 * deregistration
511 	 */
512 	mr	= *mrhdl;
513 	mutex_enter(&mr->mr_lock);
514 	mpt	= mr->mr_mptrsrcp;
515 	mtt	= mr->mr_mttrsrcp;
516 	mtt_refcnt = mr->mr_mttrefcntp;
517 	rsrc	= mr->mr_rsrcp;
518 	pd	= mr->mr_pdhdl;
519 	bind	= &mr->mr_bindinfo;
520 
521 	/*
522 	 * Check here to see if the memory region has already been partially
523 	 * deregistered as a result of the tavor_umap_umemlock_cb() callback.
524 	 * If so, then jump to the end and free the remaining resources.
525 	 */
526 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
527 		goto mrdereg_finish_cleanup;
528 	}
529 
530 	/*
531 	 * We must drop the "mr_lock" here to ensure that both SLEEP and
532 	 * NOSLEEP calls into the firmware work as expected.  Also, if two
533 	 * threads are attemping to access this MR (via de-register,
534 	 * re-register, or otherwise), then we allow the firmware to enforce
535 	 * the checking, that only one deregister is valid.
536 	 */
537 	mutex_exit(&mr->mr_lock);
538 
539 	/*
540 	 * Reclaim MPT entry from hardware (if necessary).  Since the
541 	 * tavor_mr_deregister() routine is used in the memory region
542 	 * reregistration process as well, it is possible that we will
543 	 * not always wish to reclaim ownership of the MPT.  Check the
544 	 * "level" arg and, if necessary, attempt to reclaim it.  If
545 	 * the ownership transfer fails for any reason, we check to see
546 	 * what command status was returned from the hardware.  The only
547 	 * "expected" error status is the one that indicates an attempt to
548 	 * deregister a memory region that has memory windows bound to it
549 	 */
550 	if (level >= TAVOR_MR_DEREG_ALL) {
551 		status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT,
552 		    NULL, 0, mpt->tr_indx, sleep);
553 		if (status != TAVOR_CMD_SUCCESS) {
554 			if (status == TAVOR_CMD_REG_BOUND) {
555 				TAVOR_TNF_EXIT(tavor_mr_deregister);
556 				return (IBT_MR_IN_USE);
557 			} else {
558 				cmn_err(CE_CONT, "Tavor: HW2SW_MPT command "
559 				    "failed: %08x\n", status);
560 				TNF_PROBE_1(tavor_hw2sw_mpt_cmd_fail,
561 				    TAVOR_TNF_ERROR, "", tnf_uint, status,
562 				    status);
563 				TAVOR_TNF_EXIT(tavor_mr_deregister);
564 				return (IBT_INVALID_PARAM);
565 			}
566 		}
567 	}
568 
569 	/*
570 	 * Re-grab the mr_lock here.  Since further access to the protected
571 	 * 'mr' structure is needed, and we would have returned previously for
572 	 * the multiple deregistration case, we can safely grab the lock here.
573 	 */
574 	mutex_enter(&mr->mr_lock);
575 
576 	/*
577 	 * If the memory had come from userland, then we do a lookup in the
578 	 * "userland resources database".  On success, we free the entry, call
579 	 * ddi_umem_unlock(), and continue the cleanup.  On failure (which is
580 	 * an indication that the umem_lockmemory() callback has called
581 	 * tavor_mr_deregister()), we call ddi_umem_unlock() and invalidate
582 	 * the "mr_umemcookie" field in the MR handle (this will be used
583 	 * later to detect that only partial cleaup still remains to be done
584 	 * on the MR handle).
585 	 */
586 	if (mr->mr_is_umem) {
587 		status = tavor_umap_db_find(state->ts_instance,
588 		    (uint64_t)(uintptr_t)mr->mr_umemcookie,
589 		    MLNX_UMAP_MRMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
590 		    &umapdb);
591 		if (status == DDI_SUCCESS) {
592 			tavor_umap_db_free(umapdb);
593 			ddi_umem_unlock(mr->mr_umemcookie);
594 		} else {
595 			ddi_umem_unlock(mr->mr_umemcookie);
596 			mr->mr_umemcookie = NULL;
597 		}
598 	}
599 
600 	/*
601 	 * Decrement the MTT reference count.  Since the MTT resource
602 	 * may be shared between multiple memory regions (as a result
603 	 * of a "RegisterSharedMR" verb) it is important that we not
604 	 * free up or unbind resources prematurely.  If it's not shared (as
605 	 * indicated by the return status), then free the resource.
606 	 */
607 	shared_mtt = tavor_mtt_refcnt_dec(mtt_refcnt);
608 	if (!shared_mtt) {
609 		tavor_rsrc_free(state, &mtt_refcnt);
610 	}
611 
612 	/*
613 	 * Free up the MTT entries and unbind the memory.  Here, as above, we
614 	 * attempt to free these resources only if it is appropriate to do so.
615 	 */
616 	if (!shared_mtt) {
617 		if (level >= TAVOR_MR_DEREG_NO_HW2SW_MPT) {
618 			tavor_mr_mem_unbind(state, bind);
619 		}
620 		tavor_rsrc_free(state, &mtt);
621 	}
622 
623 	/*
624 	 * If the MR handle has been invalidated, then drop the
625 	 * lock and return success.  Note: This only happens because
626 	 * the umem_lockmemory() callback has been triggered.  The
627 	 * cleanup here is partial, and further cleanup (in a
628 	 * subsequent tavor_mr_deregister() call) will be necessary.
629 	 */
630 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
631 		mutex_exit(&mr->mr_lock);
632 		TAVOR_TNF_EXIT(tavor_mr_deregister);
633 		return (DDI_SUCCESS);
634 	}
635 
636 mrdereg_finish_cleanup:
637 	mutex_exit(&mr->mr_lock);
638 
639 	/* Free the Tavor Memory Region handle */
640 	tavor_rsrc_free(state, &rsrc);
641 
642 	/* Free up the MPT entry resource */
643 	tavor_rsrc_free(state, &mpt);
644 
645 	/* Decrement the reference count on the protection domain (PD) */
646 	tavor_pd_refcnt_dec(pd);
647 
648 	/* Set the mrhdl pointer to NULL and return success */
649 	*mrhdl = NULL;
650 
651 	TAVOR_TNF_EXIT(tavor_mr_deregister);
652 	return (DDI_SUCCESS);
653 }
654 
655 
656 /*
657  * tavor_mr_query()
658  *    Context: Can be called from interrupt or base context.
659  */
660 /* ARGSUSED */
661 int
662 tavor_mr_query(tavor_state_t *state, tavor_mrhdl_t mr,
663     ibt_mr_query_attr_t *attr)
664 {
665 	TAVOR_TNF_ENTER(tavor_mr_query);
666 
667 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr))
668 
669 	mutex_enter(&mr->mr_lock);
670 
671 	/*
672 	 * Check here to see if the memory region has already been partially
673 	 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
674 	 * If so, this is an error, return failure.
675 	 */
676 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
677 		mutex_exit(&mr->mr_lock);
678 		TNF_PROBE_0(tavor_mr_query_inv_mrhdl_fail, TAVOR_TNF_ERROR, "");
679 		TAVOR_TNF_EXIT(tavor_mr_query);
680 		return (IBT_MR_HDL_INVALID);
681 	}
682 
683 	/* Fill in the queried attributes */
684 	attr->mr_attr_flags = mr->mr_accflag;
685 	attr->mr_pd	= (ibt_pd_hdl_t)mr->mr_pdhdl;
686 
687 	/* Fill in the "local" attributes */
688 	attr->mr_lkey = (ibt_lkey_t)mr->mr_lkey;
689 	attr->mr_lbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
690 	attr->mr_lbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
691 
692 	/*
693 	 * Fill in the "remote" attributes (if necessary).  Note: the
694 	 * remote attributes are only valid if the memory region has one
695 	 * or more of the remote access flags set.
696 	 */
697 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
698 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
699 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
700 		attr->mr_rkey = (ibt_rkey_t)mr->mr_rkey;
701 		attr->mr_rbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
702 		attr->mr_rbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
703 	}
704 
705 	/*
706 	 * If region is mapped for streaming (i.e. noncoherent), then set sync
707 	 * is required
708 	 */
709 	attr->mr_sync_required = (mr->mr_bindinfo.bi_flags &
710 	    IBT_MR_NONCOHERENT) ? B_TRUE : B_FALSE;
711 
712 	mutex_exit(&mr->mr_lock);
713 	TAVOR_TNF_EXIT(tavor_mr_query);
714 	return (DDI_SUCCESS);
715 }
716 
717 
718 /*
719  * tavor_mr_reregister()
720  *    Context: Can be called from interrupt or base context.
721  */
722 int
723 tavor_mr_reregister(tavor_state_t *state, tavor_mrhdl_t mr,
724     tavor_pdhdl_t pd, ibt_mr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl_new,
725     tavor_mr_options_t *op)
726 {
727 	tavor_bind_info_t	bind;
728 	int			status;
729 
730 	TAVOR_TNF_ENTER(tavor_mr_reregister);
731 
732 	/*
733 	 * Fill in the "bind" struct.  This struct provides the majority
734 	 * of the information that will be used to distinguish between an
735 	 * "addr" binding (as is the case here) and a "buf" binding (see
736 	 * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
737 	 * which does most of the "heavy lifting" for the Tavor memory
738 	 * registration (and reregistration) routines.
739 	 */
740 	bind.bi_type  = TAVOR_BINDHDL_VADDR;
741 	bind.bi_addr  = mr_attr->mr_vaddr;
742 	bind.bi_len   = mr_attr->mr_len;
743 	bind.bi_as    = mr_attr->mr_as;
744 	bind.bi_flags = mr_attr->mr_flags;
745 	status = tavor_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
746 	if (status != DDI_SUCCESS) {
747 		TNF_PROBE_0(tavor_mr_reregister_cmnreg_fail,
748 		    TAVOR_TNF_ERROR, "");
749 		TAVOR_TNF_EXIT(tavor_mr_reregister);
750 		return (status);
751 	}
752 
753 	TAVOR_TNF_EXIT(tavor_mr_reregister);
754 	return (DDI_SUCCESS);
755 }
756 
757 
758 /*
759  * tavor_mr_reregister_buf()
760  *    Context: Can be called from interrupt or base context.
761  */
762 int
763 tavor_mr_reregister_buf(tavor_state_t *state, tavor_mrhdl_t mr,
764     tavor_pdhdl_t pd, ibt_smr_attr_t *mr_attr, struct buf *buf,
765     tavor_mrhdl_t *mrhdl_new, tavor_mr_options_t *op)
766 {
767 	tavor_bind_info_t	bind;
768 	int			status;
769 
770 	TAVOR_TNF_ENTER(tavor_mr_reregister_buf);
771 
772 	/*
773 	 * Fill in the "bind" struct.  This struct provides the majority
774 	 * of the information that will be used to distinguish between an
775 	 * "addr" binding (see above) and a "buf" binding (as is the case
776 	 * here).  The "bind" struct is later passed to tavor_mr_mem_bind()
777 	 * which does most of the "heavy lifting" for the Tavor memory
778 	 * registration routines.  Note: We have chosen to provide
779 	 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
780 	 * not set).  It is not critical what value we choose here as it need
781 	 * only be unique for the given RKey (which will happen by default),
782 	 * so the choice here is somewhat arbitrary.
783 	 */
784 	bind.bi_type  = TAVOR_BINDHDL_BUF;
785 	bind.bi_buf   = buf;
786 	if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
787 		bind.bi_addr  = mr_attr->mr_vaddr;
788 	} else {
789 		bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
790 	}
791 	bind.bi_len   = (uint64_t)buf->b_bcount;
792 	bind.bi_flags = mr_attr->mr_flags;
793 	bind.bi_as = NULL;
794 	status = tavor_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
795 	if (status != DDI_SUCCESS) {
796 		TNF_PROBE_0(tavor_mr_reregister_buf_cmnreg_fail,
797 		    TAVOR_TNF_ERROR, "");
798 		TAVOR_TNF_EXIT(tavor_mr_reregister_buf);
799 		return (status);
800 	}
801 
802 	TAVOR_TNF_EXIT(tavor_mr_reregister_buf);
803 	return (DDI_SUCCESS);
804 }
805 
806 
807 /*
808  * tavor_mr_sync()
809  *    Context: Can be called from interrupt or base context.
810  */
811 /* ARGSUSED */
812 int
813 tavor_mr_sync(tavor_state_t *state, ibt_mr_sync_t *mr_segs, size_t num_segs)
814 {
815 	tavor_mrhdl_t		mrhdl;
816 	uint64_t		seg_vaddr, seg_len, seg_end;
817 	uint64_t		mr_start, mr_end;
818 	uint_t			type;
819 	int			status, i;
820 	char			*errormsg;
821 
822 	TAVOR_TNF_ENTER(tavor_mr_sync);
823 
824 	/* Process each of the ibt_mr_sync_t's */
825 	for (i = 0; i < num_segs; i++) {
826 		mrhdl = (tavor_mrhdl_t)mr_segs[i].ms_handle;
827 
828 		/* Check for valid memory region handle */
829 		if (mrhdl == NULL) {
830 			/* Set "status" and "errormsg" and goto failure */
831 			TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
832 			goto mrsync_fail;
833 		}
834 
835 		mutex_enter(&mrhdl->mr_lock);
836 
837 		/*
838 		 * Check here to see if the memory region has already been
839 		 * partially deregistered as a result of a
840 		 * tavor_umap_umemlock_cb() callback.  If so, this is an
841 		 * error, return failure.
842 		 */
843 		if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
844 			mutex_exit(&mrhdl->mr_lock);
845 			/* Set "status" and "errormsg" and goto failure */
846 			TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl2");
847 			goto mrsync_fail;
848 		}
849 
850 		/* Check for valid bounds on sync request */
851 		seg_vaddr = mr_segs[i].ms_vaddr;
852 		seg_len	  = mr_segs[i].ms_len;
853 		seg_end	  = seg_vaddr + seg_len - 1;
854 		mr_start  = mrhdl->mr_bindinfo.bi_addr;
855 		mr_end	  = mr_start + mrhdl->mr_bindinfo.bi_len - 1;
856 		if ((seg_vaddr < mr_start) || (seg_vaddr > mr_end)) {
857 			mutex_exit(&mrhdl->mr_lock);
858 			/* Set "status" and "errormsg" and goto failure */
859 			TAVOR_TNF_FAIL(IBT_MR_VA_INVALID, "invalid vaddr");
860 			goto mrsync_fail;
861 		}
862 		if ((seg_end < mr_start) || (seg_end > mr_end)) {
863 			mutex_exit(&mrhdl->mr_lock);
864 			/* Set "status" and "errormsg" and goto failure */
865 			TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
866 			goto mrsync_fail;
867 		}
868 
869 		/* Determine what type (i.e. direction) for sync */
870 		if (mr_segs[i].ms_flags & IBT_SYNC_READ) {
871 			type = DDI_DMA_SYNC_FORDEV;
872 		} else if (mr_segs[i].ms_flags & IBT_SYNC_WRITE) {
873 			type = DDI_DMA_SYNC_FORCPU;
874 		} else {
875 			mutex_exit(&mrhdl->mr_lock);
876 			/* Set "status" and "errormsg" and goto failure */
877 			TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sync type");
878 			goto mrsync_fail;
879 		}
880 
881 		(void) ddi_dma_sync(mrhdl->mr_bindinfo.bi_dmahdl,
882 		    (off_t)(seg_vaddr - mr_start), (size_t)seg_len, type);
883 		mutex_exit(&mrhdl->mr_lock);
884 	}
885 
886 	TAVOR_TNF_EXIT(tavor_mr_sync);
887 	return (DDI_SUCCESS);
888 
889 mrsync_fail:
890 	TNF_PROBE_1(tavor_mr_sync_fail, TAVOR_TNF_ERROR, "", tnf_string, msg,
891 	    errormsg);
892 	TAVOR_TNF_EXIT(tavor_mr_sync);
893 	return (status);
894 }
895 
896 
897 /*
898  * tavor_mw_alloc()
899  *    Context: Can be called from interrupt or base context.
900  */
901 int
902 tavor_mw_alloc(tavor_state_t *state, tavor_pdhdl_t pd, ibt_mw_flags_t flags,
903     tavor_mwhdl_t *mwhdl)
904 {
905 	tavor_rsrc_t		*mpt, *rsrc;
906 	tavor_hw_mpt_t		mpt_entry;
907 	tavor_mwhdl_t		mw;
908 	uint_t			sleep;
909 	int			status;
910 	char			*errormsg;
911 
912 	TAVOR_TNF_ENTER(tavor_mw_alloc);
913 
914 	/*
915 	 * Check the sleep flag.  Ensure that it is consistent with the
916 	 * current thread context (i.e. if we are currently in the interrupt
917 	 * context, then we shouldn't be attempting to sleep).
918 	 */
919 	sleep = (flags & IBT_MW_NOSLEEP) ? TAVOR_NOSLEEP : TAVOR_SLEEP;
920 	if ((sleep == TAVOR_SLEEP) &&
921 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
922 		/* Set "status" and "errormsg" and goto failure */
923 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
924 		goto mwalloc_fail;
925 	}
926 
927 	/* Increment the reference count on the protection domain (PD) */
928 	tavor_pd_refcnt_inc(pd);
929 
930 	/*
931 	 * Allocate an MPT entry (for use as a memory window).  Since the
932 	 * Tavor hardware uses the MPT entry for memory regions and for
933 	 * memory windows, we will fill in this MPT with all the necessary
934 	 * parameters for the memory window.  And then (just as we do for
935 	 * memory regions) ownership will be passed to the hardware in the
936 	 * final step below.  If we fail here, we must undo the protection
937 	 * domain reference count.
938 	 */
939 	status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
940 	if (status != DDI_SUCCESS) {
941 		/* Set "status" and "errormsg" and goto failure */
942 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
943 		goto mwalloc_fail1;
944 	}
945 
946 	/*
947 	 * Allocate the software structure for tracking the memory window (i.e.
948 	 * the Tavor Memory Window handle).  Note: This is actually the same
949 	 * software structure used for tracking memory regions, but since many
950 	 * of the same properties are needed, only a single structure is
951 	 * necessary.  If we fail here, we must undo the protection domain
952 	 * reference count and the previous resource allocation.
953 	 */
954 	status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
955 	if (status != DDI_SUCCESS) {
956 		/* Set "status" and "errormsg" and goto failure */
957 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
958 		goto mwalloc_fail2;
959 	}
960 	mw = (tavor_mwhdl_t)rsrc->tr_addr;
961 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
962 
963 	/*
964 	 * Calculate an "unbound" RKey from MPT index.  In much the same way
965 	 * as we do for memory regions (above), this key is constructed from
966 	 * a "constrained" (which depends on the MPT index) and an
967 	 * "unconstrained" portion (which may be arbitrarily chosen).
968 	 */
969 	tavor_mr_keycalc(state, mpt->tr_indx, &mw->mr_rkey);
970 
971 	/*
972 	 * Fill in the MPT entry.  This is the final step before passing
973 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
974 	 * the information collected/calculated above to fill in the
975 	 * requisite portions of the MPT.  Note: fewer entries in the MPT
976 	 * entry are necessary to allocate a memory window.
977 	 */
978 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
979 	mpt_entry.reg_win	= TAVOR_MPT_IS_WINDOW;
980 	mpt_entry.mem_key	= mw->mr_rkey;
981 	mpt_entry.pd		= pd->pd_pdnum;
982 
983 	/*
984 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
985 	 * the entry to the hardware.  Note: in general, this operation
986 	 * shouldn't fail.  But if it does, we have to undo everything we've
987 	 * done above before returning error.
988 	 */
989 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
990 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
991 	if (status != TAVOR_CMD_SUCCESS) {
992 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
993 		    status);
994 		TNF_PROBE_1(tavor_mw_alloc_sw2hw_mpt_cmd_fail,
995 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
996 		/* Set "status" and "errormsg" and goto failure */
997 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
998 		    "tavor SW2HW_MPT command");
999 		goto mwalloc_fail3;
1000 	}
1001 
1002 	/*
1003 	 * Fill in the rest of the Tavor Memory Window handle.  Having
1004 	 * successfully transferred ownership of the MPT, we can update the
1005 	 * following fields for use in further operations on the MW.
1006 	 */
1007 	mw->mr_mptrsrcp	= mpt;
1008 	mw->mr_pdhdl	= pd;
1009 	mw->mr_rsrcp	= rsrc;
1010 	*mwhdl = mw;
1011 
1012 	TAVOR_TNF_EXIT(tavor_mw_alloc);
1013 	return (DDI_SUCCESS);
1014 
1015 mwalloc_fail3:
1016 	tavor_rsrc_free(state, &rsrc);
1017 mwalloc_fail2:
1018 	tavor_rsrc_free(state, &mpt);
1019 mwalloc_fail1:
1020 	tavor_pd_refcnt_dec(pd);
1021 mwalloc_fail:
1022 	TNF_PROBE_1(tavor_mw_alloc_fail, TAVOR_TNF_ERROR, "",
1023 	    tnf_string, msg, errormsg);
1024 	TAVOR_TNF_EXIT(tavor_mw_alloc);
1025 	return (status);
1026 }
1027 
1028 
1029 /*
1030  * tavor_mw_free()
1031  *    Context: Can be called from interrupt or base context.
1032  */
1033 int
1034 tavor_mw_free(tavor_state_t *state, tavor_mwhdl_t *mwhdl, uint_t sleep)
1035 {
1036 	tavor_rsrc_t		*mpt, *rsrc;
1037 	tavor_mwhdl_t		mw;
1038 	int			status;
1039 	char			*errormsg;
1040 	tavor_pdhdl_t		pd;
1041 
1042 	TAVOR_TNF_ENTER(tavor_mw_free);
1043 
1044 	/*
1045 	 * Check the sleep flag.  Ensure that it is consistent with the
1046 	 * current thread context (i.e. if we are currently in the interrupt
1047 	 * context, then we shouldn't be attempting to sleep).
1048 	 */
1049 	if ((sleep == TAVOR_SLEEP) &&
1050 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1051 		/* Set "status" and "errormsg" and goto failure */
1052 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sleep flags");
1053 		TNF_PROBE_1(tavor_mw_free_fail, TAVOR_TNF_ERROR, "",
1054 		    tnf_string, msg, errormsg);
1055 		TAVOR_TNF_EXIT(tavor_mw_free);
1056 		return (status);
1057 	}
1058 
1059 	/*
1060 	 * Pull all the necessary information from the Tavor Memory Window
1061 	 * handle.  This is necessary here because the resource for the
1062 	 * MW handle is going to be freed up as part of the this operation.
1063 	 */
1064 	mw	= *mwhdl;
1065 	mutex_enter(&mw->mr_lock);
1066 	mpt	= mw->mr_mptrsrcp;
1067 	rsrc	= mw->mr_rsrcp;
1068 	pd	= mw->mr_pdhdl;
1069 	mutex_exit(&mw->mr_lock);
1070 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
1071 
1072 	/*
1073 	 * Reclaim the MPT entry from hardware.  Note: in general, it is
1074 	 * unexpected for this operation to return an error.
1075 	 */
1076 	status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT, NULL,
1077 	    0, mpt->tr_indx, sleep);
1078 	if (status != TAVOR_CMD_SUCCESS) {
1079 		cmn_err(CE_CONT, "Tavor: HW2SW_MPT command failed: %08x\n",
1080 		    status);
1081 		TNF_PROBE_1(tavor_hw2sw_mpt_cmd_fail, TAVOR_TNF_ERROR, "",
1082 		    tnf_uint, status, status);
1083 		TAVOR_TNF_EXIT(tavor_mw_free);
1084 		return (IBT_INVALID_PARAM);
1085 	}
1086 
1087 	/* Free the Tavor Memory Window handle */
1088 	tavor_rsrc_free(state, &rsrc);
1089 
1090 	/* Free up the MPT entry resource */
1091 	tavor_rsrc_free(state, &mpt);
1092 
1093 	/* Decrement the reference count on the protection domain (PD) */
1094 	tavor_pd_refcnt_dec(pd);
1095 
1096 	/* Set the mwhdl pointer to NULL and return success */
1097 	*mwhdl = NULL;
1098 
1099 	TAVOR_TNF_EXIT(tavor_mw_free);
1100 	return (DDI_SUCCESS);
1101 }
1102 
1103 
1104 /*
1105  * tavor_mr_keycalc()
1106  *    Context: Can be called from interrupt or base context.
1107  */
1108 void
1109 tavor_mr_keycalc(tavor_state_t *state, uint32_t indx, uint32_t *key)
1110 {
1111 	uint32_t	tmp, log_num_mpt;
1112 
1113 	/*
1114 	 * Generate a simple key from counter.  Note:  We increment this
1115 	 * static variable _intentionally_ without any kind of mutex around
1116 	 * it.  First, single-threading all operations through a single lock
1117 	 * would be a bad idea (from a performance point-of-view).  Second,
1118 	 * the upper "unconstrained" bits don't really have to be unique
1119 	 * because the lower bits are guaranteed to be (although we do make a
1120 	 * best effort to ensure that they are).  Third, the window for the
1121 	 * race (where both threads read and update the counter at the same
1122 	 * time) is incredibly small.
1123 	 * And, lastly, we'd like to make this into a "random" key XXX
1124 	 */
1125 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(tavor_debug_memkey_cnt))
1126 	log_num_mpt = state->ts_cfg_profile->cp_log_num_mpt;
1127 	tmp = (tavor_debug_memkey_cnt++) << log_num_mpt;
1128 	*key = tmp | indx;
1129 }
1130 
1131 
1132 /*
1133  * tavor_mr_common_reg()
1134  *    Context: Can be called from interrupt or base context.
1135  */
1136 static int
1137 tavor_mr_common_reg(tavor_state_t *state, tavor_pdhdl_t pd,
1138     tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op)
1139 {
1140 	tavor_rsrc_pool_info_t	*rsrc_pool;
1141 	tavor_rsrc_t		*mpt, *mtt, *rsrc, *mtt_refcnt;
1142 	tavor_umap_db_entry_t	*umapdb;
1143 	tavor_sw_refcnt_t	*swrc_tmp;
1144 	tavor_hw_mpt_t		mpt_entry;
1145 	tavor_mrhdl_t		mr;
1146 	ibt_mr_flags_t		flags;
1147 	tavor_bind_info_t	*bh;
1148 	ddi_dma_handle_t	bind_dmahdl;
1149 	ddi_umem_cookie_t	umem_cookie;
1150 	size_t			umem_len;
1151 	caddr_t			umem_addr;
1152 	uint64_t		mtt_addr, mtt_ddrbaseaddr, max_sz;
1153 	uint_t			sleep, mtt_pgsize_bits, bind_type, mr_is_umem;
1154 	int			status, umem_flags, bind_override_addr;
1155 	char			*errormsg;
1156 
1157 	TAVOR_TNF_ENTER(tavor_mr_common_reg);
1158 
1159 	/*
1160 	 * Check the "options" flag.  Currently this flag tells the driver
1161 	 * whether or not the region should be bound normally (i.e. with
1162 	 * entries written into the PCI IOMMU), whether it should be
1163 	 * registered to bypass the IOMMU, and whether or not the resulting
1164 	 * address should be "zero-based" (to aid the alignment restrictions
1165 	 * for QPs).
1166 	 */
1167 	if (op == NULL) {
1168 		bind_type   = TAVOR_BINDMEM_NORMAL;
1169 		bind_dmahdl = NULL;
1170 		bind_override_addr = 0;
1171 	} else {
1172 		bind_type	   = op->mro_bind_type;
1173 		bind_dmahdl	   = op->mro_bind_dmahdl;
1174 		bind_override_addr = op->mro_bind_override_addr;
1175 	}
1176 
1177 	/* Extract the flags field from the tavor_bind_info_t */
1178 	flags = bind->bi_flags;
1179 
1180 	/*
1181 	 * Check for invalid length.  Check is the length is zero or if the
1182 	 * length is larger than the maximum configured value.  Return error
1183 	 * if it is.
1184 	 */
1185 	max_sz = ((uint64_t)1 << state->ts_cfg_profile->cp_log_max_mrw_sz);
1186 	if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
1187 		/* Set "status" and "errormsg" and goto failure */
1188 		TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
1189 		goto mrcommon_fail;
1190 	}
1191 
1192 	/*
1193 	 * Check the sleep flag.  Ensure that it is consistent with the
1194 	 * current thread context (i.e. if we are currently in the interrupt
1195 	 * context, then we shouldn't be attempting to sleep).
1196 	 */
1197 	sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1198 	if ((sleep == TAVOR_SLEEP) &&
1199 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1200 		/* Set "status" and "errormsg" and goto failure */
1201 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1202 		goto mrcommon_fail;
1203 	}
1204 
1205 	/*
1206 	 * Get the base address for the MTT table.  This will be necessary
1207 	 * below when we are setting up the MPT entry.
1208 	 */
1209 	rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
1210 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
1211 
1212 	/* Increment the reference count on the protection domain (PD) */
1213 	tavor_pd_refcnt_inc(pd);
1214 
1215 	/*
1216 	 * Allocate an MPT entry.  This will be filled in with all the
1217 	 * necessary parameters to define the memory region.  And then
1218 	 * ownership will be passed to the hardware in the final step
1219 	 * below.  If we fail here, we must undo the protection domain
1220 	 * reference count.
1221 	 */
1222 	status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
1223 	if (status != DDI_SUCCESS) {
1224 		/* Set "status" and "errormsg" and goto failure */
1225 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
1226 		goto mrcommon_fail1;
1227 	}
1228 
1229 	/*
1230 	 * Allocate the software structure for tracking the memory region (i.e.
1231 	 * the Tavor Memory Region handle).  If we fail here, we must undo
1232 	 * the protection domain reference count and the previous resource
1233 	 * allocation.
1234 	 */
1235 	status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
1236 	if (status != DDI_SUCCESS) {
1237 		/* Set "status" and "errormsg" and goto failure */
1238 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
1239 		goto mrcommon_fail2;
1240 	}
1241 	mr = (tavor_mrhdl_t)rsrc->tr_addr;
1242 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
1243 
1244 	/*
1245 	 * Setup and validate the memory region access flags.  This means
1246 	 * translating the IBTF's enable flags into the access flags that
1247 	 * will be used in later operations.
1248 	 */
1249 	mr->mr_accflag = 0;
1250 	if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1251 		mr->mr_accflag |= IBT_MR_WINDOW_BIND;
1252 	if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1253 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
1254 	if (flags & IBT_MR_ENABLE_REMOTE_READ)
1255 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
1256 	if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1257 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
1258 	if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1259 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
1260 
1261 	/*
1262 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
1263 	 * from a certain number of "constrained" bits (the least significant
1264 	 * bits) and some number of "unconstrained" bits.  The constrained
1265 	 * bits must be set to the index of the entry in the MPT table, but
1266 	 * the unconstrained bits can be set to any value we wish.  Note:
1267 	 * if no remote access is required, then the RKey value is not filled
1268 	 * in.  Otherwise both Rkey and LKey are given the same value.
1269 	 */
1270 	tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
1271 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1272 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1273 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1274 		mr->mr_rkey = mr->mr_lkey;
1275 	}
1276 
1277 	/*
1278 	 * Determine if the memory is from userland and pin the pages
1279 	 * with umem_lockmemory() if necessary.
1280 	 * Then, if this is userland memory, allocate an entry in the
1281 	 * "userland resources database".  This will later be added to
1282 	 * the database (after all further memory registration operations are
1283 	 * successful).  If we fail here, we must undo the reference counts
1284 	 * and the previous resource allocations.
1285 	 */
1286 	mr_is_umem = (((bind->bi_as != NULL) && (bind->bi_as != &kas)) ? 1 : 0);
1287 	if (mr_is_umem) {
1288 		umem_len   = ptob(btopr(bind->bi_len +
1289 		    ((uintptr_t)bind->bi_addr & PAGEOFFSET)));
1290 		umem_addr  = (caddr_t)((uintptr_t)bind->bi_addr & ~PAGEOFFSET);
1291 		umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
1292 		    DDI_UMEMLOCK_LONGTERM);
1293 		status = umem_lockmemory(umem_addr, umem_len, umem_flags,
1294 		    &umem_cookie, &tavor_umem_cbops, NULL);
1295 		if (status != 0) {
1296 			/* Set "status" and "errormsg" and goto failure */
1297 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umem pin");
1298 			goto mrcommon_fail3;
1299 		}
1300 
1301 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1302 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1303 
1304 		bind->bi_buf = ddi_umem_iosetup(umem_cookie, 0, umem_len,
1305 		    B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP);
1306 		if (bind->bi_buf == NULL) {
1307 			/* Set "status" and "errormsg" and goto failure */
1308 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed iosetup");
1309 			goto mrcommon_fail3;
1310 		}
1311 		bind->bi_type = TAVOR_BINDHDL_UBUF;
1312 		bind->bi_buf->b_flags |= B_READ;
1313 
1314 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1315 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1316 
1317 		umapdb = tavor_umap_db_alloc(state->ts_instance,
1318 		    (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
1319 		    (uint64_t)(uintptr_t)rsrc);
1320 		if (umapdb == NULL) {
1321 			/* Set "status" and "errormsg" and goto failure */
1322 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
1323 			goto mrcommon_fail4;
1324 		}
1325 	}
1326 
1327 	/*
1328 	 * Setup the bindinfo for the mtt bind call
1329 	 */
1330 	bh = &mr->mr_bindinfo;
1331 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bh))
1332 	bcopy(bind, bh, sizeof (tavor_bind_info_t));
1333 	bh->bi_bypass = bind_type;
1334 	status = tavor_mr_mtt_bind(state, bh, bind_dmahdl, &mtt,
1335 	    &mtt_pgsize_bits);
1336 	if (status != DDI_SUCCESS) {
1337 		/* Set "status" and "errormsg" and goto failure */
1338 		TAVOR_TNF_FAIL(status, "failed mtt bind");
1339 		/*
1340 		 * When mtt_bind fails, freerbuf has already been done,
1341 		 * so make sure not to call it again.
1342 		 */
1343 		bind->bi_type = bh->bi_type;
1344 		goto mrcommon_fail5;
1345 	}
1346 	mr->mr_logmttpgsz = mtt_pgsize_bits;
1347 
1348 	/*
1349 	 * Allocate MTT reference count (to track shared memory regions).
1350 	 * This reference count resource may never be used on the given
1351 	 * memory region, but if it is ever later registered as "shared"
1352 	 * memory region then this resource will be necessary.  If we fail
1353 	 * here, we do pretty much the same as above to clean up.
1354 	 */
1355 	status = tavor_rsrc_alloc(state, TAVOR_REFCNT, 1, sleep,
1356 	    &mtt_refcnt);
1357 	if (status != DDI_SUCCESS) {
1358 		/* Set "status" and "errormsg" and goto failure */
1359 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed refence count");
1360 		goto mrcommon_fail6;
1361 	}
1362 	mr->mr_mttrefcntp = mtt_refcnt;
1363 	swrc_tmp = (tavor_sw_refcnt_t *)mtt_refcnt->tr_addr;
1364 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_tmp))
1365 	TAVOR_MTT_REFCNT_INIT(swrc_tmp);
1366 
1367 	/*
1368 	 * Fill in the MPT entry.  This is the final step before passing
1369 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
1370 	 * the information collected/calculated above to fill in the
1371 	 * requisite portions of the MPT.
1372 	 */
1373 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
1374 	mpt_entry.m_io	  = TAVOR_MEM_CYCLE_GENERATE;
1375 	mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
1376 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
1377 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
1378 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
1379 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
1380 	mpt_entry.lr	  = 1;
1381 	mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
1382 	mpt_entry.page_sz	= mr->mr_logmttpgsz - 0xC;
1383 	mpt_entry.mem_key	= mr->mr_lkey;
1384 	mpt_entry.pd		= pd->pd_pdnum;
1385 	if (bind_override_addr == 0) {
1386 		mpt_entry.start_addr = bh->bi_addr;
1387 	} else {
1388 		bh->bi_addr = bh->bi_addr & ((1 << mr->mr_logmttpgsz) - 1);
1389 		mpt_entry.start_addr = bh->bi_addr;
1390 	}
1391 	mpt_entry.reg_win_len	= bh->bi_len;
1392 	mpt_entry.win_cnt_limit	= TAVOR_UNLIMITED_WIN_BIND;
1393 	mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
1394 	mpt_entry.mttseg_addr_h = mtt_addr >> 32;
1395 	mpt_entry.mttseg_addr_l = mtt_addr >> 6;
1396 
1397 	/*
1398 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
1399 	 * the entry to the hardware.  Note: in general, this operation
1400 	 * shouldn't fail.  But if it does, we have to undo everything we've
1401 	 * done above before returning error.
1402 	 */
1403 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1404 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
1405 	if (status != TAVOR_CMD_SUCCESS) {
1406 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
1407 		    status);
1408 		TNF_PROBE_1(tavor_mr_common_reg_sw2hw_mpt_cmd_fail,
1409 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1410 		/* Set "status" and "errormsg" and goto failure */
1411 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1412 		    "tavor SW2HW_MPT command");
1413 		goto mrcommon_fail7;
1414 	}
1415 
1416 	/*
1417 	 * Fill in the rest of the Tavor Memory Region handle.  Having
1418 	 * successfully transferred ownership of the MPT, we can update the
1419 	 * following fields for use in further operations on the MR.
1420 	 */
1421 	mr->mr_mptrsrcp	  = mpt;
1422 	mr->mr_mttrsrcp	  = mtt;
1423 	mr->mr_pdhdl	  = pd;
1424 	mr->mr_rsrcp	  = rsrc;
1425 	mr->mr_is_umem	  = mr_is_umem;
1426 	mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL;
1427 	mr->mr_umem_cbfunc = NULL;
1428 	mr->mr_umem_cbarg1 = NULL;
1429 	mr->mr_umem_cbarg2 = NULL;
1430 
1431 	/*
1432 	 * If this is userland memory, then we need to insert the previously
1433 	 * allocated entry into the "userland resources database".  This will
1434 	 * allow for later coordination between the tavor_umap_umemlock_cb()
1435 	 * callback and tavor_mr_deregister().
1436 	 */
1437 	if (mr_is_umem) {
1438 		tavor_umap_db_add(umapdb);
1439 	}
1440 
1441 	*mrhdl = mr;
1442 
1443 	TAVOR_TNF_EXIT(tavor_mr_common_reg);
1444 	return (DDI_SUCCESS);
1445 
1446 /*
1447  * The following is cleanup for all possible failure cases in this routine
1448  */
1449 mrcommon_fail7:
1450 	tavor_rsrc_free(state, &mtt_refcnt);
1451 mrcommon_fail6:
1452 	tavor_rsrc_free(state, &mtt);
1453 	tavor_mr_mem_unbind(state, bh);
1454 	bind->bi_type = bh->bi_type;
1455 mrcommon_fail5:
1456 	if (mr_is_umem) {
1457 		tavor_umap_db_free(umapdb);
1458 	}
1459 mrcommon_fail4:
1460 	if (mr_is_umem) {
1461 		/*
1462 		 * Free up the memory ddi_umem_iosetup() allocates
1463 		 * internally.
1464 		 */
1465 		if (bind->bi_type == TAVOR_BINDHDL_UBUF) {
1466 			freerbuf(bind->bi_buf);
1467 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1468 			bind->bi_type = TAVOR_BINDHDL_NONE;
1469 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1470 		}
1471 		ddi_umem_unlock(umem_cookie);
1472 	}
1473 mrcommon_fail3:
1474 	tavor_rsrc_free(state, &rsrc);
1475 mrcommon_fail2:
1476 	tavor_rsrc_free(state, &mpt);
1477 mrcommon_fail1:
1478 	tavor_pd_refcnt_dec(pd);
1479 mrcommon_fail:
1480 	TNF_PROBE_1(tavor_mr_common_reg_fail, TAVOR_TNF_ERROR, "",
1481 	    tnf_string, msg, errormsg);
1482 	TAVOR_TNF_EXIT(tavor_mr_common_reg);
1483 	return (status);
1484 }
1485 
1486 /*
1487  * tavor_mr_mtt_bind()
1488  *    Context: Can be called from interrupt or base context.
1489  */
1490 int
1491 tavor_mr_mtt_bind(tavor_state_t *state, tavor_bind_info_t *bind,
1492     ddi_dma_handle_t bind_dmahdl, tavor_rsrc_t **mtt, uint_t *mtt_pgsize_bits)
1493 {
1494 	uint64_t		nummtt;
1495 	uint_t			sleep;
1496 	int			status;
1497 	char			*errormsg;
1498 
1499 	TAVOR_TNF_ENTER(tavor_mr_common_reg);
1500 
1501 	/*
1502 	 * Check the sleep flag.  Ensure that it is consistent with the
1503 	 * current thread context (i.e. if we are currently in the interrupt
1504 	 * context, then we shouldn't be attempting to sleep).
1505 	 */
1506 	sleep = (bind->bi_flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1507 	if ((sleep == TAVOR_SLEEP) &&
1508 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1509 		/* Set "status" and "errormsg" and goto failure */
1510 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1511 		goto mrmttbind_fail;
1512 	}
1513 
1514 	/*
1515 	 * Bind the memory and determine the mapped addresses.  This is
1516 	 * the first of two routines that do all the "heavy lifting" for
1517 	 * the Tavor memory registration routines.  The tavor_mr_mem_bind()
1518 	 * routine takes the "bind" struct with all its fields filled
1519 	 * in and returns a list of DMA cookies (for the PCI mapped addresses
1520 	 * corresponding to the specified address region) which are used by
1521 	 * the tavor_mr_fast_mtt_write() routine below.  If we fail here, we
1522 	 * must undo all the previous resource allocation (and PD reference
1523 	 * count).
1524 	 */
1525 	status = tavor_mr_mem_bind(state, bind, bind_dmahdl, sleep);
1526 	if (status != DDI_SUCCESS) {
1527 		/* Set "status" and "errormsg" and goto failure */
1528 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
1529 		goto mrmttbind_fail;
1530 	}
1531 
1532 	/*
1533 	 * Determine number of pages spanned.  This routine uses the
1534 	 * information in the "bind" struct to determine the required
1535 	 * number of MTT entries needed (and returns the suggested page size -
1536 	 * as a "power-of-2" - for each MTT entry).
1537 	 */
1538 	nummtt = tavor_mr_nummtt_needed(state, bind, mtt_pgsize_bits);
1539 
1540 	/*
1541 	 * Allocate the MTT entries.  Use the calculations performed above to
1542 	 * allocate the required number of MTT entries.  Note: MTT entries are
1543 	 * allocated in "MTT segments" which consist of complete cachelines
1544 	 * (i.e. 8 entries, 16 entries, etc.)  So the TAVOR_NUMMTT_TO_MTTSEG()
1545 	 * macro is used to do the proper conversion.  If we fail here, we
1546 	 * must not only undo all the previous resource allocation (and PD
1547 	 * reference count), but we must also unbind the memory.
1548 	 */
1549 	status = tavor_rsrc_alloc(state, TAVOR_MTT,
1550 	    TAVOR_NUMMTT_TO_MTTSEG(nummtt), sleep, mtt);
1551 	if (status != DDI_SUCCESS) {
1552 		/* Set "status" and "errormsg" and goto failure */
1553 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MTT");
1554 		goto mrmttbind_fail2;
1555 	}
1556 
1557 	/*
1558 	 * Write the mapped addresses into the MTT entries.  This is part two
1559 	 * of the "heavy lifting" routines that we talked about above.  Note:
1560 	 * we pass the suggested page size from the earlier operation here.
1561 	 * And if we fail here, we again do pretty much the same huge clean up.
1562 	 */
1563 	status = tavor_mr_fast_mtt_write(*mtt, bind, *mtt_pgsize_bits);
1564 	if (status != DDI_SUCCESS) {
1565 		/* Set "status" and "errormsg" and goto failure */
1566 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed write mtt");
1567 		goto mrmttbind_fail3;
1568 	}
1569 	TAVOR_TNF_EXIT(tavor_mr_mtt_bind);
1570 	return (DDI_SUCCESS);
1571 
1572 /*
1573  * The following is cleanup for all possible failure cases in this routine
1574  */
1575 mrmttbind_fail3:
1576 	tavor_rsrc_free(state, mtt);
1577 mrmttbind_fail2:
1578 	tavor_mr_mem_unbind(state, bind);
1579 mrmttbind_fail:
1580 	TNF_PROBE_1(tavor_mr_mtt_bind_fail, TAVOR_TNF_ERROR, "",
1581 	    tnf_string, msg, errormsg);
1582 	TAVOR_TNF_EXIT(tavor_mr_mtt_bind);
1583 	return (status);
1584 }
1585 
1586 
1587 /*
1588  * tavor_mr_mtt_unbind()
1589  *    Context: Can be called from interrupt or base context.
1590  */
1591 int
1592 tavor_mr_mtt_unbind(tavor_state_t *state, tavor_bind_info_t *bind,
1593     tavor_rsrc_t *mtt)
1594 {
1595 	TAVOR_TNF_ENTER(tavor_mr_mtt_unbind);
1596 
1597 	/*
1598 	 * Free up the MTT entries and unbind the memory.  Here, as above, we
1599 	 * attempt to free these resources only if it is appropriate to do so.
1600 	 */
1601 	tavor_mr_mem_unbind(state, bind);
1602 	tavor_rsrc_free(state, &mtt);
1603 
1604 	TAVOR_TNF_EXIT(tavor_mr_mtt_unbind);
1605 	return (DDI_SUCCESS);
1606 }
1607 
1608 
1609 /*
1610  * tavor_mr_common_rereg()
1611  *    Context: Can be called from interrupt or base context.
1612  */
1613 static int
1614 tavor_mr_common_rereg(tavor_state_t *state, tavor_mrhdl_t mr,
1615     tavor_pdhdl_t pd, tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl_new,
1616     tavor_mr_options_t *op)
1617 {
1618 	tavor_rsrc_t		*mpt;
1619 	ibt_mr_attr_flags_t	acc_flags_to_use;
1620 	ibt_mr_flags_t		flags;
1621 	tavor_pdhdl_t		pd_to_use;
1622 	tavor_hw_mpt_t		mpt_entry;
1623 	uint64_t		mtt_addr_to_use, vaddr_to_use, len_to_use;
1624 	uint_t			sleep, dereg_level;
1625 	int			status;
1626 	char			*errormsg;
1627 
1628 	TAVOR_TNF_ENTER(tavor_mr_common_rereg);
1629 
1630 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1631 
1632 	/*
1633 	 * Check here to see if the memory region corresponds to a userland
1634 	 * mapping.  Reregistration of userland memory regions is not
1635 	 * currently supported.  Return failure. XXX
1636 	 */
1637 	if (mr->mr_is_umem) {
1638 		/* Set "status" and "errormsg" and goto failure */
1639 		TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
1640 		goto mrrereg_fail;
1641 	}
1642 
1643 	mutex_enter(&mr->mr_lock);
1644 
1645 	/* Pull MPT resource pointer from the Tavor Memory Region handle */
1646 	mpt = mr->mr_mptrsrcp;
1647 
1648 	/* Extract the flags field from the tavor_bind_info_t */
1649 	flags = bind->bi_flags;
1650 
1651 	/*
1652 	 * Check the sleep flag.  Ensure that it is consistent with the
1653 	 * current thread context (i.e. if we are currently in the interrupt
1654 	 * context, then we shouldn't be attempting to sleep).
1655 	 */
1656 	sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1657 	if ((sleep == TAVOR_SLEEP) &&
1658 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1659 		mutex_exit(&mr->mr_lock);
1660 		/* Set "status" and "errormsg" and goto failure */
1661 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1662 		goto mrrereg_fail;
1663 	}
1664 
1665 	/*
1666 	 * First step is to temporarily invalidate the MPT entry.  This
1667 	 * regains ownership from the hardware, and gives us the opportunity
1668 	 * to modify the entry.  Note: The HW2SW_MPT command returns the
1669 	 * current MPT entry contents.  These are saved away here because
1670 	 * they will be reused in a later step below.  If the region has
1671 	 * bound memory windows that we fail returning an "in use" error code.
1672 	 * Otherwise, this is an unexpected error and we deregister the
1673 	 * memory region and return error.
1674 	 *
1675 	 * We use TAVOR_CMD_NOSLEEP_SPIN here always because we must protect
1676 	 * against holding the lock around this rereg call in all contexts.
1677 	 */
1678 	status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT, &mpt_entry,
1679 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, TAVOR_CMD_NOSLEEP_SPIN);
1680 	if (status != TAVOR_CMD_SUCCESS) {
1681 		mutex_exit(&mr->mr_lock);
1682 		if (status == TAVOR_CMD_REG_BOUND) {
1683 			TAVOR_TNF_EXIT(tavor_mr_common_rereg);
1684 			return (IBT_MR_IN_USE);
1685 		} else {
1686 			cmn_err(CE_CONT, "Tavor: HW2SW_MPT command failed: "
1687 			    "%08x\n", status);
1688 
1689 			/*
1690 			 * Call deregister and ensure that all current
1691 			 * resources get freed up
1692 			 */
1693 			if (tavor_mr_deregister(state, &mr,
1694 			    TAVOR_MR_DEREG_ALL, sleep) != DDI_SUCCESS) {
1695 				TAVOR_WARNING(state, "failed to deregister "
1696 				    "memory region");
1697 			}
1698 			TNF_PROBE_1(tavor_mr_common_rereg_hw2sw_mpt_cmd_fail,
1699 			    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1700 			TAVOR_TNF_EXIT(tavor_mr_common_rereg);
1701 			return (ibc_get_ci_failure(0));
1702 		}
1703 	}
1704 
1705 	/*
1706 	 * If we're changing the protection domain, then validate the new one
1707 	 */
1708 	if (flags & IBT_MR_CHANGE_PD) {
1709 
1710 		/* Check for valid PD handle pointer */
1711 		if (pd == NULL) {
1712 			mutex_exit(&mr->mr_lock);
1713 			/*
1714 			 * Call deregister and ensure that all current
1715 			 * resources get properly freed up. Unnecessary
1716 			 * here to attempt to regain software ownership
1717 			 * of the MPT entry as that has already been
1718 			 * done above.
1719 			 */
1720 			if (tavor_mr_deregister(state, &mr,
1721 			    TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) !=
1722 			    DDI_SUCCESS) {
1723 				TAVOR_WARNING(state, "failed to deregister "
1724 				    "memory region");
1725 			}
1726 			/* Set "status" and "errormsg" and goto failure */
1727 			TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle");
1728 			goto mrrereg_fail;
1729 		}
1730 
1731 		/* Use the new PD handle in all operations below */
1732 		pd_to_use = pd;
1733 
1734 	} else {
1735 		/* Use the current PD handle in all operations below */
1736 		pd_to_use = mr->mr_pdhdl;
1737 	}
1738 
1739 	/*
1740 	 * If we're changing access permissions, then validate the new ones
1741 	 */
1742 	if (flags & IBT_MR_CHANGE_ACCESS) {
1743 		/*
1744 		 * Validate the access flags.  Both remote write and remote
1745 		 * atomic require the local write flag to be set
1746 		 */
1747 		if (((flags & IBT_MR_ENABLE_REMOTE_WRITE) ||
1748 		    (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)) &&
1749 		    !(flags & IBT_MR_ENABLE_LOCAL_WRITE)) {
1750 			mutex_exit(&mr->mr_lock);
1751 			/*
1752 			 * Call deregister and ensure that all current
1753 			 * resources get properly freed up. Unnecessary
1754 			 * here to attempt to regain software ownership
1755 			 * of the MPT entry as that has already been
1756 			 * done above.
1757 			 */
1758 			if (tavor_mr_deregister(state, &mr,
1759 			    TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) !=
1760 			    DDI_SUCCESS) {
1761 				TAVOR_WARNING(state, "failed to deregister "
1762 				    "memory region");
1763 			}
1764 			/* Set "status" and "errormsg" and goto failure */
1765 			TAVOR_TNF_FAIL(IBT_MR_ACCESS_REQ_INVALID,
1766 			    "invalid access flags");
1767 			goto mrrereg_fail;
1768 		}
1769 
1770 		/*
1771 		 * Setup and validate the memory region access flags.  This
1772 		 * means translating the IBTF's enable flags into the access
1773 		 * flags that will be used in later operations.
1774 		 */
1775 		acc_flags_to_use = 0;
1776 		if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1777 			acc_flags_to_use |= IBT_MR_WINDOW_BIND;
1778 		if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1779 			acc_flags_to_use |= IBT_MR_LOCAL_WRITE;
1780 		if (flags & IBT_MR_ENABLE_REMOTE_READ)
1781 			acc_flags_to_use |= IBT_MR_REMOTE_READ;
1782 		if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1783 			acc_flags_to_use |= IBT_MR_REMOTE_WRITE;
1784 		if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1785 			acc_flags_to_use |= IBT_MR_REMOTE_ATOMIC;
1786 
1787 	} else {
1788 		acc_flags_to_use = mr->mr_accflag;
1789 	}
1790 
1791 	/*
1792 	 * If we're modifying the translation, then figure out whether
1793 	 * we can reuse the current MTT resources.  This means calling
1794 	 * tavor_mr_rereg_xlat_helper() which does most of the heavy lifting
1795 	 * for the reregistration.  If the current memory region contains
1796 	 * sufficient MTT entries for the new regions, then it will be
1797 	 * reused and filled in.  Otherwise, new entries will be allocated,
1798 	 * the old ones will be freed, and the new entries will be filled
1799 	 * in.  Note:  If we're not modifying the translation, then we
1800 	 * should already have all the information we need to update the MPT.
1801 	 * Also note: If tavor_mr_rereg_xlat_helper() fails, it will return
1802 	 * a "dereg_level" which is the level of cleanup that needs to be
1803 	 * passed to tavor_mr_deregister() to finish the cleanup.
1804 	 */
1805 	if (flags & IBT_MR_CHANGE_TRANSLATION) {
1806 		status = tavor_mr_rereg_xlat_helper(state, mr, bind, op,
1807 		    &mtt_addr_to_use, sleep, &dereg_level);
1808 		if (status != DDI_SUCCESS) {
1809 			mutex_exit(&mr->mr_lock);
1810 			/*
1811 			 * Call deregister and ensure that all resources get
1812 			 * properly freed up.
1813 			 */
1814 			if (tavor_mr_deregister(state, &mr, dereg_level,
1815 			    sleep) != DDI_SUCCESS) {
1816 				TAVOR_WARNING(state, "failed to deregister "
1817 				    "memory region");
1818 			}
1819 
1820 			/* Set "status" and "errormsg" and goto failure */
1821 			TAVOR_TNF_FAIL(status, "failed rereg helper");
1822 			goto mrrereg_fail;
1823 		}
1824 		vaddr_to_use = mr->mr_bindinfo.bi_addr;
1825 		len_to_use   = mr->mr_bindinfo.bi_len;
1826 	} else {
1827 		mtt_addr_to_use = (((uint64_t)mpt_entry.mttseg_addr_h << 32) |
1828 		    ((uint64_t)mpt_entry.mttseg_addr_l << 6));
1829 		vaddr_to_use = mr->mr_bindinfo.bi_addr;
1830 		len_to_use   = mr->mr_bindinfo.bi_len;
1831 	}
1832 
1833 	/*
1834 	 * Calculate new keys (Lkey, Rkey) from MPT index.  Just like they were
1835 	 * when the region was first registered, each key is formed from
1836 	 * "constrained" bits and "unconstrained" bits.  Note:  If no remote
1837 	 * access is required, then the RKey value is not filled in.  Otherwise
1838 	 * both Rkey and LKey are given the same value.
1839 	 */
1840 	tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
1841 	if ((acc_flags_to_use & IBT_MR_REMOTE_READ) ||
1842 	    (acc_flags_to_use & IBT_MR_REMOTE_WRITE) ||
1843 	    (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC)) {
1844 		mr->mr_rkey = mr->mr_lkey;
1845 	}
1846 
1847 	/*
1848 	 * Update the MPT entry with the new information.  Some of this
1849 	 * information is retained from the previous operation, some of
1850 	 * it is new based on request.
1851 	 */
1852 	mpt_entry.en_bind = (acc_flags_to_use & IBT_MR_WINDOW_BIND)   ? 1 : 0;
1853 	mpt_entry.atomic  = (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
1854 	mpt_entry.rw	  = (acc_flags_to_use & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
1855 	mpt_entry.rr	  = (acc_flags_to_use & IBT_MR_REMOTE_READ)   ? 1 : 0;
1856 	mpt_entry.lw	  = (acc_flags_to_use & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
1857 	mpt_entry.page_sz	= mr->mr_logmttpgsz - 0xC;
1858 	mpt_entry.mem_key	= mr->mr_lkey;
1859 	mpt_entry.pd		= pd_to_use->pd_pdnum;
1860 	mpt_entry.start_addr	= vaddr_to_use;
1861 	mpt_entry.reg_win_len	= len_to_use;
1862 	mpt_entry.mttseg_addr_h = mtt_addr_to_use >> 32;
1863 	mpt_entry.mttseg_addr_l = mtt_addr_to_use >> 6;
1864 
1865 	/*
1866 	 * Write the updated MPT entry to hardware
1867 	 *
1868 	 * We use TAVOR_CMD_NOSLEEP_SPIN here always because we must protect
1869 	 * against holding the lock around this rereg call in all contexts.
1870 	 */
1871 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1872 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, TAVOR_CMD_NOSLEEP_SPIN);
1873 	if (status != TAVOR_CMD_SUCCESS) {
1874 		mutex_exit(&mr->mr_lock);
1875 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
1876 		    status);
1877 		/*
1878 		 * Call deregister and ensure that all current resources get
1879 		 * properly freed up. Unnecessary here to attempt to regain
1880 		 * software ownership of the MPT entry as that has already
1881 		 * been done above.
1882 		 */
1883 		if (tavor_mr_deregister(state, &mr,
1884 		    TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) != DDI_SUCCESS) {
1885 			TAVOR_WARNING(state, "failed to deregister memory "
1886 			    "region");
1887 		}
1888 		TNF_PROBE_1(tavor_mr_common_rereg_sw2hw_mpt_cmd_fail,
1889 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1890 		TAVOR_TNF_EXIT(tavor_mr_common_rereg);
1891 		return (ibc_get_ci_failure(0));
1892 	}
1893 
1894 	/*
1895 	 * If we're changing PD, then update their reference counts now.
1896 	 * This means decrementing the reference count on the old PD and
1897 	 * incrementing the reference count on the new PD.
1898 	 */
1899 	if (flags & IBT_MR_CHANGE_PD) {
1900 		tavor_pd_refcnt_dec(mr->mr_pdhdl);
1901 		tavor_pd_refcnt_inc(pd);
1902 	}
1903 
1904 	/*
1905 	 * Update the contents of the Tavor Memory Region handle to reflect
1906 	 * what has been changed.
1907 	 */
1908 	mr->mr_pdhdl	  = pd_to_use;
1909 	mr->mr_accflag	  = acc_flags_to_use;
1910 	mr->mr_is_umem	  = 0;
1911 	mr->mr_umemcookie = NULL;
1912 
1913 	/* New MR handle is same as the old */
1914 	*mrhdl_new = mr;
1915 	mutex_exit(&mr->mr_lock);
1916 
1917 	TAVOR_TNF_EXIT(tavor_mr_common_rereg);
1918 	return (DDI_SUCCESS);
1919 
1920 mrrereg_fail:
1921 	TNF_PROBE_1(tavor_mr_common_rereg_fail, TAVOR_TNF_ERROR, "",
1922 	    tnf_string, msg, errormsg);
1923 	TAVOR_TNF_EXIT(tavor_mr_common_rereg);
1924 	return (status);
1925 }
1926 
1927 
1928 /*
1929  * tavor_mr_rereg_xlat_helper
1930  *    Context: Can be called from interrupt or base context.
1931  *    Note: This routine expects the "mr_lock" to be held when it
1932  *    is called.  Upon returning failure, this routine passes information
1933  *    about what "dereg_level" should be passed to tavor_mr_deregister().
1934  */
1935 static int
1936 tavor_mr_rereg_xlat_helper(tavor_state_t *state, tavor_mrhdl_t mr,
1937     tavor_bind_info_t *bind, tavor_mr_options_t *op, uint64_t *mtt_addr,
1938     uint_t sleep, uint_t *dereg_level)
1939 {
1940 	tavor_rsrc_pool_info_t	*rsrc_pool;
1941 	tavor_rsrc_t		*mtt, *mtt_refcnt;
1942 	tavor_sw_refcnt_t	*swrc_old, *swrc_new;
1943 	ddi_dma_handle_t	dmahdl;
1944 	uint64_t		nummtt_needed, nummtt_in_currrsrc, max_sz;
1945 	uint64_t		mtt_ddrbaseaddr;
1946 	uint_t			mtt_pgsize_bits, bind_type, reuse_dmahdl;
1947 	int			status;
1948 	char			*errormsg;
1949 
1950 	TAVOR_TNF_ENTER(tavor_mr_rereg_xlat_helper);
1951 
1952 	ASSERT(MUTEX_HELD(&mr->mr_lock));
1953 
1954 	/*
1955 	 * Check the "options" flag.  Currently this flag tells the driver
1956 	 * whether or not the region should be bound normally (i.e. with
1957 	 * entries written into the PCI IOMMU) or whether it should be
1958 	 * registered to bypass the IOMMU.
1959 	 */
1960 	if (op == NULL) {
1961 		bind_type = TAVOR_BINDMEM_NORMAL;
1962 	} else {
1963 		bind_type = op->mro_bind_type;
1964 	}
1965 
1966 	/*
1967 	 * Check for invalid length.  Check is the length is zero or if the
1968 	 * length is larger than the maximum configured value.  Return error
1969 	 * if it is.
1970 	 */
1971 	max_sz = ((uint64_t)1 << state->ts_cfg_profile->cp_log_max_mrw_sz);
1972 	if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
1973 		/*
1974 		 * Deregister will be called upon returning failure from this
1975 		 * routine. This will ensure that all current resources get
1976 		 * properly freed up. Unnecessary to attempt to regain
1977 		 * software ownership of the MPT entry as that has already
1978 		 * been done above (in tavor_mr_reregister())
1979 		 */
1980 		*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT;
1981 
1982 		/* Set "status" and "errormsg" and goto failure */
1983 		TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
1984 		goto mrrereghelp_fail;
1985 	}
1986 
1987 	/*
1988 	 * Determine the number of pages necessary for new region and the
1989 	 * number of pages supported by the current MTT resources
1990 	 */
1991 	nummtt_needed = tavor_mr_nummtt_needed(state, bind, &mtt_pgsize_bits);
1992 	nummtt_in_currrsrc = mr->mr_mttrsrcp->tr_len >> TAVOR_MTT_SIZE_SHIFT;
1993 
1994 	/*
1995 	 * Depending on whether we have enough pages or not, the next step is
1996 	 * to fill in a set of MTT entries that reflect the new mapping.  In
1997 	 * the first case below, we already have enough entries.  This means
1998 	 * we need to unbind the memory from the previous mapping, bind the
1999 	 * memory for the new mapping, write the new MTT entries, and update
2000 	 * the mr to reflect the changes.
2001 	 * In the second case below, we do not have enough entries in the
2002 	 * current mapping.  So, in this case, we need not only to unbind the
2003 	 * current mapping, but we need to free up the MTT resources associated
2004 	 * with that mapping.  After we've successfully done that, we continue
2005 	 * by binding the new memory, allocating new MTT entries, writing the
2006 	 * new MTT entries, and updating the mr to reflect the changes.
2007 	 */
2008 
2009 	/*
2010 	 * If this region is being shared (i.e. MTT refcount != 1), then we
2011 	 * can't reuse the current MTT resources regardless of their size.
2012 	 * Instead we'll need to alloc new ones (below) just as if there
2013 	 * hadn't been enough room in the current entries.
2014 	 */
2015 	swrc_old = (tavor_sw_refcnt_t *)mr->mr_mttrefcntp->tr_addr;
2016 	if (TAVOR_MTT_IS_NOT_SHARED(swrc_old) &&
2017 	    (nummtt_needed <= nummtt_in_currrsrc)) {
2018 
2019 		/*
2020 		 * Unbind the old mapping for this memory region, but retain
2021 		 * the ddi_dma_handle_t (if possible) for reuse in the bind
2022 		 * operation below.  Note:  If original memory region was
2023 		 * bound for IOMMU bypass and the new region can not use
2024 		 * bypass, then a new DMA handle will be necessary.
2025 		 */
2026 		if (TAVOR_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2027 			mr->mr_bindinfo.bi_free_dmahdl = 0;
2028 			tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2029 			dmahdl = mr->mr_bindinfo.bi_dmahdl;
2030 			reuse_dmahdl = 1;
2031 		} else {
2032 			tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2033 			dmahdl = NULL;
2034 			reuse_dmahdl = 0;
2035 		}
2036 
2037 		/*
2038 		 * Bind the new memory and determine the mapped addresses.
2039 		 * As described, this routine and tavor_mr_fast_mtt_write()
2040 		 * do the majority of the work for the memory registration
2041 		 * operations.  Note:  When we successfully finish the binding,
2042 		 * we will set the "bi_free_dmahdl" flag to indicate that
2043 		 * even though we may have reused the ddi_dma_handle_t we do
2044 		 * wish it to be freed up at some later time.  Note also that
2045 		 * if we fail, we may need to cleanup the ddi_dma_handle_t.
2046 		 */
2047 		bind->bi_bypass	= bind_type;
2048 		status = tavor_mr_mem_bind(state, bind, dmahdl, sleep);
2049 		if (status != DDI_SUCCESS) {
2050 			if (reuse_dmahdl) {
2051 				ddi_dma_free_handle(&dmahdl);
2052 			}
2053 
2054 			/*
2055 			 * Deregister will be called upon returning failure
2056 			 * from this routine. This will ensure that all
2057 			 * current resources get properly freed up.
2058 			 * Unnecessary to attempt to regain software ownership
2059 			 * of the MPT entry as that has already been done
2060 			 * above (in tavor_mr_reregister()).  Also unnecessary
2061 			 * to attempt to unbind the memory.
2062 			 */
2063 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2064 
2065 			/* Set "status" and "errormsg" and goto failure */
2066 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
2067 			goto mrrereghelp_fail;
2068 		}
2069 		if (reuse_dmahdl) {
2070 			bind->bi_free_dmahdl = 1;
2071 		}
2072 
2073 		/*
2074 		 * Using the new mapping, but reusing the current MTT
2075 		 * resources, write the updated entries to MTT
2076 		 */
2077 		mtt    = mr->mr_mttrsrcp;
2078 		status = tavor_mr_fast_mtt_write(mtt, bind, mtt_pgsize_bits);
2079 		if (status != DDI_SUCCESS) {
2080 			/*
2081 			 * Deregister will be called upon returning failure
2082 			 * from this routine. This will ensure that all
2083 			 * current resources get properly freed up.
2084 			 * Unnecessary to attempt to regain software ownership
2085 			 * of the MPT entry as that has already been done
2086 			 * above (in tavor_mr_reregister()).  Also unnecessary
2087 			 * to attempt to unbind the memory.
2088 			 *
2089 			 * But we do need to unbind the newly bound memory
2090 			 * before returning.
2091 			 */
2092 			tavor_mr_mem_unbind(state, bind);
2093 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2094 
2095 			/* Set "status" and "errormsg" and goto failure */
2096 			TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
2097 			    "failed write mtt");
2098 			goto mrrereghelp_fail;
2099 		}
2100 
2101 		/* Put the updated information into the Mem Region handle */
2102 		mr->mr_bindinfo	  = *bind;
2103 		mr->mr_logmttpgsz = mtt_pgsize_bits;
2104 
2105 	} else {
2106 		/*
2107 		 * Check if the memory region MTT is shared by any other MRs.
2108 		 * Since the resource may be shared between multiple memory
2109 		 * regions (as a result of a "RegisterSharedMR()" verb) it is
2110 		 * important that we not unbind any resources prematurely.
2111 		 */
2112 		if (!TAVOR_MTT_IS_SHARED(swrc_old)) {
2113 			/*
2114 			 * Unbind the old mapping for this memory region, but
2115 			 * retain the ddi_dma_handle_t for reuse in the bind
2116 			 * operation below. Note: This can only be done here
2117 			 * because the region being reregistered is not
2118 			 * currently shared.  Also if original memory region
2119 			 * was bound for IOMMU bypass and the new region can
2120 			 * not use bypass, then a new DMA handle will be
2121 			 * necessary.
2122 			 */
2123 			if (TAVOR_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2124 				mr->mr_bindinfo.bi_free_dmahdl = 0;
2125 				tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2126 				dmahdl = mr->mr_bindinfo.bi_dmahdl;
2127 				reuse_dmahdl = 1;
2128 			} else {
2129 				tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2130 				dmahdl = NULL;
2131 				reuse_dmahdl = 0;
2132 			}
2133 		} else {
2134 			dmahdl = NULL;
2135 			reuse_dmahdl = 0;
2136 		}
2137 
2138 		/*
2139 		 * Bind the new memory and determine the mapped addresses.
2140 		 * As described, this routine and tavor_mr_fast_mtt_write()
2141 		 * do the majority of the work for the memory registration
2142 		 * operations.  Note:  When we successfully finish the binding,
2143 		 * we will set the "bi_free_dmahdl" flag to indicate that
2144 		 * even though we may have reused the ddi_dma_handle_t we do
2145 		 * wish it to be freed up at some later time.  Note also that
2146 		 * if we fail, we may need to cleanup the ddi_dma_handle_t.
2147 		 */
2148 		bind->bi_bypass	= bind_type;
2149 		status = tavor_mr_mem_bind(state, bind, dmahdl, sleep);
2150 		if (status != DDI_SUCCESS) {
2151 			if (reuse_dmahdl) {
2152 				ddi_dma_free_handle(&dmahdl);
2153 			}
2154 
2155 			/*
2156 			 * Deregister will be called upon returning failure
2157 			 * from this routine. This will ensure that all
2158 			 * current resources get properly freed up.
2159 			 * Unnecessary to attempt to regain software ownership
2160 			 * of the MPT entry as that has already been done
2161 			 * above (in tavor_mr_reregister()).  Also unnecessary
2162 			 * to attempt to unbind the memory.
2163 			 */
2164 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2165 
2166 			/* Set "status" and "errormsg" and goto failure */
2167 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
2168 			goto mrrereghelp_fail;
2169 		}
2170 		if (reuse_dmahdl) {
2171 			bind->bi_free_dmahdl = 1;
2172 		}
2173 
2174 		/*
2175 		 * Allocate the new MTT entries resource
2176 		 */
2177 		status = tavor_rsrc_alloc(state, TAVOR_MTT,
2178 		    TAVOR_NUMMTT_TO_MTTSEG(nummtt_needed), sleep, &mtt);
2179 		if (status != DDI_SUCCESS) {
2180 			/*
2181 			 * Deregister will be called upon returning failure
2182 			 * from this routine. This will ensure that all
2183 			 * current resources get properly freed up.
2184 			 * Unnecessary to attempt to regain software ownership
2185 			 * of the MPT entry as that has already been done
2186 			 * above (in tavor_mr_reregister()).  Also unnecessary
2187 			 * to attempt to unbind the memory.
2188 			 *
2189 			 * But we do need to unbind the newly bound memory
2190 			 * before returning.
2191 			 */
2192 			tavor_mr_mem_unbind(state, bind);
2193 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2194 
2195 			/* Set "status" and "errormsg" and goto failure */
2196 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MTT");
2197 			goto mrrereghelp_fail;
2198 		}
2199 
2200 		/*
2201 		 * Allocate MTT reference count (to track shared memory
2202 		 * regions).  As mentioned elsewhere above, this reference
2203 		 * count resource may never be used on the given memory region,
2204 		 * but if it is ever later registered as a "shared" memory
2205 		 * region then this resource will be necessary.  Note:  This
2206 		 * is only necessary here if the existing memory region is
2207 		 * already being shared (because otherwise we already have
2208 		 * a useable reference count resource).
2209 		 */
2210 		if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2211 			status = tavor_rsrc_alloc(state, TAVOR_REFCNT, 1,
2212 			    sleep, &mtt_refcnt);
2213 			if (status != DDI_SUCCESS) {
2214 				/*
2215 				 * Deregister will be called upon returning
2216 				 * failure from this routine. This will ensure
2217 				 * that all current resources get properly
2218 				 * freed up.  Unnecessary to attempt to regain
2219 				 * software ownership of the MPT entry as that
2220 				 * has already been done above (in
2221 				 * tavor_mr_reregister()).  Also unnecessary
2222 				 * to attempt to unbind the memory.
2223 				 *
2224 				 * But we need to unbind the newly bound
2225 				 * memory and free up the newly allocated MTT
2226 				 * entries before returning.
2227 				 */
2228 				tavor_mr_mem_unbind(state, bind);
2229 				tavor_rsrc_free(state, &mtt);
2230 				*dereg_level =
2231 				    TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2232 
2233 				/* Set "status"/"errormsg", goto failure */
2234 				TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE,
2235 				    "failed reference count");
2236 				goto mrrereghelp_fail;
2237 			}
2238 			swrc_new = (tavor_sw_refcnt_t *)mtt_refcnt->tr_addr;
2239 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_new))
2240 			TAVOR_MTT_REFCNT_INIT(swrc_new);
2241 		} else {
2242 			mtt_refcnt = mr->mr_mttrefcntp;
2243 		}
2244 
2245 		/*
2246 		 * Using the new mapping and the new MTT resources, write the
2247 		 * updated entries to MTT
2248 		 */
2249 		status = tavor_mr_fast_mtt_write(mtt, bind, mtt_pgsize_bits);
2250 		if (status != DDI_SUCCESS) {
2251 			/*
2252 			 * Deregister will be called upon returning failure
2253 			 * from this routine. This will ensure that all
2254 			 * current resources get properly freed up.
2255 			 * Unnecessary to attempt to regain software ownership
2256 			 * of the MPT entry as that has already been done
2257 			 * above (in tavor_mr_reregister()).  Also unnecessary
2258 			 * to attempt to unbind the memory.
2259 			 *
2260 			 * But we need to unbind the newly bound memory,
2261 			 * free up the newly allocated MTT entries, and
2262 			 * (possibly) free the new MTT reference count
2263 			 * resource before returning.
2264 			 */
2265 			if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2266 				tavor_rsrc_free(state, &mtt_refcnt);
2267 			}
2268 			tavor_mr_mem_unbind(state, bind);
2269 			tavor_rsrc_free(state, &mtt);
2270 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2271 
2272 			/* Set "status" and "errormsg" and goto failure */
2273 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed write mtt");
2274 			goto mrrereghelp_fail;
2275 		}
2276 
2277 		/*
2278 		 * Check if the memory region MTT is shared by any other MRs.
2279 		 * Since the resource may be shared between multiple memory
2280 		 * regions (as a result of a "RegisterSharedMR()" verb) it is
2281 		 * important that we not free up any resources prematurely.
2282 		 */
2283 		if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2284 			/* Decrement MTT reference count for "old" region */
2285 			(void) tavor_mtt_refcnt_dec(mr->mr_mttrefcntp);
2286 		} else {
2287 			/* Free up the old MTT entries resource */
2288 			tavor_rsrc_free(state, &mr->mr_mttrsrcp);
2289 		}
2290 
2291 		/* Put the updated information into the mrhdl */
2292 		mr->mr_bindinfo	  = *bind;
2293 		mr->mr_logmttpgsz = mtt_pgsize_bits;
2294 		mr->mr_mttrsrcp   = mtt;
2295 		mr->mr_mttrefcntp = mtt_refcnt;
2296 	}
2297 
2298 	/*
2299 	 * Calculate and return the updated MTT address (in the DDR address
2300 	 * space).  This will be used by the caller (tavor_mr_reregister) in
2301 	 * the updated MPT entry
2302 	 */
2303 	rsrc_pool	= &state->ts_rsrc_hdl[TAVOR_MTT];
2304 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
2305 	*mtt_addr	= mtt_ddrbaseaddr + (mtt->tr_indx <<
2306 	    TAVOR_MTT_SIZE_SHIFT);
2307 
2308 	TAVOR_TNF_EXIT(tavor_mr_rereg_xlat_helper);
2309 	return (DDI_SUCCESS);
2310 
2311 mrrereghelp_fail:
2312 	TNF_PROBE_1(tavor_mr_rereg_xlat_helper_fail, TAVOR_TNF_ERROR, "",
2313 	    tnf_string, msg, errormsg);
2314 	TAVOR_TNF_EXIT(tavor_mr_rereg_xlat_helper);
2315 	return (status);
2316 }
2317 
2318 
2319 /*
2320  * tavor_mr_nummtt_needed()
2321  *    Context: Can be called from interrupt or base context.
2322  */
2323 /* ARGSUSED */
2324 static uint64_t
2325 tavor_mr_nummtt_needed(tavor_state_t *state, tavor_bind_info_t *bind,
2326     uint_t *mtt_pgsize_bits)
2327 {
2328 	uint64_t	pg_offset_mask;
2329 	uint64_t	pg_offset, tmp_length;
2330 
2331 	/*
2332 	 * For now we specify the page size as 8Kb (the default page size for
2333 	 * the sun4u architecture), or 4Kb for x86.  Figure out optimal page
2334 	 * size by examining the dmacookies XXX
2335 	 */
2336 	*mtt_pgsize_bits = PAGESHIFT;
2337 
2338 	pg_offset_mask = ((uint64_t)1 << *mtt_pgsize_bits) - 1;
2339 	pg_offset = bind->bi_addr & pg_offset_mask;
2340 	tmp_length = pg_offset + (bind->bi_len - 1);
2341 	return ((tmp_length >> *mtt_pgsize_bits) + 1);
2342 }
2343 
2344 
2345 /*
2346  * tavor_mr_mem_bind()
2347  *    Context: Can be called from interrupt or base context.
2348  */
2349 static int
2350 tavor_mr_mem_bind(tavor_state_t *state, tavor_bind_info_t *bind,
2351     ddi_dma_handle_t dmahdl, uint_t sleep)
2352 {
2353 	ddi_dma_attr_t	dma_attr;
2354 	int		(*callback)(caddr_t);
2355 	uint_t		dma_xfer_mode;
2356 	int		status;
2357 
2358 	/* bi_type must be set to a meaningful value to get a bind handle */
2359 	ASSERT(bind->bi_type == TAVOR_BINDHDL_VADDR ||
2360 	    bind->bi_type == TAVOR_BINDHDL_BUF ||
2361 	    bind->bi_type == TAVOR_BINDHDL_UBUF);
2362 
2363 	TAVOR_TNF_ENTER(tavor_mr_mem_bind);
2364 
2365 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2366 
2367 	/* Set the callback flag appropriately */
2368 	callback = (sleep == TAVOR_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT;
2369 
2370 	/* Determine whether to map STREAMING or CONSISTENT */
2371 	dma_xfer_mode = (bind->bi_flags & IBT_MR_NONCOHERENT) ?
2372 	    DDI_DMA_STREAMING : DDI_DMA_CONSISTENT;
2373 
2374 	/*
2375 	 * Initialize many of the default DMA attributes.  Then, if we're
2376 	 * bypassing the IOMMU, set the DDI_DMA_FORCE_PHYSICAL flag.
2377 	 */
2378 	if (dmahdl == NULL) {
2379 		tavor_dma_attr_init(&dma_attr);
2380 #ifdef	__sparc
2381 		/*
2382 		 * First, disable streaming and switch to consistent if
2383 		 * configured to do so and IOMMU BYPASS is enabled.
2384 		 */
2385 		if (state->ts_cfg_profile->cp_disable_streaming_on_bypass &&
2386 		    dma_xfer_mode == DDI_DMA_STREAMING &&
2387 		    bind->bi_bypass == TAVOR_BINDMEM_BYPASS) {
2388 			dma_xfer_mode = DDI_DMA_CONSISTENT;
2389 		}
2390 
2391 		/*
2392 		 * Then, if streaming is still specified, then "bypass" is not
2393 		 * allowed.
2394 		 */
2395 		if ((dma_xfer_mode == DDI_DMA_CONSISTENT) &&
2396 		    (bind->bi_bypass == TAVOR_BINDMEM_BYPASS)) {
2397 			dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2398 		}
2399 #endif
2400 		/* Allocate a DMA handle for the binding */
2401 		status = ddi_dma_alloc_handle(state->ts_dip, &dma_attr,
2402 		    callback, NULL, &bind->bi_dmahdl);
2403 		if (status != DDI_SUCCESS) {
2404 			TNF_PROBE_0(tavor_mr_mem_bind_dmahdl_fail,
2405 			    TAVOR_TNF_ERROR, "");
2406 			TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2407 			return (status);
2408 		}
2409 		bind->bi_free_dmahdl = 1;
2410 
2411 	} else  {
2412 		bind->bi_dmahdl = dmahdl;
2413 		bind->bi_free_dmahdl = 0;
2414 	}
2415 
2416 	/*
2417 	 * Bind the memory to get the PCI mapped addresses.  The decision
2418 	 * to call ddi_dma_addr_bind_handle() or ddi_dma_buf_bind_handle()
2419 	 * is determined by the "bi_type" flag.  Note: if the bind operation
2420 	 * fails then we have to free up the DMA handle and return error.
2421 	 */
2422 	if (bind->bi_type == TAVOR_BINDHDL_VADDR) {
2423 		status = ddi_dma_addr_bind_handle(bind->bi_dmahdl, NULL,
2424 		    (caddr_t)(uintptr_t)bind->bi_addr, bind->bi_len,
2425 		    (DDI_DMA_RDWR | dma_xfer_mode), callback, NULL,
2426 		    &bind->bi_dmacookie, &bind->bi_cookiecnt);
2427 	} else { /* TAVOR_BINDHDL_BUF || TAVOR_BINDHDL_UBUF */
2428 		status = ddi_dma_buf_bind_handle(bind->bi_dmahdl,
2429 		    bind->bi_buf, (DDI_DMA_RDWR | dma_xfer_mode), callback,
2430 		    NULL, &bind->bi_dmacookie, &bind->bi_cookiecnt);
2431 	}
2432 
2433 	if (status != DDI_DMA_MAPPED) {
2434 		if (bind->bi_free_dmahdl != 0) {
2435 			ddi_dma_free_handle(&bind->bi_dmahdl);
2436 		}
2437 		TNF_PROBE_0(tavor_mr_mem_bind_dmabind_fail, TAVOR_TNF_ERROR,
2438 		    "");
2439 		TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2440 		return (status);
2441 	}
2442 
2443 	TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2444 	return (DDI_SUCCESS);
2445 }
2446 
2447 
2448 /*
2449  * tavor_mr_mem_unbind()
2450  *    Context: Can be called from interrupt or base context.
2451  */
2452 static void
2453 tavor_mr_mem_unbind(tavor_state_t *state, tavor_bind_info_t *bind)
2454 {
2455 	int	status;
2456 
2457 	TAVOR_TNF_ENTER(tavor_mr_mem_unbind);
2458 
2459 	/*
2460 	 * In case of TAVOR_BINDHDL_UBUF, the memory bi_buf points to
2461 	 * is actually allocated by ddi_umem_iosetup() internally, then
2462 	 * it's required to free it here. Reset bi_type to TAVOR_BINDHDL_NONE
2463 	 * not to free it again later.
2464 	 */
2465 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2466 	if (bind->bi_type == TAVOR_BINDHDL_UBUF) {
2467 		freerbuf(bind->bi_buf);
2468 		bind->bi_type = TAVOR_BINDHDL_NONE;
2469 	}
2470 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
2471 
2472 	/*
2473 	 * Unbind the DMA memory for the region
2474 	 *
2475 	 * Note: The only way ddi_dma_unbind_handle() currently
2476 	 * can return an error is if the handle passed in is invalid.
2477 	 * Since this should never happen, we choose to return void
2478 	 * from this function!  If this does return an error, however,
2479 	 * then we print a warning message to the console.
2480 	 */
2481 	status = ddi_dma_unbind_handle(bind->bi_dmahdl);
2482 	if (status != DDI_SUCCESS) {
2483 		TAVOR_WARNING(state, "failed to unbind DMA mapping");
2484 		TNF_PROBE_0(tavor_mr_mem_unbind_dmaunbind_fail,
2485 		    TAVOR_TNF_ERROR, "");
2486 		TAVOR_TNF_EXIT(tavor_mr_mem_unbind);
2487 		return;
2488 	}
2489 
2490 	/* Free up the DMA handle */
2491 	if (bind->bi_free_dmahdl != 0) {
2492 		ddi_dma_free_handle(&bind->bi_dmahdl);
2493 	}
2494 
2495 	TAVOR_TNF_EXIT(tavor_mr_mem_unbind);
2496 }
2497 
2498 
2499 /*
2500  * tavor_mr_fast_mtt_write()
2501  *    Context: Can be called from interrupt or base context.
2502  */
2503 static int
2504 tavor_mr_fast_mtt_write(tavor_rsrc_t *mtt, tavor_bind_info_t *bind,
2505     uint32_t mtt_pgsize_bits)
2506 {
2507 	ddi_dma_cookie_t	dmacookie;
2508 	uint_t			cookie_cnt;
2509 	uint64_t		*mtt_table;
2510 	uint64_t		mtt_entry;
2511 	uint64_t		addr, endaddr;
2512 	uint64_t		pagesize;
2513 	int			i;
2514 
2515 	TAVOR_TNF_ENTER(tavor_mr_fast_mtt_write);
2516 
2517 	/* Calculate page size from the suggested value passed in */
2518 	pagesize = ((uint64_t)1 << mtt_pgsize_bits);
2519 
2520 	/*
2521 	 * Walk the "cookie list" and fill in the MTT table entries
2522 	 */
2523 	i = 0;
2524 	mtt_table  = (uint64_t *)mtt->tr_addr;
2525 	dmacookie  = bind->bi_dmacookie;
2526 	cookie_cnt = bind->bi_cookiecnt;
2527 	while (cookie_cnt-- > 0) {
2528 		addr	= dmacookie.dmac_laddress;
2529 		endaddr = addr + (dmacookie.dmac_size - 1);
2530 		addr	= addr & ~((uint64_t)pagesize - 1);
2531 		while (addr <= endaddr) {
2532 			/*
2533 			 * Fill in the mapped addresses (calculated above) and
2534 			 * set TAVOR_MTT_ENTRY_PRESET flag for each MTT entry.
2535 			 */
2536 			mtt_entry = addr | TAVOR_MTT_ENTRY_PRESET;
2537 			ddi_put64(mtt->tr_acchdl, &mtt_table[i], mtt_entry);
2538 			addr += pagesize;
2539 			i++;
2540 
2541 			if (addr == 0) {
2542 				static int do_once = 1;
2543 				_NOTE(SCHEME_PROTECTS_DATA("safe sharing",
2544 				    do_once))
2545 				if (do_once) {
2546 					do_once = 0;
2547 					cmn_err(CE_NOTE, "probable error in "
2548 					    "dma_cookie address from caller\n");
2549 				}
2550 				break;
2551 			}
2552 		}
2553 
2554 		/*
2555 		 * When we've reached the end of the current DMA cookie,
2556 		 * jump to the next cookie (if there are more)
2557 		 */
2558 		if (cookie_cnt != 0) {
2559 			ddi_dma_nextcookie(bind->bi_dmahdl, &dmacookie);
2560 		}
2561 	}
2562 
2563 	TAVOR_TNF_EXIT(tavor_mr_fast_mtt_write);
2564 	return (DDI_SUCCESS);
2565 }
2566 
2567 /*
2568  * tavor_mtt_refcnt_inc()
2569  *    Context: Can be called from interrupt or base context.
2570  */
2571 static int
2572 tavor_mtt_refcnt_inc(tavor_rsrc_t *rsrc)
2573 {
2574 	tavor_sw_refcnt_t *rc;
2575 	uint32_t	  cnt;
2576 
2577 	rc = (tavor_sw_refcnt_t *)rsrc->tr_addr;
2578 
2579 	/* Increment the MTT's reference count */
2580 	mutex_enter(&rc->swrc_lock);
2581 	TNF_PROBE_1_DEBUG(tavor_mtt_refcnt_inc, TAVOR_TNF_TRACE, "",
2582 	    tnf_uint, refcnt, rc->swrc_refcnt);
2583 	cnt = rc->swrc_refcnt++;
2584 	mutex_exit(&rc->swrc_lock);
2585 
2586 	return (cnt);
2587 }
2588 
2589 
2590 /*
2591  * tavor_mtt_refcnt_dec()
2592  *    Context: Can be called from interrupt or base context.
2593  */
2594 static int
2595 tavor_mtt_refcnt_dec(tavor_rsrc_t *rsrc)
2596 {
2597 	tavor_sw_refcnt_t *rc;
2598 	uint32_t	  cnt;
2599 
2600 	rc = (tavor_sw_refcnt_t *)rsrc->tr_addr;
2601 
2602 	/* Decrement the MTT's reference count */
2603 	mutex_enter(&rc->swrc_lock);
2604 	cnt = --rc->swrc_refcnt;
2605 	TNF_PROBE_1_DEBUG(tavor_mtt_refcnt_dec, TAVOR_TNF_TRACE, "",
2606 	    tnf_uint, refcnt, rc->swrc_refcnt);
2607 	mutex_exit(&rc->swrc_lock);
2608 
2609 	return (cnt);
2610 }
2611