xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/tavor/tavor_mr.c (revision 9e39c5ba00a55fa05777cc94b148296af305e135)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_mr.c
29  *    Tavor Memory Region/Window Routines
30  *
31  *    Implements all the routines necessary to provide the requisite memory
32  *    registration verbs.  These include operations like RegisterMemRegion(),
33  *    DeregisterMemRegion(), ReregisterMemRegion, RegisterSharedMemRegion,
34  *    etc., that affect Memory Regions.  It also includes the verbs that
35  *    affect Memory Windows, including AllocMemWindow(), FreeMemWindow(),
36  *    and QueryMemWindow().
37  */
38 
39 #include <sys/types.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/modctl.h>
44 #include <sys/esunddi.h>
45 
46 #include <sys/ib/adapters/tavor/tavor.h>
47 
48 
49 /*
50  * Used by tavor_mr_keycalc() below to fill in the "unconstrained" portion
51  * of Tavor memory keys (LKeys and RKeys)
52  */
53 static uint_t tavor_debug_memkey_cnt = 0x00000000;
54 
55 static int tavor_mr_common_reg(tavor_state_t *state, tavor_pdhdl_t pd,
56     tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op);
57 static int tavor_mr_common_rereg(tavor_state_t *state, tavor_mrhdl_t mr,
58     tavor_pdhdl_t pd, tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl_new,
59     tavor_mr_options_t *op);
60 static int tavor_mr_rereg_xlat_helper(tavor_state_t *state, tavor_mrhdl_t mr,
61     tavor_bind_info_t *bind, tavor_mr_options_t *op, uint64_t *mtt_addr,
62     uint_t sleep, uint_t *dereg_level);
63 static uint64_t tavor_mr_nummtt_needed(tavor_state_t *state,
64     tavor_bind_info_t *bind, uint_t *mtt_pgsize);
65 static int tavor_mr_mem_bind(tavor_state_t *state, tavor_bind_info_t *bind,
66     ddi_dma_handle_t dmahdl, uint_t sleep);
67 static void tavor_mr_mem_unbind(tavor_state_t *state,
68     tavor_bind_info_t *bind);
69 static int tavor_mr_fast_mtt_write(tavor_rsrc_t *mtt, tavor_bind_info_t *bind,
70     uint32_t mtt_pgsize_bits);
71 static int tavor_mr_fast_mtt_write_fmr(tavor_rsrc_t *mtt,
72     ibt_pmr_attr_t *mem_pattr, uint32_t mtt_pgsize_bits);
73 static int tavor_mtt_refcnt_inc(tavor_rsrc_t *rsrc);
74 static int tavor_mtt_refcnt_dec(tavor_rsrc_t *rsrc);
75 
76 /*
77  * The Tavor umem_lockmemory() callback ops.  When userland memory is
78  * registered, these callback ops are specified.  The tavor_umap_umemlock_cb()
79  * callback will be called whenever the memory for the corresponding
80  * ddi_umem_cookie_t is being freed.
81  */
82 static struct umem_callback_ops tavor_umem_cbops = {
83 	UMEM_CALLBACK_VERSION,
84 	tavor_umap_umemlock_cb,
85 };
86 
87 
88 /*
89  * tavor_mr_register()
90  *    Context: Can be called from interrupt or base context.
91  */
92 int
93 tavor_mr_register(tavor_state_t *state, tavor_pdhdl_t pd,
94     ibt_mr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op)
95 {
96 	tavor_bind_info_t	bind;
97 	int			status;
98 
99 	TAVOR_TNF_ENTER(tavor_mr_register);
100 
101 	/*
102 	 * Fill in the "bind" struct.  This struct provides the majority
103 	 * of the information that will be used to distinguish between an
104 	 * "addr" binding (as is the case here) and a "buf" binding (see
105 	 * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
106 	 * which does most of the "heavy lifting" for the Tavor memory
107 	 * registration routines.
108 	 */
109 	bind.bi_type  = TAVOR_BINDHDL_VADDR;
110 	bind.bi_addr  = mr_attr->mr_vaddr;
111 	bind.bi_len   = mr_attr->mr_len;
112 	bind.bi_as    = mr_attr->mr_as;
113 	bind.bi_flags = mr_attr->mr_flags;
114 	status = tavor_mr_common_reg(state, pd, &bind, mrhdl, op);
115 	if (status != DDI_SUCCESS) {
116 		TNF_PROBE_0(tavor_mr_register_cmnreg_fail,
117 		    TAVOR_TNF_ERROR, "");
118 		TAVOR_TNF_EXIT(tavor_mr_register);
119 		return (status);
120 	}
121 
122 	TAVOR_TNF_EXIT(tavor_mr_register);
123 	return (DDI_SUCCESS);
124 }
125 
126 
127 /*
128  * tavor_mr_register_buf()
129  *    Context: Can be called from interrupt or base context.
130  */
131 int
132 tavor_mr_register_buf(tavor_state_t *state, tavor_pdhdl_t pd,
133     ibt_smr_attr_t *mr_attr, struct buf *buf, tavor_mrhdl_t *mrhdl,
134     tavor_mr_options_t *op)
135 {
136 	tavor_bind_info_t	bind;
137 	int			status;
138 
139 	TAVOR_TNF_ENTER(tavor_mr_register_buf);
140 
141 	/*
142 	 * Fill in the "bind" struct.  This struct provides the majority
143 	 * of the information that will be used to distinguish between an
144 	 * "addr" binding (see above) and a "buf" binding (as is the case
145 	 * here).  The "bind" struct is later passed to tavor_mr_mem_bind()
146 	 * which does most of the "heavy lifting" for the Tavor memory
147 	 * registration routines.  Note: We have chosen to provide
148 	 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
149 	 * not set).  It is not critical what value we choose here as it need
150 	 * only be unique for the given RKey (which will happen by default),
151 	 * so the choice here is somewhat arbitrary.
152 	 */
153 	bind.bi_type  = TAVOR_BINDHDL_BUF;
154 	bind.bi_buf   = buf;
155 	if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
156 		bind.bi_addr  = mr_attr->mr_vaddr;
157 	} else {
158 		bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
159 	}
160 	bind.bi_as    = NULL;
161 	bind.bi_len   = (uint64_t)buf->b_bcount;
162 	bind.bi_flags = mr_attr->mr_flags;
163 	status = tavor_mr_common_reg(state, pd, &bind, mrhdl, op);
164 	if (status != DDI_SUCCESS) {
165 		TNF_PROBE_0(tavor_mr_register_buf_cmnreg_fail,
166 		    TAVOR_TNF_ERROR, "");
167 		TAVOR_TNF_EXIT(tavor_mr_register_buf);
168 		return (status);
169 	}
170 
171 	TAVOR_TNF_EXIT(tavor_mr_register_buf);
172 	return (DDI_SUCCESS);
173 }
174 
175 
176 /*
177  * tavor_mr_register_shared()
178  *    Context: Can be called from interrupt or base context.
179  */
180 int
181 tavor_mr_register_shared(tavor_state_t *state, tavor_mrhdl_t mrhdl,
182     tavor_pdhdl_t pd, ibt_smr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl_new)
183 {
184 	tavor_rsrc_pool_info_t	*rsrc_pool;
185 	tavor_rsrc_t		*mpt, *mtt, *rsrc;
186 	tavor_umap_db_entry_t	*umapdb;
187 	tavor_hw_mpt_t		mpt_entry;
188 	tavor_mrhdl_t		mr;
189 	tavor_bind_info_t	*bind;
190 	ddi_umem_cookie_t	umem_cookie;
191 	size_t			umem_len;
192 	caddr_t			umem_addr;
193 	uint64_t		mtt_addr, mtt_ddrbaseaddr, pgsize_msk;
194 	uint_t			sleep, mr_is_umem;
195 	int			status, umem_flags;
196 	char			*errormsg;
197 
198 	TAVOR_TNF_ENTER(tavor_mr_register_shared);
199 
200 	/*
201 	 * Check the sleep flag.  Ensure that it is consistent with the
202 	 * current thread context (i.e. if we are currently in the interrupt
203 	 * context, then we shouldn't be attempting to sleep).
204 	 */
205 	sleep = (mr_attr->mr_flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP :
206 	    TAVOR_SLEEP;
207 	if ((sleep == TAVOR_SLEEP) &&
208 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
209 		/* Set "status" and "errormsg" and goto failure */
210 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
211 		goto mrshared_fail;
212 	}
213 
214 	/* Increment the reference count on the protection domain (PD) */
215 	tavor_pd_refcnt_inc(pd);
216 
217 	/*
218 	 * Allocate an MPT entry.  This will be filled in with all the
219 	 * necessary parameters to define the shared memory region.
220 	 * Specifically, it will be made to reference the currently existing
221 	 * MTT entries and ownership of the MPT will be passed to the hardware
222 	 * in the last step below.  If we fail here, we must undo the
223 	 * protection domain reference count.
224 	 */
225 	status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
226 	if (status != DDI_SUCCESS) {
227 		/* Set "status" and "errormsg" and goto failure */
228 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
229 		goto mrshared_fail1;
230 	}
231 
232 	/*
233 	 * Allocate the software structure for tracking the shared memory
234 	 * region (i.e. the Tavor Memory Region handle).  If we fail here, we
235 	 * must undo the protection domain reference count and the previous
236 	 * resource allocation.
237 	 */
238 	status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
239 	if (status != DDI_SUCCESS) {
240 		/* Set "status" and "errormsg" and goto failure */
241 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
242 		goto mrshared_fail2;
243 	}
244 	mr = (tavor_mrhdl_t)rsrc->tr_addr;
245 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
246 
247 	/*
248 	 * Setup and validate the memory region access flags.  This means
249 	 * translating the IBTF's enable flags into the access flags that
250 	 * will be used in later operations.
251 	 */
252 	mr->mr_accflag = 0;
253 	if (mr_attr->mr_flags & IBT_MR_ENABLE_WINDOW_BIND)
254 		mr->mr_accflag |= IBT_MR_WINDOW_BIND;
255 	if (mr_attr->mr_flags & IBT_MR_ENABLE_LOCAL_WRITE)
256 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
257 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_READ)
258 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
259 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_WRITE)
260 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
261 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
262 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
263 
264 	/*
265 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
266 	 * from a certain number of "constrained" bits (the least significant
267 	 * bits) and some number of "unconstrained" bits.  The constrained
268 	 * bits must be set to the index of the entry in the MPT table, but
269 	 * the unconstrained bits can be set to any value we wish.  Note:
270 	 * if no remote access is required, then the RKey value is not filled
271 	 * in.  Otherwise both Rkey and LKey are given the same value.
272 	 */
273 	tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
274 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
275 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
276 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
277 		mr->mr_rkey = mr->mr_lkey;
278 	}
279 
280 	/* Grab the MR lock for the current memory region */
281 	mutex_enter(&mrhdl->mr_lock);
282 
283 	/*
284 	 * Check here to see if the memory region has already been partially
285 	 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
286 	 * If so, this is an error, return failure.
287 	 */
288 	if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
289 		mutex_exit(&mrhdl->mr_lock);
290 		/* Set "status" and "errormsg" and goto failure */
291 		TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
292 		goto mrshared_fail3;
293 	}
294 
295 	/*
296 	 * Determine if the original memory was from userland and, if so, pin
297 	 * the pages (again) with umem_lockmemory().  This will guarantee a
298 	 * separate callback for each of this shared region's MR handles.
299 	 * If this is userland memory, then allocate an entry in the
300 	 * "userland resources database".  This will later be added to
301 	 * the database (after all further memory registration operations are
302 	 * successful).  If we fail here, we must undo all the above setup.
303 	 */
304 	mr_is_umem = mrhdl->mr_is_umem;
305 	if (mr_is_umem) {
306 		umem_len   = ptob(btopr(mrhdl->mr_bindinfo.bi_len +
307 		    ((uintptr_t)mrhdl->mr_bindinfo.bi_addr & PAGEOFFSET)));
308 		umem_addr  = (caddr_t)((uintptr_t)mrhdl->mr_bindinfo.bi_addr &
309 		    ~PAGEOFFSET);
310 		umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
311 		    DDI_UMEMLOCK_LONGTERM);
312 		status = umem_lockmemory(umem_addr, umem_len, umem_flags,
313 		    &umem_cookie, &tavor_umem_cbops, curproc);
314 		if (status != 0) {
315 			mutex_exit(&mrhdl->mr_lock);
316 			/* Set "status" and "errormsg" and goto failure */
317 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umem pin");
318 			goto mrshared_fail3;
319 		}
320 
321 		umapdb = tavor_umap_db_alloc(state->ts_instance,
322 		    (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
323 		    (uint64_t)(uintptr_t)rsrc);
324 		if (umapdb == NULL) {
325 			mutex_exit(&mrhdl->mr_lock);
326 			/* Set "status" and "errormsg" and goto failure */
327 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
328 			goto mrshared_fail4;
329 		}
330 	}
331 
332 	/*
333 	 * Copy the MTT resource pointer (and additional parameters) from
334 	 * the original Tavor Memory Region handle.  Note: this is normally
335 	 * where the tavor_mr_mem_bind() routine would be called, but because
336 	 * we already have bound and filled-in MTT entries it is simply a
337 	 * matter here of managing the MTT reference count and grabbing the
338 	 * address of the MTT table entries (for filling in the shared region's
339 	 * MPT entry).
340 	 */
341 	mr->mr_mttrsrcp	  = mrhdl->mr_mttrsrcp;
342 	mr->mr_logmttpgsz = mrhdl->mr_logmttpgsz;
343 	mr->mr_bindinfo	  = mrhdl->mr_bindinfo;
344 	mr->mr_mttrefcntp = mrhdl->mr_mttrefcntp;
345 	mutex_exit(&mrhdl->mr_lock);
346 	bind = &mr->mr_bindinfo;
347 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
348 	mtt = mr->mr_mttrsrcp;
349 
350 	/*
351 	 * Increment the MTT reference count (to reflect the fact that
352 	 * the MTT is now shared)
353 	 */
354 	(void) tavor_mtt_refcnt_inc(mr->mr_mttrefcntp);
355 
356 	/*
357 	 * Update the new "bind" virtual address.  Do some extra work here
358 	 * to ensure proper alignment.  That is, make sure that the page
359 	 * offset for the beginning of the old range is the same as the
360 	 * offset for this new mapping
361 	 */
362 	pgsize_msk = (((uint64_t)1 << mr->mr_logmttpgsz) - 1);
363 	bind->bi_addr = ((mr_attr->mr_vaddr & ~pgsize_msk) |
364 	    (mr->mr_bindinfo.bi_addr & pgsize_msk));
365 
366 	/*
367 	 * Get the base address for the MTT table.  This will be necessary
368 	 * in the next step when we are setting up the MPT entry.
369 	 */
370 	rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
371 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
372 
373 	/*
374 	 * Fill in the MPT entry.  This is the final step before passing
375 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
376 	 * the information collected/calculated above to fill in the
377 	 * requisite portions of the MPT.
378 	 */
379 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
380 	mpt_entry.m_io	  = TAVOR_MEM_CYCLE_GENERATE;
381 	mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
382 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
383 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
384 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
385 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
386 	mpt_entry.lr	  = 1;
387 	mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
388 	mpt_entry.page_sz	= mr->mr_logmttpgsz - 0xC;
389 	mpt_entry.mem_key	= mr->mr_lkey;
390 	mpt_entry.pd		= pd->pd_pdnum;
391 	mpt_entry.start_addr	= bind->bi_addr;
392 	mpt_entry.reg_win_len	= bind->bi_len;
393 	mpt_entry.win_cnt_limit	= TAVOR_UNLIMITED_WIN_BIND;
394 	mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
395 	mpt_entry.mttseg_addr_h = mtt_addr >> 32;
396 	mpt_entry.mttseg_addr_l = mtt_addr >> 6;
397 
398 	/*
399 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
400 	 * the entry to the hardware.  Note: in general, this operation
401 	 * shouldn't fail.  But if it does, we have to undo everything we've
402 	 * done above before returning error.
403 	 */
404 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
405 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
406 	if (status != TAVOR_CMD_SUCCESS) {
407 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
408 		    status);
409 		TNF_PROBE_1(tavor_mr_register_shared_sw2hw_mpt_cmd_fail,
410 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
411 		/* Set "status" and "errormsg" and goto failure */
412 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
413 		    "tavor SW2HW_MPT command");
414 		goto mrshared_fail5;
415 	}
416 
417 	/*
418 	 * Fill in the rest of the Tavor Memory Region handle.  Having
419 	 * successfully transferred ownership of the MPT, we can update the
420 	 * following fields for use in further operations on the MR.
421 	 */
422 	mr->mr_mptrsrcp	  = mpt;
423 	mr->mr_mttrsrcp	  = mtt;
424 	mr->mr_pdhdl	  = pd;
425 	mr->mr_rsrcp	  = rsrc;
426 	mr->mr_is_umem	  = mr_is_umem;
427 	mr->mr_is_fmr	  = 0;
428 	mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL;
429 	mr->mr_umem_cbfunc = NULL;
430 	mr->mr_umem_cbarg1 = NULL;
431 	mr->mr_umem_cbarg2 = NULL;
432 
433 	/*
434 	 * If this is userland memory, then we need to insert the previously
435 	 * allocated entry into the "userland resources database".  This will
436 	 * allow for later coordination between the tavor_umap_umemlock_cb()
437 	 * callback and tavor_mr_deregister().
438 	 */
439 	if (mr_is_umem) {
440 		tavor_umap_db_add(umapdb);
441 	}
442 
443 	*mrhdl_new = mr;
444 
445 	TAVOR_TNF_EXIT(tavor_mr_register_shared);
446 	return (DDI_SUCCESS);
447 
448 /*
449  * The following is cleanup for all possible failure cases in this routine
450  */
451 mrshared_fail5:
452 	(void) tavor_mtt_refcnt_dec(mr->mr_mttrefcntp);
453 	if (mr_is_umem) {
454 		tavor_umap_db_free(umapdb);
455 	}
456 mrshared_fail4:
457 	if (mr_is_umem) {
458 		ddi_umem_unlock(umem_cookie);
459 	}
460 mrshared_fail3:
461 	tavor_rsrc_free(state, &rsrc);
462 mrshared_fail2:
463 	tavor_rsrc_free(state, &mpt);
464 mrshared_fail1:
465 	tavor_pd_refcnt_dec(pd);
466 mrshared_fail:
467 	TNF_PROBE_1(tavor_mr_register_shared_fail, TAVOR_TNF_ERROR, "",
468 	    tnf_string, msg, errormsg);
469 	TAVOR_TNF_EXIT(tavor_mr_register_shared);
470 	return (status);
471 }
472 
473 /*
474  * tavor_mr_alloc_fmr()
475  *    Context: Can be called from interrupt or base context.
476  */
477 int
478 tavor_mr_alloc_fmr(tavor_state_t *state, tavor_pdhdl_t pd,
479     tavor_fmrhdl_t fmr_pool, tavor_mrhdl_t *mrhdl)
480 {
481 	tavor_rsrc_pool_info_t	*rsrc_pool;
482 	tavor_rsrc_t		*mpt, *mtt, *rsrc;
483 	tavor_hw_mpt_t		mpt_entry;
484 	tavor_mrhdl_t		mr;
485 	tavor_bind_info_t	bind;
486 	uint64_t		mtt_addr, mtt_ddrbaseaddr;
487 	uint64_t		nummtt;
488 	uint_t			sleep, mtt_pgsize_bits;
489 	int			status;
490 	char			*errormsg;
491 
492 	TAVOR_TNF_ENTER(tavor_mr_alloc_fmr);
493 
494 	/*
495 	 * Check the sleep flag.  Ensure that it is consistent with the
496 	 * current thread context (i.e. if we are currently in the interrupt
497 	 * context, then we shouldn't be attempting to sleep).
498 	 */
499 	sleep = (fmr_pool->fmr_flags & IBT_MR_SLEEP) ? TAVOR_SLEEP :
500 	    TAVOR_NOSLEEP;
501 	if ((sleep == TAVOR_SLEEP) &&
502 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
503 		TNF_PROBE_0(tavor_mr_alloc_fmr, TAVOR_TNF_ERROR, "");
504 		TAVOR_TNF_EXIT(tavor_mr_alloc_fmr);
505 		return (IBT_INVALID_PARAM);
506 	}
507 
508 	/* Increment the reference count on the protection domain (PD) */
509 	tavor_pd_refcnt_inc(pd);
510 
511 	/*
512 	 * Allocate an MPT entry.  This will be filled in with all the
513 	 * necessary parameters to define the FMR.  Specifically, it will be
514 	 * made to reference the currently existing MTT entries and ownership
515 	 * of the MPT will be passed to the hardware in the last step below.
516 	 * If we fail here, we must undo the protection domain reference count.
517 	 */
518 
519 	status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
520 	if (status != DDI_SUCCESS) {
521 		/* Set "status" and "errormsg" and goto failure */
522 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
523 		goto fmralloc_fail1;
524 	}
525 
526 	/*
527 	 * Allocate the software structure for tracking the fmr memory
528 	 * region (i.e. the Tavor Memory Region handle).  If we fail here, we
529 	 * must undo the protection domain reference count and the previous
530 	 * resource allocation.
531 	 */
532 	status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
533 	if (status != DDI_SUCCESS) {
534 		/* Set "status" and "errormsg" and goto failure */
535 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
536 		goto fmralloc_fail2;
537 	}
538 	mr = (tavor_mrhdl_t)rsrc->tr_addr;
539 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
540 
541 	/*
542 	 * Setup and validate the memory region access flags.  This means
543 	 * translating the IBTF's enable flags into the access flags that
544 	 * will be used in later operations.
545 	 */
546 	mr->mr_accflag = 0;
547 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_LOCAL_WRITE)
548 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
549 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_REMOTE_READ)
550 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
551 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_REMOTE_WRITE)
552 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
553 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
554 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
555 
556 	/*
557 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
558 	 * from a certain number of "constrained" bits (the least significant
559 	 * bits) and some number of "unconstrained" bits.  The constrained
560 	 * bits must be set to the index of the entry in the MPT table, but
561 	 * the unconstrained bits can be set to any value we wish.  Note:
562 	 * if no remote access is required, then the RKey value is not filled
563 	 * in.  Otherwise both Rkey and LKey are given the same value.
564 	 */
565 	tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
566 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
567 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
568 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
569 		mr->mr_rkey = mr->mr_lkey;
570 	}
571 
572 	/*
573 	 * Determine number of pages spanned.  This routine uses the
574 	 * information in the "bind" struct to determine the required
575 	 * number of MTT entries needed (and returns the suggested page size -
576 	 * as a "power-of-2" - for each MTT entry).
577 	 */
578 	/* Assume address will be page aligned later */
579 	bind.bi_addr = 0;
580 	/* Calculate size based on given max pages */
581 	bind.bi_len = fmr_pool->fmr_max_pages << PAGESHIFT;
582 	nummtt = tavor_mr_nummtt_needed(state, &bind, &mtt_pgsize_bits);
583 
584 	/*
585 	 * Allocate the MTT entries.  Use the calculations performed above to
586 	 * allocate the required number of MTT entries.  Note: MTT entries are
587 	 * allocated in "MTT segments" which consist of complete cachelines
588 	 * (i.e. 8 entries, 16 entries, etc.)  So the TAVOR_NUMMTT_TO_MTTSEG()
589 	 * macro is used to do the proper conversion.  If we fail here, we
590 	 * must not only undo all the previous resource allocation (and PD
591 	 * reference count), but we must also unbind the memory.
592 	 */
593 	status = tavor_rsrc_alloc(state, TAVOR_MTT,
594 	    TAVOR_NUMMTT_TO_MTTSEG(nummtt), sleep, &mtt);
595 	if (status != DDI_SUCCESS) {
596 		/* Set "status" and "errormsg" and goto failure */
597 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MTT");
598 		goto fmralloc_fail3;
599 	}
600 	mr->mr_logmttpgsz = mtt_pgsize_bits;
601 
602 	/*
603 	 * Get the base address for the MTT table.  This will be necessary
604 	 * in the next step when we are setting up the MPT entry.
605 	 */
606 	rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
607 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
608 
609 	/*
610 	 * Fill in the MPT entry.  This is the final step before passing
611 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
612 	 * the information collected/calculated above to fill in the
613 	 * requisite portions of the MPT.
614 	 */
615 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
616 	mpt_entry.m_io	  = TAVOR_MEM_CYCLE_GENERATE;
617 	mpt_entry.en_bind = 0;
618 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
619 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
620 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
621 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
622 	mpt_entry.lr	  = 1;
623 	mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
624 	mpt_entry.pd		= pd->pd_pdnum;
625 	mpt_entry.page_sz	= mr->mr_logmttpgsz - 0xC;
626 	mpt_entry.win_cnt_limit	= TAVOR_UNLIMITED_WIN_BIND;
627 	mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
628 	mpt_entry.mttseg_addr_h = mtt_addr >> 32;
629 	mpt_entry.mttseg_addr_l = mtt_addr >> 6;
630 	mpt_entry.mem_key	= mr->mr_lkey;
631 
632 	/*
633 	 * FMR sets these to 0 for now.  Later during actual fmr registration
634 	 * these values are filled in.
635 	 */
636 	mpt_entry.start_addr	= 0;
637 	mpt_entry.reg_win_len	= 0;
638 
639 	/*
640 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
641 	 * the entry to the hardware.  Note: in general, this operation
642 	 * shouldn't fail.  But if it does, we have to undo everything we've
643 	 * done above before returning error.
644 	 */
645 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
646 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
647 	if (status != TAVOR_CMD_SUCCESS) {
648 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
649 		    status);
650 		TNF_PROBE_1(tavor_mr_register_shared_sw2hw_mpt_cmd_fail,
651 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
652 		/* Set "status" and "errormsg" and goto failure */
653 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
654 		    "tavor SW2HW_MPT command");
655 		goto fmralloc_fail4;
656 	}
657 
658 	/*
659 	 * Fill in the rest of the Tavor Memory Region handle.  Having
660 	 * successfully transferred ownership of the MPT, we can update the
661 	 * following fields for use in further operations on the MR.  Also, set
662 	 * that this is an FMR region.
663 	 */
664 	mr->mr_mptrsrcp	  = mpt;
665 	mr->mr_mttrsrcp	  = mtt;
666 	mr->mr_pdhdl	  = pd;
667 	mr->mr_rsrcp	  = rsrc;
668 	mr->mr_is_fmr	  = 1;
669 	(void) memcpy(&mr->mr_bindinfo, &bind, sizeof (tavor_bind_info_t));
670 
671 	*mrhdl = mr;
672 
673 	TAVOR_TNF_EXIT(tavor_mr_alloc_fmr);
674 	return (DDI_SUCCESS);
675 
676 /*
677  * The following is cleanup for all possible failure cases in this routine
678  */
679 fmralloc_fail4:
680 	tavor_rsrc_free(state, &mtt);
681 fmralloc_fail3:
682 	tavor_rsrc_free(state, &rsrc);
683 fmralloc_fail2:
684 	tavor_rsrc_free(state, &mpt);
685 fmralloc_fail1:
686 	tavor_pd_refcnt_dec(pd);
687 fmralloc_fail:
688 	TNF_PROBE_1(tavor_mr_alloc_fmr, TAVOR_TNF_ERROR, "",
689 	    tnf_string, msg, errormsg);
690 	TAVOR_TNF_EXIT(tavor_mr_alloc_fmr);
691 	return (status);
692 }
693 
694 /*
695  * tavor_mr_register_physical_fmr()
696  *    Context: Can be called from interrupt or base context.
697  */
698 int
699 tavor_mr_register_physical_fmr(tavor_state_t *state,
700     ibt_pmr_attr_t *mem_pattr_p, tavor_mrhdl_t mr, ibt_pmr_desc_t *mem_desc_p)
701 {
702 	tavor_rsrc_t		*mpt;
703 	uint64_t		*mpt_table;
704 	int			status;
705 	char			*errormsg;
706 
707 	TAVOR_TNF_ENTER(tavor_mr_register_physical_fmr);
708 
709 	mutex_enter(&mr->mr_lock);
710 	mpt = mr->mr_mptrsrcp;
711 	mpt_table = (uint64_t *)mpt->tr_addr;
712 
713 	/* Write MPT status to SW bit */
714 	ddi_put8(mpt->tr_acchdl, (uint8_t *)&mpt_table[0], 0xF);
715 
716 	/*
717 	 * Write the mapped addresses into the MTT entries.  FMR needs to do
718 	 * this a little differently, so we call the fmr specific fast mtt
719 	 * write here.
720 	 */
721 	status = tavor_mr_fast_mtt_write_fmr(mr->mr_mttrsrcp, mem_pattr_p,
722 	    mr->mr_logmttpgsz);
723 	if (status != DDI_SUCCESS) {
724 		mutex_exit(&mr->mr_lock);
725 		/* Set "status" and "errormsg" and goto failure */
726 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed write mtt");
727 		goto fmr_reg_fail1;
728 	}
729 
730 	/*
731 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
732 	 * from a certain number of "constrained" bits (the least significant
733 	 * bits) and some number of "unconstrained" bits.  The constrained
734 	 * bits must be set to the index of the entry in the MPT table, but
735 	 * the unconstrained bits can be set to any value we wish.  Note:
736 	 * if no remote access is required, then the RKey value is not filled
737 	 * in.  Otherwise both Rkey and LKey are given the same value.
738 	 */
739 	tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
740 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
741 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
742 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
743 		mr->mr_rkey = mr->mr_lkey;
744 	}
745 
746 	/* write mem key value */
747 	ddi_put32(mpt->tr_acchdl, (uint32_t *)&mpt_table[1], mr->mr_lkey);
748 
749 	/* write length value */
750 	ddi_put64(mpt->tr_acchdl, &mpt_table[3], mem_pattr_p->pmr_len);
751 
752 	/* write start addr value */
753 	ddi_put64(mpt->tr_acchdl, &mpt_table[2], mem_pattr_p->pmr_iova);
754 
755 	/* write lkey value */
756 	ddi_put32(mpt->tr_acchdl, (uint32_t *)&mpt_table[4], mr->mr_lkey);
757 
758 	/* Write MPT status to HW bit */
759 	ddi_put8(mpt->tr_acchdl, (uint8_t *)&mpt_table[0], 0x0);
760 
761 	/* Fill in return parameters */
762 	mem_desc_p->pmd_lkey = mr->mr_lkey;
763 	mem_desc_p->pmd_rkey = mr->mr_rkey;
764 	mem_desc_p->pmd_iova = mem_pattr_p->pmr_iova;
765 	mem_desc_p->pmd_phys_buf_list_sz = mem_pattr_p->pmr_len;
766 
767 	/* Fill in MR bindinfo struct for later sync or query operations */
768 	mr->mr_bindinfo.bi_addr = mem_pattr_p->pmr_iova;
769 	mr->mr_bindinfo.bi_flags = mem_pattr_p->pmr_flags & IBT_MR_NONCOHERENT;
770 
771 	mutex_exit(&mr->mr_lock);
772 
773 	TAVOR_TNF_EXIT(tavor_mr_register_physical_fmr);
774 	return (DDI_SUCCESS);
775 
776 fmr_reg_fail1:
777 	/*
778 	 * Note, we fail here, and purposely leave the memory ownership in
779 	 * software.  The memory tables may be corrupt, so we leave the region
780 	 * unregistered.
781 	 */
782 	TNF_PROBE_1(tavor_mr_register_physical_fmr_fail, TAVOR_TNF_ERROR, "",
783 	    tnf_string, msg, errormsg);
784 	TAVOR_TNF_EXIT(tavor_mr_register_physical_fmr);
785 	return (DDI_FAILURE);
786 }
787 
788 
789 /*
790  * tavor_mr_deregister()
791  *    Context: Can be called from interrupt or base context.
792  */
793 /* ARGSUSED */
794 int
795 tavor_mr_deregister(tavor_state_t *state, tavor_mrhdl_t *mrhdl, uint_t level,
796     uint_t sleep)
797 {
798 	tavor_rsrc_t		*mpt, *mtt, *rsrc, *mtt_refcnt;
799 	tavor_umap_db_entry_t	*umapdb;
800 	tavor_pdhdl_t		pd;
801 	tavor_mrhdl_t		mr;
802 	tavor_bind_info_t	*bind;
803 	uint64_t		value;
804 	int			status, shared_mtt;
805 	char			*errormsg;
806 
807 	TAVOR_TNF_ENTER(tavor_mr_deregister);
808 
809 	/*
810 	 * Check the sleep flag.  Ensure that it is consistent with the
811 	 * current thread context (i.e. if we are currently in the interrupt
812 	 * context, then we shouldn't be attempting to sleep).
813 	 */
814 	if ((sleep == TAVOR_SLEEP) &&
815 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
816 		/* Set "status" and "errormsg" and goto failure */
817 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sleep flags");
818 		TNF_PROBE_1(tavor_mr_deregister_fail, TAVOR_TNF_ERROR, "",
819 		    tnf_string, msg, errormsg);
820 		TAVOR_TNF_EXIT(tavor_mr_deregister);
821 		return (status);
822 	}
823 
824 	/*
825 	 * Pull all the necessary information from the Tavor Memory Region
826 	 * handle.  This is necessary here because the resource for the
827 	 * MR handle is going to be freed up as part of the this
828 	 * deregistration
829 	 */
830 	mr	= *mrhdl;
831 	mutex_enter(&mr->mr_lock);
832 	mpt	= mr->mr_mptrsrcp;
833 	mtt	= mr->mr_mttrsrcp;
834 	mtt_refcnt = mr->mr_mttrefcntp;
835 	rsrc	= mr->mr_rsrcp;
836 	pd	= mr->mr_pdhdl;
837 	bind	= &mr->mr_bindinfo;
838 
839 	/*
840 	 * Check here if the memory region is really an FMR.  If so, this is a
841 	 * bad thing and we shouldn't be here.  Return failure.
842 	 */
843 	if (mr->mr_is_fmr) {
844 		mutex_exit(&mr->mr_lock);
845 		TNF_PROBE_0(tavor_mr_deregister_is_fmr, TAVOR_TNF_ERROR, "");
846 		TAVOR_TNF_EXIT(tavor_mr_deregister);
847 		return (IBT_INVALID_PARAM);
848 	}
849 
850 	/*
851 	 * Check here to see if the memory region has already been partially
852 	 * deregistered as a result of the tavor_umap_umemlock_cb() callback.
853 	 * If so, then jump to the end and free the remaining resources.
854 	 */
855 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
856 		goto mrdereg_finish_cleanup;
857 	}
858 
859 	/*
860 	 * We must drop the "mr_lock" here to ensure that both SLEEP and
861 	 * NOSLEEP calls into the firmware work as expected.  Also, if two
862 	 * threads are attemping to access this MR (via de-register,
863 	 * re-register, or otherwise), then we allow the firmware to enforce
864 	 * the checking, that only one deregister is valid.
865 	 */
866 	mutex_exit(&mr->mr_lock);
867 
868 	/*
869 	 * Reclaim MPT entry from hardware (if necessary).  Since the
870 	 * tavor_mr_deregister() routine is used in the memory region
871 	 * reregistration process as well, it is possible that we will
872 	 * not always wish to reclaim ownership of the MPT.  Check the
873 	 * "level" arg and, if necessary, attempt to reclaim it.  If
874 	 * the ownership transfer fails for any reason, we check to see
875 	 * what command status was returned from the hardware.  The only
876 	 * "expected" error status is the one that indicates an attempt to
877 	 * deregister a memory region that has memory windows bound to it
878 	 */
879 	if (level >= TAVOR_MR_DEREG_ALL) {
880 		status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT,
881 		    NULL, 0, mpt->tr_indx, sleep);
882 		if (status != TAVOR_CMD_SUCCESS) {
883 			if (status == TAVOR_CMD_REG_BOUND) {
884 				TAVOR_TNF_EXIT(tavor_mr_deregister);
885 				return (IBT_MR_IN_USE);
886 			} else {
887 				cmn_err(CE_CONT, "Tavor: HW2SW_MPT command "
888 				    "failed: %08x\n", status);
889 				TNF_PROBE_1(tavor_hw2sw_mpt_cmd_fail,
890 				    TAVOR_TNF_ERROR, "", tnf_uint, status,
891 				    status);
892 				TAVOR_TNF_EXIT(tavor_mr_deregister);
893 				return (IBT_INVALID_PARAM);
894 			}
895 		}
896 	}
897 
898 	/*
899 	 * Re-grab the mr_lock here.  Since further access to the protected
900 	 * 'mr' structure is needed, and we would have returned previously for
901 	 * the multiple deregistration case, we can safely grab the lock here.
902 	 */
903 	mutex_enter(&mr->mr_lock);
904 
905 	/*
906 	 * If the memory had come from userland, then we do a lookup in the
907 	 * "userland resources database".  On success, we free the entry, call
908 	 * ddi_umem_unlock(), and continue the cleanup.  On failure (which is
909 	 * an indication that the umem_lockmemory() callback has called
910 	 * tavor_mr_deregister()), we call ddi_umem_unlock() and invalidate
911 	 * the "mr_umemcookie" field in the MR handle (this will be used
912 	 * later to detect that only partial cleaup still remains to be done
913 	 * on the MR handle).
914 	 */
915 	if (mr->mr_is_umem) {
916 		status = tavor_umap_db_find(state->ts_instance,
917 		    (uint64_t)(uintptr_t)mr->mr_umemcookie,
918 		    MLNX_UMAP_MRMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
919 		    &umapdb);
920 		if (status == DDI_SUCCESS) {
921 			tavor_umap_db_free(umapdb);
922 			ddi_umem_unlock(mr->mr_umemcookie);
923 		} else {
924 			ddi_umem_unlock(mr->mr_umemcookie);
925 			mr->mr_umemcookie = NULL;
926 		}
927 	}
928 
929 	/*
930 	 * Decrement the MTT reference count.  Since the MTT resource
931 	 * may be shared between multiple memory regions (as a result
932 	 * of a "RegisterSharedMR" verb) it is important that we not
933 	 * free up or unbind resources prematurely.  If it's not shared (as
934 	 * indicated by the return status), then free the resource.
935 	 */
936 	shared_mtt = tavor_mtt_refcnt_dec(mtt_refcnt);
937 	if (!shared_mtt) {
938 		tavor_rsrc_free(state, &mtt_refcnt);
939 	}
940 
941 	/*
942 	 * Free up the MTT entries and unbind the memory.  Here, as above, we
943 	 * attempt to free these resources only if it is appropriate to do so.
944 	 */
945 	if (!shared_mtt) {
946 		if (level >= TAVOR_MR_DEREG_NO_HW2SW_MPT) {
947 			tavor_mr_mem_unbind(state, bind);
948 		}
949 		tavor_rsrc_free(state, &mtt);
950 	}
951 
952 	/*
953 	 * If the MR handle has been invalidated, then drop the
954 	 * lock and return success.  Note: This only happens because
955 	 * the umem_lockmemory() callback has been triggered.  The
956 	 * cleanup here is partial, and further cleanup (in a
957 	 * subsequent tavor_mr_deregister() call) will be necessary.
958 	 */
959 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
960 		mutex_exit(&mr->mr_lock);
961 		TAVOR_TNF_EXIT(tavor_mr_deregister);
962 		return (DDI_SUCCESS);
963 	}
964 
965 mrdereg_finish_cleanup:
966 	mutex_exit(&mr->mr_lock);
967 
968 	/* Free the Tavor Memory Region handle */
969 	tavor_rsrc_free(state, &rsrc);
970 
971 	/* Free up the MPT entry resource */
972 	tavor_rsrc_free(state, &mpt);
973 
974 	/* Decrement the reference count on the protection domain (PD) */
975 	tavor_pd_refcnt_dec(pd);
976 
977 	/* Set the mrhdl pointer to NULL and return success */
978 	*mrhdl = NULL;
979 
980 	TAVOR_TNF_EXIT(tavor_mr_deregister);
981 	return (DDI_SUCCESS);
982 }
983 
984 /*
985  * tavor_mr_dealloc_fmr()
986  *    Context: Can be called from interrupt or base context.
987  */
988 /* ARGSUSED */
989 int
990 tavor_mr_dealloc_fmr(tavor_state_t *state, tavor_mrhdl_t *mrhdl)
991 {
992 	tavor_rsrc_t		*mpt, *mtt, *rsrc;
993 	tavor_pdhdl_t		pd;
994 	tavor_mrhdl_t		mr;
995 
996 	TAVOR_TNF_ENTER(tavor_mr_dealloc_fmr);
997 
998 	/*
999 	 * Pull all the necessary information from the Tavor Memory Region
1000 	 * handle.  This is necessary here because the resource for the
1001 	 * MR handle is going to be freed up as part of the this
1002 	 * deregistration
1003 	 */
1004 	mr	= *mrhdl;
1005 	mutex_enter(&mr->mr_lock);
1006 	mpt	= mr->mr_mptrsrcp;
1007 	mtt	= mr->mr_mttrsrcp;
1008 	rsrc	= mr->mr_rsrcp;
1009 	pd	= mr->mr_pdhdl;
1010 	mutex_exit(&mr->mr_lock);
1011 
1012 	/* Free the MTT entries */
1013 	tavor_rsrc_free(state, &mtt);
1014 
1015 	/* Free the Tavor Memory Region handle */
1016 	tavor_rsrc_free(state, &rsrc);
1017 
1018 	/* Free up the MPT entry resource */
1019 	tavor_rsrc_free(state, &mpt);
1020 
1021 	/* Decrement the reference count on the protection domain (PD) */
1022 	tavor_pd_refcnt_dec(pd);
1023 
1024 	/* Set the mrhdl pointer to NULL and return success */
1025 	*mrhdl = NULL;
1026 
1027 	TAVOR_TNF_EXIT(tavor_mr_dealloc_fmr);
1028 	return (DDI_SUCCESS);
1029 }
1030 
1031 /*
1032  * tavor_mr_invalidate_fmr()
1033  *    Context: Can be called from interrupt or base context.
1034  */
1035 /* ARGSUSED */
1036 int
1037 tavor_mr_invalidate_fmr(tavor_state_t *state, tavor_mrhdl_t mr)
1038 {
1039 	tavor_rsrc_t		*mpt;
1040 	uint64_t		*mpt_table;
1041 
1042 	TAVOR_TNF_ENTER(tavor_mr_invalidate_fmr);
1043 
1044 	mutex_enter(&mr->mr_lock);
1045 	mpt = mr->mr_mptrsrcp;
1046 	mpt_table = (uint64_t *)mpt->tr_addr;
1047 
1048 	/* Write MPT status to SW bit */
1049 	ddi_put8(mpt->tr_acchdl, (uint8_t *)&mpt_table[0], 0xF);
1050 
1051 	/* invalidate mem key value */
1052 	ddi_put32(mpt->tr_acchdl, (uint32_t *)&mpt_table[1], 0);
1053 
1054 	/* invalidate lkey value */
1055 	ddi_put32(mpt->tr_acchdl, (uint32_t *)&mpt_table[4], 0);
1056 
1057 	/* Write MPT status to HW bit */
1058 	ddi_put8(mpt->tr_acchdl, (uint8_t *)&mpt_table[0], 0x0);
1059 
1060 	mutex_exit(&mr->mr_lock);
1061 
1062 	TAVOR_TNF_EXIT(tavor_mr_invalidate_fmr);
1063 	return (DDI_SUCCESS);
1064 }
1065 
1066 /*
1067  * tavor_mr_deregister_fmr()
1068  *    Context: Can be called from interrupt or base context.
1069  */
1070 /* ARGSUSED */
1071 int
1072 tavor_mr_deregister_fmr(tavor_state_t *state, tavor_mrhdl_t mr)
1073 {
1074 	tavor_rsrc_t		*mpt;
1075 	uint64_t		*mpt_table;
1076 
1077 	TAVOR_TNF_ENTER(tavor_mr_deregister_fmr);
1078 
1079 	mutex_enter(&mr->mr_lock);
1080 	mpt = mr->mr_mptrsrcp;
1081 	mpt_table = (uint64_t *)mpt->tr_addr;
1082 
1083 	/* Write MPT status to SW bit */
1084 	ddi_put8(mpt->tr_acchdl, (uint8_t *)&mpt_table[0], 0xF);
1085 	mutex_exit(&mr->mr_lock);
1086 
1087 	TAVOR_TNF_EXIT(tavor_mr_deregister_fmr);
1088 	return (DDI_SUCCESS);
1089 }
1090 
1091 
1092 /*
1093  * tavor_mr_query()
1094  *    Context: Can be called from interrupt or base context.
1095  */
1096 /* ARGSUSED */
1097 int
1098 tavor_mr_query(tavor_state_t *state, tavor_mrhdl_t mr,
1099     ibt_mr_query_attr_t *attr)
1100 {
1101 	TAVOR_TNF_ENTER(tavor_mr_query);
1102 
1103 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr))
1104 
1105 	mutex_enter(&mr->mr_lock);
1106 
1107 	/*
1108 	 * Check here to see if the memory region has already been partially
1109 	 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
1110 	 * If so, this is an error, return failure.
1111 	 */
1112 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
1113 		mutex_exit(&mr->mr_lock);
1114 		TNF_PROBE_0(tavor_mr_query_inv_mrhdl_fail, TAVOR_TNF_ERROR, "");
1115 		TAVOR_TNF_EXIT(tavor_mr_query);
1116 		return (IBT_MR_HDL_INVALID);
1117 	}
1118 
1119 	/* Fill in the queried attributes */
1120 	attr->mr_attr_flags = mr->mr_accflag;
1121 	attr->mr_pd	= (ibt_pd_hdl_t)mr->mr_pdhdl;
1122 
1123 	/* Fill in the "local" attributes */
1124 	attr->mr_lkey = (ibt_lkey_t)mr->mr_lkey;
1125 	attr->mr_lbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
1126 	attr->mr_lbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
1127 
1128 	/*
1129 	 * Fill in the "remote" attributes (if necessary).  Note: the
1130 	 * remote attributes are only valid if the memory region has one
1131 	 * or more of the remote access flags set.
1132 	 */
1133 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1134 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1135 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1136 		attr->mr_rkey = (ibt_rkey_t)mr->mr_rkey;
1137 		attr->mr_rbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
1138 		attr->mr_rbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
1139 	}
1140 
1141 	/*
1142 	 * If region is mapped for streaming (i.e. noncoherent), then set sync
1143 	 * is required
1144 	 */
1145 	attr->mr_sync_required = (mr->mr_bindinfo.bi_flags &
1146 	    IBT_MR_NONCOHERENT) ? B_TRUE : B_FALSE;
1147 
1148 	mutex_exit(&mr->mr_lock);
1149 	TAVOR_TNF_EXIT(tavor_mr_query);
1150 	return (DDI_SUCCESS);
1151 }
1152 
1153 
1154 /*
1155  * tavor_mr_reregister()
1156  *    Context: Can be called from interrupt or base context.
1157  */
1158 int
1159 tavor_mr_reregister(tavor_state_t *state, tavor_mrhdl_t mr,
1160     tavor_pdhdl_t pd, ibt_mr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl_new,
1161     tavor_mr_options_t *op)
1162 {
1163 	tavor_bind_info_t	bind;
1164 	int			status;
1165 
1166 	TAVOR_TNF_ENTER(tavor_mr_reregister);
1167 
1168 	/*
1169 	 * Fill in the "bind" struct.  This struct provides the majority
1170 	 * of the information that will be used to distinguish between an
1171 	 * "addr" binding (as is the case here) and a "buf" binding (see
1172 	 * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
1173 	 * which does most of the "heavy lifting" for the Tavor memory
1174 	 * registration (and reregistration) routines.
1175 	 */
1176 	bind.bi_type  = TAVOR_BINDHDL_VADDR;
1177 	bind.bi_addr  = mr_attr->mr_vaddr;
1178 	bind.bi_len   = mr_attr->mr_len;
1179 	bind.bi_as    = mr_attr->mr_as;
1180 	bind.bi_flags = mr_attr->mr_flags;
1181 	status = tavor_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
1182 	if (status != DDI_SUCCESS) {
1183 		TNF_PROBE_0(tavor_mr_reregister_cmnreg_fail,
1184 		    TAVOR_TNF_ERROR, "");
1185 		TAVOR_TNF_EXIT(tavor_mr_reregister);
1186 		return (status);
1187 	}
1188 
1189 	TAVOR_TNF_EXIT(tavor_mr_reregister);
1190 	return (DDI_SUCCESS);
1191 }
1192 
1193 
1194 /*
1195  * tavor_mr_reregister_buf()
1196  *    Context: Can be called from interrupt or base context.
1197  */
1198 int
1199 tavor_mr_reregister_buf(tavor_state_t *state, tavor_mrhdl_t mr,
1200     tavor_pdhdl_t pd, ibt_smr_attr_t *mr_attr, struct buf *buf,
1201     tavor_mrhdl_t *mrhdl_new, tavor_mr_options_t *op)
1202 {
1203 	tavor_bind_info_t	bind;
1204 	int			status;
1205 
1206 	TAVOR_TNF_ENTER(tavor_mr_reregister_buf);
1207 
1208 	/*
1209 	 * Fill in the "bind" struct.  This struct provides the majority
1210 	 * of the information that will be used to distinguish between an
1211 	 * "addr" binding (see above) and a "buf" binding (as is the case
1212 	 * here).  The "bind" struct is later passed to tavor_mr_mem_bind()
1213 	 * which does most of the "heavy lifting" for the Tavor memory
1214 	 * registration routines.  Note: We have chosen to provide
1215 	 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
1216 	 * not set).  It is not critical what value we choose here as it need
1217 	 * only be unique for the given RKey (which will happen by default),
1218 	 * so the choice here is somewhat arbitrary.
1219 	 */
1220 	bind.bi_type  = TAVOR_BINDHDL_BUF;
1221 	bind.bi_buf   = buf;
1222 	if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
1223 		bind.bi_addr  = mr_attr->mr_vaddr;
1224 	} else {
1225 		bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
1226 	}
1227 	bind.bi_len   = (uint64_t)buf->b_bcount;
1228 	bind.bi_flags = mr_attr->mr_flags;
1229 	bind.bi_as = NULL;
1230 	status = tavor_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
1231 	if (status != DDI_SUCCESS) {
1232 		TNF_PROBE_0(tavor_mr_reregister_buf_cmnreg_fail,
1233 		    TAVOR_TNF_ERROR, "");
1234 		TAVOR_TNF_EXIT(tavor_mr_reregister_buf);
1235 		return (status);
1236 	}
1237 
1238 	TAVOR_TNF_EXIT(tavor_mr_reregister_buf);
1239 	return (DDI_SUCCESS);
1240 }
1241 
1242 
1243 /*
1244  * tavor_mr_sync()
1245  *    Context: Can be called from interrupt or base context.
1246  */
1247 /* ARGSUSED */
1248 int
1249 tavor_mr_sync(tavor_state_t *state, ibt_mr_sync_t *mr_segs, size_t num_segs)
1250 {
1251 	tavor_mrhdl_t		mrhdl;
1252 	uint64_t		seg_vaddr, seg_len, seg_end;
1253 	uint64_t		mr_start, mr_end;
1254 	uint_t			type;
1255 	int			status, i;
1256 	char			*errormsg;
1257 
1258 	TAVOR_TNF_ENTER(tavor_mr_sync);
1259 
1260 	/* Process each of the ibt_mr_sync_t's */
1261 	for (i = 0; i < num_segs; i++) {
1262 		mrhdl = (tavor_mrhdl_t)mr_segs[i].ms_handle;
1263 
1264 		/* Check for valid memory region handle */
1265 		if (mrhdl == NULL) {
1266 			/* Set "status" and "errormsg" and goto failure */
1267 			TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
1268 			goto mrsync_fail;
1269 		}
1270 
1271 		mutex_enter(&mrhdl->mr_lock);
1272 
1273 		/*
1274 		 * Check here to see if the memory region has already been
1275 		 * partially deregistered as a result of a
1276 		 * tavor_umap_umemlock_cb() callback.  If so, this is an
1277 		 * error, return failure.
1278 		 */
1279 		if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
1280 			mutex_exit(&mrhdl->mr_lock);
1281 			/* Set "status" and "errormsg" and goto failure */
1282 			TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl2");
1283 			goto mrsync_fail;
1284 		}
1285 
1286 		/* Check for valid bounds on sync request */
1287 		seg_vaddr = mr_segs[i].ms_vaddr;
1288 		seg_len	  = mr_segs[i].ms_len;
1289 		seg_end	  = seg_vaddr + seg_len - 1;
1290 		mr_start  = mrhdl->mr_bindinfo.bi_addr;
1291 		mr_end	  = mr_start + mrhdl->mr_bindinfo.bi_len - 1;
1292 		if ((seg_vaddr < mr_start) || (seg_vaddr > mr_end)) {
1293 			mutex_exit(&mrhdl->mr_lock);
1294 			/* Set "status" and "errormsg" and goto failure */
1295 			TAVOR_TNF_FAIL(IBT_MR_VA_INVALID, "invalid vaddr");
1296 			goto mrsync_fail;
1297 		}
1298 		if ((seg_end < mr_start) || (seg_end > mr_end)) {
1299 			mutex_exit(&mrhdl->mr_lock);
1300 			/* Set "status" and "errormsg" and goto failure */
1301 			TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
1302 			goto mrsync_fail;
1303 		}
1304 
1305 		/* Determine what type (i.e. direction) for sync */
1306 		if (mr_segs[i].ms_flags & IBT_SYNC_READ) {
1307 			type = DDI_DMA_SYNC_FORDEV;
1308 		} else if (mr_segs[i].ms_flags & IBT_SYNC_WRITE) {
1309 			type = DDI_DMA_SYNC_FORCPU;
1310 		} else {
1311 			mutex_exit(&mrhdl->mr_lock);
1312 			/* Set "status" and "errormsg" and goto failure */
1313 			TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sync type");
1314 			goto mrsync_fail;
1315 		}
1316 
1317 		(void) ddi_dma_sync(mrhdl->mr_bindinfo.bi_dmahdl,
1318 		    (off_t)(seg_vaddr - mr_start), (size_t)seg_len, type);
1319 		mutex_exit(&mrhdl->mr_lock);
1320 	}
1321 
1322 	TAVOR_TNF_EXIT(tavor_mr_sync);
1323 	return (DDI_SUCCESS);
1324 
1325 mrsync_fail:
1326 	TNF_PROBE_1(tavor_mr_sync_fail, TAVOR_TNF_ERROR, "", tnf_string, msg,
1327 	    errormsg);
1328 	TAVOR_TNF_EXIT(tavor_mr_sync);
1329 	return (status);
1330 }
1331 
1332 
1333 /*
1334  * tavor_mw_alloc()
1335  *    Context: Can be called from interrupt or base context.
1336  */
1337 int
1338 tavor_mw_alloc(tavor_state_t *state, tavor_pdhdl_t pd, ibt_mw_flags_t flags,
1339     tavor_mwhdl_t *mwhdl)
1340 {
1341 	tavor_rsrc_t		*mpt, *rsrc;
1342 	tavor_hw_mpt_t		mpt_entry;
1343 	tavor_mwhdl_t		mw;
1344 	uint_t			sleep;
1345 	int			status;
1346 	char			*errormsg;
1347 
1348 	TAVOR_TNF_ENTER(tavor_mw_alloc);
1349 
1350 	/*
1351 	 * Check the sleep flag.  Ensure that it is consistent with the
1352 	 * current thread context (i.e. if we are currently in the interrupt
1353 	 * context, then we shouldn't be attempting to sleep).
1354 	 */
1355 	sleep = (flags & IBT_MW_NOSLEEP) ? TAVOR_NOSLEEP : TAVOR_SLEEP;
1356 	if ((sleep == TAVOR_SLEEP) &&
1357 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1358 		/* Set "status" and "errormsg" and goto failure */
1359 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1360 		goto mwalloc_fail;
1361 	}
1362 
1363 	/* Increment the reference count on the protection domain (PD) */
1364 	tavor_pd_refcnt_inc(pd);
1365 
1366 	/*
1367 	 * Allocate an MPT entry (for use as a memory window).  Since the
1368 	 * Tavor hardware uses the MPT entry for memory regions and for
1369 	 * memory windows, we will fill in this MPT with all the necessary
1370 	 * parameters for the memory window.  And then (just as we do for
1371 	 * memory regions) ownership will be passed to the hardware in the
1372 	 * final step below.  If we fail here, we must undo the protection
1373 	 * domain reference count.
1374 	 */
1375 	status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
1376 	if (status != DDI_SUCCESS) {
1377 		/* Set "status" and "errormsg" and goto failure */
1378 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
1379 		goto mwalloc_fail1;
1380 	}
1381 
1382 	/*
1383 	 * Allocate the software structure for tracking the memory window (i.e.
1384 	 * the Tavor Memory Window handle).  Note: This is actually the same
1385 	 * software structure used for tracking memory regions, but since many
1386 	 * of the same properties are needed, only a single structure is
1387 	 * necessary.  If we fail here, we must undo the protection domain
1388 	 * reference count and the previous resource allocation.
1389 	 */
1390 	status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
1391 	if (status != DDI_SUCCESS) {
1392 		/* Set "status" and "errormsg" and goto failure */
1393 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
1394 		goto mwalloc_fail2;
1395 	}
1396 	mw = (tavor_mwhdl_t)rsrc->tr_addr;
1397 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
1398 
1399 	/*
1400 	 * Calculate an "unbound" RKey from MPT index.  In much the same way
1401 	 * as we do for memory regions (above), this key is constructed from
1402 	 * a "constrained" (which depends on the MPT index) and an
1403 	 * "unconstrained" portion (which may be arbitrarily chosen).
1404 	 */
1405 	tavor_mr_keycalc(state, mpt->tr_indx, &mw->mr_rkey);
1406 
1407 	/*
1408 	 * Fill in the MPT entry.  This is the final step before passing
1409 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
1410 	 * the information collected/calculated above to fill in the
1411 	 * requisite portions of the MPT.  Note: fewer entries in the MPT
1412 	 * entry are necessary to allocate a memory window.
1413 	 */
1414 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
1415 	mpt_entry.reg_win	= TAVOR_MPT_IS_WINDOW;
1416 	mpt_entry.mem_key	= mw->mr_rkey;
1417 	mpt_entry.pd		= pd->pd_pdnum;
1418 
1419 	/*
1420 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
1421 	 * the entry to the hardware.  Note: in general, this operation
1422 	 * shouldn't fail.  But if it does, we have to undo everything we've
1423 	 * done above before returning error.
1424 	 */
1425 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1426 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
1427 	if (status != TAVOR_CMD_SUCCESS) {
1428 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
1429 		    status);
1430 		TNF_PROBE_1(tavor_mw_alloc_sw2hw_mpt_cmd_fail,
1431 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1432 		/* Set "status" and "errormsg" and goto failure */
1433 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1434 		    "tavor SW2HW_MPT command");
1435 		goto mwalloc_fail3;
1436 	}
1437 
1438 	/*
1439 	 * Fill in the rest of the Tavor Memory Window handle.  Having
1440 	 * successfully transferred ownership of the MPT, we can update the
1441 	 * following fields for use in further operations on the MW.
1442 	 */
1443 	mw->mr_mptrsrcp	= mpt;
1444 	mw->mr_pdhdl	= pd;
1445 	mw->mr_rsrcp	= rsrc;
1446 	*mwhdl = mw;
1447 
1448 	TAVOR_TNF_EXIT(tavor_mw_alloc);
1449 	return (DDI_SUCCESS);
1450 
1451 mwalloc_fail3:
1452 	tavor_rsrc_free(state, &rsrc);
1453 mwalloc_fail2:
1454 	tavor_rsrc_free(state, &mpt);
1455 mwalloc_fail1:
1456 	tavor_pd_refcnt_dec(pd);
1457 mwalloc_fail:
1458 	TNF_PROBE_1(tavor_mw_alloc_fail, TAVOR_TNF_ERROR, "",
1459 	    tnf_string, msg, errormsg);
1460 	TAVOR_TNF_EXIT(tavor_mw_alloc);
1461 	return (status);
1462 }
1463 
1464 
1465 /*
1466  * tavor_mw_free()
1467  *    Context: Can be called from interrupt or base context.
1468  */
1469 int
1470 tavor_mw_free(tavor_state_t *state, tavor_mwhdl_t *mwhdl, uint_t sleep)
1471 {
1472 	tavor_rsrc_t		*mpt, *rsrc;
1473 	tavor_mwhdl_t		mw;
1474 	int			status;
1475 	char			*errormsg;
1476 	tavor_pdhdl_t		pd;
1477 
1478 	TAVOR_TNF_ENTER(tavor_mw_free);
1479 
1480 	/*
1481 	 * Check the sleep flag.  Ensure that it is consistent with the
1482 	 * current thread context (i.e. if we are currently in the interrupt
1483 	 * context, then we shouldn't be attempting to sleep).
1484 	 */
1485 	if ((sleep == TAVOR_SLEEP) &&
1486 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1487 		/* Set "status" and "errormsg" and goto failure */
1488 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sleep flags");
1489 		TNF_PROBE_1(tavor_mw_free_fail, TAVOR_TNF_ERROR, "",
1490 		    tnf_string, msg, errormsg);
1491 		TAVOR_TNF_EXIT(tavor_mw_free);
1492 		return (status);
1493 	}
1494 
1495 	/*
1496 	 * Pull all the necessary information from the Tavor Memory Window
1497 	 * handle.  This is necessary here because the resource for the
1498 	 * MW handle is going to be freed up as part of the this operation.
1499 	 */
1500 	mw	= *mwhdl;
1501 	mutex_enter(&mw->mr_lock);
1502 	mpt	= mw->mr_mptrsrcp;
1503 	rsrc	= mw->mr_rsrcp;
1504 	pd	= mw->mr_pdhdl;
1505 	mutex_exit(&mw->mr_lock);
1506 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
1507 
1508 	/*
1509 	 * Reclaim the MPT entry from hardware.  Note: in general, it is
1510 	 * unexpected for this operation to return an error.
1511 	 */
1512 	status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT, NULL,
1513 	    0, mpt->tr_indx, sleep);
1514 	if (status != TAVOR_CMD_SUCCESS) {
1515 		cmn_err(CE_CONT, "Tavor: HW2SW_MPT command failed: %08x\n",
1516 		    status);
1517 		TNF_PROBE_1(tavor_hw2sw_mpt_cmd_fail, TAVOR_TNF_ERROR, "",
1518 		    tnf_uint, status, status);
1519 		TAVOR_TNF_EXIT(tavor_mw_free);
1520 		return (IBT_INVALID_PARAM);
1521 	}
1522 
1523 	/* Free the Tavor Memory Window handle */
1524 	tavor_rsrc_free(state, &rsrc);
1525 
1526 	/* Free up the MPT entry resource */
1527 	tavor_rsrc_free(state, &mpt);
1528 
1529 	/* Decrement the reference count on the protection domain (PD) */
1530 	tavor_pd_refcnt_dec(pd);
1531 
1532 	/* Set the mwhdl pointer to NULL and return success */
1533 	*mwhdl = NULL;
1534 
1535 	TAVOR_TNF_EXIT(tavor_mw_free);
1536 	return (DDI_SUCCESS);
1537 }
1538 
1539 
1540 /*
1541  * tavor_mr_keycalc()
1542  *    Context: Can be called from interrupt or base context.
1543  */
1544 void
1545 tavor_mr_keycalc(tavor_state_t *state, uint32_t indx, uint32_t *key)
1546 {
1547 	uint32_t	tmp, log_num_mpt;
1548 
1549 	/*
1550 	 * Generate a simple key from counter.  Note:  We increment this
1551 	 * static variable _intentionally_ without any kind of mutex around
1552 	 * it.  First, single-threading all operations through a single lock
1553 	 * would be a bad idea (from a performance point-of-view).  Second,
1554 	 * the upper "unconstrained" bits don't really have to be unique
1555 	 * because the lower bits are guaranteed to be (although we do make a
1556 	 * best effort to ensure that they are).  Third, the window for the
1557 	 * race (where both threads read and update the counter at the same
1558 	 * time) is incredibly small.
1559 	 * And, lastly, we'd like to make this into a "random" key XXX
1560 	 */
1561 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(tavor_debug_memkey_cnt))
1562 	log_num_mpt = state->ts_cfg_profile->cp_log_num_mpt;
1563 	tmp = (tavor_debug_memkey_cnt++) << log_num_mpt;
1564 	*key = tmp | indx;
1565 }
1566 
1567 
1568 /*
1569  * tavor_mr_common_reg()
1570  *    Context: Can be called from interrupt or base context.
1571  */
1572 static int
1573 tavor_mr_common_reg(tavor_state_t *state, tavor_pdhdl_t pd,
1574     tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op)
1575 {
1576 	tavor_rsrc_pool_info_t	*rsrc_pool;
1577 	tavor_rsrc_t		*mpt, *mtt, *rsrc, *mtt_refcnt;
1578 	tavor_umap_db_entry_t	*umapdb;
1579 	tavor_sw_refcnt_t	*swrc_tmp;
1580 	tavor_hw_mpt_t		mpt_entry;
1581 	tavor_mrhdl_t		mr;
1582 	ibt_mr_flags_t		flags;
1583 	tavor_bind_info_t	*bh;
1584 	ddi_dma_handle_t	bind_dmahdl;
1585 	ddi_umem_cookie_t	umem_cookie;
1586 	size_t			umem_len;
1587 	caddr_t			umem_addr;
1588 	uint64_t		mtt_addr, mtt_ddrbaseaddr, max_sz;
1589 	uint_t			sleep, mtt_pgsize_bits, bind_type, mr_is_umem;
1590 	int			status, umem_flags, bind_override_addr;
1591 	char			*errormsg;
1592 
1593 	TAVOR_TNF_ENTER(tavor_mr_common_reg);
1594 
1595 	/*
1596 	 * Check the "options" flag.  Currently this flag tells the driver
1597 	 * whether or not the region should be bound normally (i.e. with
1598 	 * entries written into the PCI IOMMU), whether it should be
1599 	 * registered to bypass the IOMMU, and whether or not the resulting
1600 	 * address should be "zero-based" (to aid the alignment restrictions
1601 	 * for QPs).
1602 	 */
1603 	if (op == NULL) {
1604 		bind_type   = TAVOR_BINDMEM_NORMAL;
1605 		bind_dmahdl = NULL;
1606 		bind_override_addr = 0;
1607 	} else {
1608 		bind_type	   = op->mro_bind_type;
1609 		bind_dmahdl	   = op->mro_bind_dmahdl;
1610 		bind_override_addr = op->mro_bind_override_addr;
1611 	}
1612 
1613 	/* Extract the flags field from the tavor_bind_info_t */
1614 	flags = bind->bi_flags;
1615 
1616 	/*
1617 	 * Check for invalid length.  Check is the length is zero or if the
1618 	 * length is larger than the maximum configured value.  Return error
1619 	 * if it is.
1620 	 */
1621 	max_sz = ((uint64_t)1 << state->ts_cfg_profile->cp_log_max_mrw_sz);
1622 	if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
1623 		/* Set "status" and "errormsg" and goto failure */
1624 		TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
1625 		goto mrcommon_fail;
1626 	}
1627 
1628 	/*
1629 	 * Check the sleep flag.  Ensure that it is consistent with the
1630 	 * current thread context (i.e. if we are currently in the interrupt
1631 	 * context, then we shouldn't be attempting to sleep).
1632 	 */
1633 	sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1634 	if ((sleep == TAVOR_SLEEP) &&
1635 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1636 		/* Set "status" and "errormsg" and goto failure */
1637 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1638 		goto mrcommon_fail;
1639 	}
1640 
1641 	/*
1642 	 * Get the base address for the MTT table.  This will be necessary
1643 	 * below when we are setting up the MPT entry.
1644 	 */
1645 	rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
1646 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
1647 
1648 	/* Increment the reference count on the protection domain (PD) */
1649 	tavor_pd_refcnt_inc(pd);
1650 
1651 	/*
1652 	 * Allocate an MPT entry.  This will be filled in with all the
1653 	 * necessary parameters to define the memory region.  And then
1654 	 * ownership will be passed to the hardware in the final step
1655 	 * below.  If we fail here, we must undo the protection domain
1656 	 * reference count.
1657 	 */
1658 	status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
1659 	if (status != DDI_SUCCESS) {
1660 		/* Set "status" and "errormsg" and goto failure */
1661 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
1662 		goto mrcommon_fail1;
1663 	}
1664 
1665 	/*
1666 	 * Allocate the software structure for tracking the memory region (i.e.
1667 	 * the Tavor Memory Region handle).  If we fail here, we must undo
1668 	 * the protection domain reference count and the previous resource
1669 	 * allocation.
1670 	 */
1671 	status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
1672 	if (status != DDI_SUCCESS) {
1673 		/* Set "status" and "errormsg" and goto failure */
1674 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
1675 		goto mrcommon_fail2;
1676 	}
1677 	mr = (tavor_mrhdl_t)rsrc->tr_addr;
1678 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
1679 
1680 	/*
1681 	 * Setup and validate the memory region access flags.  This means
1682 	 * translating the IBTF's enable flags into the access flags that
1683 	 * will be used in later operations.
1684 	 */
1685 	mr->mr_accflag = 0;
1686 	if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1687 		mr->mr_accflag |= IBT_MR_WINDOW_BIND;
1688 	if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1689 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
1690 	if (flags & IBT_MR_ENABLE_REMOTE_READ)
1691 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
1692 	if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1693 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
1694 	if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1695 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
1696 
1697 	/*
1698 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
1699 	 * from a certain number of "constrained" bits (the least significant
1700 	 * bits) and some number of "unconstrained" bits.  The constrained
1701 	 * bits must be set to the index of the entry in the MPT table, but
1702 	 * the unconstrained bits can be set to any value we wish.  Note:
1703 	 * if no remote access is required, then the RKey value is not filled
1704 	 * in.  Otherwise both Rkey and LKey are given the same value.
1705 	 */
1706 	tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
1707 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1708 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1709 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1710 		mr->mr_rkey = mr->mr_lkey;
1711 	}
1712 
1713 	/*
1714 	 * Determine if the memory is from userland and pin the pages
1715 	 * with umem_lockmemory() if necessary.
1716 	 * Then, if this is userland memory, allocate an entry in the
1717 	 * "userland resources database".  This will later be added to
1718 	 * the database (after all further memory registration operations are
1719 	 * successful).  If we fail here, we must undo the reference counts
1720 	 * and the previous resource allocations.
1721 	 */
1722 	mr_is_umem = (((bind->bi_as != NULL) && (bind->bi_as != &kas)) ? 1 : 0);
1723 	if (mr_is_umem) {
1724 		umem_len   = ptob(btopr(bind->bi_len +
1725 		    ((uintptr_t)bind->bi_addr & PAGEOFFSET)));
1726 		umem_addr  = (caddr_t)((uintptr_t)bind->bi_addr & ~PAGEOFFSET);
1727 		umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
1728 		    DDI_UMEMLOCK_LONGTERM);
1729 		status = umem_lockmemory(umem_addr, umem_len, umem_flags,
1730 		    &umem_cookie, &tavor_umem_cbops, curproc);
1731 		if (status != 0) {
1732 			/* Set "status" and "errormsg" and goto failure */
1733 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umem pin");
1734 			goto mrcommon_fail3;
1735 		}
1736 
1737 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1738 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1739 
1740 		bind->bi_buf = ddi_umem_iosetup(umem_cookie, 0, umem_len,
1741 		    B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP);
1742 		if (bind->bi_buf == NULL) {
1743 			/* Set "status" and "errormsg" and goto failure */
1744 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed iosetup");
1745 			goto mrcommon_fail3;
1746 		}
1747 		bind->bi_type = TAVOR_BINDHDL_UBUF;
1748 		bind->bi_buf->b_flags |= B_READ;
1749 
1750 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1751 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1752 
1753 		umapdb = tavor_umap_db_alloc(state->ts_instance,
1754 		    (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
1755 		    (uint64_t)(uintptr_t)rsrc);
1756 		if (umapdb == NULL) {
1757 			/* Set "status" and "errormsg" and goto failure */
1758 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
1759 			goto mrcommon_fail4;
1760 		}
1761 	}
1762 
1763 	/*
1764 	 * Setup the bindinfo for the mtt bind call
1765 	 */
1766 	bh = &mr->mr_bindinfo;
1767 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bh))
1768 	bcopy(bind, bh, sizeof (tavor_bind_info_t));
1769 	bh->bi_bypass = bind_type;
1770 	status = tavor_mr_mtt_bind(state, bh, bind_dmahdl, &mtt,
1771 	    &mtt_pgsize_bits);
1772 	if (status != DDI_SUCCESS) {
1773 		/* Set "status" and "errormsg" and goto failure */
1774 		TAVOR_TNF_FAIL(status, "failed mtt bind");
1775 		goto mrcommon_fail5;
1776 	}
1777 	mr->mr_logmttpgsz = mtt_pgsize_bits;
1778 
1779 	/*
1780 	 * Allocate MTT reference count (to track shared memory regions).
1781 	 * This reference count resource may never be used on the given
1782 	 * memory region, but if it is ever later registered as "shared"
1783 	 * memory region then this resource will be necessary.  If we fail
1784 	 * here, we do pretty much the same as above to clean up.
1785 	 */
1786 	status = tavor_rsrc_alloc(state, TAVOR_REFCNT, 1, sleep,
1787 	    &mtt_refcnt);
1788 	if (status != DDI_SUCCESS) {
1789 		/* Set "status" and "errormsg" and goto failure */
1790 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed refence count");
1791 		goto mrcommon_fail6;
1792 	}
1793 	mr->mr_mttrefcntp = mtt_refcnt;
1794 	swrc_tmp = (tavor_sw_refcnt_t *)mtt_refcnt->tr_addr;
1795 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_tmp))
1796 	TAVOR_MTT_REFCNT_INIT(swrc_tmp);
1797 
1798 	/*
1799 	 * Fill in the MPT entry.  This is the final step before passing
1800 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
1801 	 * the information collected/calculated above to fill in the
1802 	 * requisite portions of the MPT.
1803 	 */
1804 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
1805 	mpt_entry.m_io	  = TAVOR_MEM_CYCLE_GENERATE;
1806 	mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
1807 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
1808 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
1809 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
1810 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
1811 	mpt_entry.lr	  = 1;
1812 	mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
1813 	mpt_entry.page_sz	= mr->mr_logmttpgsz - 0xC;
1814 	mpt_entry.mem_key	= mr->mr_lkey;
1815 	mpt_entry.pd		= pd->pd_pdnum;
1816 	if (bind_override_addr == 0) {
1817 		mpt_entry.start_addr = bh->bi_addr;
1818 	} else {
1819 		bh->bi_addr = bh->bi_addr & ((1 << mr->mr_logmttpgsz) - 1);
1820 		mpt_entry.start_addr = bh->bi_addr;
1821 	}
1822 	mpt_entry.reg_win_len	= bh->bi_len;
1823 	mpt_entry.win_cnt_limit	= TAVOR_UNLIMITED_WIN_BIND;
1824 	mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
1825 	mpt_entry.mttseg_addr_h = mtt_addr >> 32;
1826 	mpt_entry.mttseg_addr_l = mtt_addr >> 6;
1827 
1828 	/*
1829 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
1830 	 * the entry to the hardware.  Note: in general, this operation
1831 	 * shouldn't fail.  But if it does, we have to undo everything we've
1832 	 * done above before returning error.
1833 	 */
1834 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1835 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
1836 	if (status != TAVOR_CMD_SUCCESS) {
1837 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
1838 		    status);
1839 		TNF_PROBE_1(tavor_mr_common_reg_sw2hw_mpt_cmd_fail,
1840 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1841 		/* Set "status" and "errormsg" and goto failure */
1842 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1843 		    "tavor SW2HW_MPT command");
1844 		goto mrcommon_fail7;
1845 	}
1846 
1847 	/*
1848 	 * Fill in the rest of the Tavor Memory Region handle.  Having
1849 	 * successfully transferred ownership of the MPT, we can update the
1850 	 * following fields for use in further operations on the MR.
1851 	 */
1852 	mr->mr_mptrsrcp	  = mpt;
1853 	mr->mr_mttrsrcp	  = mtt;
1854 	mr->mr_pdhdl	  = pd;
1855 	mr->mr_rsrcp	  = rsrc;
1856 	mr->mr_is_umem	  = mr_is_umem;
1857 	mr->mr_is_fmr	  = 0;
1858 	mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL;
1859 	mr->mr_umem_cbfunc = NULL;
1860 	mr->mr_umem_cbarg1 = NULL;
1861 	mr->mr_umem_cbarg2 = NULL;
1862 
1863 	/*
1864 	 * If this is userland memory, then we need to insert the previously
1865 	 * allocated entry into the "userland resources database".  This will
1866 	 * allow for later coordination between the tavor_umap_umemlock_cb()
1867 	 * callback and tavor_mr_deregister().
1868 	 */
1869 	if (mr_is_umem) {
1870 		tavor_umap_db_add(umapdb);
1871 	}
1872 
1873 	*mrhdl = mr;
1874 
1875 	TAVOR_TNF_EXIT(tavor_mr_common_reg);
1876 	return (DDI_SUCCESS);
1877 
1878 /*
1879  * The following is cleanup for all possible failure cases in this routine
1880  */
1881 mrcommon_fail7:
1882 	tavor_rsrc_free(state, &mtt_refcnt);
1883 mrcommon_fail6:
1884 	tavor_rsrc_free(state, &mtt);
1885 	tavor_mr_mem_unbind(state, bh);
1886 mrcommon_fail5:
1887 	if (mr_is_umem) {
1888 		tavor_umap_db_free(umapdb);
1889 	}
1890 mrcommon_fail4:
1891 	if (mr_is_umem) {
1892 		/*
1893 		 * Free up the memory ddi_umem_iosetup() allocates
1894 		 * internally.
1895 		 */
1896 		if (bind->bi_type == TAVOR_BINDHDL_UBUF) {
1897 			freerbuf(bind->bi_buf);
1898 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1899 			bind->bi_type = TAVOR_BINDHDL_NONE;
1900 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1901 		}
1902 		ddi_umem_unlock(umem_cookie);
1903 	}
1904 mrcommon_fail3:
1905 	tavor_rsrc_free(state, &rsrc);
1906 mrcommon_fail2:
1907 	tavor_rsrc_free(state, &mpt);
1908 mrcommon_fail1:
1909 	tavor_pd_refcnt_dec(pd);
1910 mrcommon_fail:
1911 	TNF_PROBE_1(tavor_mr_common_reg_fail, TAVOR_TNF_ERROR, "",
1912 	    tnf_string, msg, errormsg);
1913 	TAVOR_TNF_EXIT(tavor_mr_common_reg);
1914 	return (status);
1915 }
1916 
1917 /*
1918  * tavor_mr_mtt_bind()
1919  *    Context: Can be called from interrupt or base context.
1920  */
1921 int
1922 tavor_mr_mtt_bind(tavor_state_t *state, tavor_bind_info_t *bind,
1923     ddi_dma_handle_t bind_dmahdl, tavor_rsrc_t **mtt, uint_t *mtt_pgsize_bits)
1924 {
1925 	uint64_t		nummtt;
1926 	uint_t			sleep;
1927 	int			status;
1928 	char			*errormsg;
1929 
1930 	TAVOR_TNF_ENTER(tavor_mr_common_reg);
1931 
1932 	/*
1933 	 * Check the sleep flag.  Ensure that it is consistent with the
1934 	 * current thread context (i.e. if we are currently in the interrupt
1935 	 * context, then we shouldn't be attempting to sleep).
1936 	 */
1937 	sleep = (bind->bi_flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1938 	if ((sleep == TAVOR_SLEEP) &&
1939 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1940 		/* Set "status" and "errormsg" and goto failure */
1941 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1942 		goto mrmttbind_fail;
1943 	}
1944 
1945 	/*
1946 	 * Bind the memory and determine the mapped addresses.  This is
1947 	 * the first of two routines that do all the "heavy lifting" for
1948 	 * the Tavor memory registration routines.  The tavor_mr_mem_bind()
1949 	 * routine takes the "bind" struct with all its fields filled
1950 	 * in and returns a list of DMA cookies (for the PCI mapped addresses
1951 	 * corresponding to the specified address region) which are used by
1952 	 * the tavor_mr_fast_mtt_write() routine below.  If we fail here, we
1953 	 * must undo all the previous resource allocation (and PD reference
1954 	 * count).
1955 	 */
1956 	status = tavor_mr_mem_bind(state, bind, bind_dmahdl, sleep);
1957 	if (status != DDI_SUCCESS) {
1958 		/* Set "status" and "errormsg" and goto failure */
1959 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
1960 		goto mrmttbind_fail;
1961 	}
1962 
1963 	/*
1964 	 * Determine number of pages spanned.  This routine uses the
1965 	 * information in the "bind" struct to determine the required
1966 	 * number of MTT entries needed (and returns the suggested page size -
1967 	 * as a "power-of-2" - for each MTT entry).
1968 	 */
1969 	nummtt = tavor_mr_nummtt_needed(state, bind, mtt_pgsize_bits);
1970 
1971 	/*
1972 	 * Allocate the MTT entries.  Use the calculations performed above to
1973 	 * allocate the required number of MTT entries.  Note: MTT entries are
1974 	 * allocated in "MTT segments" which consist of complete cachelines
1975 	 * (i.e. 8 entries, 16 entries, etc.)  So the TAVOR_NUMMTT_TO_MTTSEG()
1976 	 * macro is used to do the proper conversion.  If we fail here, we
1977 	 * must not only undo all the previous resource allocation (and PD
1978 	 * reference count), but we must also unbind the memory.
1979 	 */
1980 	status = tavor_rsrc_alloc(state, TAVOR_MTT,
1981 	    TAVOR_NUMMTT_TO_MTTSEG(nummtt), sleep, mtt);
1982 	if (status != DDI_SUCCESS) {
1983 		/* Set "status" and "errormsg" and goto failure */
1984 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MTT");
1985 		goto mrmttbind_fail2;
1986 	}
1987 
1988 	/*
1989 	 * Write the mapped addresses into the MTT entries.  This is part two
1990 	 * of the "heavy lifting" routines that we talked about above.  Note:
1991 	 * we pass the suggested page size from the earlier operation here.
1992 	 * And if we fail here, we again do pretty much the same huge clean up.
1993 	 */
1994 	status = tavor_mr_fast_mtt_write(*mtt, bind, *mtt_pgsize_bits);
1995 	if (status != DDI_SUCCESS) {
1996 		/* Set "status" and "errormsg" and goto failure */
1997 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed write mtt");
1998 		goto mrmttbind_fail3;
1999 	}
2000 	TAVOR_TNF_EXIT(tavor_mr_mtt_bind);
2001 	return (DDI_SUCCESS);
2002 
2003 /*
2004  * The following is cleanup for all possible failure cases in this routine
2005  */
2006 mrmttbind_fail3:
2007 	tavor_rsrc_free(state, mtt);
2008 mrmttbind_fail2:
2009 	tavor_mr_mem_unbind(state, bind);
2010 mrmttbind_fail:
2011 	TNF_PROBE_1(tavor_mr_mtt_bind_fail, TAVOR_TNF_ERROR, "",
2012 	    tnf_string, msg, errormsg);
2013 	TAVOR_TNF_EXIT(tavor_mr_mtt_bind);
2014 	return (status);
2015 }
2016 
2017 
2018 /*
2019  * tavor_mr_mtt_unbind()
2020  *    Context: Can be called from interrupt or base context.
2021  */
2022 int
2023 tavor_mr_mtt_unbind(tavor_state_t *state, tavor_bind_info_t *bind,
2024     tavor_rsrc_t *mtt)
2025 {
2026 	TAVOR_TNF_ENTER(tavor_mr_mtt_unbind);
2027 
2028 	/*
2029 	 * Free up the MTT entries and unbind the memory.  Here, as above, we
2030 	 * attempt to free these resources only if it is appropriate to do so.
2031 	 */
2032 	tavor_mr_mem_unbind(state, bind);
2033 	tavor_rsrc_free(state, &mtt);
2034 
2035 	TAVOR_TNF_EXIT(tavor_mr_mtt_unbind);
2036 	return (DDI_SUCCESS);
2037 }
2038 
2039 
2040 /*
2041  * tavor_mr_common_rereg()
2042  *    Context: Can be called from interrupt or base context.
2043  */
2044 static int
2045 tavor_mr_common_rereg(tavor_state_t *state, tavor_mrhdl_t mr,
2046     tavor_pdhdl_t pd, tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl_new,
2047     tavor_mr_options_t *op)
2048 {
2049 	tavor_rsrc_t		*mpt;
2050 	ibt_mr_attr_flags_t	acc_flags_to_use;
2051 	ibt_mr_flags_t		flags;
2052 	tavor_pdhdl_t		pd_to_use;
2053 	tavor_hw_mpt_t		mpt_entry;
2054 	uint64_t		mtt_addr_to_use, vaddr_to_use, len_to_use;
2055 	uint_t			sleep, dereg_level;
2056 	int			status;
2057 	char			*errormsg;
2058 
2059 	TAVOR_TNF_ENTER(tavor_mr_common_rereg);
2060 
2061 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2062 
2063 	/*
2064 	 * Check here to see if the memory region corresponds to a userland
2065 	 * mapping.  Reregistration of userland memory regions is not
2066 	 * currently supported.  Return failure. XXX
2067 	 */
2068 	if (mr->mr_is_umem) {
2069 		/* Set "status" and "errormsg" and goto failure */
2070 		TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
2071 		goto mrrereg_fail;
2072 	}
2073 
2074 	mutex_enter(&mr->mr_lock);
2075 
2076 	/* Pull MPT resource pointer from the Tavor Memory Region handle */
2077 	mpt = mr->mr_mptrsrcp;
2078 
2079 	/* Extract the flags field from the tavor_bind_info_t */
2080 	flags = bind->bi_flags;
2081 
2082 	/*
2083 	 * Check the sleep flag.  Ensure that it is consistent with the
2084 	 * current thread context (i.e. if we are currently in the interrupt
2085 	 * context, then we shouldn't be attempting to sleep).
2086 	 */
2087 	sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
2088 	if ((sleep == TAVOR_SLEEP) &&
2089 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
2090 		mutex_exit(&mr->mr_lock);
2091 		/* Set "status" and "errormsg" and goto failure */
2092 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
2093 		goto mrrereg_fail;
2094 	}
2095 
2096 	/*
2097 	 * First step is to temporarily invalidate the MPT entry.  This
2098 	 * regains ownership from the hardware, and gives us the opportunity
2099 	 * to modify the entry.  Note: The HW2SW_MPT command returns the
2100 	 * current MPT entry contents.  These are saved away here because
2101 	 * they will be reused in a later step below.  If the region has
2102 	 * bound memory windows that we fail returning an "in use" error code.
2103 	 * Otherwise, this is an unexpected error and we deregister the
2104 	 * memory region and return error.
2105 	 *
2106 	 * We use TAVOR_CMD_NOSLEEP_SPIN here always because we must protect
2107 	 * against holding the lock around this rereg call in all contexts.
2108 	 */
2109 	status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT, &mpt_entry,
2110 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, TAVOR_CMD_NOSLEEP_SPIN);
2111 	if (status != TAVOR_CMD_SUCCESS) {
2112 		mutex_exit(&mr->mr_lock);
2113 		if (status == TAVOR_CMD_REG_BOUND) {
2114 			TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2115 			return (IBT_MR_IN_USE);
2116 		} else {
2117 			cmn_err(CE_CONT, "Tavor: HW2SW_MPT command failed: "
2118 			    "%08x\n", status);
2119 
2120 			/*
2121 			 * Call deregister and ensure that all current
2122 			 * resources get freed up
2123 			 */
2124 			if (tavor_mr_deregister(state, &mr,
2125 			    TAVOR_MR_DEREG_ALL, sleep) != DDI_SUCCESS) {
2126 				TAVOR_WARNING(state, "failed to deregister "
2127 				    "memory region");
2128 			}
2129 			TNF_PROBE_1(tavor_mr_common_rereg_hw2sw_mpt_cmd_fail,
2130 			    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
2131 			TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2132 			return (ibc_get_ci_failure(0));
2133 		}
2134 	}
2135 
2136 	/*
2137 	 * If we're changing the protection domain, then validate the new one
2138 	 */
2139 	if (flags & IBT_MR_CHANGE_PD) {
2140 
2141 		/* Check for valid PD handle pointer */
2142 		if (pd == NULL) {
2143 			mutex_exit(&mr->mr_lock);
2144 			/*
2145 			 * Call deregister and ensure that all current
2146 			 * resources get properly freed up. Unnecessary
2147 			 * here to attempt to regain software ownership
2148 			 * of the MPT entry as that has already been
2149 			 * done above.
2150 			 */
2151 			if (tavor_mr_deregister(state, &mr,
2152 			    TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) !=
2153 			    DDI_SUCCESS) {
2154 				TAVOR_WARNING(state, "failed to deregister "
2155 				    "memory region");
2156 			}
2157 			/* Set "status" and "errormsg" and goto failure */
2158 			TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle");
2159 			goto mrrereg_fail;
2160 		}
2161 
2162 		/* Use the new PD handle in all operations below */
2163 		pd_to_use = pd;
2164 
2165 	} else {
2166 		/* Use the current PD handle in all operations below */
2167 		pd_to_use = mr->mr_pdhdl;
2168 	}
2169 
2170 	/*
2171 	 * If we're changing access permissions, then validate the new ones
2172 	 */
2173 	if (flags & IBT_MR_CHANGE_ACCESS) {
2174 		/*
2175 		 * Validate the access flags.  Both remote write and remote
2176 		 * atomic require the local write flag to be set
2177 		 */
2178 		if (((flags & IBT_MR_ENABLE_REMOTE_WRITE) ||
2179 		    (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)) &&
2180 		    !(flags & IBT_MR_ENABLE_LOCAL_WRITE)) {
2181 			mutex_exit(&mr->mr_lock);
2182 			/*
2183 			 * Call deregister and ensure that all current
2184 			 * resources get properly freed up. Unnecessary
2185 			 * here to attempt to regain software ownership
2186 			 * of the MPT entry as that has already been
2187 			 * done above.
2188 			 */
2189 			if (tavor_mr_deregister(state, &mr,
2190 			    TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) !=
2191 			    DDI_SUCCESS) {
2192 				TAVOR_WARNING(state, "failed to deregister "
2193 				    "memory region");
2194 			}
2195 			/* Set "status" and "errormsg" and goto failure */
2196 			TAVOR_TNF_FAIL(IBT_MR_ACCESS_REQ_INVALID,
2197 			    "invalid access flags");
2198 			goto mrrereg_fail;
2199 		}
2200 
2201 		/*
2202 		 * Setup and validate the memory region access flags.  This
2203 		 * means translating the IBTF's enable flags into the access
2204 		 * flags that will be used in later operations.
2205 		 */
2206 		acc_flags_to_use = 0;
2207 		if (flags & IBT_MR_ENABLE_WINDOW_BIND)
2208 			acc_flags_to_use |= IBT_MR_WINDOW_BIND;
2209 		if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
2210 			acc_flags_to_use |= IBT_MR_LOCAL_WRITE;
2211 		if (flags & IBT_MR_ENABLE_REMOTE_READ)
2212 			acc_flags_to_use |= IBT_MR_REMOTE_READ;
2213 		if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
2214 			acc_flags_to_use |= IBT_MR_REMOTE_WRITE;
2215 		if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
2216 			acc_flags_to_use |= IBT_MR_REMOTE_ATOMIC;
2217 
2218 	} else {
2219 		acc_flags_to_use = mr->mr_accflag;
2220 	}
2221 
2222 	/*
2223 	 * If we're modifying the translation, then figure out whether
2224 	 * we can reuse the current MTT resources.  This means calling
2225 	 * tavor_mr_rereg_xlat_helper() which does most of the heavy lifting
2226 	 * for the reregistration.  If the current memory region contains
2227 	 * sufficient MTT entries for the new regions, then it will be
2228 	 * reused and filled in.  Otherwise, new entries will be allocated,
2229 	 * the old ones will be freed, and the new entries will be filled
2230 	 * in.  Note:  If we're not modifying the translation, then we
2231 	 * should already have all the information we need to update the MPT.
2232 	 * Also note: If tavor_mr_rereg_xlat_helper() fails, it will return
2233 	 * a "dereg_level" which is the level of cleanup that needs to be
2234 	 * passed to tavor_mr_deregister() to finish the cleanup.
2235 	 */
2236 	if (flags & IBT_MR_CHANGE_TRANSLATION) {
2237 		status = tavor_mr_rereg_xlat_helper(state, mr, bind, op,
2238 		    &mtt_addr_to_use, sleep, &dereg_level);
2239 		if (status != DDI_SUCCESS) {
2240 			mutex_exit(&mr->mr_lock);
2241 			/*
2242 			 * Call deregister and ensure that all resources get
2243 			 * properly freed up.
2244 			 */
2245 			if (tavor_mr_deregister(state, &mr, dereg_level,
2246 			    sleep) != DDI_SUCCESS) {
2247 				TAVOR_WARNING(state, "failed to deregister "
2248 				    "memory region");
2249 			}
2250 
2251 			/* Set "status" and "errormsg" and goto failure */
2252 			TAVOR_TNF_FAIL(status, "failed rereg helper");
2253 			goto mrrereg_fail;
2254 		}
2255 		vaddr_to_use = mr->mr_bindinfo.bi_addr;
2256 		len_to_use   = mr->mr_bindinfo.bi_len;
2257 	} else {
2258 		mtt_addr_to_use = (((uint64_t)mpt_entry.mttseg_addr_h << 32) |
2259 		    ((uint64_t)mpt_entry.mttseg_addr_l << 6));
2260 		vaddr_to_use = mr->mr_bindinfo.bi_addr;
2261 		len_to_use   = mr->mr_bindinfo.bi_len;
2262 	}
2263 
2264 	/*
2265 	 * Calculate new keys (Lkey, Rkey) from MPT index.  Just like they were
2266 	 * when the region was first registered, each key is formed from
2267 	 * "constrained" bits and "unconstrained" bits.  Note:  If no remote
2268 	 * access is required, then the RKey value is not filled in.  Otherwise
2269 	 * both Rkey and LKey are given the same value.
2270 	 */
2271 	tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
2272 	if ((acc_flags_to_use & IBT_MR_REMOTE_READ) ||
2273 	    (acc_flags_to_use & IBT_MR_REMOTE_WRITE) ||
2274 	    (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC)) {
2275 		mr->mr_rkey = mr->mr_lkey;
2276 	}
2277 
2278 	/*
2279 	 * Update the MPT entry with the new information.  Some of this
2280 	 * information is retained from the previous operation, some of
2281 	 * it is new based on request.
2282 	 */
2283 	mpt_entry.en_bind = (acc_flags_to_use & IBT_MR_WINDOW_BIND)   ? 1 : 0;
2284 	mpt_entry.atomic  = (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
2285 	mpt_entry.rw	  = (acc_flags_to_use & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
2286 	mpt_entry.rr	  = (acc_flags_to_use & IBT_MR_REMOTE_READ)   ? 1 : 0;
2287 	mpt_entry.lw	  = (acc_flags_to_use & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
2288 	mpt_entry.page_sz	= mr->mr_logmttpgsz - 0xC;
2289 	mpt_entry.mem_key	= mr->mr_lkey;
2290 	mpt_entry.pd		= pd_to_use->pd_pdnum;
2291 	mpt_entry.start_addr	= vaddr_to_use;
2292 	mpt_entry.reg_win_len	= len_to_use;
2293 	mpt_entry.mttseg_addr_h = mtt_addr_to_use >> 32;
2294 	mpt_entry.mttseg_addr_l = mtt_addr_to_use >> 6;
2295 
2296 	/*
2297 	 * Write the updated MPT entry to hardware
2298 	 *
2299 	 * We use TAVOR_CMD_NOSLEEP_SPIN here always because we must protect
2300 	 * against holding the lock around this rereg call in all contexts.
2301 	 */
2302 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
2303 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, TAVOR_CMD_NOSLEEP_SPIN);
2304 	if (status != TAVOR_CMD_SUCCESS) {
2305 		mutex_exit(&mr->mr_lock);
2306 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
2307 		    status);
2308 		/*
2309 		 * Call deregister and ensure that all current resources get
2310 		 * properly freed up. Unnecessary here to attempt to regain
2311 		 * software ownership of the MPT entry as that has already
2312 		 * been done above.
2313 		 */
2314 		if (tavor_mr_deregister(state, &mr,
2315 		    TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) != DDI_SUCCESS) {
2316 			TAVOR_WARNING(state, "failed to deregister memory "
2317 			    "region");
2318 		}
2319 		TNF_PROBE_1(tavor_mr_common_rereg_sw2hw_mpt_cmd_fail,
2320 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
2321 		TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2322 		return (ibc_get_ci_failure(0));
2323 	}
2324 
2325 	/*
2326 	 * If we're changing PD, then update their reference counts now.
2327 	 * This means decrementing the reference count on the old PD and
2328 	 * incrementing the reference count on the new PD.
2329 	 */
2330 	if (flags & IBT_MR_CHANGE_PD) {
2331 		tavor_pd_refcnt_dec(mr->mr_pdhdl);
2332 		tavor_pd_refcnt_inc(pd);
2333 	}
2334 
2335 	/*
2336 	 * Update the contents of the Tavor Memory Region handle to reflect
2337 	 * what has been changed.
2338 	 */
2339 	mr->mr_pdhdl	  = pd_to_use;
2340 	mr->mr_accflag	  = acc_flags_to_use;
2341 	mr->mr_is_umem	  = 0;
2342 	mr->mr_is_fmr	  = 0;
2343 	mr->mr_umemcookie = NULL;
2344 
2345 	/* New MR handle is same as the old */
2346 	*mrhdl_new = mr;
2347 	mutex_exit(&mr->mr_lock);
2348 
2349 	TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2350 	return (DDI_SUCCESS);
2351 
2352 mrrereg_fail:
2353 	TNF_PROBE_1(tavor_mr_common_rereg_fail, TAVOR_TNF_ERROR, "",
2354 	    tnf_string, msg, errormsg);
2355 	TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2356 	return (status);
2357 }
2358 
2359 
2360 /*
2361  * tavor_mr_rereg_xlat_helper
2362  *    Context: Can be called from interrupt or base context.
2363  *    Note: This routine expects the "mr_lock" to be held when it
2364  *    is called.  Upon returning failure, this routine passes information
2365  *    about what "dereg_level" should be passed to tavor_mr_deregister().
2366  */
2367 static int
2368 tavor_mr_rereg_xlat_helper(tavor_state_t *state, tavor_mrhdl_t mr,
2369     tavor_bind_info_t *bind, tavor_mr_options_t *op, uint64_t *mtt_addr,
2370     uint_t sleep, uint_t *dereg_level)
2371 {
2372 	tavor_rsrc_pool_info_t	*rsrc_pool;
2373 	tavor_rsrc_t		*mtt, *mtt_refcnt;
2374 	tavor_sw_refcnt_t	*swrc_old, *swrc_new;
2375 	ddi_dma_handle_t	dmahdl;
2376 	uint64_t		nummtt_needed, nummtt_in_currrsrc, max_sz;
2377 	uint64_t		mtt_ddrbaseaddr;
2378 	uint_t			mtt_pgsize_bits, bind_type, reuse_dmahdl;
2379 	int			status;
2380 	char			*errormsg;
2381 
2382 	TAVOR_TNF_ENTER(tavor_mr_rereg_xlat_helper);
2383 
2384 	ASSERT(MUTEX_HELD(&mr->mr_lock));
2385 
2386 	/*
2387 	 * Check the "options" flag.  Currently this flag tells the driver
2388 	 * whether or not the region should be bound normally (i.e. with
2389 	 * entries written into the PCI IOMMU) or whether it should be
2390 	 * registered to bypass the IOMMU.
2391 	 */
2392 	if (op == NULL) {
2393 		bind_type = TAVOR_BINDMEM_NORMAL;
2394 	} else {
2395 		bind_type = op->mro_bind_type;
2396 	}
2397 
2398 	/*
2399 	 * Check for invalid length.  Check is the length is zero or if the
2400 	 * length is larger than the maximum configured value.  Return error
2401 	 * if it is.
2402 	 */
2403 	max_sz = ((uint64_t)1 << state->ts_cfg_profile->cp_log_max_mrw_sz);
2404 	if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
2405 		/*
2406 		 * Deregister will be called upon returning failure from this
2407 		 * routine. This will ensure that all current resources get
2408 		 * properly freed up. Unnecessary to attempt to regain
2409 		 * software ownership of the MPT entry as that has already
2410 		 * been done above (in tavor_mr_reregister())
2411 		 */
2412 		*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT;
2413 
2414 		/* Set "status" and "errormsg" and goto failure */
2415 		TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
2416 		goto mrrereghelp_fail;
2417 	}
2418 
2419 	/*
2420 	 * Determine the number of pages necessary for new region and the
2421 	 * number of pages supported by the current MTT resources
2422 	 */
2423 	nummtt_needed = tavor_mr_nummtt_needed(state, bind, &mtt_pgsize_bits);
2424 	nummtt_in_currrsrc = mr->mr_mttrsrcp->tr_len >> TAVOR_MTT_SIZE_SHIFT;
2425 
2426 	/*
2427 	 * Depending on whether we have enough pages or not, the next step is
2428 	 * to fill in a set of MTT entries that reflect the new mapping.  In
2429 	 * the first case below, we already have enough entries.  This means
2430 	 * we need to unbind the memory from the previous mapping, bind the
2431 	 * memory for the new mapping, write the new MTT entries, and update
2432 	 * the mr to reflect the changes.
2433 	 * In the second case below, we do not have enough entries in the
2434 	 * current mapping.  So, in this case, we need not only to unbind the
2435 	 * current mapping, but we need to free up the MTT resources associated
2436 	 * with that mapping.  After we've successfully done that, we continue
2437 	 * by binding the new memory, allocating new MTT entries, writing the
2438 	 * new MTT entries, and updating the mr to reflect the changes.
2439 	 */
2440 
2441 	/*
2442 	 * If this region is being shared (i.e. MTT refcount != 1), then we
2443 	 * can't reuse the current MTT resources regardless of their size.
2444 	 * Instead we'll need to alloc new ones (below) just as if there
2445 	 * hadn't been enough room in the current entries.
2446 	 */
2447 	swrc_old = (tavor_sw_refcnt_t *)mr->mr_mttrefcntp->tr_addr;
2448 	if (TAVOR_MTT_IS_NOT_SHARED(swrc_old) &&
2449 	    (nummtt_needed <= nummtt_in_currrsrc)) {
2450 
2451 		/*
2452 		 * Unbind the old mapping for this memory region, but retain
2453 		 * the ddi_dma_handle_t (if possible) for reuse in the bind
2454 		 * operation below.  Note:  If original memory region was
2455 		 * bound for IOMMU bypass and the new region can not use
2456 		 * bypass, then a new DMA handle will be necessary.
2457 		 */
2458 		if (TAVOR_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2459 			mr->mr_bindinfo.bi_free_dmahdl = 0;
2460 			tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2461 			dmahdl = mr->mr_bindinfo.bi_dmahdl;
2462 			reuse_dmahdl = 1;
2463 		} else {
2464 			tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2465 			dmahdl = NULL;
2466 			reuse_dmahdl = 0;
2467 		}
2468 
2469 		/*
2470 		 * Bind the new memory and determine the mapped addresses.
2471 		 * As described, this routine and tavor_mr_fast_mtt_write()
2472 		 * do the majority of the work for the memory registration
2473 		 * operations.  Note:  When we successfully finish the binding,
2474 		 * we will set the "bi_free_dmahdl" flag to indicate that
2475 		 * even though we may have reused the ddi_dma_handle_t we do
2476 		 * wish it to be freed up at some later time.  Note also that
2477 		 * if we fail, we may need to cleanup the ddi_dma_handle_t.
2478 		 */
2479 		bind->bi_bypass	= bind_type;
2480 		status = tavor_mr_mem_bind(state, bind, dmahdl, sleep);
2481 		if (status != DDI_SUCCESS) {
2482 			if (reuse_dmahdl) {
2483 				ddi_dma_free_handle(&dmahdl);
2484 			}
2485 
2486 			/*
2487 			 * Deregister will be called upon returning failure
2488 			 * from this routine. This will ensure that all
2489 			 * current resources get properly freed up.
2490 			 * Unnecessary to attempt to regain software ownership
2491 			 * of the MPT entry as that has already been done
2492 			 * above (in tavor_mr_reregister()).  Also unnecessary
2493 			 * to attempt to unbind the memory.
2494 			 */
2495 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2496 
2497 			/* Set "status" and "errormsg" and goto failure */
2498 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
2499 			goto mrrereghelp_fail;
2500 		}
2501 		if (reuse_dmahdl) {
2502 			bind->bi_free_dmahdl = 1;
2503 		}
2504 
2505 		/*
2506 		 * Using the new mapping, but reusing the current MTT
2507 		 * resources, write the updated entries to MTT
2508 		 */
2509 		mtt    = mr->mr_mttrsrcp;
2510 		status = tavor_mr_fast_mtt_write(mtt, bind, mtt_pgsize_bits);
2511 		if (status != DDI_SUCCESS) {
2512 			/*
2513 			 * Deregister will be called upon returning failure
2514 			 * from this routine. This will ensure that all
2515 			 * current resources get properly freed up.
2516 			 * Unnecessary to attempt to regain software ownership
2517 			 * of the MPT entry as that has already been done
2518 			 * above (in tavor_mr_reregister()).  Also unnecessary
2519 			 * to attempt to unbind the memory.
2520 			 *
2521 			 * But we do need to unbind the newly bound memory
2522 			 * before returning.
2523 			 */
2524 			tavor_mr_mem_unbind(state, bind);
2525 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2526 
2527 			/* Set "status" and "errormsg" and goto failure */
2528 			TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
2529 			    "failed write mtt");
2530 			goto mrrereghelp_fail;
2531 		}
2532 
2533 		/* Put the updated information into the Mem Region handle */
2534 		mr->mr_bindinfo	  = *bind;
2535 		mr->mr_logmttpgsz = mtt_pgsize_bits;
2536 
2537 	} else {
2538 		/*
2539 		 * Check if the memory region MTT is shared by any other MRs.
2540 		 * Since the resource may be shared between multiple memory
2541 		 * regions (as a result of a "RegisterSharedMR()" verb) it is
2542 		 * important that we not unbind any resources prematurely.
2543 		 */
2544 		if (!TAVOR_MTT_IS_SHARED(swrc_old)) {
2545 			/*
2546 			 * Unbind the old mapping for this memory region, but
2547 			 * retain the ddi_dma_handle_t for reuse in the bind
2548 			 * operation below. Note: This can only be done here
2549 			 * because the region being reregistered is not
2550 			 * currently shared.  Also if original memory region
2551 			 * was bound for IOMMU bypass and the new region can
2552 			 * not use bypass, then a new DMA handle will be
2553 			 * necessary.
2554 			 */
2555 			if (TAVOR_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2556 				mr->mr_bindinfo.bi_free_dmahdl = 0;
2557 				tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2558 				dmahdl = mr->mr_bindinfo.bi_dmahdl;
2559 				reuse_dmahdl = 1;
2560 			} else {
2561 				tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2562 				dmahdl = NULL;
2563 				reuse_dmahdl = 0;
2564 			}
2565 		} else {
2566 			dmahdl = NULL;
2567 			reuse_dmahdl = 0;
2568 		}
2569 
2570 		/*
2571 		 * Bind the new memory and determine the mapped addresses.
2572 		 * As described, this routine and tavor_mr_fast_mtt_write()
2573 		 * do the majority of the work for the memory registration
2574 		 * operations.  Note:  When we successfully finish the binding,
2575 		 * we will set the "bi_free_dmahdl" flag to indicate that
2576 		 * even though we may have reused the ddi_dma_handle_t we do
2577 		 * wish it to be freed up at some later time.  Note also that
2578 		 * if we fail, we may need to cleanup the ddi_dma_handle_t.
2579 		 */
2580 		bind->bi_bypass	= bind_type;
2581 		status = tavor_mr_mem_bind(state, bind, dmahdl, sleep);
2582 		if (status != DDI_SUCCESS) {
2583 			if (reuse_dmahdl) {
2584 				ddi_dma_free_handle(&dmahdl);
2585 			}
2586 
2587 			/*
2588 			 * Deregister will be called upon returning failure
2589 			 * from this routine. This will ensure that all
2590 			 * current resources get properly freed up.
2591 			 * Unnecessary to attempt to regain software ownership
2592 			 * of the MPT entry as that has already been done
2593 			 * above (in tavor_mr_reregister()).  Also unnecessary
2594 			 * to attempt to unbind the memory.
2595 			 */
2596 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2597 
2598 			/* Set "status" and "errormsg" and goto failure */
2599 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
2600 			goto mrrereghelp_fail;
2601 		}
2602 		if (reuse_dmahdl) {
2603 			bind->bi_free_dmahdl = 1;
2604 		}
2605 
2606 		/*
2607 		 * Allocate the new MTT entries resource
2608 		 */
2609 		status = tavor_rsrc_alloc(state, TAVOR_MTT,
2610 		    TAVOR_NUMMTT_TO_MTTSEG(nummtt_needed), sleep, &mtt);
2611 		if (status != DDI_SUCCESS) {
2612 			/*
2613 			 * Deregister will be called upon returning failure
2614 			 * from this routine. This will ensure that all
2615 			 * current resources get properly freed up.
2616 			 * Unnecessary to attempt to regain software ownership
2617 			 * of the MPT entry as that has already been done
2618 			 * above (in tavor_mr_reregister()).  Also unnecessary
2619 			 * to attempt to unbind the memory.
2620 			 *
2621 			 * But we do need to unbind the newly bound memory
2622 			 * before returning.
2623 			 */
2624 			tavor_mr_mem_unbind(state, bind);
2625 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2626 
2627 			/* Set "status" and "errormsg" and goto failure */
2628 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MTT");
2629 			goto mrrereghelp_fail;
2630 		}
2631 
2632 		/*
2633 		 * Allocate MTT reference count (to track shared memory
2634 		 * regions).  As mentioned elsewhere above, this reference
2635 		 * count resource may never be used on the given memory region,
2636 		 * but if it is ever later registered as a "shared" memory
2637 		 * region then this resource will be necessary.  Note:  This
2638 		 * is only necessary here if the existing memory region is
2639 		 * already being shared (because otherwise we already have
2640 		 * a useable reference count resource).
2641 		 */
2642 		if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2643 			status = tavor_rsrc_alloc(state, TAVOR_REFCNT, 1,
2644 			    sleep, &mtt_refcnt);
2645 			if (status != DDI_SUCCESS) {
2646 				/*
2647 				 * Deregister will be called upon returning
2648 				 * failure from this routine. This will ensure
2649 				 * that all current resources get properly
2650 				 * freed up.  Unnecessary to attempt to regain
2651 				 * software ownership of the MPT entry as that
2652 				 * has already been done above (in
2653 				 * tavor_mr_reregister()).  Also unnecessary
2654 				 * to attempt to unbind the memory.
2655 				 *
2656 				 * But we need to unbind the newly bound
2657 				 * memory and free up the newly allocated MTT
2658 				 * entries before returning.
2659 				 */
2660 				tavor_mr_mem_unbind(state, bind);
2661 				tavor_rsrc_free(state, &mtt);
2662 				*dereg_level =
2663 				    TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2664 
2665 				/* Set "status"/"errormsg", goto failure */
2666 				TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE,
2667 				    "failed reference count");
2668 				goto mrrereghelp_fail;
2669 			}
2670 			swrc_new = (tavor_sw_refcnt_t *)mtt_refcnt->tr_addr;
2671 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_new))
2672 			TAVOR_MTT_REFCNT_INIT(swrc_new);
2673 		} else {
2674 			mtt_refcnt = mr->mr_mttrefcntp;
2675 		}
2676 
2677 		/*
2678 		 * Using the new mapping and the new MTT resources, write the
2679 		 * updated entries to MTT
2680 		 */
2681 		status = tavor_mr_fast_mtt_write(mtt, bind, mtt_pgsize_bits);
2682 		if (status != DDI_SUCCESS) {
2683 			/*
2684 			 * Deregister will be called upon returning failure
2685 			 * from this routine. This will ensure that all
2686 			 * current resources get properly freed up.
2687 			 * Unnecessary to attempt to regain software ownership
2688 			 * of the MPT entry as that has already been done
2689 			 * above (in tavor_mr_reregister()).  Also unnecessary
2690 			 * to attempt to unbind the memory.
2691 			 *
2692 			 * But we need to unbind the newly bound memory,
2693 			 * free up the newly allocated MTT entries, and
2694 			 * (possibly) free the new MTT reference count
2695 			 * resource before returning.
2696 			 */
2697 			if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2698 				tavor_rsrc_free(state, &mtt_refcnt);
2699 			}
2700 			tavor_mr_mem_unbind(state, bind);
2701 			tavor_rsrc_free(state, &mtt);
2702 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2703 
2704 			/* Set "status" and "errormsg" and goto failure */
2705 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed write mtt");
2706 			goto mrrereghelp_fail;
2707 		}
2708 
2709 		/*
2710 		 * Check if the memory region MTT is shared by any other MRs.
2711 		 * Since the resource may be shared between multiple memory
2712 		 * regions (as a result of a "RegisterSharedMR()" verb) it is
2713 		 * important that we not free up any resources prematurely.
2714 		 */
2715 		if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2716 			/* Decrement MTT reference count for "old" region */
2717 			(void) tavor_mtt_refcnt_dec(mr->mr_mttrefcntp);
2718 		} else {
2719 			/* Free up the old MTT entries resource */
2720 			tavor_rsrc_free(state, &mr->mr_mttrsrcp);
2721 		}
2722 
2723 		/* Put the updated information into the mrhdl */
2724 		mr->mr_bindinfo	  = *bind;
2725 		mr->mr_logmttpgsz = mtt_pgsize_bits;
2726 		mr->mr_mttrsrcp   = mtt;
2727 		mr->mr_mttrefcntp = mtt_refcnt;
2728 	}
2729 
2730 	/*
2731 	 * Calculate and return the updated MTT address (in the DDR address
2732 	 * space).  This will be used by the caller (tavor_mr_reregister) in
2733 	 * the updated MPT entry
2734 	 */
2735 	rsrc_pool	= &state->ts_rsrc_hdl[TAVOR_MTT];
2736 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
2737 	*mtt_addr	= mtt_ddrbaseaddr + (mtt->tr_indx <<
2738 	    TAVOR_MTT_SIZE_SHIFT);
2739 
2740 	TAVOR_TNF_EXIT(tavor_mr_rereg_xlat_helper);
2741 	return (DDI_SUCCESS);
2742 
2743 mrrereghelp_fail:
2744 	TNF_PROBE_1(tavor_mr_rereg_xlat_helper_fail, TAVOR_TNF_ERROR, "",
2745 	    tnf_string, msg, errormsg);
2746 	TAVOR_TNF_EXIT(tavor_mr_rereg_xlat_helper);
2747 	return (status);
2748 }
2749 
2750 
2751 /*
2752  * tavor_mr_nummtt_needed()
2753  *    Context: Can be called from interrupt or base context.
2754  */
2755 /* ARGSUSED */
2756 static uint64_t
2757 tavor_mr_nummtt_needed(tavor_state_t *state, tavor_bind_info_t *bind,
2758     uint_t *mtt_pgsize_bits)
2759 {
2760 	uint64_t	pg_offset_mask;
2761 	uint64_t	pg_offset, tmp_length;
2762 
2763 	/*
2764 	 * For now we specify the page size as 8Kb (the default page size for
2765 	 * the sun4u architecture), or 4Kb for x86.  Figure out optimal page
2766 	 * size by examining the dmacookies XXX
2767 	 */
2768 	*mtt_pgsize_bits = PAGESHIFT;
2769 
2770 	pg_offset_mask = ((uint64_t)1 << *mtt_pgsize_bits) - 1;
2771 	pg_offset = bind->bi_addr & pg_offset_mask;
2772 	tmp_length = pg_offset + (bind->bi_len - 1);
2773 	return ((tmp_length >> *mtt_pgsize_bits) + 1);
2774 }
2775 
2776 
2777 /*
2778  * tavor_mr_mem_bind()
2779  *    Context: Can be called from interrupt or base context.
2780  */
2781 static int
2782 tavor_mr_mem_bind(tavor_state_t *state, tavor_bind_info_t *bind,
2783     ddi_dma_handle_t dmahdl, uint_t sleep)
2784 {
2785 	ddi_dma_attr_t	dma_attr;
2786 	int		(*callback)(caddr_t);
2787 	uint_t		dma_xfer_mode;
2788 	int		status;
2789 
2790 	/* bi_type must be set to a meaningful value to get a bind handle */
2791 	ASSERT(bind->bi_type == TAVOR_BINDHDL_VADDR ||
2792 	    bind->bi_type == TAVOR_BINDHDL_BUF ||
2793 	    bind->bi_type == TAVOR_BINDHDL_UBUF);
2794 
2795 	TAVOR_TNF_ENTER(tavor_mr_mem_bind);
2796 
2797 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2798 
2799 	/* Set the callback flag appropriately */
2800 	callback = (sleep == TAVOR_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT;
2801 
2802 	/* Determine whether to map STREAMING or CONSISTENT */
2803 	dma_xfer_mode = (bind->bi_flags & IBT_MR_NONCOHERENT) ?
2804 	    DDI_DMA_STREAMING : DDI_DMA_CONSISTENT;
2805 
2806 	/*
2807 	 * Initialize many of the default DMA attributes.  Then, if we're
2808 	 * bypassing the IOMMU, set the DDI_DMA_FORCE_PHYSICAL flag.
2809 	 */
2810 	if (dmahdl == NULL) {
2811 		tavor_dma_attr_init(&dma_attr);
2812 #ifdef	__sparc
2813 		/*
2814 		 * First, disable streaming and switch to consistent if
2815 		 * configured to do so and IOMMU BYPASS is enabled.
2816 		 */
2817 		if (state->ts_cfg_profile->cp_disable_streaming_on_bypass &&
2818 		    dma_xfer_mode == DDI_DMA_STREAMING &&
2819 		    bind->bi_bypass == TAVOR_BINDMEM_BYPASS) {
2820 			dma_xfer_mode = DDI_DMA_CONSISTENT;
2821 		}
2822 
2823 		/*
2824 		 * Then, if streaming is still specified, then "bypass" is not
2825 		 * allowed.
2826 		 */
2827 		if ((dma_xfer_mode == DDI_DMA_CONSISTENT) &&
2828 		    (bind->bi_bypass == TAVOR_BINDMEM_BYPASS)) {
2829 			dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2830 		}
2831 #endif
2832 		/* Allocate a DMA handle for the binding */
2833 		status = ddi_dma_alloc_handle(state->ts_dip, &dma_attr,
2834 		    callback, NULL, &bind->bi_dmahdl);
2835 		if (status != DDI_SUCCESS) {
2836 			TNF_PROBE_0(tavor_mr_mem_bind_dmahdl_fail,
2837 			    TAVOR_TNF_ERROR, "");
2838 			TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2839 			return (status);
2840 		}
2841 		bind->bi_free_dmahdl = 1;
2842 
2843 	} else  {
2844 		bind->bi_dmahdl = dmahdl;
2845 		bind->bi_free_dmahdl = 0;
2846 	}
2847 
2848 	/*
2849 	 * Bind the memory to get the PCI mapped addresses.  The decision
2850 	 * to call ddi_dma_addr_bind_handle() or ddi_dma_buf_bind_handle()
2851 	 * is determined by the "bi_type" flag.  Note: if the bind operation
2852 	 * fails then we have to free up the DMA handle and return error.
2853 	 */
2854 	if (bind->bi_type == TAVOR_BINDHDL_VADDR) {
2855 		status = ddi_dma_addr_bind_handle(bind->bi_dmahdl, NULL,
2856 		    (caddr_t)(uintptr_t)bind->bi_addr, bind->bi_len,
2857 		    (DDI_DMA_RDWR | dma_xfer_mode), callback, NULL,
2858 		    &bind->bi_dmacookie, &bind->bi_cookiecnt);
2859 	} else { /* TAVOR_BINDHDL_BUF || TAVOR_BINDHDL_UBUF */
2860 		status = ddi_dma_buf_bind_handle(bind->bi_dmahdl,
2861 		    bind->bi_buf, (DDI_DMA_RDWR | dma_xfer_mode), callback,
2862 		    NULL, &bind->bi_dmacookie, &bind->bi_cookiecnt);
2863 	}
2864 
2865 	if (status != DDI_DMA_MAPPED) {
2866 		if (bind->bi_free_dmahdl != 0) {
2867 			ddi_dma_free_handle(&bind->bi_dmahdl);
2868 		}
2869 		TNF_PROBE_0(tavor_mr_mem_bind_dmabind_fail, TAVOR_TNF_ERROR,
2870 		    "");
2871 		TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2872 		return (status);
2873 	}
2874 
2875 	TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2876 	return (DDI_SUCCESS);
2877 }
2878 
2879 
2880 /*
2881  * tavor_mr_mem_unbind()
2882  *    Context: Can be called from interrupt or base context.
2883  */
2884 static void
2885 tavor_mr_mem_unbind(tavor_state_t *state, tavor_bind_info_t *bind)
2886 {
2887 	int	status;
2888 
2889 	TAVOR_TNF_ENTER(tavor_mr_mem_unbind);
2890 
2891 	/*
2892 	 * In case of TAVOR_BINDHDL_UBUF, the memory bi_buf points to
2893 	 * is actually allocated by ddi_umem_iosetup() internally, then
2894 	 * it's required to free it here. Reset bi_type to TAVOR_BINDHDL_NONE
2895 	 * not to free it again later.
2896 	 */
2897 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2898 	if (bind->bi_type == TAVOR_BINDHDL_UBUF) {
2899 		freerbuf(bind->bi_buf);
2900 		bind->bi_type = TAVOR_BINDHDL_NONE;
2901 	}
2902 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
2903 
2904 	/*
2905 	 * Unbind the DMA memory for the region
2906 	 *
2907 	 * Note: The only way ddi_dma_unbind_handle() currently
2908 	 * can return an error is if the handle passed in is invalid.
2909 	 * Since this should never happen, we choose to return void
2910 	 * from this function!  If this does return an error, however,
2911 	 * then we print a warning message to the console.
2912 	 */
2913 	status = ddi_dma_unbind_handle(bind->bi_dmahdl);
2914 	if (status != DDI_SUCCESS) {
2915 		TAVOR_WARNING(state, "failed to unbind DMA mapping");
2916 		TNF_PROBE_0(tavor_mr_mem_unbind_dmaunbind_fail,
2917 		    TAVOR_TNF_ERROR, "");
2918 		TAVOR_TNF_EXIT(tavor_mr_mem_unbind);
2919 		return;
2920 	}
2921 
2922 	/* Free up the DMA handle */
2923 	if (bind->bi_free_dmahdl != 0) {
2924 		ddi_dma_free_handle(&bind->bi_dmahdl);
2925 	}
2926 
2927 	TAVOR_TNF_EXIT(tavor_mr_mem_unbind);
2928 }
2929 
2930 
2931 /*
2932  * tavor_mr_fast_mtt_write()
2933  *    Context: Can be called from interrupt or base context.
2934  */
2935 static int
2936 tavor_mr_fast_mtt_write(tavor_rsrc_t *mtt, tavor_bind_info_t *bind,
2937     uint32_t mtt_pgsize_bits)
2938 {
2939 	ddi_dma_cookie_t	dmacookie;
2940 	uint_t			cookie_cnt;
2941 	uint64_t		*mtt_table;
2942 	uint64_t		mtt_entry;
2943 	uint64_t		addr, endaddr;
2944 	uint64_t		pagesize;
2945 	int			i;
2946 
2947 	TAVOR_TNF_ENTER(tavor_mr_fast_mtt_write);
2948 
2949 	/* Calculate page size from the suggested value passed in */
2950 	pagesize = ((uint64_t)1 << mtt_pgsize_bits);
2951 
2952 	/*
2953 	 * Walk the "cookie list" and fill in the MTT table entries
2954 	 */
2955 	i = 0;
2956 	mtt_table  = (uint64_t *)mtt->tr_addr;
2957 	dmacookie  = bind->bi_dmacookie;
2958 	cookie_cnt = bind->bi_cookiecnt;
2959 	while (cookie_cnt-- > 0) {
2960 		addr	= dmacookie.dmac_laddress;
2961 		endaddr = addr + (dmacookie.dmac_size - 1);
2962 		addr	= addr & ~((uint64_t)pagesize - 1);
2963 		while (addr <= endaddr) {
2964 			/*
2965 			 * Fill in the mapped addresses (calculated above) and
2966 			 * set TAVOR_MTT_ENTRY_PRESET flag for each MTT entry.
2967 			 */
2968 			mtt_entry = addr | TAVOR_MTT_ENTRY_PRESET;
2969 			ddi_put64(mtt->tr_acchdl, &mtt_table[i], mtt_entry);
2970 			addr += pagesize;
2971 			i++;
2972 
2973 			if (addr == 0) {
2974 				static int do_once = 1;
2975 				_NOTE(SCHEME_PROTECTS_DATA("safe sharing",
2976 				    do_once))
2977 				if (do_once) {
2978 					do_once = 0;
2979 					cmn_err(CE_NOTE, "probable error in "
2980 					    "dma_cookie address from caller\n");
2981 				}
2982 				break;
2983 			}
2984 		}
2985 
2986 		/*
2987 		 * When we've reached the end of the current DMA cookie,
2988 		 * jump to the next cookie (if there are more)
2989 		 */
2990 		if (cookie_cnt != 0) {
2991 			ddi_dma_nextcookie(bind->bi_dmahdl, &dmacookie);
2992 		}
2993 	}
2994 
2995 	TAVOR_TNF_EXIT(tavor_mr_fast_mtt_write);
2996 	return (DDI_SUCCESS);
2997 }
2998 
2999 /*
3000  * tavor_mr_fast_mtt_write_fmr()
3001  *    Context: Can be called from interrupt or base context.
3002  */
3003 static int
3004 tavor_mr_fast_mtt_write_fmr(tavor_rsrc_t *mtt, ibt_pmr_attr_t *mem_pattr,
3005     uint32_t mtt_pgsize_bits)
3006 {
3007 	uint64_t		*mtt_table;
3008 	ibt_phys_addr_t		*buf;
3009 	uint64_t		mtt_entry;
3010 	uint64_t		addr, first_addr, endaddr;
3011 	uint64_t		pagesize;
3012 	int			i;
3013 
3014 	TAVOR_TNF_ENTER(tavor_mr_fast_mtt_write_fmr);
3015 
3016 	/* Calculate page size from the suggested value passed in */
3017 	pagesize = ((uint64_t)1 << mtt_pgsize_bits);
3018 
3019 	/*
3020 	 * Walk the "buf list" and fill in the MTT table entries
3021 	 */
3022 	mtt_table  = (uint64_t *)mtt->tr_addr;
3023 	for (i = 0; i < mem_pattr->pmr_num_buf; i++) {
3024 		buf = &mem_pattr->pmr_addr_list[i];
3025 
3026 		/*
3027 		 * For first cookie, use the offset field to determine where
3028 		 * the buffer starts.  The end addr is then calculated with the
3029 		 * offset in mind.
3030 		 */
3031 		if (i == 0) {
3032 			first_addr = addr = buf->p_laddr +
3033 			    mem_pattr->pmr_offset;
3034 			endaddr = addr + (mem_pattr->pmr_buf_sz - 1) -
3035 			    mem_pattr->pmr_offset;
3036 		/*
3037 		 * For last cookie, determine end addr based on starting
3038 		 * address and size of the total buffer
3039 		 */
3040 		} else if (i == mem_pattr->pmr_num_buf - 1) {
3041 			addr = buf->p_laddr;
3042 			endaddr = addr + (first_addr + mem_pattr->pmr_len &
3043 			    (mem_pattr->pmr_buf_sz - 1));
3044 		/*
3045 		 * For the middle cookies case, start and end addr are
3046 		 * straightforward.  Just use the laddr, and the size, as all
3047 		 * middle cookies are a set size.
3048 		 */
3049 		} else {
3050 			addr = buf->p_laddr;
3051 			endaddr = addr + (mem_pattr->pmr_buf_sz - 1);
3052 		}
3053 
3054 		addr	= addr & ~((uint64_t)pagesize - 1);
3055 		while (addr <= endaddr) {
3056 			/*
3057 			 * Fill in the mapped addresses (calculated above) and
3058 			 * set TAVOR_MTT_ENTRY_PRESET flag for each MTT entry.
3059 			 */
3060 			mtt_entry = addr | TAVOR_MTT_ENTRY_PRESET;
3061 			ddi_put64(mtt->tr_acchdl, &mtt_table[i], mtt_entry);
3062 			addr += pagesize;
3063 		}
3064 	}
3065 
3066 	TAVOR_TNF_EXIT(tavor_mr_fast_mtt_write_fmr);
3067 	return (DDI_SUCCESS);
3068 }
3069 
3070 
3071 /*
3072  * tavor_mtt_refcnt_inc()
3073  *    Context: Can be called from interrupt or base context.
3074  */
3075 static int
3076 tavor_mtt_refcnt_inc(tavor_rsrc_t *rsrc)
3077 {
3078 	tavor_sw_refcnt_t *rc;
3079 	uint32_t	  cnt;
3080 
3081 	rc = (tavor_sw_refcnt_t *)rsrc->tr_addr;
3082 
3083 	/* Increment the MTT's reference count */
3084 	mutex_enter(&rc->swrc_lock);
3085 	TNF_PROBE_1_DEBUG(tavor_mtt_refcnt_inc, TAVOR_TNF_TRACE, "",
3086 	    tnf_uint, refcnt, rc->swrc_refcnt);
3087 	cnt = rc->swrc_refcnt++;
3088 	mutex_exit(&rc->swrc_lock);
3089 
3090 	return (cnt);
3091 }
3092 
3093 
3094 /*
3095  * tavor_mtt_refcnt_dec()
3096  *    Context: Can be called from interrupt or base context.
3097  */
3098 static int
3099 tavor_mtt_refcnt_dec(tavor_rsrc_t *rsrc)
3100 {
3101 	tavor_sw_refcnt_t *rc;
3102 	uint32_t	  cnt;
3103 
3104 	rc = (tavor_sw_refcnt_t *)rsrc->tr_addr;
3105 
3106 	/* Decrement the MTT's reference count */
3107 	mutex_enter(&rc->swrc_lock);
3108 	cnt = --rc->swrc_refcnt;
3109 	TNF_PROBE_1_DEBUG(tavor_mtt_refcnt_dec, TAVOR_TNF_TRACE, "",
3110 	    tnf_uint, refcnt, rc->swrc_refcnt);
3111 	mutex_exit(&rc->swrc_lock);
3112 
3113 	return (cnt);
3114 }
3115