xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/tavor/tavor_mr.c (revision 949b58c70cf907006b9f724dfad665d44eca5881)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_mr.c
29  *    Tavor Memory Region/Window Routines
30  *
31  *    Implements all the routines necessary to provide the requisite memory
32  *    registration verbs.  These include operations like RegisterMemRegion(),
33  *    DeregisterMemRegion(), ReregisterMemRegion, RegisterSharedMemRegion,
34  *    etc., that affect Memory Regions.  It also includes the verbs that
35  *    affect Memory Windows, including AllocMemWindow(), FreeMemWindow(),
36  *    and QueryMemWindow().
37  */
38 
39 #include <sys/types.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/modctl.h>
44 #include <sys/esunddi.h>
45 
46 #include <sys/ib/adapters/tavor/tavor.h>
47 
48 
49 /*
50  * Used by tavor_mr_keycalc() below to fill in the "unconstrained" portion
51  * of Tavor memory keys (LKeys and RKeys)
52  */
53 static uint_t tavor_debug_memkey_cnt = 0x00000000;
54 
55 static int tavor_mr_common_reg(tavor_state_t *state, tavor_pdhdl_t pd,
56     tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op);
57 static int tavor_mr_common_rereg(tavor_state_t *state, tavor_mrhdl_t mr,
58     tavor_pdhdl_t pd, tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl_new,
59     tavor_mr_options_t *op);
60 static int tavor_mr_rereg_xlat_helper(tavor_state_t *state, tavor_mrhdl_t mr,
61     tavor_bind_info_t *bind, tavor_mr_options_t *op, uint64_t *mtt_addr,
62     uint_t sleep, uint_t *dereg_level);
63 static uint64_t tavor_mr_nummtt_needed(tavor_state_t *state,
64     tavor_bind_info_t *bind, uint_t *mtt_pgsize);
65 static int tavor_mr_mem_bind(tavor_state_t *state, tavor_bind_info_t *bind,
66     ddi_dma_handle_t dmahdl, uint_t sleep);
67 static void tavor_mr_mem_unbind(tavor_state_t *state,
68     tavor_bind_info_t *bind);
69 static int tavor_mr_fast_mtt_write(tavor_rsrc_t *mtt, tavor_bind_info_t *bind,
70     uint32_t mtt_pgsize_bits);
71 static int tavor_mr_fast_mtt_write_fmr(tavor_rsrc_t *mtt,
72     ibt_pmr_attr_t *mem_pattr, uint32_t mtt_pgsize_bits);
73 static int tavor_mtt_refcnt_inc(tavor_rsrc_t *rsrc);
74 static int tavor_mtt_refcnt_dec(tavor_rsrc_t *rsrc);
75 
76 /*
77  * The Tavor umem_lockmemory() callback ops.  When userland memory is
78  * registered, these callback ops are specified.  The tavor_umap_umemlock_cb()
79  * callback will be called whenever the memory for the corresponding
80  * ddi_umem_cookie_t is being freed.
81  */
82 static struct umem_callback_ops tavor_umem_cbops = {
83 	UMEM_CALLBACK_VERSION,
84 	tavor_umap_umemlock_cb,
85 };
86 
87 
88 /*
89  * tavor_mr_register()
90  *    Context: Can be called from interrupt or base context.
91  */
92 int
93 tavor_mr_register(tavor_state_t *state, tavor_pdhdl_t pd,
94     ibt_mr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op)
95 {
96 	tavor_bind_info_t	bind;
97 	int			status;
98 
99 	TAVOR_TNF_ENTER(tavor_mr_register);
100 
101 	/*
102 	 * Fill in the "bind" struct.  This struct provides the majority
103 	 * of the information that will be used to distinguish between an
104 	 * "addr" binding (as is the case here) and a "buf" binding (see
105 	 * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
106 	 * which does most of the "heavy lifting" for the Tavor memory
107 	 * registration routines.
108 	 */
109 	bind.bi_type  = TAVOR_BINDHDL_VADDR;
110 	bind.bi_addr  = mr_attr->mr_vaddr;
111 	bind.bi_len   = mr_attr->mr_len;
112 	bind.bi_as    = mr_attr->mr_as;
113 	bind.bi_flags = mr_attr->mr_flags;
114 	status = tavor_mr_common_reg(state, pd, &bind, mrhdl, op);
115 	if (status != DDI_SUCCESS) {
116 		TNF_PROBE_0(tavor_mr_register_cmnreg_fail,
117 		    TAVOR_TNF_ERROR, "");
118 		TAVOR_TNF_EXIT(tavor_mr_register);
119 		return (status);
120 	}
121 
122 	TAVOR_TNF_EXIT(tavor_mr_register);
123 	return (DDI_SUCCESS);
124 }
125 
126 
127 /*
128  * tavor_mr_register_buf()
129  *    Context: Can be called from interrupt or base context.
130  */
131 int
132 tavor_mr_register_buf(tavor_state_t *state, tavor_pdhdl_t pd,
133     ibt_smr_attr_t *mr_attr, struct buf *buf, tavor_mrhdl_t *mrhdl,
134     tavor_mr_options_t *op)
135 {
136 	tavor_bind_info_t	bind;
137 	int			status;
138 
139 	TAVOR_TNF_ENTER(tavor_mr_register_buf);
140 
141 	/*
142 	 * Fill in the "bind" struct.  This struct provides the majority
143 	 * of the information that will be used to distinguish between an
144 	 * "addr" binding (see above) and a "buf" binding (as is the case
145 	 * here).  The "bind" struct is later passed to tavor_mr_mem_bind()
146 	 * which does most of the "heavy lifting" for the Tavor memory
147 	 * registration routines.  Note: We have chosen to provide
148 	 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
149 	 * not set).  It is not critical what value we choose here as it need
150 	 * only be unique for the given RKey (which will happen by default),
151 	 * so the choice here is somewhat arbitrary.
152 	 */
153 	bind.bi_type  = TAVOR_BINDHDL_BUF;
154 	bind.bi_buf   = buf;
155 	if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
156 		bind.bi_addr  = mr_attr->mr_vaddr;
157 	} else {
158 		bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
159 	}
160 	bind.bi_as    = NULL;
161 	bind.bi_len   = (uint64_t)buf->b_bcount;
162 	bind.bi_flags = mr_attr->mr_flags;
163 	status = tavor_mr_common_reg(state, pd, &bind, mrhdl, op);
164 	if (status != DDI_SUCCESS) {
165 		TNF_PROBE_0(tavor_mr_register_buf_cmnreg_fail,
166 		    TAVOR_TNF_ERROR, "");
167 		TAVOR_TNF_EXIT(tavor_mr_register_buf);
168 		return (status);
169 	}
170 
171 	TAVOR_TNF_EXIT(tavor_mr_register_buf);
172 	return (DDI_SUCCESS);
173 }
174 
175 
176 /*
177  * tavor_mr_register_shared()
178  *    Context: Can be called from interrupt or base context.
179  */
180 int
181 tavor_mr_register_shared(tavor_state_t *state, tavor_mrhdl_t mrhdl,
182     tavor_pdhdl_t pd, ibt_smr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl_new)
183 {
184 	tavor_rsrc_pool_info_t	*rsrc_pool;
185 	tavor_rsrc_t		*mpt, *mtt, *rsrc;
186 	tavor_umap_db_entry_t	*umapdb;
187 	tavor_hw_mpt_t		mpt_entry;
188 	tavor_mrhdl_t		mr;
189 	tavor_bind_info_t	*bind;
190 	ddi_umem_cookie_t	umem_cookie;
191 	size_t			umem_len;
192 	caddr_t			umem_addr;
193 	uint64_t		mtt_addr, mtt_ddrbaseaddr, pgsize_msk;
194 	uint_t			sleep, mr_is_umem;
195 	int			status, umem_flags;
196 	char			*errormsg;
197 
198 	TAVOR_TNF_ENTER(tavor_mr_register_shared);
199 
200 	/*
201 	 * Check the sleep flag.  Ensure that it is consistent with the
202 	 * current thread context (i.e. if we are currently in the interrupt
203 	 * context, then we shouldn't be attempting to sleep).
204 	 */
205 	sleep = (mr_attr->mr_flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP :
206 	    TAVOR_SLEEP;
207 	if ((sleep == TAVOR_SLEEP) &&
208 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
209 		/* Set "status" and "errormsg" and goto failure */
210 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
211 		goto mrshared_fail;
212 	}
213 
214 	/* Increment the reference count on the protection domain (PD) */
215 	tavor_pd_refcnt_inc(pd);
216 
217 	/*
218 	 * Allocate an MPT entry.  This will be filled in with all the
219 	 * necessary parameters to define the shared memory region.
220 	 * Specifically, it will be made to reference the currently existing
221 	 * MTT entries and ownership of the MPT will be passed to the hardware
222 	 * in the last step below.  If we fail here, we must undo the
223 	 * protection domain reference count.
224 	 */
225 	status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
226 	if (status != DDI_SUCCESS) {
227 		/* Set "status" and "errormsg" and goto failure */
228 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
229 		goto mrshared_fail1;
230 	}
231 
232 	/*
233 	 * Allocate the software structure for tracking the shared memory
234 	 * region (i.e. the Tavor Memory Region handle).  If we fail here, we
235 	 * must undo the protection domain reference count and the previous
236 	 * resource allocation.
237 	 */
238 	status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
239 	if (status != DDI_SUCCESS) {
240 		/* Set "status" and "errormsg" and goto failure */
241 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
242 		goto mrshared_fail2;
243 	}
244 	mr = (tavor_mrhdl_t)rsrc->tr_addr;
245 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
246 
247 	/*
248 	 * Setup and validate the memory region access flags.  This means
249 	 * translating the IBTF's enable flags into the access flags that
250 	 * will be used in later operations.
251 	 */
252 	mr->mr_accflag = 0;
253 	if (mr_attr->mr_flags & IBT_MR_ENABLE_WINDOW_BIND)
254 		mr->mr_accflag |= IBT_MR_WINDOW_BIND;
255 	if (mr_attr->mr_flags & IBT_MR_ENABLE_LOCAL_WRITE)
256 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
257 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_READ)
258 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
259 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_WRITE)
260 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
261 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
262 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
263 
264 	/*
265 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
266 	 * from a certain number of "constrained" bits (the least significant
267 	 * bits) and some number of "unconstrained" bits.  The constrained
268 	 * bits must be set to the index of the entry in the MPT table, but
269 	 * the unconstrained bits can be set to any value we wish.  Note:
270 	 * if no remote access is required, then the RKey value is not filled
271 	 * in.  Otherwise both Rkey and LKey are given the same value.
272 	 */
273 	tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
274 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
275 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
276 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
277 		mr->mr_rkey = mr->mr_lkey;
278 	}
279 
280 	/* Grab the MR lock for the current memory region */
281 	mutex_enter(&mrhdl->mr_lock);
282 
283 	/*
284 	 * Check here to see if the memory region has already been partially
285 	 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
286 	 * If so, this is an error, return failure.
287 	 */
288 	if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
289 		mutex_exit(&mrhdl->mr_lock);
290 		/* Set "status" and "errormsg" and goto failure */
291 		TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
292 		goto mrshared_fail3;
293 	}
294 
295 	/*
296 	 * Determine if the original memory was from userland and, if so, pin
297 	 * the pages (again) with umem_lockmemory().  This will guarantee a
298 	 * separate callback for each of this shared region's MR handles.
299 	 * If this is userland memory, then allocate an entry in the
300 	 * "userland resources database".  This will later be added to
301 	 * the database (after all further memory registration operations are
302 	 * successful).  If we fail here, we must undo all the above setup.
303 	 */
304 	mr_is_umem = mrhdl->mr_is_umem;
305 	if (mr_is_umem) {
306 		umem_len   = ptob(btopr(mrhdl->mr_bindinfo.bi_len +
307 		    ((uintptr_t)mrhdl->mr_bindinfo.bi_addr & PAGEOFFSET)));
308 		umem_addr  = (caddr_t)((uintptr_t)mrhdl->mr_bindinfo.bi_addr &
309 		    ~PAGEOFFSET);
310 		umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
311 		    DDI_UMEMLOCK_LONGTERM);
312 		status = umem_lockmemory(umem_addr, umem_len, umem_flags,
313 		    &umem_cookie, &tavor_umem_cbops, NULL);
314 		if (status != 0) {
315 			mutex_exit(&mrhdl->mr_lock);
316 			/* Set "status" and "errormsg" and goto failure */
317 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umem pin");
318 			goto mrshared_fail3;
319 		}
320 
321 		umapdb = tavor_umap_db_alloc(state->ts_instance,
322 		    (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
323 		    (uint64_t)(uintptr_t)rsrc);
324 		if (umapdb == NULL) {
325 			mutex_exit(&mrhdl->mr_lock);
326 			/* Set "status" and "errormsg" and goto failure */
327 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
328 			goto mrshared_fail4;
329 		}
330 	}
331 
332 	/*
333 	 * Copy the MTT resource pointer (and additional parameters) from
334 	 * the original Tavor Memory Region handle.  Note: this is normally
335 	 * where the tavor_mr_mem_bind() routine would be called, but because
336 	 * we already have bound and filled-in MTT entries it is simply a
337 	 * matter here of managing the MTT reference count and grabbing the
338 	 * address of the MTT table entries (for filling in the shared region's
339 	 * MPT entry).
340 	 */
341 	mr->mr_mttrsrcp	  = mrhdl->mr_mttrsrcp;
342 	mr->mr_logmttpgsz = mrhdl->mr_logmttpgsz;
343 	mr->mr_bindinfo	  = mrhdl->mr_bindinfo;
344 	mr->mr_mttrefcntp = mrhdl->mr_mttrefcntp;
345 	mutex_exit(&mrhdl->mr_lock);
346 	bind = &mr->mr_bindinfo;
347 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
348 	mtt = mr->mr_mttrsrcp;
349 
350 	/*
351 	 * Increment the MTT reference count (to reflect the fact that
352 	 * the MTT is now shared)
353 	 */
354 	(void) tavor_mtt_refcnt_inc(mr->mr_mttrefcntp);
355 
356 	/*
357 	 * Update the new "bind" virtual address.  Do some extra work here
358 	 * to ensure proper alignment.  That is, make sure that the page
359 	 * offset for the beginning of the old range is the same as the
360 	 * offset for this new mapping
361 	 */
362 	pgsize_msk = (((uint64_t)1 << mr->mr_logmttpgsz) - 1);
363 	bind->bi_addr = ((mr_attr->mr_vaddr & ~pgsize_msk) |
364 	    (mr->mr_bindinfo.bi_addr & pgsize_msk));
365 
366 	/*
367 	 * Get the base address for the MTT table.  This will be necessary
368 	 * in the next step when we are setting up the MPT entry.
369 	 */
370 	rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
371 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
372 
373 	/*
374 	 * Fill in the MPT entry.  This is the final step before passing
375 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
376 	 * the information collected/calculated above to fill in the
377 	 * requisite portions of the MPT.
378 	 */
379 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
380 	mpt_entry.m_io	  = TAVOR_MEM_CYCLE_GENERATE;
381 	mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
382 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
383 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
384 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
385 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
386 	mpt_entry.lr	  = 1;
387 	mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
388 	mpt_entry.page_sz	= mr->mr_logmttpgsz - 0xC;
389 	mpt_entry.mem_key	= mr->mr_lkey;
390 	mpt_entry.pd		= pd->pd_pdnum;
391 	mpt_entry.start_addr	= bind->bi_addr;
392 	mpt_entry.reg_win_len	= bind->bi_len;
393 	mpt_entry.win_cnt_limit	= TAVOR_UNLIMITED_WIN_BIND;
394 	mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
395 	mpt_entry.mttseg_addr_h = mtt_addr >> 32;
396 	mpt_entry.mttseg_addr_l = mtt_addr >> 6;
397 
398 	/*
399 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
400 	 * the entry to the hardware.  Note: in general, this operation
401 	 * shouldn't fail.  But if it does, we have to undo everything we've
402 	 * done above before returning error.
403 	 */
404 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
405 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
406 	if (status != TAVOR_CMD_SUCCESS) {
407 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
408 		    status);
409 		TNF_PROBE_1(tavor_mr_register_shared_sw2hw_mpt_cmd_fail,
410 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
411 		/* Set "status" and "errormsg" and goto failure */
412 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
413 		    "tavor SW2HW_MPT command");
414 		goto mrshared_fail5;
415 	}
416 
417 	/*
418 	 * Fill in the rest of the Tavor Memory Region handle.  Having
419 	 * successfully transferred ownership of the MPT, we can update the
420 	 * following fields for use in further operations on the MR.
421 	 */
422 	mr->mr_mptrsrcp	  = mpt;
423 	mr->mr_mttrsrcp	  = mtt;
424 	mr->mr_pdhdl	  = pd;
425 	mr->mr_rsrcp	  = rsrc;
426 	mr->mr_is_umem	  = mr_is_umem;
427 	mr->mr_is_fmr	  = 0;
428 	mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL;
429 	mr->mr_umem_cbfunc = NULL;
430 	mr->mr_umem_cbarg1 = NULL;
431 	mr->mr_umem_cbarg2 = NULL;
432 
433 	/*
434 	 * If this is userland memory, then we need to insert the previously
435 	 * allocated entry into the "userland resources database".  This will
436 	 * allow for later coordination between the tavor_umap_umemlock_cb()
437 	 * callback and tavor_mr_deregister().
438 	 */
439 	if (mr_is_umem) {
440 		tavor_umap_db_add(umapdb);
441 	}
442 
443 	*mrhdl_new = mr;
444 
445 	TAVOR_TNF_EXIT(tavor_mr_register_shared);
446 	return (DDI_SUCCESS);
447 
448 /*
449  * The following is cleanup for all possible failure cases in this routine
450  */
451 mrshared_fail5:
452 	(void) tavor_mtt_refcnt_dec(mr->mr_mttrefcntp);
453 	if (mr_is_umem) {
454 		tavor_umap_db_free(umapdb);
455 	}
456 mrshared_fail4:
457 	if (mr_is_umem) {
458 		ddi_umem_unlock(umem_cookie);
459 	}
460 mrshared_fail3:
461 	tavor_rsrc_free(state, &rsrc);
462 mrshared_fail2:
463 	tavor_rsrc_free(state, &mpt);
464 mrshared_fail1:
465 	tavor_pd_refcnt_dec(pd);
466 mrshared_fail:
467 	TNF_PROBE_1(tavor_mr_register_shared_fail, TAVOR_TNF_ERROR, "",
468 	    tnf_string, msg, errormsg);
469 	TAVOR_TNF_EXIT(tavor_mr_register_shared);
470 	return (status);
471 }
472 
473 /*
474  * tavor_mr_alloc_fmr()
475  *    Context: Can be called from interrupt or base context.
476  */
477 int
478 tavor_mr_alloc_fmr(tavor_state_t *state, tavor_pdhdl_t pd,
479     tavor_fmrhdl_t fmr_pool, tavor_mrhdl_t *mrhdl)
480 {
481 	tavor_rsrc_pool_info_t	*rsrc_pool;
482 	tavor_rsrc_t		*mpt, *mtt, *rsrc;
483 	tavor_hw_mpt_t		mpt_entry;
484 	tavor_mrhdl_t		mr;
485 	tavor_bind_info_t	bind;
486 	uint64_t		mtt_addr, mtt_ddrbaseaddr;
487 	uint64_t		nummtt;
488 	uint_t			sleep, mtt_pgsize_bits;
489 	int			status;
490 	char			*errormsg;
491 
492 	TAVOR_TNF_ENTER(tavor_mr_alloc_fmr);
493 
494 	/*
495 	 * Check the sleep flag.  Ensure that it is consistent with the
496 	 * current thread context (i.e. if we are currently in the interrupt
497 	 * context, then we shouldn't be attempting to sleep).
498 	 */
499 	sleep = (fmr_pool->fmr_flags & IBT_MR_SLEEP) ? TAVOR_SLEEP :
500 	    TAVOR_NOSLEEP;
501 	if ((sleep == TAVOR_SLEEP) &&
502 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
503 		TNF_PROBE_0(tavor_mr_alloc_fmr, TAVOR_TNF_ERROR, "");
504 		TAVOR_TNF_EXIT(tavor_mr_alloc_fmr);
505 		return (IBT_INVALID_PARAM);
506 	}
507 
508 	/* Increment the reference count on the protection domain (PD) */
509 	tavor_pd_refcnt_inc(pd);
510 
511 	/*
512 	 * Allocate an MPT entry.  This will be filled in with all the
513 	 * necessary parameters to define the FMR.  Specifically, it will be
514 	 * made to reference the currently existing MTT entries and ownership
515 	 * of the MPT will be passed to the hardware in the last step below.
516 	 * If we fail here, we must undo the protection domain reference count.
517 	 */
518 
519 	status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
520 	if (status != DDI_SUCCESS) {
521 		/* Set "status" and "errormsg" and goto failure */
522 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
523 		goto fmralloc_fail1;
524 	}
525 
526 	/*
527 	 * Allocate the software structure for tracking the fmr memory
528 	 * region (i.e. the Tavor Memory Region handle).  If we fail here, we
529 	 * must undo the protection domain reference count and the previous
530 	 * resource allocation.
531 	 */
532 	status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
533 	if (status != DDI_SUCCESS) {
534 		/* Set "status" and "errormsg" and goto failure */
535 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
536 		goto fmralloc_fail2;
537 	}
538 	mr = (tavor_mrhdl_t)rsrc->tr_addr;
539 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
540 
541 	/*
542 	 * Setup and validate the memory region access flags.  This means
543 	 * translating the IBTF's enable flags into the access flags that
544 	 * will be used in later operations.
545 	 */
546 	mr->mr_accflag = 0;
547 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_LOCAL_WRITE)
548 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
549 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_REMOTE_READ)
550 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
551 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_REMOTE_WRITE)
552 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
553 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
554 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
555 
556 	/*
557 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
558 	 * from a certain number of "constrained" bits (the least significant
559 	 * bits) and some number of "unconstrained" bits.  The constrained
560 	 * bits must be set to the index of the entry in the MPT table, but
561 	 * the unconstrained bits can be set to any value we wish.  Note:
562 	 * if no remote access is required, then the RKey value is not filled
563 	 * in.  Otherwise both Rkey and LKey are given the same value.
564 	 */
565 	tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
566 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
567 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
568 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
569 		mr->mr_rkey = mr->mr_lkey;
570 	}
571 
572 	/*
573 	 * Determine number of pages spanned.  This routine uses the
574 	 * information in the "bind" struct to determine the required
575 	 * number of MTT entries needed (and returns the suggested page size -
576 	 * as a "power-of-2" - for each MTT entry).
577 	 */
578 	/* Assume address will be page aligned later */
579 	bind.bi_addr = 0;
580 	/* Calculate size based on given max pages */
581 	bind.bi_len = fmr_pool->fmr_max_pages << PAGESHIFT;
582 	nummtt = tavor_mr_nummtt_needed(state, &bind, &mtt_pgsize_bits);
583 
584 	/*
585 	 * Allocate the MTT entries.  Use the calculations performed above to
586 	 * allocate the required number of MTT entries.  Note: MTT entries are
587 	 * allocated in "MTT segments" which consist of complete cachelines
588 	 * (i.e. 8 entries, 16 entries, etc.)  So the TAVOR_NUMMTT_TO_MTTSEG()
589 	 * macro is used to do the proper conversion.  If we fail here, we
590 	 * must not only undo all the previous resource allocation (and PD
591 	 * reference count), but we must also unbind the memory.
592 	 */
593 	status = tavor_rsrc_alloc(state, TAVOR_MTT,
594 	    TAVOR_NUMMTT_TO_MTTSEG(nummtt), sleep, &mtt);
595 	if (status != DDI_SUCCESS) {
596 		/* Set "status" and "errormsg" and goto failure */
597 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MTT");
598 		goto fmralloc_fail3;
599 	}
600 	mr->mr_logmttpgsz = mtt_pgsize_bits;
601 
602 	/*
603 	 * Get the base address for the MTT table.  This will be necessary
604 	 * in the next step when we are setting up the MPT entry.
605 	 */
606 	rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
607 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
608 
609 	/*
610 	 * Fill in the MPT entry.  This is the final step before passing
611 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
612 	 * the information collected/calculated above to fill in the
613 	 * requisite portions of the MPT.
614 	 */
615 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
616 	mpt_entry.m_io	  = TAVOR_MEM_CYCLE_GENERATE;
617 	mpt_entry.en_bind = 0;
618 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
619 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
620 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
621 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
622 	mpt_entry.lr	  = 1;
623 	mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
624 	mpt_entry.pd		= pd->pd_pdnum;
625 	mpt_entry.page_sz	= mr->mr_logmttpgsz - 0xC;
626 	mpt_entry.win_cnt_limit	= TAVOR_UNLIMITED_WIN_BIND;
627 	mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
628 	mpt_entry.mttseg_addr_h = mtt_addr >> 32;
629 	mpt_entry.mttseg_addr_l = mtt_addr >> 6;
630 	mpt_entry.mem_key	= mr->mr_lkey;
631 
632 	/*
633 	 * FMR sets these to 0 for now.  Later during actual fmr registration
634 	 * these values are filled in.
635 	 */
636 	mpt_entry.start_addr	= 0;
637 	mpt_entry.reg_win_len	= 0;
638 
639 	/*
640 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
641 	 * the entry to the hardware.  Note: in general, this operation
642 	 * shouldn't fail.  But if it does, we have to undo everything we've
643 	 * done above before returning error.
644 	 */
645 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
646 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
647 	if (status != TAVOR_CMD_SUCCESS) {
648 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
649 		    status);
650 		TNF_PROBE_1(tavor_mr_register_shared_sw2hw_mpt_cmd_fail,
651 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
652 		/* Set "status" and "errormsg" and goto failure */
653 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
654 		    "tavor SW2HW_MPT command");
655 		goto fmralloc_fail4;
656 	}
657 
658 	/*
659 	 * Fill in the rest of the Tavor Memory Region handle.  Having
660 	 * successfully transferred ownership of the MPT, we can update the
661 	 * following fields for use in further operations on the MR.  Also, set
662 	 * that this is an FMR region.
663 	 */
664 	mr->mr_mptrsrcp	  = mpt;
665 	mr->mr_mttrsrcp	  = mtt;
666 	mr->mr_pdhdl	  = pd;
667 	mr->mr_rsrcp	  = rsrc;
668 	mr->mr_is_fmr	  = 1;
669 	(void) memcpy(&mr->mr_bindinfo, &bind, sizeof (tavor_bind_info_t));
670 
671 	*mrhdl = mr;
672 
673 	TAVOR_TNF_EXIT(tavor_mr_alloc_fmr);
674 	return (DDI_SUCCESS);
675 
676 /*
677  * The following is cleanup for all possible failure cases in this routine
678  */
679 fmralloc_fail4:
680 	tavor_rsrc_free(state, &mtt);
681 fmralloc_fail3:
682 	tavor_rsrc_free(state, &rsrc);
683 fmralloc_fail2:
684 	tavor_rsrc_free(state, &mpt);
685 fmralloc_fail1:
686 	tavor_pd_refcnt_dec(pd);
687 fmralloc_fail:
688 	TNF_PROBE_1(tavor_mr_alloc_fmr, TAVOR_TNF_ERROR, "",
689 	    tnf_string, msg, errormsg);
690 	TAVOR_TNF_EXIT(tavor_mr_alloc_fmr);
691 	return (status);
692 }
693 
694 /*
695  * tavor_mr_register_physical_fmr()
696  *    Context: Can be called from interrupt or base context.
697  */
698 int
699 tavor_mr_register_physical_fmr(tavor_state_t *state,
700     ibt_pmr_attr_t *mem_pattr_p, tavor_mrhdl_t mr, ibt_pmr_desc_t *mem_desc_p)
701 {
702 	tavor_rsrc_t		*mpt;
703 	uint64_t		*mpt_table;
704 	int			status;
705 	char			*errormsg;
706 
707 	TAVOR_TNF_ENTER(tavor_mr_register_physical_fmr);
708 
709 	mutex_enter(&mr->mr_lock);
710 	mpt = mr->mr_mptrsrcp;
711 	mpt_table = (uint64_t *)mpt->tr_addr;
712 
713 	/* Write MPT status to SW bit */
714 	ddi_put8(mpt->tr_acchdl, (uint8_t *)&mpt_table[0], 0xF);
715 
716 	/*
717 	 * Write the mapped addresses into the MTT entries.  FMR needs to do
718 	 * this a little differently, so we call the fmr specific fast mtt
719 	 * write here.
720 	 */
721 	status = tavor_mr_fast_mtt_write_fmr(mr->mr_mttrsrcp, mem_pattr_p,
722 	    mr->mr_logmttpgsz);
723 	if (status != DDI_SUCCESS) {
724 		mutex_exit(&mr->mr_lock);
725 		/* Set "status" and "errormsg" and goto failure */
726 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed write mtt");
727 		goto fmr_reg_fail1;
728 	}
729 
730 	/*
731 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
732 	 * from a certain number of "constrained" bits (the least significant
733 	 * bits) and some number of "unconstrained" bits.  The constrained
734 	 * bits must be set to the index of the entry in the MPT table, but
735 	 * the unconstrained bits can be set to any value we wish.  Note:
736 	 * if no remote access is required, then the RKey value is not filled
737 	 * in.  Otherwise both Rkey and LKey are given the same value.
738 	 */
739 	tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
740 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
741 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
742 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
743 		mr->mr_rkey = mr->mr_lkey;
744 	}
745 
746 	/* write mem key value */
747 	ddi_put32(mpt->tr_acchdl, (uint32_t *)&mpt_table[1], mr->mr_lkey);
748 
749 	/* write length value */
750 	ddi_put64(mpt->tr_acchdl, &mpt_table[3], mem_pattr_p->pmr_len);
751 
752 	/* write start addr value */
753 	ddi_put64(mpt->tr_acchdl, &mpt_table[2], mem_pattr_p->pmr_iova);
754 
755 	/* write lkey value */
756 	ddi_put32(mpt->tr_acchdl, (uint32_t *)&mpt_table[4], mr->mr_lkey);
757 
758 	/* Write MPT status to HW bit */
759 	ddi_put8(mpt->tr_acchdl, (uint8_t *)&mpt_table[0], 0x0);
760 
761 	/* Fill in return parameters */
762 	mem_desc_p->pmd_lkey = mr->mr_lkey;
763 	mem_desc_p->pmd_rkey = mr->mr_rkey;
764 	mem_desc_p->pmd_iova = mem_pattr_p->pmr_iova;
765 	mem_desc_p->pmd_phys_buf_list_sz = mem_pattr_p->pmr_len;
766 
767 	/* Fill in MR bindinfo struct for later sync or query operations */
768 	mr->mr_bindinfo.bi_addr = mem_pattr_p->pmr_iova;
769 	mr->mr_bindinfo.bi_flags = mem_pattr_p->pmr_flags & IBT_MR_NONCOHERENT;
770 
771 	mutex_exit(&mr->mr_lock);
772 
773 	TAVOR_TNF_EXIT(tavor_mr_register_physical_fmr);
774 	return (DDI_SUCCESS);
775 
776 fmr_reg_fail1:
777 	/*
778 	 * Note, we fail here, and purposely leave the memory ownership in
779 	 * software.  The memory tables may be corrupt, so we leave the region
780 	 * unregistered.
781 	 */
782 	TNF_PROBE_1(tavor_mr_register_physical_fmr_fail, TAVOR_TNF_ERROR, "",
783 	    tnf_string, msg, errormsg);
784 	TAVOR_TNF_EXIT(tavor_mr_register_physical_fmr);
785 	return (DDI_FAILURE);
786 }
787 
788 
789 /*
790  * tavor_mr_deregister()
791  *    Context: Can be called from interrupt or base context.
792  */
793 /* ARGSUSED */
794 int
795 tavor_mr_deregister(tavor_state_t *state, tavor_mrhdl_t *mrhdl, uint_t level,
796     uint_t sleep)
797 {
798 	tavor_rsrc_t		*mpt, *mtt, *rsrc, *mtt_refcnt;
799 	tavor_umap_db_entry_t	*umapdb;
800 	tavor_pdhdl_t		pd;
801 	tavor_mrhdl_t		mr;
802 	tavor_bind_info_t	*bind;
803 	uint64_t		value;
804 	int			status, shared_mtt;
805 	char			*errormsg;
806 
807 	TAVOR_TNF_ENTER(tavor_mr_deregister);
808 
809 	/*
810 	 * Check the sleep flag.  Ensure that it is consistent with the
811 	 * current thread context (i.e. if we are currently in the interrupt
812 	 * context, then we shouldn't be attempting to sleep).
813 	 */
814 	if ((sleep == TAVOR_SLEEP) &&
815 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
816 		/* Set "status" and "errormsg" and goto failure */
817 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sleep flags");
818 		TNF_PROBE_1(tavor_mr_deregister_fail, TAVOR_TNF_ERROR, "",
819 		    tnf_string, msg, errormsg);
820 		TAVOR_TNF_EXIT(tavor_mr_deregister);
821 		return (status);
822 	}
823 
824 	/*
825 	 * Pull all the necessary information from the Tavor Memory Region
826 	 * handle.  This is necessary here because the resource for the
827 	 * MR handle is going to be freed up as part of the this
828 	 * deregistration
829 	 */
830 	mr	= *mrhdl;
831 	mutex_enter(&mr->mr_lock);
832 	mpt	= mr->mr_mptrsrcp;
833 	mtt	= mr->mr_mttrsrcp;
834 	mtt_refcnt = mr->mr_mttrefcntp;
835 	rsrc	= mr->mr_rsrcp;
836 	pd	= mr->mr_pdhdl;
837 	bind	= &mr->mr_bindinfo;
838 
839 	/*
840 	 * Check here if the memory region is really an FMR.  If so, this is a
841 	 * bad thing and we shouldn't be here.  Return failure.
842 	 */
843 	if (mr->mr_is_fmr) {
844 		mutex_exit(&mr->mr_lock);
845 		TNF_PROBE_0(tavor_mr_deregister_is_fmr, TAVOR_TNF_ERROR, "");
846 		TAVOR_TNF_EXIT(tavor_mr_deregister);
847 		return (IBT_INVALID_PARAM);
848 	}
849 
850 	/*
851 	 * Check here to see if the memory region has already been partially
852 	 * deregistered as a result of the tavor_umap_umemlock_cb() callback.
853 	 * If so, then jump to the end and free the remaining resources.
854 	 */
855 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
856 		goto mrdereg_finish_cleanup;
857 	}
858 
859 	/*
860 	 * We must drop the "mr_lock" here to ensure that both SLEEP and
861 	 * NOSLEEP calls into the firmware work as expected.  Also, if two
862 	 * threads are attemping to access this MR (via de-register,
863 	 * re-register, or otherwise), then we allow the firmware to enforce
864 	 * the checking, that only one deregister is valid.
865 	 */
866 	mutex_exit(&mr->mr_lock);
867 
868 	/*
869 	 * Reclaim MPT entry from hardware (if necessary).  Since the
870 	 * tavor_mr_deregister() routine is used in the memory region
871 	 * reregistration process as well, it is possible that we will
872 	 * not always wish to reclaim ownership of the MPT.  Check the
873 	 * "level" arg and, if necessary, attempt to reclaim it.  If
874 	 * the ownership transfer fails for any reason, we check to see
875 	 * what command status was returned from the hardware.  The only
876 	 * "expected" error status is the one that indicates an attempt to
877 	 * deregister a memory region that has memory windows bound to it
878 	 */
879 	if (level >= TAVOR_MR_DEREG_ALL) {
880 		status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT,
881 		    NULL, 0, mpt->tr_indx, sleep);
882 		if (status != TAVOR_CMD_SUCCESS) {
883 			if (status == TAVOR_CMD_REG_BOUND) {
884 				TAVOR_TNF_EXIT(tavor_mr_deregister);
885 				return (IBT_MR_IN_USE);
886 			} else {
887 				cmn_err(CE_CONT, "Tavor: HW2SW_MPT command "
888 				    "failed: %08x\n", status);
889 				TNF_PROBE_1(tavor_hw2sw_mpt_cmd_fail,
890 				    TAVOR_TNF_ERROR, "", tnf_uint, status,
891 				    status);
892 				TAVOR_TNF_EXIT(tavor_mr_deregister);
893 				return (IBT_INVALID_PARAM);
894 			}
895 		}
896 	}
897 
898 	/*
899 	 * Re-grab the mr_lock here.  Since further access to the protected
900 	 * 'mr' structure is needed, and we would have returned previously for
901 	 * the multiple deregistration case, we can safely grab the lock here.
902 	 */
903 	mutex_enter(&mr->mr_lock);
904 
905 	/*
906 	 * If the memory had come from userland, then we do a lookup in the
907 	 * "userland resources database".  On success, we free the entry, call
908 	 * ddi_umem_unlock(), and continue the cleanup.  On failure (which is
909 	 * an indication that the umem_lockmemory() callback has called
910 	 * tavor_mr_deregister()), we call ddi_umem_unlock() and invalidate
911 	 * the "mr_umemcookie" field in the MR handle (this will be used
912 	 * later to detect that only partial cleaup still remains to be done
913 	 * on the MR handle).
914 	 */
915 	if (mr->mr_is_umem) {
916 		status = tavor_umap_db_find(state->ts_instance,
917 		    (uint64_t)(uintptr_t)mr->mr_umemcookie,
918 		    MLNX_UMAP_MRMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
919 		    &umapdb);
920 		if (status == DDI_SUCCESS) {
921 			tavor_umap_db_free(umapdb);
922 			ddi_umem_unlock(mr->mr_umemcookie);
923 		} else {
924 			ddi_umem_unlock(mr->mr_umemcookie);
925 			mr->mr_umemcookie = NULL;
926 		}
927 	}
928 
929 	/*
930 	 * Decrement the MTT reference count.  Since the MTT resource
931 	 * may be shared between multiple memory regions (as a result
932 	 * of a "RegisterSharedMR" verb) it is important that we not
933 	 * free up or unbind resources prematurely.  If it's not shared (as
934 	 * indicated by the return status), then free the resource.
935 	 */
936 	shared_mtt = tavor_mtt_refcnt_dec(mtt_refcnt);
937 	if (!shared_mtt) {
938 		tavor_rsrc_free(state, &mtt_refcnt);
939 	}
940 
941 	/*
942 	 * Free up the MTT entries and unbind the memory.  Here, as above, we
943 	 * attempt to free these resources only if it is appropriate to do so.
944 	 */
945 	if (!shared_mtt) {
946 		if (level >= TAVOR_MR_DEREG_NO_HW2SW_MPT) {
947 			tavor_mr_mem_unbind(state, bind);
948 		}
949 		tavor_rsrc_free(state, &mtt);
950 	}
951 
952 	/*
953 	 * If the MR handle has been invalidated, then drop the
954 	 * lock and return success.  Note: This only happens because
955 	 * the umem_lockmemory() callback has been triggered.  The
956 	 * cleanup here is partial, and further cleanup (in a
957 	 * subsequent tavor_mr_deregister() call) will be necessary.
958 	 */
959 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
960 		mutex_exit(&mr->mr_lock);
961 		TAVOR_TNF_EXIT(tavor_mr_deregister);
962 		return (DDI_SUCCESS);
963 	}
964 
965 mrdereg_finish_cleanup:
966 	mutex_exit(&mr->mr_lock);
967 
968 	/* Free the Tavor Memory Region handle */
969 	tavor_rsrc_free(state, &rsrc);
970 
971 	/* Free up the MPT entry resource */
972 	tavor_rsrc_free(state, &mpt);
973 
974 	/* Decrement the reference count on the protection domain (PD) */
975 	tavor_pd_refcnt_dec(pd);
976 
977 	/* Set the mrhdl pointer to NULL and return success */
978 	*mrhdl = NULL;
979 
980 	TAVOR_TNF_EXIT(tavor_mr_deregister);
981 	return (DDI_SUCCESS);
982 }
983 
984 /*
985  * tavor_mr_dealloc_fmr()
986  *    Context: Can be called from interrupt or base context.
987  */
988 /* ARGSUSED */
989 int
990 tavor_mr_dealloc_fmr(tavor_state_t *state, tavor_mrhdl_t *mrhdl)
991 {
992 	tavor_rsrc_t		*mpt, *mtt, *rsrc;
993 	tavor_pdhdl_t		pd;
994 	tavor_mrhdl_t		mr;
995 
996 	TAVOR_TNF_ENTER(tavor_mr_dealloc_fmr);
997 
998 	/*
999 	 * Pull all the necessary information from the Tavor Memory Region
1000 	 * handle.  This is necessary here because the resource for the
1001 	 * MR handle is going to be freed up as part of the this
1002 	 * deregistration
1003 	 */
1004 	mr	= *mrhdl;
1005 	mutex_enter(&mr->mr_lock);
1006 	mpt	= mr->mr_mptrsrcp;
1007 	mtt	= mr->mr_mttrsrcp;
1008 	rsrc	= mr->mr_rsrcp;
1009 	pd	= mr->mr_pdhdl;
1010 	mutex_exit(&mr->mr_lock);
1011 
1012 	/* Free the MTT entries */
1013 	tavor_rsrc_free(state, &mtt);
1014 
1015 	/* Free the Tavor Memory Region handle */
1016 	tavor_rsrc_free(state, &rsrc);
1017 
1018 	/* Free up the MPT entry resource */
1019 	tavor_rsrc_free(state, &mpt);
1020 
1021 	/* Decrement the reference count on the protection domain (PD) */
1022 	tavor_pd_refcnt_dec(pd);
1023 
1024 	/* Set the mrhdl pointer to NULL and return success */
1025 	*mrhdl = NULL;
1026 
1027 	TAVOR_TNF_EXIT(tavor_mr_dealloc_fmr);
1028 	return (DDI_SUCCESS);
1029 }
1030 
1031 /*
1032  * tavor_mr_invalidate_fmr()
1033  *    Context: Can be called from interrupt or base context.
1034  */
1035 /* ARGSUSED */
1036 int
1037 tavor_mr_invalidate_fmr(tavor_state_t *state, tavor_mrhdl_t mr)
1038 {
1039 	tavor_rsrc_t		*mpt;
1040 	uint64_t		*mpt_table;
1041 
1042 	TAVOR_TNF_ENTER(tavor_mr_invalidate_fmr);
1043 
1044 	mutex_enter(&mr->mr_lock);
1045 	mpt = mr->mr_mptrsrcp;
1046 	mpt_table = (uint64_t *)mpt->tr_addr;
1047 
1048 	/* Write MPT status to SW bit */
1049 	ddi_put8(mpt->tr_acchdl, (uint8_t *)&mpt_table[0], 0xF);
1050 
1051 	/* invalidate mem key value */
1052 	ddi_put32(mpt->tr_acchdl, (uint32_t *)&mpt_table[1], 0);
1053 
1054 	/* invalidate lkey value */
1055 	ddi_put32(mpt->tr_acchdl, (uint32_t *)&mpt_table[4], 0);
1056 
1057 	/* Write MPT status to HW bit */
1058 	ddi_put8(mpt->tr_acchdl, (uint8_t *)&mpt_table[0], 0x0);
1059 
1060 	mutex_exit(&mr->mr_lock);
1061 
1062 	TAVOR_TNF_EXIT(tavor_mr_invalidate_fmr);
1063 	return (DDI_SUCCESS);
1064 }
1065 
1066 /*
1067  * tavor_mr_deregister_fmr()
1068  *    Context: Can be called from interrupt or base context.
1069  */
1070 /* ARGSUSED */
1071 int
1072 tavor_mr_deregister_fmr(tavor_state_t *state, tavor_mrhdl_t mr)
1073 {
1074 	tavor_rsrc_t		*mpt;
1075 	uint64_t		*mpt_table;
1076 
1077 	TAVOR_TNF_ENTER(tavor_mr_deregister_fmr);
1078 
1079 	mutex_enter(&mr->mr_lock);
1080 	mpt = mr->mr_mptrsrcp;
1081 	mpt_table = (uint64_t *)mpt->tr_addr;
1082 
1083 	/* Write MPT status to SW bit */
1084 	ddi_put8(mpt->tr_acchdl, (uint8_t *)&mpt_table[0], 0xF);
1085 	mutex_exit(&mr->mr_lock);
1086 
1087 	TAVOR_TNF_EXIT(tavor_mr_deregister_fmr);
1088 	return (DDI_SUCCESS);
1089 }
1090 
1091 
1092 /*
1093  * tavor_mr_query()
1094  *    Context: Can be called from interrupt or base context.
1095  */
1096 /* ARGSUSED */
1097 int
1098 tavor_mr_query(tavor_state_t *state, tavor_mrhdl_t mr,
1099     ibt_mr_query_attr_t *attr)
1100 {
1101 	TAVOR_TNF_ENTER(tavor_mr_query);
1102 
1103 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr))
1104 
1105 	mutex_enter(&mr->mr_lock);
1106 
1107 	/*
1108 	 * Check here to see if the memory region has already been partially
1109 	 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
1110 	 * If so, this is an error, return failure.
1111 	 */
1112 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
1113 		mutex_exit(&mr->mr_lock);
1114 		TNF_PROBE_0(tavor_mr_query_inv_mrhdl_fail, TAVOR_TNF_ERROR, "");
1115 		TAVOR_TNF_EXIT(tavor_mr_query);
1116 		return (IBT_MR_HDL_INVALID);
1117 	}
1118 
1119 	/* Fill in the queried attributes */
1120 	attr->mr_attr_flags = mr->mr_accflag;
1121 	attr->mr_pd	= (ibt_pd_hdl_t)mr->mr_pdhdl;
1122 
1123 	/* Fill in the "local" attributes */
1124 	attr->mr_lkey = (ibt_lkey_t)mr->mr_lkey;
1125 	attr->mr_lbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
1126 	attr->mr_lbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
1127 
1128 	/*
1129 	 * Fill in the "remote" attributes (if necessary).  Note: the
1130 	 * remote attributes are only valid if the memory region has one
1131 	 * or more of the remote access flags set.
1132 	 */
1133 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1134 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1135 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1136 		attr->mr_rkey = (ibt_rkey_t)mr->mr_rkey;
1137 		attr->mr_rbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
1138 		attr->mr_rbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
1139 	}
1140 
1141 	/*
1142 	 * If region is mapped for streaming (i.e. noncoherent), then set sync
1143 	 * is required
1144 	 */
1145 	attr->mr_sync_required = (mr->mr_bindinfo.bi_flags &
1146 	    IBT_MR_NONCOHERENT) ? B_TRUE : B_FALSE;
1147 
1148 	mutex_exit(&mr->mr_lock);
1149 	TAVOR_TNF_EXIT(tavor_mr_query);
1150 	return (DDI_SUCCESS);
1151 }
1152 
1153 
1154 /*
1155  * tavor_mr_reregister()
1156  *    Context: Can be called from interrupt or base context.
1157  */
1158 int
1159 tavor_mr_reregister(tavor_state_t *state, tavor_mrhdl_t mr,
1160     tavor_pdhdl_t pd, ibt_mr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl_new,
1161     tavor_mr_options_t *op)
1162 {
1163 	tavor_bind_info_t	bind;
1164 	int			status;
1165 
1166 	TAVOR_TNF_ENTER(tavor_mr_reregister);
1167 
1168 	/*
1169 	 * Fill in the "bind" struct.  This struct provides the majority
1170 	 * of the information that will be used to distinguish between an
1171 	 * "addr" binding (as is the case here) and a "buf" binding (see
1172 	 * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
1173 	 * which does most of the "heavy lifting" for the Tavor memory
1174 	 * registration (and reregistration) routines.
1175 	 */
1176 	bind.bi_type  = TAVOR_BINDHDL_VADDR;
1177 	bind.bi_addr  = mr_attr->mr_vaddr;
1178 	bind.bi_len   = mr_attr->mr_len;
1179 	bind.bi_as    = mr_attr->mr_as;
1180 	bind.bi_flags = mr_attr->mr_flags;
1181 	status = tavor_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
1182 	if (status != DDI_SUCCESS) {
1183 		TNF_PROBE_0(tavor_mr_reregister_cmnreg_fail,
1184 		    TAVOR_TNF_ERROR, "");
1185 		TAVOR_TNF_EXIT(tavor_mr_reregister);
1186 		return (status);
1187 	}
1188 
1189 	TAVOR_TNF_EXIT(tavor_mr_reregister);
1190 	return (DDI_SUCCESS);
1191 }
1192 
1193 
1194 /*
1195  * tavor_mr_reregister_buf()
1196  *    Context: Can be called from interrupt or base context.
1197  */
1198 int
1199 tavor_mr_reregister_buf(tavor_state_t *state, tavor_mrhdl_t mr,
1200     tavor_pdhdl_t pd, ibt_smr_attr_t *mr_attr, struct buf *buf,
1201     tavor_mrhdl_t *mrhdl_new, tavor_mr_options_t *op)
1202 {
1203 	tavor_bind_info_t	bind;
1204 	int			status;
1205 
1206 	TAVOR_TNF_ENTER(tavor_mr_reregister_buf);
1207 
1208 	/*
1209 	 * Fill in the "bind" struct.  This struct provides the majority
1210 	 * of the information that will be used to distinguish between an
1211 	 * "addr" binding (see above) and a "buf" binding (as is the case
1212 	 * here).  The "bind" struct is later passed to tavor_mr_mem_bind()
1213 	 * which does most of the "heavy lifting" for the Tavor memory
1214 	 * registration routines.  Note: We have chosen to provide
1215 	 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
1216 	 * not set).  It is not critical what value we choose here as it need
1217 	 * only be unique for the given RKey (which will happen by default),
1218 	 * so the choice here is somewhat arbitrary.
1219 	 */
1220 	bind.bi_type  = TAVOR_BINDHDL_BUF;
1221 	bind.bi_buf   = buf;
1222 	if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
1223 		bind.bi_addr  = mr_attr->mr_vaddr;
1224 	} else {
1225 		bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
1226 	}
1227 	bind.bi_len   = (uint64_t)buf->b_bcount;
1228 	bind.bi_flags = mr_attr->mr_flags;
1229 	bind.bi_as = NULL;
1230 	status = tavor_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
1231 	if (status != DDI_SUCCESS) {
1232 		TNF_PROBE_0(tavor_mr_reregister_buf_cmnreg_fail,
1233 		    TAVOR_TNF_ERROR, "");
1234 		TAVOR_TNF_EXIT(tavor_mr_reregister_buf);
1235 		return (status);
1236 	}
1237 
1238 	TAVOR_TNF_EXIT(tavor_mr_reregister_buf);
1239 	return (DDI_SUCCESS);
1240 }
1241 
1242 
1243 /*
1244  * tavor_mr_sync()
1245  *    Context: Can be called from interrupt or base context.
1246  */
1247 /* ARGSUSED */
1248 int
1249 tavor_mr_sync(tavor_state_t *state, ibt_mr_sync_t *mr_segs, size_t num_segs)
1250 {
1251 	tavor_mrhdl_t		mrhdl;
1252 	uint64_t		seg_vaddr, seg_len, seg_end;
1253 	uint64_t		mr_start, mr_end;
1254 	uint_t			type;
1255 	int			status, i;
1256 	char			*errormsg;
1257 
1258 	TAVOR_TNF_ENTER(tavor_mr_sync);
1259 
1260 	/* Process each of the ibt_mr_sync_t's */
1261 	for (i = 0; i < num_segs; i++) {
1262 		mrhdl = (tavor_mrhdl_t)mr_segs[i].ms_handle;
1263 
1264 		/* Check for valid memory region handle */
1265 		if (mrhdl == NULL) {
1266 			/* Set "status" and "errormsg" and goto failure */
1267 			TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
1268 			goto mrsync_fail;
1269 		}
1270 
1271 		mutex_enter(&mrhdl->mr_lock);
1272 
1273 		/*
1274 		 * Check here to see if the memory region has already been
1275 		 * partially deregistered as a result of a
1276 		 * tavor_umap_umemlock_cb() callback.  If so, this is an
1277 		 * error, return failure.
1278 		 */
1279 		if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
1280 			mutex_exit(&mrhdl->mr_lock);
1281 			/* Set "status" and "errormsg" and goto failure */
1282 			TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl2");
1283 			goto mrsync_fail;
1284 		}
1285 
1286 		/* Check for valid bounds on sync request */
1287 		seg_vaddr = mr_segs[i].ms_vaddr;
1288 		seg_len	  = mr_segs[i].ms_len;
1289 		seg_end	  = seg_vaddr + seg_len - 1;
1290 		mr_start  = mrhdl->mr_bindinfo.bi_addr;
1291 		mr_end	  = mr_start + mrhdl->mr_bindinfo.bi_len - 1;
1292 		if ((seg_vaddr < mr_start) || (seg_vaddr > mr_end)) {
1293 			mutex_exit(&mrhdl->mr_lock);
1294 			/* Set "status" and "errormsg" and goto failure */
1295 			TAVOR_TNF_FAIL(IBT_MR_VA_INVALID, "invalid vaddr");
1296 			goto mrsync_fail;
1297 		}
1298 		if ((seg_end < mr_start) || (seg_end > mr_end)) {
1299 			mutex_exit(&mrhdl->mr_lock);
1300 			/* Set "status" and "errormsg" and goto failure */
1301 			TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
1302 			goto mrsync_fail;
1303 		}
1304 
1305 		/* Determine what type (i.e. direction) for sync */
1306 		if (mr_segs[i].ms_flags & IBT_SYNC_READ) {
1307 			type = DDI_DMA_SYNC_FORDEV;
1308 		} else if (mr_segs[i].ms_flags & IBT_SYNC_WRITE) {
1309 			type = DDI_DMA_SYNC_FORCPU;
1310 		} else {
1311 			mutex_exit(&mrhdl->mr_lock);
1312 			/* Set "status" and "errormsg" and goto failure */
1313 			TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sync type");
1314 			goto mrsync_fail;
1315 		}
1316 
1317 		(void) ddi_dma_sync(mrhdl->mr_bindinfo.bi_dmahdl,
1318 		    (off_t)(seg_vaddr - mr_start), (size_t)seg_len, type);
1319 		mutex_exit(&mrhdl->mr_lock);
1320 	}
1321 
1322 	TAVOR_TNF_EXIT(tavor_mr_sync);
1323 	return (DDI_SUCCESS);
1324 
1325 mrsync_fail:
1326 	TNF_PROBE_1(tavor_mr_sync_fail, TAVOR_TNF_ERROR, "", tnf_string, msg,
1327 	    errormsg);
1328 	TAVOR_TNF_EXIT(tavor_mr_sync);
1329 	return (status);
1330 }
1331 
1332 
1333 /*
1334  * tavor_mw_alloc()
1335  *    Context: Can be called from interrupt or base context.
1336  */
1337 int
1338 tavor_mw_alloc(tavor_state_t *state, tavor_pdhdl_t pd, ibt_mw_flags_t flags,
1339     tavor_mwhdl_t *mwhdl)
1340 {
1341 	tavor_rsrc_t		*mpt, *rsrc;
1342 	tavor_hw_mpt_t		mpt_entry;
1343 	tavor_mwhdl_t		mw;
1344 	uint_t			sleep;
1345 	int			status;
1346 	char			*errormsg;
1347 
1348 	TAVOR_TNF_ENTER(tavor_mw_alloc);
1349 
1350 	/*
1351 	 * Check the sleep flag.  Ensure that it is consistent with the
1352 	 * current thread context (i.e. if we are currently in the interrupt
1353 	 * context, then we shouldn't be attempting to sleep).
1354 	 */
1355 	sleep = (flags & IBT_MW_NOSLEEP) ? TAVOR_NOSLEEP : TAVOR_SLEEP;
1356 	if ((sleep == TAVOR_SLEEP) &&
1357 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1358 		/* Set "status" and "errormsg" and goto failure */
1359 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1360 		goto mwalloc_fail;
1361 	}
1362 
1363 	/* Increment the reference count on the protection domain (PD) */
1364 	tavor_pd_refcnt_inc(pd);
1365 
1366 	/*
1367 	 * Allocate an MPT entry (for use as a memory window).  Since the
1368 	 * Tavor hardware uses the MPT entry for memory regions and for
1369 	 * memory windows, we will fill in this MPT with all the necessary
1370 	 * parameters for the memory window.  And then (just as we do for
1371 	 * memory regions) ownership will be passed to the hardware in the
1372 	 * final step below.  If we fail here, we must undo the protection
1373 	 * domain reference count.
1374 	 */
1375 	status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
1376 	if (status != DDI_SUCCESS) {
1377 		/* Set "status" and "errormsg" and goto failure */
1378 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
1379 		goto mwalloc_fail1;
1380 	}
1381 
1382 	/*
1383 	 * Allocate the software structure for tracking the memory window (i.e.
1384 	 * the Tavor Memory Window handle).  Note: This is actually the same
1385 	 * software structure used for tracking memory regions, but since many
1386 	 * of the same properties are needed, only a single structure is
1387 	 * necessary.  If we fail here, we must undo the protection domain
1388 	 * reference count and the previous resource allocation.
1389 	 */
1390 	status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
1391 	if (status != DDI_SUCCESS) {
1392 		/* Set "status" and "errormsg" and goto failure */
1393 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
1394 		goto mwalloc_fail2;
1395 	}
1396 	mw = (tavor_mwhdl_t)rsrc->tr_addr;
1397 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
1398 
1399 	/*
1400 	 * Calculate an "unbound" RKey from MPT index.  In much the same way
1401 	 * as we do for memory regions (above), this key is constructed from
1402 	 * a "constrained" (which depends on the MPT index) and an
1403 	 * "unconstrained" portion (which may be arbitrarily chosen).
1404 	 */
1405 	tavor_mr_keycalc(state, mpt->tr_indx, &mw->mr_rkey);
1406 
1407 	/*
1408 	 * Fill in the MPT entry.  This is the final step before passing
1409 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
1410 	 * the information collected/calculated above to fill in the
1411 	 * requisite portions of the MPT.  Note: fewer entries in the MPT
1412 	 * entry are necessary to allocate a memory window.
1413 	 */
1414 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
1415 	mpt_entry.reg_win	= TAVOR_MPT_IS_WINDOW;
1416 	mpt_entry.mem_key	= mw->mr_rkey;
1417 	mpt_entry.pd		= pd->pd_pdnum;
1418 
1419 	/*
1420 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
1421 	 * the entry to the hardware.  Note: in general, this operation
1422 	 * shouldn't fail.  But if it does, we have to undo everything we've
1423 	 * done above before returning error.
1424 	 */
1425 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1426 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
1427 	if (status != TAVOR_CMD_SUCCESS) {
1428 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
1429 		    status);
1430 		TNF_PROBE_1(tavor_mw_alloc_sw2hw_mpt_cmd_fail,
1431 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1432 		/* Set "status" and "errormsg" and goto failure */
1433 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1434 		    "tavor SW2HW_MPT command");
1435 		goto mwalloc_fail3;
1436 	}
1437 
1438 	/*
1439 	 * Fill in the rest of the Tavor Memory Window handle.  Having
1440 	 * successfully transferred ownership of the MPT, we can update the
1441 	 * following fields for use in further operations on the MW.
1442 	 */
1443 	mw->mr_mptrsrcp	= mpt;
1444 	mw->mr_pdhdl	= pd;
1445 	mw->mr_rsrcp	= rsrc;
1446 	*mwhdl = mw;
1447 
1448 	TAVOR_TNF_EXIT(tavor_mw_alloc);
1449 	return (DDI_SUCCESS);
1450 
1451 mwalloc_fail3:
1452 	tavor_rsrc_free(state, &rsrc);
1453 mwalloc_fail2:
1454 	tavor_rsrc_free(state, &mpt);
1455 mwalloc_fail1:
1456 	tavor_pd_refcnt_dec(pd);
1457 mwalloc_fail:
1458 	TNF_PROBE_1(tavor_mw_alloc_fail, TAVOR_TNF_ERROR, "",
1459 	    tnf_string, msg, errormsg);
1460 	TAVOR_TNF_EXIT(tavor_mw_alloc);
1461 	return (status);
1462 }
1463 
1464 
1465 /*
1466  * tavor_mw_free()
1467  *    Context: Can be called from interrupt or base context.
1468  */
1469 int
1470 tavor_mw_free(tavor_state_t *state, tavor_mwhdl_t *mwhdl, uint_t sleep)
1471 {
1472 	tavor_rsrc_t		*mpt, *rsrc;
1473 	tavor_mwhdl_t		mw;
1474 	int			status;
1475 	char			*errormsg;
1476 	tavor_pdhdl_t		pd;
1477 
1478 	TAVOR_TNF_ENTER(tavor_mw_free);
1479 
1480 	/*
1481 	 * Check the sleep flag.  Ensure that it is consistent with the
1482 	 * current thread context (i.e. if we are currently in the interrupt
1483 	 * context, then we shouldn't be attempting to sleep).
1484 	 */
1485 	if ((sleep == TAVOR_SLEEP) &&
1486 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1487 		/* Set "status" and "errormsg" and goto failure */
1488 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sleep flags");
1489 		TNF_PROBE_1(tavor_mw_free_fail, TAVOR_TNF_ERROR, "",
1490 		    tnf_string, msg, errormsg);
1491 		TAVOR_TNF_EXIT(tavor_mw_free);
1492 		return (status);
1493 	}
1494 
1495 	/*
1496 	 * Pull all the necessary information from the Tavor Memory Window
1497 	 * handle.  This is necessary here because the resource for the
1498 	 * MW handle is going to be freed up as part of the this operation.
1499 	 */
1500 	mw	= *mwhdl;
1501 	mutex_enter(&mw->mr_lock);
1502 	mpt	= mw->mr_mptrsrcp;
1503 	rsrc	= mw->mr_rsrcp;
1504 	pd	= mw->mr_pdhdl;
1505 	mutex_exit(&mw->mr_lock);
1506 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
1507 
1508 	/*
1509 	 * Reclaim the MPT entry from hardware.  Note: in general, it is
1510 	 * unexpected for this operation to return an error.
1511 	 */
1512 	status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT, NULL,
1513 	    0, mpt->tr_indx, sleep);
1514 	if (status != TAVOR_CMD_SUCCESS) {
1515 		cmn_err(CE_CONT, "Tavor: HW2SW_MPT command failed: %08x\n",
1516 		    status);
1517 		TNF_PROBE_1(tavor_hw2sw_mpt_cmd_fail, TAVOR_TNF_ERROR, "",
1518 		    tnf_uint, status, status);
1519 		TAVOR_TNF_EXIT(tavor_mw_free);
1520 		return (IBT_INVALID_PARAM);
1521 	}
1522 
1523 	/* Free the Tavor Memory Window handle */
1524 	tavor_rsrc_free(state, &rsrc);
1525 
1526 	/* Free up the MPT entry resource */
1527 	tavor_rsrc_free(state, &mpt);
1528 
1529 	/* Decrement the reference count on the protection domain (PD) */
1530 	tavor_pd_refcnt_dec(pd);
1531 
1532 	/* Set the mwhdl pointer to NULL and return success */
1533 	*mwhdl = NULL;
1534 
1535 	TAVOR_TNF_EXIT(tavor_mw_free);
1536 	return (DDI_SUCCESS);
1537 }
1538 
1539 
1540 /*
1541  * tavor_mr_keycalc()
1542  *    Context: Can be called from interrupt or base context.
1543  */
1544 void
1545 tavor_mr_keycalc(tavor_state_t *state, uint32_t indx, uint32_t *key)
1546 {
1547 	uint32_t	tmp, log_num_mpt;
1548 
1549 	/*
1550 	 * Generate a simple key from counter.  Note:  We increment this
1551 	 * static variable _intentionally_ without any kind of mutex around
1552 	 * it.  First, single-threading all operations through a single lock
1553 	 * would be a bad idea (from a performance point-of-view).  Second,
1554 	 * the upper "unconstrained" bits don't really have to be unique
1555 	 * because the lower bits are guaranteed to be (although we do make a
1556 	 * best effort to ensure that they are).  Third, the window for the
1557 	 * race (where both threads read and update the counter at the same
1558 	 * time) is incredibly small.
1559 	 * And, lastly, we'd like to make this into a "random" key XXX
1560 	 */
1561 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(tavor_debug_memkey_cnt))
1562 	log_num_mpt = state->ts_cfg_profile->cp_log_num_mpt;
1563 	tmp = (tavor_debug_memkey_cnt++) << log_num_mpt;
1564 	*key = tmp | indx;
1565 }
1566 
1567 
1568 /*
1569  * tavor_mr_common_reg()
1570  *    Context: Can be called from interrupt or base context.
1571  */
1572 static int
1573 tavor_mr_common_reg(tavor_state_t *state, tavor_pdhdl_t pd,
1574     tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op)
1575 {
1576 	tavor_rsrc_pool_info_t	*rsrc_pool;
1577 	tavor_rsrc_t		*mpt, *mtt, *rsrc, *mtt_refcnt;
1578 	tavor_umap_db_entry_t	*umapdb;
1579 	tavor_sw_refcnt_t	*swrc_tmp;
1580 	tavor_hw_mpt_t		mpt_entry;
1581 	tavor_mrhdl_t		mr;
1582 	ibt_mr_flags_t		flags;
1583 	tavor_bind_info_t	*bh;
1584 	ddi_dma_handle_t	bind_dmahdl;
1585 	ddi_umem_cookie_t	umem_cookie;
1586 	size_t			umem_len;
1587 	caddr_t			umem_addr;
1588 	uint64_t		mtt_addr, mtt_ddrbaseaddr, max_sz;
1589 	uint_t			sleep, mtt_pgsize_bits, bind_type, mr_is_umem;
1590 	int			status, umem_flags, bind_override_addr;
1591 	char			*errormsg;
1592 
1593 	TAVOR_TNF_ENTER(tavor_mr_common_reg);
1594 
1595 	/*
1596 	 * Check the "options" flag.  Currently this flag tells the driver
1597 	 * whether or not the region should be bound normally (i.e. with
1598 	 * entries written into the PCI IOMMU), whether it should be
1599 	 * registered to bypass the IOMMU, and whether or not the resulting
1600 	 * address should be "zero-based" (to aid the alignment restrictions
1601 	 * for QPs).
1602 	 */
1603 	if (op == NULL) {
1604 		bind_type   = TAVOR_BINDMEM_NORMAL;
1605 		bind_dmahdl = NULL;
1606 		bind_override_addr = 0;
1607 	} else {
1608 		bind_type	   = op->mro_bind_type;
1609 		bind_dmahdl	   = op->mro_bind_dmahdl;
1610 		bind_override_addr = op->mro_bind_override_addr;
1611 	}
1612 
1613 	/* Extract the flags field from the tavor_bind_info_t */
1614 	flags = bind->bi_flags;
1615 
1616 	/*
1617 	 * Check for invalid length.  Check is the length is zero or if the
1618 	 * length is larger than the maximum configured value.  Return error
1619 	 * if it is.
1620 	 */
1621 	max_sz = ((uint64_t)1 << state->ts_cfg_profile->cp_log_max_mrw_sz);
1622 	if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
1623 		/* Set "status" and "errormsg" and goto failure */
1624 		TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
1625 		goto mrcommon_fail;
1626 	}
1627 
1628 	/*
1629 	 * Check the sleep flag.  Ensure that it is consistent with the
1630 	 * current thread context (i.e. if we are currently in the interrupt
1631 	 * context, then we shouldn't be attempting to sleep).
1632 	 */
1633 	sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1634 	if ((sleep == TAVOR_SLEEP) &&
1635 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1636 		/* Set "status" and "errormsg" and goto failure */
1637 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1638 		goto mrcommon_fail;
1639 	}
1640 
1641 	/*
1642 	 * Get the base address for the MTT table.  This will be necessary
1643 	 * below when we are setting up the MPT entry.
1644 	 */
1645 	rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
1646 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
1647 
1648 	/* Increment the reference count on the protection domain (PD) */
1649 	tavor_pd_refcnt_inc(pd);
1650 
1651 	/*
1652 	 * Allocate an MPT entry.  This will be filled in with all the
1653 	 * necessary parameters to define the memory region.  And then
1654 	 * ownership will be passed to the hardware in the final step
1655 	 * below.  If we fail here, we must undo the protection domain
1656 	 * reference count.
1657 	 */
1658 	status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
1659 	if (status != DDI_SUCCESS) {
1660 		/* Set "status" and "errormsg" and goto failure */
1661 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
1662 		goto mrcommon_fail1;
1663 	}
1664 
1665 	/*
1666 	 * Allocate the software structure for tracking the memory region (i.e.
1667 	 * the Tavor Memory Region handle).  If we fail here, we must undo
1668 	 * the protection domain reference count and the previous resource
1669 	 * allocation.
1670 	 */
1671 	status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
1672 	if (status != DDI_SUCCESS) {
1673 		/* Set "status" and "errormsg" and goto failure */
1674 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
1675 		goto mrcommon_fail2;
1676 	}
1677 	mr = (tavor_mrhdl_t)rsrc->tr_addr;
1678 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
1679 
1680 	/*
1681 	 * Setup and validate the memory region access flags.  This means
1682 	 * translating the IBTF's enable flags into the access flags that
1683 	 * will be used in later operations.
1684 	 */
1685 	mr->mr_accflag = 0;
1686 	if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1687 		mr->mr_accflag |= IBT_MR_WINDOW_BIND;
1688 	if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1689 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
1690 	if (flags & IBT_MR_ENABLE_REMOTE_READ)
1691 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
1692 	if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1693 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
1694 	if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1695 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
1696 
1697 	/*
1698 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
1699 	 * from a certain number of "constrained" bits (the least significant
1700 	 * bits) and some number of "unconstrained" bits.  The constrained
1701 	 * bits must be set to the index of the entry in the MPT table, but
1702 	 * the unconstrained bits can be set to any value we wish.  Note:
1703 	 * if no remote access is required, then the RKey value is not filled
1704 	 * in.  Otherwise both Rkey and LKey are given the same value.
1705 	 */
1706 	tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
1707 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1708 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1709 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1710 		mr->mr_rkey = mr->mr_lkey;
1711 	}
1712 
1713 	/*
1714 	 * Determine if the memory is from userland and pin the pages
1715 	 * with umem_lockmemory() if necessary.
1716 	 * Then, if this is userland memory, allocate an entry in the
1717 	 * "userland resources database".  This will later be added to
1718 	 * the database (after all further memory registration operations are
1719 	 * successful).  If we fail here, we must undo the reference counts
1720 	 * and the previous resource allocations.
1721 	 */
1722 	mr_is_umem = (((bind->bi_as != NULL) && (bind->bi_as != &kas)) ? 1 : 0);
1723 	if (mr_is_umem) {
1724 		umem_len   = ptob(btopr(bind->bi_len +
1725 		    ((uintptr_t)bind->bi_addr & PAGEOFFSET)));
1726 		umem_addr  = (caddr_t)((uintptr_t)bind->bi_addr & ~PAGEOFFSET);
1727 		umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
1728 		    DDI_UMEMLOCK_LONGTERM);
1729 		status = umem_lockmemory(umem_addr, umem_len, umem_flags,
1730 		    &umem_cookie, &tavor_umem_cbops, NULL);
1731 		if (status != 0) {
1732 			/* Set "status" and "errormsg" and goto failure */
1733 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umem pin");
1734 			goto mrcommon_fail3;
1735 		}
1736 
1737 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1738 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1739 
1740 		bind->bi_buf = ddi_umem_iosetup(umem_cookie, 0, umem_len,
1741 		    B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP);
1742 		if (bind->bi_buf == NULL) {
1743 			/* Set "status" and "errormsg" and goto failure */
1744 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed iosetup");
1745 			goto mrcommon_fail3;
1746 		}
1747 		bind->bi_type = TAVOR_BINDHDL_UBUF;
1748 		bind->bi_buf->b_flags |= B_READ;
1749 
1750 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1751 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1752 
1753 		umapdb = tavor_umap_db_alloc(state->ts_instance,
1754 		    (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
1755 		    (uint64_t)(uintptr_t)rsrc);
1756 		if (umapdb == NULL) {
1757 			/* Set "status" and "errormsg" and goto failure */
1758 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
1759 			goto mrcommon_fail4;
1760 		}
1761 	}
1762 
1763 	/*
1764 	 * Setup the bindinfo for the mtt bind call
1765 	 */
1766 	bh = &mr->mr_bindinfo;
1767 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bh))
1768 	bcopy(bind, bh, sizeof (tavor_bind_info_t));
1769 	bh->bi_bypass = bind_type;
1770 	status = tavor_mr_mtt_bind(state, bh, bind_dmahdl, &mtt,
1771 	    &mtt_pgsize_bits);
1772 	if (status != DDI_SUCCESS) {
1773 		/* Set "status" and "errormsg" and goto failure */
1774 		TAVOR_TNF_FAIL(status, "failed mtt bind");
1775 		/*
1776 		 * When mtt_bind fails, freerbuf has already been done,
1777 		 * so make sure not to call it again.
1778 		 */
1779 		bind->bi_type = bh->bi_type;
1780 		goto mrcommon_fail5;
1781 	}
1782 	mr->mr_logmttpgsz = mtt_pgsize_bits;
1783 
1784 	/*
1785 	 * Allocate MTT reference count (to track shared memory regions).
1786 	 * This reference count resource may never be used on the given
1787 	 * memory region, but if it is ever later registered as "shared"
1788 	 * memory region then this resource will be necessary.  If we fail
1789 	 * here, we do pretty much the same as above to clean up.
1790 	 */
1791 	status = tavor_rsrc_alloc(state, TAVOR_REFCNT, 1, sleep,
1792 	    &mtt_refcnt);
1793 	if (status != DDI_SUCCESS) {
1794 		/* Set "status" and "errormsg" and goto failure */
1795 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed refence count");
1796 		goto mrcommon_fail6;
1797 	}
1798 	mr->mr_mttrefcntp = mtt_refcnt;
1799 	swrc_tmp = (tavor_sw_refcnt_t *)mtt_refcnt->tr_addr;
1800 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_tmp))
1801 	TAVOR_MTT_REFCNT_INIT(swrc_tmp);
1802 
1803 	/*
1804 	 * Fill in the MPT entry.  This is the final step before passing
1805 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
1806 	 * the information collected/calculated above to fill in the
1807 	 * requisite portions of the MPT.
1808 	 */
1809 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
1810 	mpt_entry.m_io	  = TAVOR_MEM_CYCLE_GENERATE;
1811 	mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
1812 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
1813 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
1814 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
1815 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
1816 	mpt_entry.lr	  = 1;
1817 	mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
1818 	mpt_entry.page_sz	= mr->mr_logmttpgsz - 0xC;
1819 	mpt_entry.mem_key	= mr->mr_lkey;
1820 	mpt_entry.pd		= pd->pd_pdnum;
1821 	if (bind_override_addr == 0) {
1822 		mpt_entry.start_addr = bh->bi_addr;
1823 	} else {
1824 		bh->bi_addr = bh->bi_addr & ((1 << mr->mr_logmttpgsz) - 1);
1825 		mpt_entry.start_addr = bh->bi_addr;
1826 	}
1827 	mpt_entry.reg_win_len	= bh->bi_len;
1828 	mpt_entry.win_cnt_limit	= TAVOR_UNLIMITED_WIN_BIND;
1829 	mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
1830 	mpt_entry.mttseg_addr_h = mtt_addr >> 32;
1831 	mpt_entry.mttseg_addr_l = mtt_addr >> 6;
1832 
1833 	/*
1834 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
1835 	 * the entry to the hardware.  Note: in general, this operation
1836 	 * shouldn't fail.  But if it does, we have to undo everything we've
1837 	 * done above before returning error.
1838 	 */
1839 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1840 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
1841 	if (status != TAVOR_CMD_SUCCESS) {
1842 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
1843 		    status);
1844 		TNF_PROBE_1(tavor_mr_common_reg_sw2hw_mpt_cmd_fail,
1845 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1846 		/* Set "status" and "errormsg" and goto failure */
1847 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1848 		    "tavor SW2HW_MPT command");
1849 		goto mrcommon_fail7;
1850 	}
1851 
1852 	/*
1853 	 * Fill in the rest of the Tavor Memory Region handle.  Having
1854 	 * successfully transferred ownership of the MPT, we can update the
1855 	 * following fields for use in further operations on the MR.
1856 	 */
1857 	mr->mr_mptrsrcp	  = mpt;
1858 	mr->mr_mttrsrcp	  = mtt;
1859 	mr->mr_pdhdl	  = pd;
1860 	mr->mr_rsrcp	  = rsrc;
1861 	mr->mr_is_umem	  = mr_is_umem;
1862 	mr->mr_is_fmr	  = 0;
1863 	mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL;
1864 	mr->mr_umem_cbfunc = NULL;
1865 	mr->mr_umem_cbarg1 = NULL;
1866 	mr->mr_umem_cbarg2 = NULL;
1867 
1868 	/*
1869 	 * If this is userland memory, then we need to insert the previously
1870 	 * allocated entry into the "userland resources database".  This will
1871 	 * allow for later coordination between the tavor_umap_umemlock_cb()
1872 	 * callback and tavor_mr_deregister().
1873 	 */
1874 	if (mr_is_umem) {
1875 		tavor_umap_db_add(umapdb);
1876 	}
1877 
1878 	*mrhdl = mr;
1879 
1880 	TAVOR_TNF_EXIT(tavor_mr_common_reg);
1881 	return (DDI_SUCCESS);
1882 
1883 /*
1884  * The following is cleanup for all possible failure cases in this routine
1885  */
1886 mrcommon_fail7:
1887 	tavor_rsrc_free(state, &mtt_refcnt);
1888 mrcommon_fail6:
1889 	tavor_rsrc_free(state, &mtt);
1890 	tavor_mr_mem_unbind(state, bh);
1891 	bind->bi_type = bh->bi_type;
1892 mrcommon_fail5:
1893 	if (mr_is_umem) {
1894 		tavor_umap_db_free(umapdb);
1895 	}
1896 mrcommon_fail4:
1897 	if (mr_is_umem) {
1898 		/*
1899 		 * Free up the memory ddi_umem_iosetup() allocates
1900 		 * internally.
1901 		 */
1902 		if (bind->bi_type == TAVOR_BINDHDL_UBUF) {
1903 			freerbuf(bind->bi_buf);
1904 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1905 			bind->bi_type = TAVOR_BINDHDL_NONE;
1906 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1907 		}
1908 		ddi_umem_unlock(umem_cookie);
1909 	}
1910 mrcommon_fail3:
1911 	tavor_rsrc_free(state, &rsrc);
1912 mrcommon_fail2:
1913 	tavor_rsrc_free(state, &mpt);
1914 mrcommon_fail1:
1915 	tavor_pd_refcnt_dec(pd);
1916 mrcommon_fail:
1917 	TNF_PROBE_1(tavor_mr_common_reg_fail, TAVOR_TNF_ERROR, "",
1918 	    tnf_string, msg, errormsg);
1919 	TAVOR_TNF_EXIT(tavor_mr_common_reg);
1920 	return (status);
1921 }
1922 
1923 /*
1924  * tavor_mr_mtt_bind()
1925  *    Context: Can be called from interrupt or base context.
1926  */
1927 int
1928 tavor_mr_mtt_bind(tavor_state_t *state, tavor_bind_info_t *bind,
1929     ddi_dma_handle_t bind_dmahdl, tavor_rsrc_t **mtt, uint_t *mtt_pgsize_bits)
1930 {
1931 	uint64_t		nummtt;
1932 	uint_t			sleep;
1933 	int			status;
1934 	char			*errormsg;
1935 
1936 	TAVOR_TNF_ENTER(tavor_mr_common_reg);
1937 
1938 	/*
1939 	 * Check the sleep flag.  Ensure that it is consistent with the
1940 	 * current thread context (i.e. if we are currently in the interrupt
1941 	 * context, then we shouldn't be attempting to sleep).
1942 	 */
1943 	sleep = (bind->bi_flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1944 	if ((sleep == TAVOR_SLEEP) &&
1945 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1946 		/* Set "status" and "errormsg" and goto failure */
1947 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1948 		goto mrmttbind_fail;
1949 	}
1950 
1951 	/*
1952 	 * Bind the memory and determine the mapped addresses.  This is
1953 	 * the first of two routines that do all the "heavy lifting" for
1954 	 * the Tavor memory registration routines.  The tavor_mr_mem_bind()
1955 	 * routine takes the "bind" struct with all its fields filled
1956 	 * in and returns a list of DMA cookies (for the PCI mapped addresses
1957 	 * corresponding to the specified address region) which are used by
1958 	 * the tavor_mr_fast_mtt_write() routine below.  If we fail here, we
1959 	 * must undo all the previous resource allocation (and PD reference
1960 	 * count).
1961 	 */
1962 	status = tavor_mr_mem_bind(state, bind, bind_dmahdl, sleep);
1963 	if (status != DDI_SUCCESS) {
1964 		/* Set "status" and "errormsg" and goto failure */
1965 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
1966 		goto mrmttbind_fail;
1967 	}
1968 
1969 	/*
1970 	 * Determine number of pages spanned.  This routine uses the
1971 	 * information in the "bind" struct to determine the required
1972 	 * number of MTT entries needed (and returns the suggested page size -
1973 	 * as a "power-of-2" - for each MTT entry).
1974 	 */
1975 	nummtt = tavor_mr_nummtt_needed(state, bind, mtt_pgsize_bits);
1976 
1977 	/*
1978 	 * Allocate the MTT entries.  Use the calculations performed above to
1979 	 * allocate the required number of MTT entries.  Note: MTT entries are
1980 	 * allocated in "MTT segments" which consist of complete cachelines
1981 	 * (i.e. 8 entries, 16 entries, etc.)  So the TAVOR_NUMMTT_TO_MTTSEG()
1982 	 * macro is used to do the proper conversion.  If we fail here, we
1983 	 * must not only undo all the previous resource allocation (and PD
1984 	 * reference count), but we must also unbind the memory.
1985 	 */
1986 	status = tavor_rsrc_alloc(state, TAVOR_MTT,
1987 	    TAVOR_NUMMTT_TO_MTTSEG(nummtt), sleep, mtt);
1988 	if (status != DDI_SUCCESS) {
1989 		/* Set "status" and "errormsg" and goto failure */
1990 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MTT");
1991 		goto mrmttbind_fail2;
1992 	}
1993 
1994 	/*
1995 	 * Write the mapped addresses into the MTT entries.  This is part two
1996 	 * of the "heavy lifting" routines that we talked about above.  Note:
1997 	 * we pass the suggested page size from the earlier operation here.
1998 	 * And if we fail here, we again do pretty much the same huge clean up.
1999 	 */
2000 	status = tavor_mr_fast_mtt_write(*mtt, bind, *mtt_pgsize_bits);
2001 	if (status != DDI_SUCCESS) {
2002 		/* Set "status" and "errormsg" and goto failure */
2003 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed write mtt");
2004 		goto mrmttbind_fail3;
2005 	}
2006 	TAVOR_TNF_EXIT(tavor_mr_mtt_bind);
2007 	return (DDI_SUCCESS);
2008 
2009 /*
2010  * The following is cleanup for all possible failure cases in this routine
2011  */
2012 mrmttbind_fail3:
2013 	tavor_rsrc_free(state, mtt);
2014 mrmttbind_fail2:
2015 	tavor_mr_mem_unbind(state, bind);
2016 mrmttbind_fail:
2017 	TNF_PROBE_1(tavor_mr_mtt_bind_fail, TAVOR_TNF_ERROR, "",
2018 	    tnf_string, msg, errormsg);
2019 	TAVOR_TNF_EXIT(tavor_mr_mtt_bind);
2020 	return (status);
2021 }
2022 
2023 
2024 /*
2025  * tavor_mr_mtt_unbind()
2026  *    Context: Can be called from interrupt or base context.
2027  */
2028 int
2029 tavor_mr_mtt_unbind(tavor_state_t *state, tavor_bind_info_t *bind,
2030     tavor_rsrc_t *mtt)
2031 {
2032 	TAVOR_TNF_ENTER(tavor_mr_mtt_unbind);
2033 
2034 	/*
2035 	 * Free up the MTT entries and unbind the memory.  Here, as above, we
2036 	 * attempt to free these resources only if it is appropriate to do so.
2037 	 */
2038 	tavor_mr_mem_unbind(state, bind);
2039 	tavor_rsrc_free(state, &mtt);
2040 
2041 	TAVOR_TNF_EXIT(tavor_mr_mtt_unbind);
2042 	return (DDI_SUCCESS);
2043 }
2044 
2045 
2046 /*
2047  * tavor_mr_common_rereg()
2048  *    Context: Can be called from interrupt or base context.
2049  */
2050 static int
2051 tavor_mr_common_rereg(tavor_state_t *state, tavor_mrhdl_t mr,
2052     tavor_pdhdl_t pd, tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl_new,
2053     tavor_mr_options_t *op)
2054 {
2055 	tavor_rsrc_t		*mpt;
2056 	ibt_mr_attr_flags_t	acc_flags_to_use;
2057 	ibt_mr_flags_t		flags;
2058 	tavor_pdhdl_t		pd_to_use;
2059 	tavor_hw_mpt_t		mpt_entry;
2060 	uint64_t		mtt_addr_to_use, vaddr_to_use, len_to_use;
2061 	uint_t			sleep, dereg_level;
2062 	int			status;
2063 	char			*errormsg;
2064 
2065 	TAVOR_TNF_ENTER(tavor_mr_common_rereg);
2066 
2067 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2068 
2069 	/*
2070 	 * Check here to see if the memory region corresponds to a userland
2071 	 * mapping.  Reregistration of userland memory regions is not
2072 	 * currently supported.  Return failure. XXX
2073 	 */
2074 	if (mr->mr_is_umem) {
2075 		/* Set "status" and "errormsg" and goto failure */
2076 		TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
2077 		goto mrrereg_fail;
2078 	}
2079 
2080 	mutex_enter(&mr->mr_lock);
2081 
2082 	/* Pull MPT resource pointer from the Tavor Memory Region handle */
2083 	mpt = mr->mr_mptrsrcp;
2084 
2085 	/* Extract the flags field from the tavor_bind_info_t */
2086 	flags = bind->bi_flags;
2087 
2088 	/*
2089 	 * Check the sleep flag.  Ensure that it is consistent with the
2090 	 * current thread context (i.e. if we are currently in the interrupt
2091 	 * context, then we shouldn't be attempting to sleep).
2092 	 */
2093 	sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
2094 	if ((sleep == TAVOR_SLEEP) &&
2095 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
2096 		mutex_exit(&mr->mr_lock);
2097 		/* Set "status" and "errormsg" and goto failure */
2098 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
2099 		goto mrrereg_fail;
2100 	}
2101 
2102 	/*
2103 	 * First step is to temporarily invalidate the MPT entry.  This
2104 	 * regains ownership from the hardware, and gives us the opportunity
2105 	 * to modify the entry.  Note: The HW2SW_MPT command returns the
2106 	 * current MPT entry contents.  These are saved away here because
2107 	 * they will be reused in a later step below.  If the region has
2108 	 * bound memory windows that we fail returning an "in use" error code.
2109 	 * Otherwise, this is an unexpected error and we deregister the
2110 	 * memory region and return error.
2111 	 *
2112 	 * We use TAVOR_CMD_NOSLEEP_SPIN here always because we must protect
2113 	 * against holding the lock around this rereg call in all contexts.
2114 	 */
2115 	status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT, &mpt_entry,
2116 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, TAVOR_CMD_NOSLEEP_SPIN);
2117 	if (status != TAVOR_CMD_SUCCESS) {
2118 		mutex_exit(&mr->mr_lock);
2119 		if (status == TAVOR_CMD_REG_BOUND) {
2120 			TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2121 			return (IBT_MR_IN_USE);
2122 		} else {
2123 			cmn_err(CE_CONT, "Tavor: HW2SW_MPT command failed: "
2124 			    "%08x\n", status);
2125 
2126 			/*
2127 			 * Call deregister and ensure that all current
2128 			 * resources get freed up
2129 			 */
2130 			if (tavor_mr_deregister(state, &mr,
2131 			    TAVOR_MR_DEREG_ALL, sleep) != DDI_SUCCESS) {
2132 				TAVOR_WARNING(state, "failed to deregister "
2133 				    "memory region");
2134 			}
2135 			TNF_PROBE_1(tavor_mr_common_rereg_hw2sw_mpt_cmd_fail,
2136 			    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
2137 			TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2138 			return (ibc_get_ci_failure(0));
2139 		}
2140 	}
2141 
2142 	/*
2143 	 * If we're changing the protection domain, then validate the new one
2144 	 */
2145 	if (flags & IBT_MR_CHANGE_PD) {
2146 
2147 		/* Check for valid PD handle pointer */
2148 		if (pd == NULL) {
2149 			mutex_exit(&mr->mr_lock);
2150 			/*
2151 			 * Call deregister and ensure that all current
2152 			 * resources get properly freed up. Unnecessary
2153 			 * here to attempt to regain software ownership
2154 			 * of the MPT entry as that has already been
2155 			 * done above.
2156 			 */
2157 			if (tavor_mr_deregister(state, &mr,
2158 			    TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) !=
2159 			    DDI_SUCCESS) {
2160 				TAVOR_WARNING(state, "failed to deregister "
2161 				    "memory region");
2162 			}
2163 			/* Set "status" and "errormsg" and goto failure */
2164 			TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle");
2165 			goto mrrereg_fail;
2166 		}
2167 
2168 		/* Use the new PD handle in all operations below */
2169 		pd_to_use = pd;
2170 
2171 	} else {
2172 		/* Use the current PD handle in all operations below */
2173 		pd_to_use = mr->mr_pdhdl;
2174 	}
2175 
2176 	/*
2177 	 * If we're changing access permissions, then validate the new ones
2178 	 */
2179 	if (flags & IBT_MR_CHANGE_ACCESS) {
2180 		/*
2181 		 * Validate the access flags.  Both remote write and remote
2182 		 * atomic require the local write flag to be set
2183 		 */
2184 		if (((flags & IBT_MR_ENABLE_REMOTE_WRITE) ||
2185 		    (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)) &&
2186 		    !(flags & IBT_MR_ENABLE_LOCAL_WRITE)) {
2187 			mutex_exit(&mr->mr_lock);
2188 			/*
2189 			 * Call deregister and ensure that all current
2190 			 * resources get properly freed up. Unnecessary
2191 			 * here to attempt to regain software ownership
2192 			 * of the MPT entry as that has already been
2193 			 * done above.
2194 			 */
2195 			if (tavor_mr_deregister(state, &mr,
2196 			    TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) !=
2197 			    DDI_SUCCESS) {
2198 				TAVOR_WARNING(state, "failed to deregister "
2199 				    "memory region");
2200 			}
2201 			/* Set "status" and "errormsg" and goto failure */
2202 			TAVOR_TNF_FAIL(IBT_MR_ACCESS_REQ_INVALID,
2203 			    "invalid access flags");
2204 			goto mrrereg_fail;
2205 		}
2206 
2207 		/*
2208 		 * Setup and validate the memory region access flags.  This
2209 		 * means translating the IBTF's enable flags into the access
2210 		 * flags that will be used in later operations.
2211 		 */
2212 		acc_flags_to_use = 0;
2213 		if (flags & IBT_MR_ENABLE_WINDOW_BIND)
2214 			acc_flags_to_use |= IBT_MR_WINDOW_BIND;
2215 		if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
2216 			acc_flags_to_use |= IBT_MR_LOCAL_WRITE;
2217 		if (flags & IBT_MR_ENABLE_REMOTE_READ)
2218 			acc_flags_to_use |= IBT_MR_REMOTE_READ;
2219 		if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
2220 			acc_flags_to_use |= IBT_MR_REMOTE_WRITE;
2221 		if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
2222 			acc_flags_to_use |= IBT_MR_REMOTE_ATOMIC;
2223 
2224 	} else {
2225 		acc_flags_to_use = mr->mr_accflag;
2226 	}
2227 
2228 	/*
2229 	 * If we're modifying the translation, then figure out whether
2230 	 * we can reuse the current MTT resources.  This means calling
2231 	 * tavor_mr_rereg_xlat_helper() which does most of the heavy lifting
2232 	 * for the reregistration.  If the current memory region contains
2233 	 * sufficient MTT entries for the new regions, then it will be
2234 	 * reused and filled in.  Otherwise, new entries will be allocated,
2235 	 * the old ones will be freed, and the new entries will be filled
2236 	 * in.  Note:  If we're not modifying the translation, then we
2237 	 * should already have all the information we need to update the MPT.
2238 	 * Also note: If tavor_mr_rereg_xlat_helper() fails, it will return
2239 	 * a "dereg_level" which is the level of cleanup that needs to be
2240 	 * passed to tavor_mr_deregister() to finish the cleanup.
2241 	 */
2242 	if (flags & IBT_MR_CHANGE_TRANSLATION) {
2243 		status = tavor_mr_rereg_xlat_helper(state, mr, bind, op,
2244 		    &mtt_addr_to_use, sleep, &dereg_level);
2245 		if (status != DDI_SUCCESS) {
2246 			mutex_exit(&mr->mr_lock);
2247 			/*
2248 			 * Call deregister and ensure that all resources get
2249 			 * properly freed up.
2250 			 */
2251 			if (tavor_mr_deregister(state, &mr, dereg_level,
2252 			    sleep) != DDI_SUCCESS) {
2253 				TAVOR_WARNING(state, "failed to deregister "
2254 				    "memory region");
2255 			}
2256 
2257 			/* Set "status" and "errormsg" and goto failure */
2258 			TAVOR_TNF_FAIL(status, "failed rereg helper");
2259 			goto mrrereg_fail;
2260 		}
2261 		vaddr_to_use = mr->mr_bindinfo.bi_addr;
2262 		len_to_use   = mr->mr_bindinfo.bi_len;
2263 	} else {
2264 		mtt_addr_to_use = (((uint64_t)mpt_entry.mttseg_addr_h << 32) |
2265 		    ((uint64_t)mpt_entry.mttseg_addr_l << 6));
2266 		vaddr_to_use = mr->mr_bindinfo.bi_addr;
2267 		len_to_use   = mr->mr_bindinfo.bi_len;
2268 	}
2269 
2270 	/*
2271 	 * Calculate new keys (Lkey, Rkey) from MPT index.  Just like they were
2272 	 * when the region was first registered, each key is formed from
2273 	 * "constrained" bits and "unconstrained" bits.  Note:  If no remote
2274 	 * access is required, then the RKey value is not filled in.  Otherwise
2275 	 * both Rkey and LKey are given the same value.
2276 	 */
2277 	tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
2278 	if ((acc_flags_to_use & IBT_MR_REMOTE_READ) ||
2279 	    (acc_flags_to_use & IBT_MR_REMOTE_WRITE) ||
2280 	    (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC)) {
2281 		mr->mr_rkey = mr->mr_lkey;
2282 	}
2283 
2284 	/*
2285 	 * Update the MPT entry with the new information.  Some of this
2286 	 * information is retained from the previous operation, some of
2287 	 * it is new based on request.
2288 	 */
2289 	mpt_entry.en_bind = (acc_flags_to_use & IBT_MR_WINDOW_BIND)   ? 1 : 0;
2290 	mpt_entry.atomic  = (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
2291 	mpt_entry.rw	  = (acc_flags_to_use & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
2292 	mpt_entry.rr	  = (acc_flags_to_use & IBT_MR_REMOTE_READ)   ? 1 : 0;
2293 	mpt_entry.lw	  = (acc_flags_to_use & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
2294 	mpt_entry.page_sz	= mr->mr_logmttpgsz - 0xC;
2295 	mpt_entry.mem_key	= mr->mr_lkey;
2296 	mpt_entry.pd		= pd_to_use->pd_pdnum;
2297 	mpt_entry.start_addr	= vaddr_to_use;
2298 	mpt_entry.reg_win_len	= len_to_use;
2299 	mpt_entry.mttseg_addr_h = mtt_addr_to_use >> 32;
2300 	mpt_entry.mttseg_addr_l = mtt_addr_to_use >> 6;
2301 
2302 	/*
2303 	 * Write the updated MPT entry to hardware
2304 	 *
2305 	 * We use TAVOR_CMD_NOSLEEP_SPIN here always because we must protect
2306 	 * against holding the lock around this rereg call in all contexts.
2307 	 */
2308 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
2309 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, TAVOR_CMD_NOSLEEP_SPIN);
2310 	if (status != TAVOR_CMD_SUCCESS) {
2311 		mutex_exit(&mr->mr_lock);
2312 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
2313 		    status);
2314 		/*
2315 		 * Call deregister and ensure that all current resources get
2316 		 * properly freed up. Unnecessary here to attempt to regain
2317 		 * software ownership of the MPT entry as that has already
2318 		 * been done above.
2319 		 */
2320 		if (tavor_mr_deregister(state, &mr,
2321 		    TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) != DDI_SUCCESS) {
2322 			TAVOR_WARNING(state, "failed to deregister memory "
2323 			    "region");
2324 		}
2325 		TNF_PROBE_1(tavor_mr_common_rereg_sw2hw_mpt_cmd_fail,
2326 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
2327 		TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2328 		return (ibc_get_ci_failure(0));
2329 	}
2330 
2331 	/*
2332 	 * If we're changing PD, then update their reference counts now.
2333 	 * This means decrementing the reference count on the old PD and
2334 	 * incrementing the reference count on the new PD.
2335 	 */
2336 	if (flags & IBT_MR_CHANGE_PD) {
2337 		tavor_pd_refcnt_dec(mr->mr_pdhdl);
2338 		tavor_pd_refcnt_inc(pd);
2339 	}
2340 
2341 	/*
2342 	 * Update the contents of the Tavor Memory Region handle to reflect
2343 	 * what has been changed.
2344 	 */
2345 	mr->mr_pdhdl	  = pd_to_use;
2346 	mr->mr_accflag	  = acc_flags_to_use;
2347 	mr->mr_is_umem	  = 0;
2348 	mr->mr_is_fmr	  = 0;
2349 	mr->mr_umemcookie = NULL;
2350 
2351 	/* New MR handle is same as the old */
2352 	*mrhdl_new = mr;
2353 	mutex_exit(&mr->mr_lock);
2354 
2355 	TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2356 	return (DDI_SUCCESS);
2357 
2358 mrrereg_fail:
2359 	TNF_PROBE_1(tavor_mr_common_rereg_fail, TAVOR_TNF_ERROR, "",
2360 	    tnf_string, msg, errormsg);
2361 	TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2362 	return (status);
2363 }
2364 
2365 
2366 /*
2367  * tavor_mr_rereg_xlat_helper
2368  *    Context: Can be called from interrupt or base context.
2369  *    Note: This routine expects the "mr_lock" to be held when it
2370  *    is called.  Upon returning failure, this routine passes information
2371  *    about what "dereg_level" should be passed to tavor_mr_deregister().
2372  */
2373 static int
2374 tavor_mr_rereg_xlat_helper(tavor_state_t *state, tavor_mrhdl_t mr,
2375     tavor_bind_info_t *bind, tavor_mr_options_t *op, uint64_t *mtt_addr,
2376     uint_t sleep, uint_t *dereg_level)
2377 {
2378 	tavor_rsrc_pool_info_t	*rsrc_pool;
2379 	tavor_rsrc_t		*mtt, *mtt_refcnt;
2380 	tavor_sw_refcnt_t	*swrc_old, *swrc_new;
2381 	ddi_dma_handle_t	dmahdl;
2382 	uint64_t		nummtt_needed, nummtt_in_currrsrc, max_sz;
2383 	uint64_t		mtt_ddrbaseaddr;
2384 	uint_t			mtt_pgsize_bits, bind_type, reuse_dmahdl;
2385 	int			status;
2386 	char			*errormsg;
2387 
2388 	TAVOR_TNF_ENTER(tavor_mr_rereg_xlat_helper);
2389 
2390 	ASSERT(MUTEX_HELD(&mr->mr_lock));
2391 
2392 	/*
2393 	 * Check the "options" flag.  Currently this flag tells the driver
2394 	 * whether or not the region should be bound normally (i.e. with
2395 	 * entries written into the PCI IOMMU) or whether it should be
2396 	 * registered to bypass the IOMMU.
2397 	 */
2398 	if (op == NULL) {
2399 		bind_type = TAVOR_BINDMEM_NORMAL;
2400 	} else {
2401 		bind_type = op->mro_bind_type;
2402 	}
2403 
2404 	/*
2405 	 * Check for invalid length.  Check is the length is zero or if the
2406 	 * length is larger than the maximum configured value.  Return error
2407 	 * if it is.
2408 	 */
2409 	max_sz = ((uint64_t)1 << state->ts_cfg_profile->cp_log_max_mrw_sz);
2410 	if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
2411 		/*
2412 		 * Deregister will be called upon returning failure from this
2413 		 * routine. This will ensure that all current resources get
2414 		 * properly freed up. Unnecessary to attempt to regain
2415 		 * software ownership of the MPT entry as that has already
2416 		 * been done above (in tavor_mr_reregister())
2417 		 */
2418 		*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT;
2419 
2420 		/* Set "status" and "errormsg" and goto failure */
2421 		TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
2422 		goto mrrereghelp_fail;
2423 	}
2424 
2425 	/*
2426 	 * Determine the number of pages necessary for new region and the
2427 	 * number of pages supported by the current MTT resources
2428 	 */
2429 	nummtt_needed = tavor_mr_nummtt_needed(state, bind, &mtt_pgsize_bits);
2430 	nummtt_in_currrsrc = mr->mr_mttrsrcp->tr_len >> TAVOR_MTT_SIZE_SHIFT;
2431 
2432 	/*
2433 	 * Depending on whether we have enough pages or not, the next step is
2434 	 * to fill in a set of MTT entries that reflect the new mapping.  In
2435 	 * the first case below, we already have enough entries.  This means
2436 	 * we need to unbind the memory from the previous mapping, bind the
2437 	 * memory for the new mapping, write the new MTT entries, and update
2438 	 * the mr to reflect the changes.
2439 	 * In the second case below, we do not have enough entries in the
2440 	 * current mapping.  So, in this case, we need not only to unbind the
2441 	 * current mapping, but we need to free up the MTT resources associated
2442 	 * with that mapping.  After we've successfully done that, we continue
2443 	 * by binding the new memory, allocating new MTT entries, writing the
2444 	 * new MTT entries, and updating the mr to reflect the changes.
2445 	 */
2446 
2447 	/*
2448 	 * If this region is being shared (i.e. MTT refcount != 1), then we
2449 	 * can't reuse the current MTT resources regardless of their size.
2450 	 * Instead we'll need to alloc new ones (below) just as if there
2451 	 * hadn't been enough room in the current entries.
2452 	 */
2453 	swrc_old = (tavor_sw_refcnt_t *)mr->mr_mttrefcntp->tr_addr;
2454 	if (TAVOR_MTT_IS_NOT_SHARED(swrc_old) &&
2455 	    (nummtt_needed <= nummtt_in_currrsrc)) {
2456 
2457 		/*
2458 		 * Unbind the old mapping for this memory region, but retain
2459 		 * the ddi_dma_handle_t (if possible) for reuse in the bind
2460 		 * operation below.  Note:  If original memory region was
2461 		 * bound for IOMMU bypass and the new region can not use
2462 		 * bypass, then a new DMA handle will be necessary.
2463 		 */
2464 		if (TAVOR_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2465 			mr->mr_bindinfo.bi_free_dmahdl = 0;
2466 			tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2467 			dmahdl = mr->mr_bindinfo.bi_dmahdl;
2468 			reuse_dmahdl = 1;
2469 		} else {
2470 			tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2471 			dmahdl = NULL;
2472 			reuse_dmahdl = 0;
2473 		}
2474 
2475 		/*
2476 		 * Bind the new memory and determine the mapped addresses.
2477 		 * As described, this routine and tavor_mr_fast_mtt_write()
2478 		 * do the majority of the work for the memory registration
2479 		 * operations.  Note:  When we successfully finish the binding,
2480 		 * we will set the "bi_free_dmahdl" flag to indicate that
2481 		 * even though we may have reused the ddi_dma_handle_t we do
2482 		 * wish it to be freed up at some later time.  Note also that
2483 		 * if we fail, we may need to cleanup the ddi_dma_handle_t.
2484 		 */
2485 		bind->bi_bypass	= bind_type;
2486 		status = tavor_mr_mem_bind(state, bind, dmahdl, sleep);
2487 		if (status != DDI_SUCCESS) {
2488 			if (reuse_dmahdl) {
2489 				ddi_dma_free_handle(&dmahdl);
2490 			}
2491 
2492 			/*
2493 			 * Deregister will be called upon returning failure
2494 			 * from this routine. This will ensure that all
2495 			 * current resources get properly freed up.
2496 			 * Unnecessary to attempt to regain software ownership
2497 			 * of the MPT entry as that has already been done
2498 			 * above (in tavor_mr_reregister()).  Also unnecessary
2499 			 * to attempt to unbind the memory.
2500 			 */
2501 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2502 
2503 			/* Set "status" and "errormsg" and goto failure */
2504 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
2505 			goto mrrereghelp_fail;
2506 		}
2507 		if (reuse_dmahdl) {
2508 			bind->bi_free_dmahdl = 1;
2509 		}
2510 
2511 		/*
2512 		 * Using the new mapping, but reusing the current MTT
2513 		 * resources, write the updated entries to MTT
2514 		 */
2515 		mtt    = mr->mr_mttrsrcp;
2516 		status = tavor_mr_fast_mtt_write(mtt, bind, mtt_pgsize_bits);
2517 		if (status != DDI_SUCCESS) {
2518 			/*
2519 			 * Deregister will be called upon returning failure
2520 			 * from this routine. This will ensure that all
2521 			 * current resources get properly freed up.
2522 			 * Unnecessary to attempt to regain software ownership
2523 			 * of the MPT entry as that has already been done
2524 			 * above (in tavor_mr_reregister()).  Also unnecessary
2525 			 * to attempt to unbind the memory.
2526 			 *
2527 			 * But we do need to unbind the newly bound memory
2528 			 * before returning.
2529 			 */
2530 			tavor_mr_mem_unbind(state, bind);
2531 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2532 
2533 			/* Set "status" and "errormsg" and goto failure */
2534 			TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
2535 			    "failed write mtt");
2536 			goto mrrereghelp_fail;
2537 		}
2538 
2539 		/* Put the updated information into the Mem Region handle */
2540 		mr->mr_bindinfo	  = *bind;
2541 		mr->mr_logmttpgsz = mtt_pgsize_bits;
2542 
2543 	} else {
2544 		/*
2545 		 * Check if the memory region MTT is shared by any other MRs.
2546 		 * Since the resource may be shared between multiple memory
2547 		 * regions (as a result of a "RegisterSharedMR()" verb) it is
2548 		 * important that we not unbind any resources prematurely.
2549 		 */
2550 		if (!TAVOR_MTT_IS_SHARED(swrc_old)) {
2551 			/*
2552 			 * Unbind the old mapping for this memory region, but
2553 			 * retain the ddi_dma_handle_t for reuse in the bind
2554 			 * operation below. Note: This can only be done here
2555 			 * because the region being reregistered is not
2556 			 * currently shared.  Also if original memory region
2557 			 * was bound for IOMMU bypass and the new region can
2558 			 * not use bypass, then a new DMA handle will be
2559 			 * necessary.
2560 			 */
2561 			if (TAVOR_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2562 				mr->mr_bindinfo.bi_free_dmahdl = 0;
2563 				tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2564 				dmahdl = mr->mr_bindinfo.bi_dmahdl;
2565 				reuse_dmahdl = 1;
2566 			} else {
2567 				tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2568 				dmahdl = NULL;
2569 				reuse_dmahdl = 0;
2570 			}
2571 		} else {
2572 			dmahdl = NULL;
2573 			reuse_dmahdl = 0;
2574 		}
2575 
2576 		/*
2577 		 * Bind the new memory and determine the mapped addresses.
2578 		 * As described, this routine and tavor_mr_fast_mtt_write()
2579 		 * do the majority of the work for the memory registration
2580 		 * operations.  Note:  When we successfully finish the binding,
2581 		 * we will set the "bi_free_dmahdl" flag to indicate that
2582 		 * even though we may have reused the ddi_dma_handle_t we do
2583 		 * wish it to be freed up at some later time.  Note also that
2584 		 * if we fail, we may need to cleanup the ddi_dma_handle_t.
2585 		 */
2586 		bind->bi_bypass	= bind_type;
2587 		status = tavor_mr_mem_bind(state, bind, dmahdl, sleep);
2588 		if (status != DDI_SUCCESS) {
2589 			if (reuse_dmahdl) {
2590 				ddi_dma_free_handle(&dmahdl);
2591 			}
2592 
2593 			/*
2594 			 * Deregister will be called upon returning failure
2595 			 * from this routine. This will ensure that all
2596 			 * current resources get properly freed up.
2597 			 * Unnecessary to attempt to regain software ownership
2598 			 * of the MPT entry as that has already been done
2599 			 * above (in tavor_mr_reregister()).  Also unnecessary
2600 			 * to attempt to unbind the memory.
2601 			 */
2602 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2603 
2604 			/* Set "status" and "errormsg" and goto failure */
2605 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
2606 			goto mrrereghelp_fail;
2607 		}
2608 		if (reuse_dmahdl) {
2609 			bind->bi_free_dmahdl = 1;
2610 		}
2611 
2612 		/*
2613 		 * Allocate the new MTT entries resource
2614 		 */
2615 		status = tavor_rsrc_alloc(state, TAVOR_MTT,
2616 		    TAVOR_NUMMTT_TO_MTTSEG(nummtt_needed), sleep, &mtt);
2617 		if (status != DDI_SUCCESS) {
2618 			/*
2619 			 * Deregister will be called upon returning failure
2620 			 * from this routine. This will ensure that all
2621 			 * current resources get properly freed up.
2622 			 * Unnecessary to attempt to regain software ownership
2623 			 * of the MPT entry as that has already been done
2624 			 * above (in tavor_mr_reregister()).  Also unnecessary
2625 			 * to attempt to unbind the memory.
2626 			 *
2627 			 * But we do need to unbind the newly bound memory
2628 			 * before returning.
2629 			 */
2630 			tavor_mr_mem_unbind(state, bind);
2631 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2632 
2633 			/* Set "status" and "errormsg" and goto failure */
2634 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MTT");
2635 			goto mrrereghelp_fail;
2636 		}
2637 
2638 		/*
2639 		 * Allocate MTT reference count (to track shared memory
2640 		 * regions).  As mentioned elsewhere above, this reference
2641 		 * count resource may never be used on the given memory region,
2642 		 * but if it is ever later registered as a "shared" memory
2643 		 * region then this resource will be necessary.  Note:  This
2644 		 * is only necessary here if the existing memory region is
2645 		 * already being shared (because otherwise we already have
2646 		 * a useable reference count resource).
2647 		 */
2648 		if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2649 			status = tavor_rsrc_alloc(state, TAVOR_REFCNT, 1,
2650 			    sleep, &mtt_refcnt);
2651 			if (status != DDI_SUCCESS) {
2652 				/*
2653 				 * Deregister will be called upon returning
2654 				 * failure from this routine. This will ensure
2655 				 * that all current resources get properly
2656 				 * freed up.  Unnecessary to attempt to regain
2657 				 * software ownership of the MPT entry as that
2658 				 * has already been done above (in
2659 				 * tavor_mr_reregister()).  Also unnecessary
2660 				 * to attempt to unbind the memory.
2661 				 *
2662 				 * But we need to unbind the newly bound
2663 				 * memory and free up the newly allocated MTT
2664 				 * entries before returning.
2665 				 */
2666 				tavor_mr_mem_unbind(state, bind);
2667 				tavor_rsrc_free(state, &mtt);
2668 				*dereg_level =
2669 				    TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2670 
2671 				/* Set "status"/"errormsg", goto failure */
2672 				TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE,
2673 				    "failed reference count");
2674 				goto mrrereghelp_fail;
2675 			}
2676 			swrc_new = (tavor_sw_refcnt_t *)mtt_refcnt->tr_addr;
2677 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_new))
2678 			TAVOR_MTT_REFCNT_INIT(swrc_new);
2679 		} else {
2680 			mtt_refcnt = mr->mr_mttrefcntp;
2681 		}
2682 
2683 		/*
2684 		 * Using the new mapping and the new MTT resources, write the
2685 		 * updated entries to MTT
2686 		 */
2687 		status = tavor_mr_fast_mtt_write(mtt, bind, mtt_pgsize_bits);
2688 		if (status != DDI_SUCCESS) {
2689 			/*
2690 			 * Deregister will be called upon returning failure
2691 			 * from this routine. This will ensure that all
2692 			 * current resources get properly freed up.
2693 			 * Unnecessary to attempt to regain software ownership
2694 			 * of the MPT entry as that has already been done
2695 			 * above (in tavor_mr_reregister()).  Also unnecessary
2696 			 * to attempt to unbind the memory.
2697 			 *
2698 			 * But we need to unbind the newly bound memory,
2699 			 * free up the newly allocated MTT entries, and
2700 			 * (possibly) free the new MTT reference count
2701 			 * resource before returning.
2702 			 */
2703 			if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2704 				tavor_rsrc_free(state, &mtt_refcnt);
2705 			}
2706 			tavor_mr_mem_unbind(state, bind);
2707 			tavor_rsrc_free(state, &mtt);
2708 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2709 
2710 			/* Set "status" and "errormsg" and goto failure */
2711 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed write mtt");
2712 			goto mrrereghelp_fail;
2713 		}
2714 
2715 		/*
2716 		 * Check if the memory region MTT is shared by any other MRs.
2717 		 * Since the resource may be shared between multiple memory
2718 		 * regions (as a result of a "RegisterSharedMR()" verb) it is
2719 		 * important that we not free up any resources prematurely.
2720 		 */
2721 		if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2722 			/* Decrement MTT reference count for "old" region */
2723 			(void) tavor_mtt_refcnt_dec(mr->mr_mttrefcntp);
2724 		} else {
2725 			/* Free up the old MTT entries resource */
2726 			tavor_rsrc_free(state, &mr->mr_mttrsrcp);
2727 		}
2728 
2729 		/* Put the updated information into the mrhdl */
2730 		mr->mr_bindinfo	  = *bind;
2731 		mr->mr_logmttpgsz = mtt_pgsize_bits;
2732 		mr->mr_mttrsrcp   = mtt;
2733 		mr->mr_mttrefcntp = mtt_refcnt;
2734 	}
2735 
2736 	/*
2737 	 * Calculate and return the updated MTT address (in the DDR address
2738 	 * space).  This will be used by the caller (tavor_mr_reregister) in
2739 	 * the updated MPT entry
2740 	 */
2741 	rsrc_pool	= &state->ts_rsrc_hdl[TAVOR_MTT];
2742 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
2743 	*mtt_addr	= mtt_ddrbaseaddr + (mtt->tr_indx <<
2744 	    TAVOR_MTT_SIZE_SHIFT);
2745 
2746 	TAVOR_TNF_EXIT(tavor_mr_rereg_xlat_helper);
2747 	return (DDI_SUCCESS);
2748 
2749 mrrereghelp_fail:
2750 	TNF_PROBE_1(tavor_mr_rereg_xlat_helper_fail, TAVOR_TNF_ERROR, "",
2751 	    tnf_string, msg, errormsg);
2752 	TAVOR_TNF_EXIT(tavor_mr_rereg_xlat_helper);
2753 	return (status);
2754 }
2755 
2756 
2757 /*
2758  * tavor_mr_nummtt_needed()
2759  *    Context: Can be called from interrupt or base context.
2760  */
2761 /* ARGSUSED */
2762 static uint64_t
2763 tavor_mr_nummtt_needed(tavor_state_t *state, tavor_bind_info_t *bind,
2764     uint_t *mtt_pgsize_bits)
2765 {
2766 	uint64_t	pg_offset_mask;
2767 	uint64_t	pg_offset, tmp_length;
2768 
2769 	/*
2770 	 * For now we specify the page size as 8Kb (the default page size for
2771 	 * the sun4u architecture), or 4Kb for x86.  Figure out optimal page
2772 	 * size by examining the dmacookies XXX
2773 	 */
2774 	*mtt_pgsize_bits = PAGESHIFT;
2775 
2776 	pg_offset_mask = ((uint64_t)1 << *mtt_pgsize_bits) - 1;
2777 	pg_offset = bind->bi_addr & pg_offset_mask;
2778 	tmp_length = pg_offset + (bind->bi_len - 1);
2779 	return ((tmp_length >> *mtt_pgsize_bits) + 1);
2780 }
2781 
2782 
2783 /*
2784  * tavor_mr_mem_bind()
2785  *    Context: Can be called from interrupt or base context.
2786  */
2787 static int
2788 tavor_mr_mem_bind(tavor_state_t *state, tavor_bind_info_t *bind,
2789     ddi_dma_handle_t dmahdl, uint_t sleep)
2790 {
2791 	ddi_dma_attr_t	dma_attr;
2792 	int		(*callback)(caddr_t);
2793 	uint_t		dma_xfer_mode;
2794 	int		status;
2795 
2796 	/* bi_type must be set to a meaningful value to get a bind handle */
2797 	ASSERT(bind->bi_type == TAVOR_BINDHDL_VADDR ||
2798 	    bind->bi_type == TAVOR_BINDHDL_BUF ||
2799 	    bind->bi_type == TAVOR_BINDHDL_UBUF);
2800 
2801 	TAVOR_TNF_ENTER(tavor_mr_mem_bind);
2802 
2803 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2804 
2805 	/* Set the callback flag appropriately */
2806 	callback = (sleep == TAVOR_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT;
2807 
2808 	/* Determine whether to map STREAMING or CONSISTENT */
2809 	dma_xfer_mode = (bind->bi_flags & IBT_MR_NONCOHERENT) ?
2810 	    DDI_DMA_STREAMING : DDI_DMA_CONSISTENT;
2811 
2812 	/*
2813 	 * Initialize many of the default DMA attributes.  Then, if we're
2814 	 * bypassing the IOMMU, set the DDI_DMA_FORCE_PHYSICAL flag.
2815 	 */
2816 	if (dmahdl == NULL) {
2817 		tavor_dma_attr_init(&dma_attr);
2818 #ifdef	__sparc
2819 		/*
2820 		 * First, disable streaming and switch to consistent if
2821 		 * configured to do so and IOMMU BYPASS is enabled.
2822 		 */
2823 		if (state->ts_cfg_profile->cp_disable_streaming_on_bypass &&
2824 		    dma_xfer_mode == DDI_DMA_STREAMING &&
2825 		    bind->bi_bypass == TAVOR_BINDMEM_BYPASS) {
2826 			dma_xfer_mode = DDI_DMA_CONSISTENT;
2827 		}
2828 
2829 		/*
2830 		 * Then, if streaming is still specified, then "bypass" is not
2831 		 * allowed.
2832 		 */
2833 		if ((dma_xfer_mode == DDI_DMA_CONSISTENT) &&
2834 		    (bind->bi_bypass == TAVOR_BINDMEM_BYPASS)) {
2835 			dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2836 		}
2837 #endif
2838 		/* Allocate a DMA handle for the binding */
2839 		status = ddi_dma_alloc_handle(state->ts_dip, &dma_attr,
2840 		    callback, NULL, &bind->bi_dmahdl);
2841 		if (status != DDI_SUCCESS) {
2842 			TNF_PROBE_0(tavor_mr_mem_bind_dmahdl_fail,
2843 			    TAVOR_TNF_ERROR, "");
2844 			TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2845 			return (status);
2846 		}
2847 		bind->bi_free_dmahdl = 1;
2848 
2849 	} else  {
2850 		bind->bi_dmahdl = dmahdl;
2851 		bind->bi_free_dmahdl = 0;
2852 	}
2853 
2854 	/*
2855 	 * Bind the memory to get the PCI mapped addresses.  The decision
2856 	 * to call ddi_dma_addr_bind_handle() or ddi_dma_buf_bind_handle()
2857 	 * is determined by the "bi_type" flag.  Note: if the bind operation
2858 	 * fails then we have to free up the DMA handle and return error.
2859 	 */
2860 	if (bind->bi_type == TAVOR_BINDHDL_VADDR) {
2861 		status = ddi_dma_addr_bind_handle(bind->bi_dmahdl, NULL,
2862 		    (caddr_t)(uintptr_t)bind->bi_addr, bind->bi_len,
2863 		    (DDI_DMA_RDWR | dma_xfer_mode), callback, NULL,
2864 		    &bind->bi_dmacookie, &bind->bi_cookiecnt);
2865 	} else { /* TAVOR_BINDHDL_BUF || TAVOR_BINDHDL_UBUF */
2866 		status = ddi_dma_buf_bind_handle(bind->bi_dmahdl,
2867 		    bind->bi_buf, (DDI_DMA_RDWR | dma_xfer_mode), callback,
2868 		    NULL, &bind->bi_dmacookie, &bind->bi_cookiecnt);
2869 	}
2870 
2871 	if (status != DDI_DMA_MAPPED) {
2872 		if (bind->bi_free_dmahdl != 0) {
2873 			ddi_dma_free_handle(&bind->bi_dmahdl);
2874 		}
2875 		TNF_PROBE_0(tavor_mr_mem_bind_dmabind_fail, TAVOR_TNF_ERROR,
2876 		    "");
2877 		TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2878 		return (status);
2879 	}
2880 
2881 	TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2882 	return (DDI_SUCCESS);
2883 }
2884 
2885 
2886 /*
2887  * tavor_mr_mem_unbind()
2888  *    Context: Can be called from interrupt or base context.
2889  */
2890 static void
2891 tavor_mr_mem_unbind(tavor_state_t *state, tavor_bind_info_t *bind)
2892 {
2893 	int	status;
2894 
2895 	TAVOR_TNF_ENTER(tavor_mr_mem_unbind);
2896 
2897 	/*
2898 	 * In case of TAVOR_BINDHDL_UBUF, the memory bi_buf points to
2899 	 * is actually allocated by ddi_umem_iosetup() internally, then
2900 	 * it's required to free it here. Reset bi_type to TAVOR_BINDHDL_NONE
2901 	 * not to free it again later.
2902 	 */
2903 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2904 	if (bind->bi_type == TAVOR_BINDHDL_UBUF) {
2905 		freerbuf(bind->bi_buf);
2906 		bind->bi_type = TAVOR_BINDHDL_NONE;
2907 	}
2908 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
2909 
2910 	/*
2911 	 * Unbind the DMA memory for the region
2912 	 *
2913 	 * Note: The only way ddi_dma_unbind_handle() currently
2914 	 * can return an error is if the handle passed in is invalid.
2915 	 * Since this should never happen, we choose to return void
2916 	 * from this function!  If this does return an error, however,
2917 	 * then we print a warning message to the console.
2918 	 */
2919 	status = ddi_dma_unbind_handle(bind->bi_dmahdl);
2920 	if (status != DDI_SUCCESS) {
2921 		TAVOR_WARNING(state, "failed to unbind DMA mapping");
2922 		TNF_PROBE_0(tavor_mr_mem_unbind_dmaunbind_fail,
2923 		    TAVOR_TNF_ERROR, "");
2924 		TAVOR_TNF_EXIT(tavor_mr_mem_unbind);
2925 		return;
2926 	}
2927 
2928 	/* Free up the DMA handle */
2929 	if (bind->bi_free_dmahdl != 0) {
2930 		ddi_dma_free_handle(&bind->bi_dmahdl);
2931 	}
2932 
2933 	TAVOR_TNF_EXIT(tavor_mr_mem_unbind);
2934 }
2935 
2936 
2937 /*
2938  * tavor_mr_fast_mtt_write()
2939  *    Context: Can be called from interrupt or base context.
2940  */
2941 static int
2942 tavor_mr_fast_mtt_write(tavor_rsrc_t *mtt, tavor_bind_info_t *bind,
2943     uint32_t mtt_pgsize_bits)
2944 {
2945 	ddi_dma_cookie_t	dmacookie;
2946 	uint_t			cookie_cnt;
2947 	uint64_t		*mtt_table;
2948 	uint64_t		mtt_entry;
2949 	uint64_t		addr, endaddr;
2950 	uint64_t		pagesize;
2951 	int			i;
2952 
2953 	TAVOR_TNF_ENTER(tavor_mr_fast_mtt_write);
2954 
2955 	/* Calculate page size from the suggested value passed in */
2956 	pagesize = ((uint64_t)1 << mtt_pgsize_bits);
2957 
2958 	/*
2959 	 * Walk the "cookie list" and fill in the MTT table entries
2960 	 */
2961 	i = 0;
2962 	mtt_table  = (uint64_t *)mtt->tr_addr;
2963 	dmacookie  = bind->bi_dmacookie;
2964 	cookie_cnt = bind->bi_cookiecnt;
2965 	while (cookie_cnt-- > 0) {
2966 		addr	= dmacookie.dmac_laddress;
2967 		endaddr = addr + (dmacookie.dmac_size - 1);
2968 		addr	= addr & ~((uint64_t)pagesize - 1);
2969 		while (addr <= endaddr) {
2970 			/*
2971 			 * Fill in the mapped addresses (calculated above) and
2972 			 * set TAVOR_MTT_ENTRY_PRESET flag for each MTT entry.
2973 			 */
2974 			mtt_entry = addr | TAVOR_MTT_ENTRY_PRESET;
2975 			ddi_put64(mtt->tr_acchdl, &mtt_table[i], mtt_entry);
2976 			addr += pagesize;
2977 			i++;
2978 
2979 			if (addr == 0) {
2980 				static int do_once = 1;
2981 				_NOTE(SCHEME_PROTECTS_DATA("safe sharing",
2982 				    do_once))
2983 				if (do_once) {
2984 					do_once = 0;
2985 					cmn_err(CE_NOTE, "probable error in "
2986 					    "dma_cookie address from caller\n");
2987 				}
2988 				break;
2989 			}
2990 		}
2991 
2992 		/*
2993 		 * When we've reached the end of the current DMA cookie,
2994 		 * jump to the next cookie (if there are more)
2995 		 */
2996 		if (cookie_cnt != 0) {
2997 			ddi_dma_nextcookie(bind->bi_dmahdl, &dmacookie);
2998 		}
2999 	}
3000 
3001 	TAVOR_TNF_EXIT(tavor_mr_fast_mtt_write);
3002 	return (DDI_SUCCESS);
3003 }
3004 
3005 /*
3006  * tavor_mr_fast_mtt_write_fmr()
3007  *    Context: Can be called from interrupt or base context.
3008  */
3009 static int
3010 tavor_mr_fast_mtt_write_fmr(tavor_rsrc_t *mtt, ibt_pmr_attr_t *mem_pattr,
3011     uint32_t mtt_pgsize_bits)
3012 {
3013 	uint64_t		*mtt_table;
3014 	ibt_phys_addr_t		*buf;
3015 	uint64_t		mtt_entry;
3016 	uint64_t		addr, first_addr, endaddr;
3017 	uint64_t		pagesize;
3018 	int			i;
3019 
3020 	TAVOR_TNF_ENTER(tavor_mr_fast_mtt_write_fmr);
3021 
3022 	/* Calculate page size from the suggested value passed in */
3023 	pagesize = ((uint64_t)1 << mtt_pgsize_bits);
3024 
3025 	/*
3026 	 * Walk the "buf list" and fill in the MTT table entries
3027 	 */
3028 	mtt_table  = (uint64_t *)mtt->tr_addr;
3029 	for (i = 0; i < mem_pattr->pmr_num_buf; i++) {
3030 		buf = &mem_pattr->pmr_addr_list[i];
3031 
3032 		/*
3033 		 * For first cookie, use the offset field to determine where
3034 		 * the buffer starts.  The end addr is then calculated with the
3035 		 * offset in mind.
3036 		 */
3037 		if (i == 0) {
3038 			first_addr = addr = buf->p_laddr +
3039 			    mem_pattr->pmr_offset;
3040 			endaddr = addr + (mem_pattr->pmr_buf_sz - 1) -
3041 			    mem_pattr->pmr_offset;
3042 		/*
3043 		 * For last cookie, determine end addr based on starting
3044 		 * address and size of the total buffer
3045 		 */
3046 		} else if (i == mem_pattr->pmr_num_buf - 1) {
3047 			addr = buf->p_laddr;
3048 			endaddr = addr + (first_addr + mem_pattr->pmr_len &
3049 			    (mem_pattr->pmr_buf_sz - 1));
3050 		/*
3051 		 * For the middle cookies case, start and end addr are
3052 		 * straightforward.  Just use the laddr, and the size, as all
3053 		 * middle cookies are a set size.
3054 		 */
3055 		} else {
3056 			addr = buf->p_laddr;
3057 			endaddr = addr + (mem_pattr->pmr_buf_sz - 1);
3058 		}
3059 
3060 		addr	= addr & ~((uint64_t)pagesize - 1);
3061 		while (addr <= endaddr) {
3062 			/*
3063 			 * Fill in the mapped addresses (calculated above) and
3064 			 * set TAVOR_MTT_ENTRY_PRESET flag for each MTT entry.
3065 			 */
3066 			mtt_entry = addr | TAVOR_MTT_ENTRY_PRESET;
3067 			ddi_put64(mtt->tr_acchdl, &mtt_table[i], mtt_entry);
3068 			addr += pagesize;
3069 		}
3070 	}
3071 
3072 	TAVOR_TNF_EXIT(tavor_mr_fast_mtt_write_fmr);
3073 	return (DDI_SUCCESS);
3074 }
3075 
3076 
3077 /*
3078  * tavor_mtt_refcnt_inc()
3079  *    Context: Can be called from interrupt or base context.
3080  */
3081 static int
3082 tavor_mtt_refcnt_inc(tavor_rsrc_t *rsrc)
3083 {
3084 	tavor_sw_refcnt_t *rc;
3085 	uint32_t	  cnt;
3086 
3087 	rc = (tavor_sw_refcnt_t *)rsrc->tr_addr;
3088 
3089 	/* Increment the MTT's reference count */
3090 	mutex_enter(&rc->swrc_lock);
3091 	TNF_PROBE_1_DEBUG(tavor_mtt_refcnt_inc, TAVOR_TNF_TRACE, "",
3092 	    tnf_uint, refcnt, rc->swrc_refcnt);
3093 	cnt = rc->swrc_refcnt++;
3094 	mutex_exit(&rc->swrc_lock);
3095 
3096 	return (cnt);
3097 }
3098 
3099 
3100 /*
3101  * tavor_mtt_refcnt_dec()
3102  *    Context: Can be called from interrupt or base context.
3103  */
3104 static int
3105 tavor_mtt_refcnt_dec(tavor_rsrc_t *rsrc)
3106 {
3107 	tavor_sw_refcnt_t *rc;
3108 	uint32_t	  cnt;
3109 
3110 	rc = (tavor_sw_refcnt_t *)rsrc->tr_addr;
3111 
3112 	/* Decrement the MTT's reference count */
3113 	mutex_enter(&rc->swrc_lock);
3114 	cnt = --rc->swrc_refcnt;
3115 	TNF_PROBE_1_DEBUG(tavor_mtt_refcnt_dec, TAVOR_TNF_TRACE, "",
3116 	    tnf_uint, refcnt, rc->swrc_refcnt);
3117 	mutex_exit(&rc->swrc_lock);
3118 
3119 	return (cnt);
3120 }
3121