xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/tavor/tavor_mr.c (revision 17a2b317610f531d565bf4e940433aab2d9e6985)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * tavor_mr.c
28  *    Tavor Memory Region/Window Routines
29  *
30  *    Implements all the routines necessary to provide the requisite memory
31  *    registration verbs.  These include operations like RegisterMemRegion(),
32  *    DeregisterMemRegion(), ReregisterMemRegion, RegisterSharedMemRegion,
33  *    etc., that affect Memory Regions.  It also includes the verbs that
34  *    affect Memory Windows, including AllocMemWindow(), FreeMemWindow(),
35  *    and QueryMemWindow().
36  */
37 
38 #include <sys/types.h>
39 #include <sys/conf.h>
40 #include <sys/ddi.h>
41 #include <sys/sunddi.h>
42 #include <sys/modctl.h>
43 #include <sys/esunddi.h>
44 
45 #include <sys/ib/adapters/tavor/tavor.h>
46 
47 
48 /*
49  * Used by tavor_mr_keycalc() below to fill in the "unconstrained" portion
50  * of Tavor memory keys (LKeys and RKeys)
51  */
52 static uint_t tavor_debug_memkey_cnt = 0x00000000;
53 
54 static int tavor_mr_common_reg(tavor_state_t *state, tavor_pdhdl_t pd,
55     tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op);
56 static int tavor_mr_common_rereg(tavor_state_t *state, tavor_mrhdl_t mr,
57     tavor_pdhdl_t pd, tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl_new,
58     tavor_mr_options_t *op);
59 static int tavor_mr_rereg_xlat_helper(tavor_state_t *state, tavor_mrhdl_t mr,
60     tavor_bind_info_t *bind, tavor_mr_options_t *op, uint64_t *mtt_addr,
61     uint_t sleep, uint_t *dereg_level);
62 static uint64_t tavor_mr_nummtt_needed(tavor_state_t *state,
63     tavor_bind_info_t *bind, uint_t *mtt_pgsize);
64 static int tavor_mr_mem_bind(tavor_state_t *state, tavor_bind_info_t *bind,
65     ddi_dma_handle_t dmahdl, uint_t sleep);
66 static void tavor_mr_mem_unbind(tavor_state_t *state,
67     tavor_bind_info_t *bind);
68 static int tavor_mr_fast_mtt_write(tavor_rsrc_t *mtt, tavor_bind_info_t *bind,
69     uint32_t mtt_pgsize_bits);
70 static int tavor_mtt_refcnt_inc(tavor_rsrc_t *rsrc);
71 static int tavor_mtt_refcnt_dec(tavor_rsrc_t *rsrc);
72 
73 /*
74  * The Tavor umem_lockmemory() callback ops.  When userland memory is
75  * registered, these callback ops are specified.  The tavor_umap_umemlock_cb()
76  * callback will be called whenever the memory for the corresponding
77  * ddi_umem_cookie_t is being freed.
78  */
79 static struct umem_callback_ops tavor_umem_cbops = {
80 	UMEM_CALLBACK_VERSION,
81 	tavor_umap_umemlock_cb,
82 };
83 
84 
85 /*
86  * tavor_mr_register()
87  *    Context: Can be called from interrupt or base context.
88  */
89 int
90 tavor_mr_register(tavor_state_t *state, tavor_pdhdl_t pd,
91     ibt_mr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op)
92 {
93 	tavor_bind_info_t	bind;
94 	int			status;
95 
96 	TAVOR_TNF_ENTER(tavor_mr_register);
97 
98 	/*
99 	 * Fill in the "bind" struct.  This struct provides the majority
100 	 * of the information that will be used to distinguish between an
101 	 * "addr" binding (as is the case here) and a "buf" binding (see
102 	 * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
103 	 * which does most of the "heavy lifting" for the Tavor memory
104 	 * registration routines.
105 	 */
106 	bind.bi_type  = TAVOR_BINDHDL_VADDR;
107 	bind.bi_addr  = mr_attr->mr_vaddr;
108 	bind.bi_len   = mr_attr->mr_len;
109 	bind.bi_as    = mr_attr->mr_as;
110 	bind.bi_flags = mr_attr->mr_flags;
111 	status = tavor_mr_common_reg(state, pd, &bind, mrhdl, op);
112 	if (status != DDI_SUCCESS) {
113 		TNF_PROBE_0(tavor_mr_register_cmnreg_fail,
114 		    TAVOR_TNF_ERROR, "");
115 		TAVOR_TNF_EXIT(tavor_mr_register);
116 		return (status);
117 	}
118 
119 	TAVOR_TNF_EXIT(tavor_mr_register);
120 	return (DDI_SUCCESS);
121 }
122 
123 
124 /*
125  * tavor_mr_register_buf()
126  *    Context: Can be called from interrupt or base context.
127  */
128 int
129 tavor_mr_register_buf(tavor_state_t *state, tavor_pdhdl_t pd,
130     ibt_smr_attr_t *mr_attr, struct buf *buf, tavor_mrhdl_t *mrhdl,
131     tavor_mr_options_t *op)
132 {
133 	tavor_bind_info_t	bind;
134 	int			status;
135 
136 	TAVOR_TNF_ENTER(tavor_mr_register_buf);
137 
138 	/*
139 	 * Fill in the "bind" struct.  This struct provides the majority
140 	 * of the information that will be used to distinguish between an
141 	 * "addr" binding (see above) and a "buf" binding (as is the case
142 	 * here).  The "bind" struct is later passed to tavor_mr_mem_bind()
143 	 * which does most of the "heavy lifting" for the Tavor memory
144 	 * registration routines.  Note: We have chosen to provide
145 	 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
146 	 * not set).  It is not critical what value we choose here as it need
147 	 * only be unique for the given RKey (which will happen by default),
148 	 * so the choice here is somewhat arbitrary.
149 	 */
150 	bind.bi_type  = TAVOR_BINDHDL_BUF;
151 	bind.bi_buf   = buf;
152 	if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
153 		bind.bi_addr  = mr_attr->mr_vaddr;
154 	} else {
155 		bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
156 	}
157 	bind.bi_as    = NULL;
158 	bind.bi_len   = (uint64_t)buf->b_bcount;
159 	bind.bi_flags = mr_attr->mr_flags;
160 	status = tavor_mr_common_reg(state, pd, &bind, mrhdl, op);
161 	if (status != DDI_SUCCESS) {
162 		TNF_PROBE_0(tavor_mr_register_buf_cmnreg_fail,
163 		    TAVOR_TNF_ERROR, "");
164 		TAVOR_TNF_EXIT(tavor_mr_register_buf);
165 		return (status);
166 	}
167 
168 	TAVOR_TNF_EXIT(tavor_mr_register_buf);
169 	return (DDI_SUCCESS);
170 }
171 
172 
173 /*
174  * tavor_mr_register_shared()
175  *    Context: Can be called from interrupt or base context.
176  */
177 int
178 tavor_mr_register_shared(tavor_state_t *state, tavor_mrhdl_t mrhdl,
179     tavor_pdhdl_t pd, ibt_smr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl_new)
180 {
181 	tavor_rsrc_pool_info_t	*rsrc_pool;
182 	tavor_rsrc_t		*mpt, *mtt, *rsrc;
183 	tavor_umap_db_entry_t	*umapdb;
184 	tavor_hw_mpt_t		mpt_entry;
185 	tavor_mrhdl_t		mr;
186 	tavor_bind_info_t	*bind;
187 	ddi_umem_cookie_t	umem_cookie;
188 	size_t			umem_len;
189 	caddr_t			umem_addr;
190 	uint64_t		mtt_addr, mtt_ddrbaseaddr, pgsize_msk;
191 	uint_t			sleep, mr_is_umem;
192 	int			status, umem_flags;
193 	char			*errormsg;
194 
195 	TAVOR_TNF_ENTER(tavor_mr_register_shared);
196 
197 	/*
198 	 * Check the sleep flag.  Ensure that it is consistent with the
199 	 * current thread context (i.e. if we are currently in the interrupt
200 	 * context, then we shouldn't be attempting to sleep).
201 	 */
202 	sleep = (mr_attr->mr_flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP :
203 	    TAVOR_SLEEP;
204 	if ((sleep == TAVOR_SLEEP) &&
205 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
206 		/* Set "status" and "errormsg" and goto failure */
207 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
208 		goto mrshared_fail;
209 	}
210 
211 	/* Increment the reference count on the protection domain (PD) */
212 	tavor_pd_refcnt_inc(pd);
213 
214 	/*
215 	 * Allocate an MPT entry.  This will be filled in with all the
216 	 * necessary parameters to define the shared memory region.
217 	 * Specifically, it will be made to reference the currently existing
218 	 * MTT entries and ownership of the MPT will be passed to the hardware
219 	 * in the last step below.  If we fail here, we must undo the
220 	 * protection domain reference count.
221 	 */
222 	status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
223 	if (status != DDI_SUCCESS) {
224 		/* Set "status" and "errormsg" and goto failure */
225 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
226 		goto mrshared_fail1;
227 	}
228 
229 	/*
230 	 * Allocate the software structure for tracking the shared memory
231 	 * region (i.e. the Tavor Memory Region handle).  If we fail here, we
232 	 * must undo the protection domain reference count and the previous
233 	 * resource allocation.
234 	 */
235 	status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
236 	if (status != DDI_SUCCESS) {
237 		/* Set "status" and "errormsg" and goto failure */
238 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
239 		goto mrshared_fail2;
240 	}
241 	mr = (tavor_mrhdl_t)rsrc->tr_addr;
242 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
243 
244 	/*
245 	 * Setup and validate the memory region access flags.  This means
246 	 * translating the IBTF's enable flags into the access flags that
247 	 * will be used in later operations.
248 	 */
249 	mr->mr_accflag = 0;
250 	if (mr_attr->mr_flags & IBT_MR_ENABLE_WINDOW_BIND)
251 		mr->mr_accflag |= IBT_MR_WINDOW_BIND;
252 	if (mr_attr->mr_flags & IBT_MR_ENABLE_LOCAL_WRITE)
253 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
254 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_READ)
255 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
256 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_WRITE)
257 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
258 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
259 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
260 
261 	/*
262 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
263 	 * from a certain number of "constrained" bits (the least significant
264 	 * bits) and some number of "unconstrained" bits.  The constrained
265 	 * bits must be set to the index of the entry in the MPT table, but
266 	 * the unconstrained bits can be set to any value we wish.  Note:
267 	 * if no remote access is required, then the RKey value is not filled
268 	 * in.  Otherwise both Rkey and LKey are given the same value.
269 	 */
270 	tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
271 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
272 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
273 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
274 		mr->mr_rkey = mr->mr_lkey;
275 	}
276 
277 	/* Grab the MR lock for the current memory region */
278 	mutex_enter(&mrhdl->mr_lock);
279 
280 	/*
281 	 * Check here to see if the memory region has already been partially
282 	 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
283 	 * If so, this is an error, return failure.
284 	 */
285 	if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
286 		mutex_exit(&mrhdl->mr_lock);
287 		/* Set "status" and "errormsg" and goto failure */
288 		TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
289 		goto mrshared_fail3;
290 	}
291 
292 	/*
293 	 * Determine if the original memory was from userland and, if so, pin
294 	 * the pages (again) with umem_lockmemory().  This will guarantee a
295 	 * separate callback for each of this shared region's MR handles.
296 	 * If this is userland memory, then allocate an entry in the
297 	 * "userland resources database".  This will later be added to
298 	 * the database (after all further memory registration operations are
299 	 * successful).  If we fail here, we must undo all the above setup.
300 	 */
301 	mr_is_umem = mrhdl->mr_is_umem;
302 	if (mr_is_umem) {
303 		umem_len   = ptob(btopr(mrhdl->mr_bindinfo.bi_len +
304 		    ((uintptr_t)mrhdl->mr_bindinfo.bi_addr & PAGEOFFSET)));
305 		umem_addr  = (caddr_t)((uintptr_t)mrhdl->mr_bindinfo.bi_addr &
306 		    ~PAGEOFFSET);
307 		umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
308 		    DDI_UMEMLOCK_LONGTERM);
309 		status = umem_lockmemory(umem_addr, umem_len, umem_flags,
310 		    &umem_cookie, &tavor_umem_cbops, NULL);
311 		if (status != 0) {
312 			mutex_exit(&mrhdl->mr_lock);
313 			/* Set "status" and "errormsg" and goto failure */
314 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umem pin");
315 			goto mrshared_fail3;
316 		}
317 
318 		umapdb = tavor_umap_db_alloc(state->ts_instance,
319 		    (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
320 		    (uint64_t)(uintptr_t)rsrc);
321 		if (umapdb == NULL) {
322 			mutex_exit(&mrhdl->mr_lock);
323 			/* Set "status" and "errormsg" and goto failure */
324 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
325 			goto mrshared_fail4;
326 		}
327 	}
328 
329 	/*
330 	 * Copy the MTT resource pointer (and additional parameters) from
331 	 * the original Tavor Memory Region handle.  Note: this is normally
332 	 * where the tavor_mr_mem_bind() routine would be called, but because
333 	 * we already have bound and filled-in MTT entries it is simply a
334 	 * matter here of managing the MTT reference count and grabbing the
335 	 * address of the MTT table entries (for filling in the shared region's
336 	 * MPT entry).
337 	 */
338 	mr->mr_mttrsrcp	  = mrhdl->mr_mttrsrcp;
339 	mr->mr_logmttpgsz = mrhdl->mr_logmttpgsz;
340 	mr->mr_bindinfo	  = mrhdl->mr_bindinfo;
341 	mr->mr_mttrefcntp = mrhdl->mr_mttrefcntp;
342 	mutex_exit(&mrhdl->mr_lock);
343 	bind = &mr->mr_bindinfo;
344 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
345 	mtt = mr->mr_mttrsrcp;
346 
347 	/*
348 	 * Increment the MTT reference count (to reflect the fact that
349 	 * the MTT is now shared)
350 	 */
351 	(void) tavor_mtt_refcnt_inc(mr->mr_mttrefcntp);
352 
353 	/*
354 	 * Update the new "bind" virtual address.  Do some extra work here
355 	 * to ensure proper alignment.  That is, make sure that the page
356 	 * offset for the beginning of the old range is the same as the
357 	 * offset for this new mapping
358 	 */
359 	pgsize_msk = (((uint64_t)1 << mr->mr_logmttpgsz) - 1);
360 	bind->bi_addr = ((mr_attr->mr_vaddr & ~pgsize_msk) |
361 	    (mr->mr_bindinfo.bi_addr & pgsize_msk));
362 
363 	/*
364 	 * Get the base address for the MTT table.  This will be necessary
365 	 * in the next step when we are setting up the MPT entry.
366 	 */
367 	rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
368 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
369 
370 	/*
371 	 * Fill in the MPT entry.  This is the final step before passing
372 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
373 	 * the information collected/calculated above to fill in the
374 	 * requisite portions of the MPT.
375 	 */
376 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
377 	mpt_entry.m_io	  = TAVOR_MEM_CYCLE_GENERATE;
378 	mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
379 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
380 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
381 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
382 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
383 	mpt_entry.lr	  = 1;
384 	mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
385 	mpt_entry.page_sz	= mr->mr_logmttpgsz - 0xC;
386 	mpt_entry.mem_key	= mr->mr_lkey;
387 	mpt_entry.pd		= pd->pd_pdnum;
388 	mpt_entry.start_addr	= bind->bi_addr;
389 	mpt_entry.reg_win_len	= bind->bi_len;
390 	mpt_entry.win_cnt_limit	= TAVOR_UNLIMITED_WIN_BIND;
391 	mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
392 	mpt_entry.mttseg_addr_h = mtt_addr >> 32;
393 	mpt_entry.mttseg_addr_l = mtt_addr >> 6;
394 
395 	/*
396 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
397 	 * the entry to the hardware.  Note: in general, this operation
398 	 * shouldn't fail.  But if it does, we have to undo everything we've
399 	 * done above before returning error.
400 	 */
401 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
402 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
403 	if (status != TAVOR_CMD_SUCCESS) {
404 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
405 		    status);
406 		TNF_PROBE_1(tavor_mr_register_shared_sw2hw_mpt_cmd_fail,
407 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
408 		/* Set "status" and "errormsg" and goto failure */
409 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
410 		    "tavor SW2HW_MPT command");
411 		goto mrshared_fail5;
412 	}
413 
414 	/*
415 	 * Fill in the rest of the Tavor Memory Region handle.  Having
416 	 * successfully transferred ownership of the MPT, we can update the
417 	 * following fields for use in further operations on the MR.
418 	 */
419 	mr->mr_mptrsrcp	  = mpt;
420 	mr->mr_mttrsrcp	  = mtt;
421 	mr->mr_pdhdl	  = pd;
422 	mr->mr_rsrcp	  = rsrc;
423 	mr->mr_is_umem	  = mr_is_umem;
424 	mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL;
425 	mr->mr_umem_cbfunc = NULL;
426 	mr->mr_umem_cbarg1 = NULL;
427 	mr->mr_umem_cbarg2 = NULL;
428 
429 	/*
430 	 * If this is userland memory, then we need to insert the previously
431 	 * allocated entry into the "userland resources database".  This will
432 	 * allow for later coordination between the tavor_umap_umemlock_cb()
433 	 * callback and tavor_mr_deregister().
434 	 */
435 	if (mr_is_umem) {
436 		tavor_umap_db_add(umapdb);
437 	}
438 
439 	*mrhdl_new = mr;
440 
441 	TAVOR_TNF_EXIT(tavor_mr_register_shared);
442 	return (DDI_SUCCESS);
443 
444 /*
445  * The following is cleanup for all possible failure cases in this routine
446  */
447 mrshared_fail5:
448 	(void) tavor_mtt_refcnt_dec(mr->mr_mttrefcntp);
449 	if (mr_is_umem) {
450 		tavor_umap_db_free(umapdb);
451 	}
452 mrshared_fail4:
453 	if (mr_is_umem) {
454 		ddi_umem_unlock(umem_cookie);
455 	}
456 mrshared_fail3:
457 	tavor_rsrc_free(state, &rsrc);
458 mrshared_fail2:
459 	tavor_rsrc_free(state, &mpt);
460 mrshared_fail1:
461 	tavor_pd_refcnt_dec(pd);
462 mrshared_fail:
463 	TNF_PROBE_1(tavor_mr_register_shared_fail, TAVOR_TNF_ERROR, "",
464 	    tnf_string, msg, errormsg);
465 	TAVOR_TNF_EXIT(tavor_mr_register_shared);
466 	return (status);
467 }
468 
469 
470 /*
471  * tavor_mr_deregister()
472  *    Context: Can be called from interrupt or base context.
473  */
474 /* ARGSUSED */
475 int
476 tavor_mr_deregister(tavor_state_t *state, tavor_mrhdl_t *mrhdl, uint_t level,
477     uint_t sleep)
478 {
479 	tavor_rsrc_t		*mpt, *mtt, *rsrc, *mtt_refcnt;
480 	tavor_umap_db_entry_t	*umapdb;
481 	tavor_pdhdl_t		pd;
482 	tavor_mrhdl_t		mr;
483 	tavor_bind_info_t	*bind;
484 	uint64_t		value;
485 	int			status, shared_mtt;
486 	char			*errormsg;
487 
488 	TAVOR_TNF_ENTER(tavor_mr_deregister);
489 
490 	/*
491 	 * Check the sleep flag.  Ensure that it is consistent with the
492 	 * current thread context (i.e. if we are currently in the interrupt
493 	 * context, then we shouldn't be attempting to sleep).
494 	 */
495 	if ((sleep == TAVOR_SLEEP) &&
496 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
497 		/* Set "status" and "errormsg" and goto failure */
498 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sleep flags");
499 		TNF_PROBE_1(tavor_mr_deregister_fail, TAVOR_TNF_ERROR, "",
500 		    tnf_string, msg, errormsg);
501 		TAVOR_TNF_EXIT(tavor_mr_deregister);
502 		return (status);
503 	}
504 
505 	/*
506 	 * Pull all the necessary information from the Tavor Memory Region
507 	 * handle.  This is necessary here because the resource for the
508 	 * MR handle is going to be freed up as part of the this
509 	 * deregistration
510 	 */
511 	mr	= *mrhdl;
512 	mutex_enter(&mr->mr_lock);
513 	mpt	= mr->mr_mptrsrcp;
514 	mtt	= mr->mr_mttrsrcp;
515 	mtt_refcnt = mr->mr_mttrefcntp;
516 	rsrc	= mr->mr_rsrcp;
517 	pd	= mr->mr_pdhdl;
518 	bind	= &mr->mr_bindinfo;
519 
520 	/*
521 	 * Check here to see if the memory region has already been partially
522 	 * deregistered as a result of the tavor_umap_umemlock_cb() callback.
523 	 * If so, then jump to the end and free the remaining resources.
524 	 */
525 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
526 		goto mrdereg_finish_cleanup;
527 	}
528 
529 	/*
530 	 * We must drop the "mr_lock" here to ensure that both SLEEP and
531 	 * NOSLEEP calls into the firmware work as expected.  Also, if two
532 	 * threads are attemping to access this MR (via de-register,
533 	 * re-register, or otherwise), then we allow the firmware to enforce
534 	 * the checking, that only one deregister is valid.
535 	 */
536 	mutex_exit(&mr->mr_lock);
537 
538 	/*
539 	 * Reclaim MPT entry from hardware (if necessary).  Since the
540 	 * tavor_mr_deregister() routine is used in the memory region
541 	 * reregistration process as well, it is possible that we will
542 	 * not always wish to reclaim ownership of the MPT.  Check the
543 	 * "level" arg and, if necessary, attempt to reclaim it.  If
544 	 * the ownership transfer fails for any reason, we check to see
545 	 * what command status was returned from the hardware.  The only
546 	 * "expected" error status is the one that indicates an attempt to
547 	 * deregister a memory region that has memory windows bound to it
548 	 */
549 	if (level >= TAVOR_MR_DEREG_ALL) {
550 		status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT,
551 		    NULL, 0, mpt->tr_indx, sleep);
552 		if (status != TAVOR_CMD_SUCCESS) {
553 			if (status == TAVOR_CMD_REG_BOUND) {
554 				TAVOR_TNF_EXIT(tavor_mr_deregister);
555 				return (IBT_MR_IN_USE);
556 			} else {
557 				cmn_err(CE_CONT, "Tavor: HW2SW_MPT command "
558 				    "failed: %08x\n", status);
559 				TNF_PROBE_1(tavor_hw2sw_mpt_cmd_fail,
560 				    TAVOR_TNF_ERROR, "", tnf_uint, status,
561 				    status);
562 				TAVOR_TNF_EXIT(tavor_mr_deregister);
563 				return (IBT_INVALID_PARAM);
564 			}
565 		}
566 	}
567 
568 	/*
569 	 * Re-grab the mr_lock here.  Since further access to the protected
570 	 * 'mr' structure is needed, and we would have returned previously for
571 	 * the multiple deregistration case, we can safely grab the lock here.
572 	 */
573 	mutex_enter(&mr->mr_lock);
574 
575 	/*
576 	 * If the memory had come from userland, then we do a lookup in the
577 	 * "userland resources database".  On success, we free the entry, call
578 	 * ddi_umem_unlock(), and continue the cleanup.  On failure (which is
579 	 * an indication that the umem_lockmemory() callback has called
580 	 * tavor_mr_deregister()), we call ddi_umem_unlock() and invalidate
581 	 * the "mr_umemcookie" field in the MR handle (this will be used
582 	 * later to detect that only partial cleaup still remains to be done
583 	 * on the MR handle).
584 	 */
585 	if (mr->mr_is_umem) {
586 		status = tavor_umap_db_find(state->ts_instance,
587 		    (uint64_t)(uintptr_t)mr->mr_umemcookie,
588 		    MLNX_UMAP_MRMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
589 		    &umapdb);
590 		if (status == DDI_SUCCESS) {
591 			tavor_umap_db_free(umapdb);
592 			ddi_umem_unlock(mr->mr_umemcookie);
593 		} else {
594 			ddi_umem_unlock(mr->mr_umemcookie);
595 			mr->mr_umemcookie = NULL;
596 		}
597 	}
598 
599 	/* mtt_refcnt is NULL in the case of tavor_dma_mr_register() */
600 	if (mtt_refcnt != NULL) {
601 		/*
602 		 * Decrement the MTT reference count.  Since the MTT resource
603 		 * may be shared between multiple memory regions (as a result
604 		 * of a "RegisterSharedMR" verb) it is important that we not
605 		 * free up or unbind resources prematurely.  If it's not shared
606 		 * (as indicated by the return status), then free the resource.
607 		 */
608 		shared_mtt = tavor_mtt_refcnt_dec(mtt_refcnt);
609 		if (!shared_mtt) {
610 			tavor_rsrc_free(state, &mtt_refcnt);
611 		}
612 
613 		/*
614 		 * Free up the MTT entries and unbind the memory.  Here,
615 		 * as above, we attempt to free these resources only if
616 		 * it is appropriate to do so.
617 		 */
618 		if (!shared_mtt) {
619 			if (level >= TAVOR_MR_DEREG_NO_HW2SW_MPT) {
620 				tavor_mr_mem_unbind(state, bind);
621 			}
622 			tavor_rsrc_free(state, &mtt);
623 		}
624 	}
625 
626 	/*
627 	 * If the MR handle has been invalidated, then drop the
628 	 * lock and return success.  Note: This only happens because
629 	 * the umem_lockmemory() callback has been triggered.  The
630 	 * cleanup here is partial, and further cleanup (in a
631 	 * subsequent tavor_mr_deregister() call) will be necessary.
632 	 */
633 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
634 		mutex_exit(&mr->mr_lock);
635 		TAVOR_TNF_EXIT(tavor_mr_deregister);
636 		return (DDI_SUCCESS);
637 	}
638 
639 mrdereg_finish_cleanup:
640 	mutex_exit(&mr->mr_lock);
641 
642 	/* Free the Tavor Memory Region handle */
643 	tavor_rsrc_free(state, &rsrc);
644 
645 	/* Free up the MPT entry resource */
646 	tavor_rsrc_free(state, &mpt);
647 
648 	/* Decrement the reference count on the protection domain (PD) */
649 	tavor_pd_refcnt_dec(pd);
650 
651 	/* Set the mrhdl pointer to NULL and return success */
652 	*mrhdl = NULL;
653 
654 	TAVOR_TNF_EXIT(tavor_mr_deregister);
655 	return (DDI_SUCCESS);
656 }
657 
658 
659 /*
660  * tavor_mr_query()
661  *    Context: Can be called from interrupt or base context.
662  */
663 /* ARGSUSED */
664 int
665 tavor_mr_query(tavor_state_t *state, tavor_mrhdl_t mr,
666     ibt_mr_query_attr_t *attr)
667 {
668 	TAVOR_TNF_ENTER(tavor_mr_query);
669 
670 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr))
671 
672 	mutex_enter(&mr->mr_lock);
673 
674 	/*
675 	 * Check here to see if the memory region has already been partially
676 	 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
677 	 * If so, this is an error, return failure.
678 	 */
679 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
680 		mutex_exit(&mr->mr_lock);
681 		TNF_PROBE_0(tavor_mr_query_inv_mrhdl_fail, TAVOR_TNF_ERROR, "");
682 		TAVOR_TNF_EXIT(tavor_mr_query);
683 		return (IBT_MR_HDL_INVALID);
684 	}
685 
686 	/* Fill in the queried attributes */
687 	attr->mr_attr_flags = mr->mr_accflag;
688 	attr->mr_pd	= (ibt_pd_hdl_t)mr->mr_pdhdl;
689 
690 	/* Fill in the "local" attributes */
691 	attr->mr_lkey = (ibt_lkey_t)mr->mr_lkey;
692 	attr->mr_lbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
693 	attr->mr_lbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
694 
695 	/*
696 	 * Fill in the "remote" attributes (if necessary).  Note: the
697 	 * remote attributes are only valid if the memory region has one
698 	 * or more of the remote access flags set.
699 	 */
700 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
701 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
702 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
703 		attr->mr_rkey = (ibt_rkey_t)mr->mr_rkey;
704 		attr->mr_rbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
705 		attr->mr_rbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
706 	}
707 
708 	/*
709 	 * If region is mapped for streaming (i.e. noncoherent), then set sync
710 	 * is required
711 	 */
712 	attr->mr_sync_required = (mr->mr_bindinfo.bi_flags &
713 	    IBT_MR_NONCOHERENT) ? B_TRUE : B_FALSE;
714 
715 	mutex_exit(&mr->mr_lock);
716 	TAVOR_TNF_EXIT(tavor_mr_query);
717 	return (DDI_SUCCESS);
718 }
719 
720 
721 /*
722  * tavor_mr_reregister()
723  *    Context: Can be called from interrupt or base context.
724  */
725 int
726 tavor_mr_reregister(tavor_state_t *state, tavor_mrhdl_t mr,
727     tavor_pdhdl_t pd, ibt_mr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl_new,
728     tavor_mr_options_t *op)
729 {
730 	tavor_bind_info_t	bind;
731 	int			status;
732 
733 	TAVOR_TNF_ENTER(tavor_mr_reregister);
734 
735 	/*
736 	 * Fill in the "bind" struct.  This struct provides the majority
737 	 * of the information that will be used to distinguish between an
738 	 * "addr" binding (as is the case here) and a "buf" binding (see
739 	 * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
740 	 * which does most of the "heavy lifting" for the Tavor memory
741 	 * registration (and reregistration) routines.
742 	 */
743 	bind.bi_type  = TAVOR_BINDHDL_VADDR;
744 	bind.bi_addr  = mr_attr->mr_vaddr;
745 	bind.bi_len   = mr_attr->mr_len;
746 	bind.bi_as    = mr_attr->mr_as;
747 	bind.bi_flags = mr_attr->mr_flags;
748 	status = tavor_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
749 	if (status != DDI_SUCCESS) {
750 		TNF_PROBE_0(tavor_mr_reregister_cmnreg_fail,
751 		    TAVOR_TNF_ERROR, "");
752 		TAVOR_TNF_EXIT(tavor_mr_reregister);
753 		return (status);
754 	}
755 
756 	TAVOR_TNF_EXIT(tavor_mr_reregister);
757 	return (DDI_SUCCESS);
758 }
759 
760 
761 /*
762  * tavor_mr_reregister_buf()
763  *    Context: Can be called from interrupt or base context.
764  */
765 int
766 tavor_mr_reregister_buf(tavor_state_t *state, tavor_mrhdl_t mr,
767     tavor_pdhdl_t pd, ibt_smr_attr_t *mr_attr, struct buf *buf,
768     tavor_mrhdl_t *mrhdl_new, tavor_mr_options_t *op)
769 {
770 	tavor_bind_info_t	bind;
771 	int			status;
772 
773 	TAVOR_TNF_ENTER(tavor_mr_reregister_buf);
774 
775 	/*
776 	 * Fill in the "bind" struct.  This struct provides the majority
777 	 * of the information that will be used to distinguish between an
778 	 * "addr" binding (see above) and a "buf" binding (as is the case
779 	 * here).  The "bind" struct is later passed to tavor_mr_mem_bind()
780 	 * which does most of the "heavy lifting" for the Tavor memory
781 	 * registration routines.  Note: We have chosen to provide
782 	 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
783 	 * not set).  It is not critical what value we choose here as it need
784 	 * only be unique for the given RKey (which will happen by default),
785 	 * so the choice here is somewhat arbitrary.
786 	 */
787 	bind.bi_type  = TAVOR_BINDHDL_BUF;
788 	bind.bi_buf   = buf;
789 	if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
790 		bind.bi_addr  = mr_attr->mr_vaddr;
791 	} else {
792 		bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
793 	}
794 	bind.bi_len   = (uint64_t)buf->b_bcount;
795 	bind.bi_flags = mr_attr->mr_flags;
796 	bind.bi_as = NULL;
797 	status = tavor_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
798 	if (status != DDI_SUCCESS) {
799 		TNF_PROBE_0(tavor_mr_reregister_buf_cmnreg_fail,
800 		    TAVOR_TNF_ERROR, "");
801 		TAVOR_TNF_EXIT(tavor_mr_reregister_buf);
802 		return (status);
803 	}
804 
805 	TAVOR_TNF_EXIT(tavor_mr_reregister_buf);
806 	return (DDI_SUCCESS);
807 }
808 
809 
810 /*
811  * tavor_mr_sync()
812  *    Context: Can be called from interrupt or base context.
813  */
814 /* ARGSUSED */
815 int
816 tavor_mr_sync(tavor_state_t *state, ibt_mr_sync_t *mr_segs, size_t num_segs)
817 {
818 	tavor_mrhdl_t		mrhdl;
819 	uint64_t		seg_vaddr, seg_len, seg_end;
820 	uint64_t		mr_start, mr_end;
821 	uint_t			type;
822 	int			status, i;
823 	char			*errormsg;
824 
825 	TAVOR_TNF_ENTER(tavor_mr_sync);
826 
827 	/* Process each of the ibt_mr_sync_t's */
828 	for (i = 0; i < num_segs; i++) {
829 		mrhdl = (tavor_mrhdl_t)mr_segs[i].ms_handle;
830 
831 		/* Check for valid memory region handle */
832 		if (mrhdl == NULL) {
833 			/* Set "status" and "errormsg" and goto failure */
834 			TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
835 			goto mrsync_fail;
836 		}
837 
838 		mutex_enter(&mrhdl->mr_lock);
839 
840 		/*
841 		 * Check here to see if the memory region has already been
842 		 * partially deregistered as a result of a
843 		 * tavor_umap_umemlock_cb() callback.  If so, this is an
844 		 * error, return failure.
845 		 */
846 		if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
847 			mutex_exit(&mrhdl->mr_lock);
848 			/* Set "status" and "errormsg" and goto failure */
849 			TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl2");
850 			goto mrsync_fail;
851 		}
852 
853 		/* Check for valid bounds on sync request */
854 		seg_vaddr = mr_segs[i].ms_vaddr;
855 		seg_len	  = mr_segs[i].ms_len;
856 		seg_end	  = seg_vaddr + seg_len - 1;
857 		mr_start  = mrhdl->mr_bindinfo.bi_addr;
858 		mr_end	  = mr_start + mrhdl->mr_bindinfo.bi_len - 1;
859 		if ((seg_vaddr < mr_start) || (seg_vaddr > mr_end)) {
860 			mutex_exit(&mrhdl->mr_lock);
861 			/* Set "status" and "errormsg" and goto failure */
862 			TAVOR_TNF_FAIL(IBT_MR_VA_INVALID, "invalid vaddr");
863 			goto mrsync_fail;
864 		}
865 		if ((seg_end < mr_start) || (seg_end > mr_end)) {
866 			mutex_exit(&mrhdl->mr_lock);
867 			/* Set "status" and "errormsg" and goto failure */
868 			TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
869 			goto mrsync_fail;
870 		}
871 
872 		/* Determine what type (i.e. direction) for sync */
873 		if (mr_segs[i].ms_flags & IBT_SYNC_READ) {
874 			type = DDI_DMA_SYNC_FORDEV;
875 		} else if (mr_segs[i].ms_flags & IBT_SYNC_WRITE) {
876 			type = DDI_DMA_SYNC_FORCPU;
877 		} else {
878 			mutex_exit(&mrhdl->mr_lock);
879 			/* Set "status" and "errormsg" and goto failure */
880 			TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sync type");
881 			goto mrsync_fail;
882 		}
883 
884 		(void) ddi_dma_sync(mrhdl->mr_bindinfo.bi_dmahdl,
885 		    (off_t)(seg_vaddr - mr_start), (size_t)seg_len, type);
886 		mutex_exit(&mrhdl->mr_lock);
887 	}
888 
889 	TAVOR_TNF_EXIT(tavor_mr_sync);
890 	return (DDI_SUCCESS);
891 
892 mrsync_fail:
893 	TNF_PROBE_1(tavor_mr_sync_fail, TAVOR_TNF_ERROR, "", tnf_string, msg,
894 	    errormsg);
895 	TAVOR_TNF_EXIT(tavor_mr_sync);
896 	return (status);
897 }
898 
899 
900 /*
901  * tavor_mw_alloc()
902  *    Context: Can be called from interrupt or base context.
903  */
904 int
905 tavor_mw_alloc(tavor_state_t *state, tavor_pdhdl_t pd, ibt_mw_flags_t flags,
906     tavor_mwhdl_t *mwhdl)
907 {
908 	tavor_rsrc_t		*mpt, *rsrc;
909 	tavor_hw_mpt_t		mpt_entry;
910 	tavor_mwhdl_t		mw;
911 	uint_t			sleep;
912 	int			status;
913 	char			*errormsg;
914 
915 	TAVOR_TNF_ENTER(tavor_mw_alloc);
916 
917 	/*
918 	 * Check the sleep flag.  Ensure that it is consistent with the
919 	 * current thread context (i.e. if we are currently in the interrupt
920 	 * context, then we shouldn't be attempting to sleep).
921 	 */
922 	sleep = (flags & IBT_MW_NOSLEEP) ? TAVOR_NOSLEEP : TAVOR_SLEEP;
923 	if ((sleep == TAVOR_SLEEP) &&
924 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
925 		/* Set "status" and "errormsg" and goto failure */
926 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
927 		goto mwalloc_fail;
928 	}
929 
930 	/* Increment the reference count on the protection domain (PD) */
931 	tavor_pd_refcnt_inc(pd);
932 
933 	/*
934 	 * Allocate an MPT entry (for use as a memory window).  Since the
935 	 * Tavor hardware uses the MPT entry for memory regions and for
936 	 * memory windows, we will fill in this MPT with all the necessary
937 	 * parameters for the memory window.  And then (just as we do for
938 	 * memory regions) ownership will be passed to the hardware in the
939 	 * final step below.  If we fail here, we must undo the protection
940 	 * domain reference count.
941 	 */
942 	status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
943 	if (status != DDI_SUCCESS) {
944 		/* Set "status" and "errormsg" and goto failure */
945 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
946 		goto mwalloc_fail1;
947 	}
948 
949 	/*
950 	 * Allocate the software structure for tracking the memory window (i.e.
951 	 * the Tavor Memory Window handle).  Note: This is actually the same
952 	 * software structure used for tracking memory regions, but since many
953 	 * of the same properties are needed, only a single structure is
954 	 * necessary.  If we fail here, we must undo the protection domain
955 	 * reference count and the previous resource allocation.
956 	 */
957 	status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
958 	if (status != DDI_SUCCESS) {
959 		/* Set "status" and "errormsg" and goto failure */
960 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
961 		goto mwalloc_fail2;
962 	}
963 	mw = (tavor_mwhdl_t)rsrc->tr_addr;
964 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
965 
966 	/*
967 	 * Calculate an "unbound" RKey from MPT index.  In much the same way
968 	 * as we do for memory regions (above), this key is constructed from
969 	 * a "constrained" (which depends on the MPT index) and an
970 	 * "unconstrained" portion (which may be arbitrarily chosen).
971 	 */
972 	tavor_mr_keycalc(state, mpt->tr_indx, &mw->mr_rkey);
973 
974 	/*
975 	 * Fill in the MPT entry.  This is the final step before passing
976 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
977 	 * the information collected/calculated above to fill in the
978 	 * requisite portions of the MPT.  Note: fewer entries in the MPT
979 	 * entry are necessary to allocate a memory window.
980 	 */
981 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
982 	mpt_entry.reg_win	= TAVOR_MPT_IS_WINDOW;
983 	mpt_entry.mem_key	= mw->mr_rkey;
984 	mpt_entry.pd		= pd->pd_pdnum;
985 
986 	/*
987 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
988 	 * the entry to the hardware.  Note: in general, this operation
989 	 * shouldn't fail.  But if it does, we have to undo everything we've
990 	 * done above before returning error.
991 	 */
992 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
993 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
994 	if (status != TAVOR_CMD_SUCCESS) {
995 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
996 		    status);
997 		TNF_PROBE_1(tavor_mw_alloc_sw2hw_mpt_cmd_fail,
998 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
999 		/* Set "status" and "errormsg" and goto failure */
1000 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1001 		    "tavor SW2HW_MPT command");
1002 		goto mwalloc_fail3;
1003 	}
1004 
1005 	/*
1006 	 * Fill in the rest of the Tavor Memory Window handle.  Having
1007 	 * successfully transferred ownership of the MPT, we can update the
1008 	 * following fields for use in further operations on the MW.
1009 	 */
1010 	mw->mr_mptrsrcp	= mpt;
1011 	mw->mr_pdhdl	= pd;
1012 	mw->mr_rsrcp	= rsrc;
1013 	*mwhdl = mw;
1014 
1015 	TAVOR_TNF_EXIT(tavor_mw_alloc);
1016 	return (DDI_SUCCESS);
1017 
1018 mwalloc_fail3:
1019 	tavor_rsrc_free(state, &rsrc);
1020 mwalloc_fail2:
1021 	tavor_rsrc_free(state, &mpt);
1022 mwalloc_fail1:
1023 	tavor_pd_refcnt_dec(pd);
1024 mwalloc_fail:
1025 	TNF_PROBE_1(tavor_mw_alloc_fail, TAVOR_TNF_ERROR, "",
1026 	    tnf_string, msg, errormsg);
1027 	TAVOR_TNF_EXIT(tavor_mw_alloc);
1028 	return (status);
1029 }
1030 
1031 
1032 /*
1033  * tavor_mw_free()
1034  *    Context: Can be called from interrupt or base context.
1035  */
1036 int
1037 tavor_mw_free(tavor_state_t *state, tavor_mwhdl_t *mwhdl, uint_t sleep)
1038 {
1039 	tavor_rsrc_t		*mpt, *rsrc;
1040 	tavor_mwhdl_t		mw;
1041 	int			status;
1042 	char			*errormsg;
1043 	tavor_pdhdl_t		pd;
1044 
1045 	TAVOR_TNF_ENTER(tavor_mw_free);
1046 
1047 	/*
1048 	 * Check the sleep flag.  Ensure that it is consistent with the
1049 	 * current thread context (i.e. if we are currently in the interrupt
1050 	 * context, then we shouldn't be attempting to sleep).
1051 	 */
1052 	if ((sleep == TAVOR_SLEEP) &&
1053 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1054 		/* Set "status" and "errormsg" and goto failure */
1055 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sleep flags");
1056 		TNF_PROBE_1(tavor_mw_free_fail, TAVOR_TNF_ERROR, "",
1057 		    tnf_string, msg, errormsg);
1058 		TAVOR_TNF_EXIT(tavor_mw_free);
1059 		return (status);
1060 	}
1061 
1062 	/*
1063 	 * Pull all the necessary information from the Tavor Memory Window
1064 	 * handle.  This is necessary here because the resource for the
1065 	 * MW handle is going to be freed up as part of the this operation.
1066 	 */
1067 	mw	= *mwhdl;
1068 	mutex_enter(&mw->mr_lock);
1069 	mpt	= mw->mr_mptrsrcp;
1070 	rsrc	= mw->mr_rsrcp;
1071 	pd	= mw->mr_pdhdl;
1072 	mutex_exit(&mw->mr_lock);
1073 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
1074 
1075 	/*
1076 	 * Reclaim the MPT entry from hardware.  Note: in general, it is
1077 	 * unexpected for this operation to return an error.
1078 	 */
1079 	status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT, NULL,
1080 	    0, mpt->tr_indx, sleep);
1081 	if (status != TAVOR_CMD_SUCCESS) {
1082 		cmn_err(CE_CONT, "Tavor: HW2SW_MPT command failed: %08x\n",
1083 		    status);
1084 		TNF_PROBE_1(tavor_hw2sw_mpt_cmd_fail, TAVOR_TNF_ERROR, "",
1085 		    tnf_uint, status, status);
1086 		TAVOR_TNF_EXIT(tavor_mw_free);
1087 		return (IBT_INVALID_PARAM);
1088 	}
1089 
1090 	/* Free the Tavor Memory Window handle */
1091 	tavor_rsrc_free(state, &rsrc);
1092 
1093 	/* Free up the MPT entry resource */
1094 	tavor_rsrc_free(state, &mpt);
1095 
1096 	/* Decrement the reference count on the protection domain (PD) */
1097 	tavor_pd_refcnt_dec(pd);
1098 
1099 	/* Set the mwhdl pointer to NULL and return success */
1100 	*mwhdl = NULL;
1101 
1102 	TAVOR_TNF_EXIT(tavor_mw_free);
1103 	return (DDI_SUCCESS);
1104 }
1105 
1106 
1107 /*
1108  * tavor_mr_keycalc()
1109  *    Context: Can be called from interrupt or base context.
1110  */
1111 void
1112 tavor_mr_keycalc(tavor_state_t *state, uint32_t indx, uint32_t *key)
1113 {
1114 	uint32_t	tmp, log_num_mpt;
1115 
1116 	/*
1117 	 * Generate a simple key from counter.  Note:  We increment this
1118 	 * static variable _intentionally_ without any kind of mutex around
1119 	 * it.  First, single-threading all operations through a single lock
1120 	 * would be a bad idea (from a performance point-of-view).  Second,
1121 	 * the upper "unconstrained" bits don't really have to be unique
1122 	 * because the lower bits are guaranteed to be (although we do make a
1123 	 * best effort to ensure that they are).  Third, the window for the
1124 	 * race (where both threads read and update the counter at the same
1125 	 * time) is incredibly small.
1126 	 * And, lastly, we'd like to make this into a "random" key XXX
1127 	 */
1128 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(tavor_debug_memkey_cnt))
1129 	log_num_mpt = state->ts_cfg_profile->cp_log_num_mpt;
1130 	tmp = (tavor_debug_memkey_cnt++) << log_num_mpt;
1131 	*key = tmp | indx;
1132 }
1133 
1134 
1135 /*
1136  * tavor_mr_common_reg()
1137  *    Context: Can be called from interrupt or base context.
1138  */
1139 static int
1140 tavor_mr_common_reg(tavor_state_t *state, tavor_pdhdl_t pd,
1141     tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op)
1142 {
1143 	tavor_rsrc_pool_info_t	*rsrc_pool;
1144 	tavor_rsrc_t		*mpt, *mtt, *rsrc, *mtt_refcnt;
1145 	tavor_umap_db_entry_t	*umapdb;
1146 	tavor_sw_refcnt_t	*swrc_tmp;
1147 	tavor_hw_mpt_t		mpt_entry;
1148 	tavor_mrhdl_t		mr;
1149 	ibt_mr_flags_t		flags;
1150 	tavor_bind_info_t	*bh;
1151 	ddi_dma_handle_t	bind_dmahdl;
1152 	ddi_umem_cookie_t	umem_cookie;
1153 	size_t			umem_len;
1154 	caddr_t			umem_addr;
1155 	uint64_t		mtt_addr, mtt_ddrbaseaddr, max_sz;
1156 	uint_t			sleep, mtt_pgsize_bits, bind_type, mr_is_umem;
1157 	int			status, umem_flags, bind_override_addr;
1158 	char			*errormsg;
1159 
1160 	TAVOR_TNF_ENTER(tavor_mr_common_reg);
1161 
1162 	/*
1163 	 * Check the "options" flag.  Currently this flag tells the driver
1164 	 * whether or not the region should be bound normally (i.e. with
1165 	 * entries written into the PCI IOMMU), whether it should be
1166 	 * registered to bypass the IOMMU, and whether or not the resulting
1167 	 * address should be "zero-based" (to aid the alignment restrictions
1168 	 * for QPs).
1169 	 */
1170 	if (op == NULL) {
1171 		bind_type   = TAVOR_BINDMEM_NORMAL;
1172 		bind_dmahdl = NULL;
1173 		bind_override_addr = 0;
1174 	} else {
1175 		bind_type	   = op->mro_bind_type;
1176 		bind_dmahdl	   = op->mro_bind_dmahdl;
1177 		bind_override_addr = op->mro_bind_override_addr;
1178 	}
1179 
1180 	/* Extract the flags field from the tavor_bind_info_t */
1181 	flags = bind->bi_flags;
1182 
1183 	/*
1184 	 * Check for invalid length.  Check is the length is zero or if the
1185 	 * length is larger than the maximum configured value.  Return error
1186 	 * if it is.
1187 	 */
1188 	max_sz = ((uint64_t)1 << state->ts_cfg_profile->cp_log_max_mrw_sz);
1189 	if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
1190 		/* Set "status" and "errormsg" and goto failure */
1191 		TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
1192 		goto mrcommon_fail;
1193 	}
1194 
1195 	/*
1196 	 * Check the sleep flag.  Ensure that it is consistent with the
1197 	 * current thread context (i.e. if we are currently in the interrupt
1198 	 * context, then we shouldn't be attempting to sleep).
1199 	 */
1200 	sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1201 	if ((sleep == TAVOR_SLEEP) &&
1202 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1203 		/* Set "status" and "errormsg" and goto failure */
1204 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1205 		goto mrcommon_fail;
1206 	}
1207 
1208 	/*
1209 	 * Get the base address for the MTT table.  This will be necessary
1210 	 * below when we are setting up the MPT entry.
1211 	 */
1212 	rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
1213 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
1214 
1215 	/* Increment the reference count on the protection domain (PD) */
1216 	tavor_pd_refcnt_inc(pd);
1217 
1218 	/*
1219 	 * Allocate an MPT entry.  This will be filled in with all the
1220 	 * necessary parameters to define the memory region.  And then
1221 	 * ownership will be passed to the hardware in the final step
1222 	 * below.  If we fail here, we must undo the protection domain
1223 	 * reference count.
1224 	 */
1225 	status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
1226 	if (status != DDI_SUCCESS) {
1227 		/* Set "status" and "errormsg" and goto failure */
1228 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT");
1229 		goto mrcommon_fail1;
1230 	}
1231 
1232 	/*
1233 	 * Allocate the software structure for tracking the memory region (i.e.
1234 	 * the Tavor Memory Region handle).  If we fail here, we must undo
1235 	 * the protection domain reference count and the previous resource
1236 	 * allocation.
1237 	 */
1238 	status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
1239 	if (status != DDI_SUCCESS) {
1240 		/* Set "status" and "errormsg" and goto failure */
1241 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle");
1242 		goto mrcommon_fail2;
1243 	}
1244 	mr = (tavor_mrhdl_t)rsrc->tr_addr;
1245 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
1246 
1247 	/*
1248 	 * Setup and validate the memory region access flags.  This means
1249 	 * translating the IBTF's enable flags into the access flags that
1250 	 * will be used in later operations.
1251 	 */
1252 	mr->mr_accflag = 0;
1253 	if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1254 		mr->mr_accflag |= IBT_MR_WINDOW_BIND;
1255 	if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1256 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
1257 	if (flags & IBT_MR_ENABLE_REMOTE_READ)
1258 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
1259 	if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1260 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
1261 	if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1262 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
1263 
1264 	/*
1265 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
1266 	 * from a certain number of "constrained" bits (the least significant
1267 	 * bits) and some number of "unconstrained" bits.  The constrained
1268 	 * bits must be set to the index of the entry in the MPT table, but
1269 	 * the unconstrained bits can be set to any value we wish.  Note:
1270 	 * if no remote access is required, then the RKey value is not filled
1271 	 * in.  Otherwise both Rkey and LKey are given the same value.
1272 	 */
1273 	tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
1274 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1275 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1276 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1277 		mr->mr_rkey = mr->mr_lkey;
1278 	}
1279 
1280 	/*
1281 	 * Determine if the memory is from userland and pin the pages
1282 	 * with umem_lockmemory() if necessary.
1283 	 * Then, if this is userland memory, allocate an entry in the
1284 	 * "userland resources database".  This will later be added to
1285 	 * the database (after all further memory registration operations are
1286 	 * successful).  If we fail here, we must undo the reference counts
1287 	 * and the previous resource allocations.
1288 	 */
1289 	mr_is_umem = (((bind->bi_as != NULL) && (bind->bi_as != &kas)) ? 1 : 0);
1290 	if (mr_is_umem) {
1291 		umem_len   = ptob(btopr(bind->bi_len +
1292 		    ((uintptr_t)bind->bi_addr & PAGEOFFSET)));
1293 		umem_addr  = (caddr_t)((uintptr_t)bind->bi_addr & ~PAGEOFFSET);
1294 		umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
1295 		    DDI_UMEMLOCK_LONGTERM);
1296 		status = umem_lockmemory(umem_addr, umem_len, umem_flags,
1297 		    &umem_cookie, &tavor_umem_cbops, NULL);
1298 		if (status != 0) {
1299 			/* Set "status" and "errormsg" and goto failure */
1300 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umem pin");
1301 			goto mrcommon_fail3;
1302 		}
1303 
1304 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1305 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1306 
1307 		bind->bi_buf = ddi_umem_iosetup(umem_cookie, 0, umem_len,
1308 		    B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP);
1309 		if (bind->bi_buf == NULL) {
1310 			/* Set "status" and "errormsg" and goto failure */
1311 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed iosetup");
1312 			goto mrcommon_fail3;
1313 		}
1314 		bind->bi_type = TAVOR_BINDHDL_UBUF;
1315 		bind->bi_buf->b_flags |= B_READ;
1316 
1317 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1318 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1319 
1320 		umapdb = tavor_umap_db_alloc(state->ts_instance,
1321 		    (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
1322 		    (uint64_t)(uintptr_t)rsrc);
1323 		if (umapdb == NULL) {
1324 			/* Set "status" and "errormsg" and goto failure */
1325 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
1326 			goto mrcommon_fail4;
1327 		}
1328 	}
1329 
1330 	/*
1331 	 * Setup the bindinfo for the mtt bind call
1332 	 */
1333 	bh = &mr->mr_bindinfo;
1334 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bh))
1335 	bcopy(bind, bh, sizeof (tavor_bind_info_t));
1336 	bh->bi_bypass = bind_type;
1337 	status = tavor_mr_mtt_bind(state, bh, bind_dmahdl, &mtt,
1338 	    &mtt_pgsize_bits);
1339 	if (status != DDI_SUCCESS) {
1340 		/* Set "status" and "errormsg" and goto failure */
1341 		TAVOR_TNF_FAIL(status, "failed mtt bind");
1342 		/*
1343 		 * When mtt_bind fails, freerbuf has already been done,
1344 		 * so make sure not to call it again.
1345 		 */
1346 		bind->bi_type = bh->bi_type;
1347 		goto mrcommon_fail5;
1348 	}
1349 	mr->mr_logmttpgsz = mtt_pgsize_bits;
1350 
1351 	/*
1352 	 * Allocate MTT reference count (to track shared memory regions).
1353 	 * This reference count resource may never be used on the given
1354 	 * memory region, but if it is ever later registered as "shared"
1355 	 * memory region then this resource will be necessary.  If we fail
1356 	 * here, we do pretty much the same as above to clean up.
1357 	 */
1358 	status = tavor_rsrc_alloc(state, TAVOR_REFCNT, 1, sleep,
1359 	    &mtt_refcnt);
1360 	if (status != DDI_SUCCESS) {
1361 		/* Set "status" and "errormsg" and goto failure */
1362 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed refence count");
1363 		goto mrcommon_fail6;
1364 	}
1365 	mr->mr_mttrefcntp = mtt_refcnt;
1366 	swrc_tmp = (tavor_sw_refcnt_t *)mtt_refcnt->tr_addr;
1367 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_tmp))
1368 	TAVOR_MTT_REFCNT_INIT(swrc_tmp);
1369 
1370 	/*
1371 	 * Fill in the MPT entry.  This is the final step before passing
1372 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
1373 	 * the information collected/calculated above to fill in the
1374 	 * requisite portions of the MPT.
1375 	 */
1376 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
1377 	mpt_entry.m_io	  = TAVOR_MEM_CYCLE_GENERATE;
1378 	mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
1379 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
1380 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
1381 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
1382 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
1383 	mpt_entry.lr	  = 1;
1384 	mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
1385 	mpt_entry.page_sz	= mr->mr_logmttpgsz - 0xC;
1386 	mpt_entry.mem_key	= mr->mr_lkey;
1387 	mpt_entry.pd		= pd->pd_pdnum;
1388 	if (bind_override_addr == 0) {
1389 		mpt_entry.start_addr = bh->bi_addr;
1390 	} else {
1391 		bh->bi_addr = bh->bi_addr & ((1 << mr->mr_logmttpgsz) - 1);
1392 		mpt_entry.start_addr = bh->bi_addr;
1393 	}
1394 	mpt_entry.reg_win_len	= bh->bi_len;
1395 	mpt_entry.win_cnt_limit	= TAVOR_UNLIMITED_WIN_BIND;
1396 	mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
1397 	mpt_entry.mttseg_addr_h = mtt_addr >> 32;
1398 	mpt_entry.mttseg_addr_l = mtt_addr >> 6;
1399 
1400 	/*
1401 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
1402 	 * the entry to the hardware.  Note: in general, this operation
1403 	 * shouldn't fail.  But if it does, we have to undo everything we've
1404 	 * done above before returning error.
1405 	 */
1406 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1407 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
1408 	if (status != TAVOR_CMD_SUCCESS) {
1409 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
1410 		    status);
1411 		TNF_PROBE_1(tavor_mr_common_reg_sw2hw_mpt_cmd_fail,
1412 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1413 		/* Set "status" and "errormsg" and goto failure */
1414 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1415 		    "tavor SW2HW_MPT command");
1416 		goto mrcommon_fail7;
1417 	}
1418 
1419 	/*
1420 	 * Fill in the rest of the Tavor Memory Region handle.  Having
1421 	 * successfully transferred ownership of the MPT, we can update the
1422 	 * following fields for use in further operations on the MR.
1423 	 */
1424 	mr->mr_mptrsrcp	  = mpt;
1425 	mr->mr_mttrsrcp	  = mtt;
1426 	mr->mr_pdhdl	  = pd;
1427 	mr->mr_rsrcp	  = rsrc;
1428 	mr->mr_is_umem	  = mr_is_umem;
1429 	mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL;
1430 	mr->mr_umem_cbfunc = NULL;
1431 	mr->mr_umem_cbarg1 = NULL;
1432 	mr->mr_umem_cbarg2 = NULL;
1433 
1434 	/*
1435 	 * If this is userland memory, then we need to insert the previously
1436 	 * allocated entry into the "userland resources database".  This will
1437 	 * allow for later coordination between the tavor_umap_umemlock_cb()
1438 	 * callback and tavor_mr_deregister().
1439 	 */
1440 	if (mr_is_umem) {
1441 		tavor_umap_db_add(umapdb);
1442 	}
1443 
1444 	*mrhdl = mr;
1445 
1446 	TAVOR_TNF_EXIT(tavor_mr_common_reg);
1447 	return (DDI_SUCCESS);
1448 
1449 /*
1450  * The following is cleanup for all possible failure cases in this routine
1451  */
1452 mrcommon_fail7:
1453 	tavor_rsrc_free(state, &mtt_refcnt);
1454 mrcommon_fail6:
1455 	tavor_rsrc_free(state, &mtt);
1456 	tavor_mr_mem_unbind(state, bh);
1457 	bind->bi_type = bh->bi_type;
1458 mrcommon_fail5:
1459 	if (mr_is_umem) {
1460 		tavor_umap_db_free(umapdb);
1461 	}
1462 mrcommon_fail4:
1463 	if (mr_is_umem) {
1464 		/*
1465 		 * Free up the memory ddi_umem_iosetup() allocates
1466 		 * internally.
1467 		 */
1468 		if (bind->bi_type == TAVOR_BINDHDL_UBUF) {
1469 			freerbuf(bind->bi_buf);
1470 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1471 			bind->bi_type = TAVOR_BINDHDL_NONE;
1472 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1473 		}
1474 		ddi_umem_unlock(umem_cookie);
1475 	}
1476 mrcommon_fail3:
1477 	tavor_rsrc_free(state, &rsrc);
1478 mrcommon_fail2:
1479 	tavor_rsrc_free(state, &mpt);
1480 mrcommon_fail1:
1481 	tavor_pd_refcnt_dec(pd);
1482 mrcommon_fail:
1483 	TNF_PROBE_1(tavor_mr_common_reg_fail, TAVOR_TNF_ERROR, "",
1484 	    tnf_string, msg, errormsg);
1485 	TAVOR_TNF_EXIT(tavor_mr_common_reg);
1486 	return (status);
1487 }
1488 
1489 int
1490 tavor_dma_mr_register(tavor_state_t *state, tavor_pdhdl_t pd,
1491     ibt_dmr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl)
1492 {
1493 	tavor_rsrc_t		*mpt, *rsrc;
1494 	tavor_hw_mpt_t		mpt_entry;
1495 	tavor_mrhdl_t		mr;
1496 	ibt_mr_flags_t		flags;
1497 	uint_t			sleep;
1498 	int			status;
1499 
1500 	/* Extract the flags field */
1501 	flags = mr_attr->dmr_flags;
1502 
1503 	/*
1504 	 * Check the sleep flag.  Ensure that it is consistent with the
1505 	 * current thread context (i.e. if we are currently in the interrupt
1506 	 * context, then we shouldn't be attempting to sleep).
1507 	 */
1508 	sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1509 	if ((sleep == TAVOR_SLEEP) &&
1510 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1511 		status = IBT_INVALID_PARAM;
1512 		goto mrcommon_fail;
1513 	}
1514 
1515 	/* Increment the reference count on the protection domain (PD) */
1516 	tavor_pd_refcnt_inc(pd);
1517 
1518 	/*
1519 	 * Allocate an MPT entry.  This will be filled in with all the
1520 	 * necessary parameters to define the memory region.  And then
1521 	 * ownership will be passed to the hardware in the final step
1522 	 * below.  If we fail here, we must undo the protection domain
1523 	 * reference count.
1524 	 */
1525 	status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
1526 	if (status != DDI_SUCCESS) {
1527 		status = IBT_INSUFF_RESOURCE;
1528 		goto mrcommon_fail1;
1529 	}
1530 
1531 	/*
1532 	 * Allocate the software structure for tracking the memory region (i.e.
1533 	 * the Tavor Memory Region handle).  If we fail here, we must undo
1534 	 * the protection domain reference count and the previous resource
1535 	 * allocation.
1536 	 */
1537 	status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
1538 	if (status != DDI_SUCCESS) {
1539 		status = IBT_INSUFF_RESOURCE;
1540 		goto mrcommon_fail2;
1541 	}
1542 	mr = (tavor_mrhdl_t)rsrc->tr_addr;
1543 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
1544 	bzero(mr, sizeof (*mr));
1545 
1546 	/*
1547 	 * Setup and validate the memory region access flags.  This means
1548 	 * translating the IBTF's enable flags into the access flags that
1549 	 * will be used in later operations.
1550 	 */
1551 	mr->mr_accflag = 0;
1552 	if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1553 		mr->mr_accflag |= IBT_MR_WINDOW_BIND;
1554 	if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1555 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
1556 	if (flags & IBT_MR_ENABLE_REMOTE_READ)
1557 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
1558 	if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1559 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
1560 	if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1561 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
1562 
1563 	/*
1564 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
1565 	 * from a certain number of "constrained" bits (the least significant
1566 	 * bits) and some number of "unconstrained" bits.  The constrained
1567 	 * bits must be set to the index of the entry in the MPT table, but
1568 	 * the unconstrained bits can be set to any value we wish.  Note:
1569 	 * if no remote access is required, then the RKey value is not filled
1570 	 * in.  Otherwise both Rkey and LKey are given the same value.
1571 	 */
1572 	tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
1573 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1574 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1575 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1576 		mr->mr_rkey = mr->mr_lkey;
1577 	}
1578 
1579 	/*
1580 	 * Fill in the MPT entry.  This is the final step before passing
1581 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
1582 	 * the information collected/calculated above to fill in the
1583 	 * requisite portions of the MPT.
1584 	 */
1585 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
1586 
1587 	mpt_entry.m_io	  = TAVOR_MEM_CYCLE_GENERATE;
1588 	mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
1589 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
1590 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
1591 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
1592 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
1593 	mpt_entry.lr	  = 1;
1594 	mpt_entry.phys_addr = 1;	/* critical bit for this */
1595 	mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
1596 
1597 	mpt_entry.page_sz	= mr->mr_logmttpgsz - 0xC;
1598 	mpt_entry.mem_key	= mr->mr_lkey;
1599 	mpt_entry.pd		= pd->pd_pdnum;
1600 	mpt_entry.win_cnt_limit = TAVOR_UNLIMITED_WIN_BIND;
1601 
1602 	mpt_entry.start_addr = mr_attr->dmr_paddr;
1603 	mpt_entry.reg_win_len = mr_attr->dmr_len;
1604 
1605 	mpt_entry.mttseg_addr_h = 0;
1606 	mpt_entry.mttseg_addr_l = 0;
1607 
1608 	/*
1609 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
1610 	 * the entry to the hardware if needed.  Note: in general, this
1611 	 * operation shouldn't fail.  But if it does, we have to undo
1612 	 * everything we've done above before returning error.
1613 	 *
1614 	 * For Tavor, this routine (which is common to the contexts) will only
1615 	 * set the ownership if needed - the process of passing the context
1616 	 * itself to HW will take care of setting up the MPT (based on type
1617 	 * and index).
1618 	 */
1619 
1620 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1621 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
1622 	if (status != TAVOR_CMD_SUCCESS) {
1623 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
1624 		    status);
1625 		status = ibc_get_ci_failure(0);
1626 		goto mrcommon_fail7;
1627 	}
1628 
1629 	/*
1630 	 * Fill in the rest of the Tavor Memory Region handle.  Having
1631 	 * successfully transferred ownership of the MPT, we can update the
1632 	 * following fields for use in further operations on the MR.
1633 	 */
1634 	mr->mr_mptrsrcp	   = mpt;
1635 	mr->mr_mttrsrcp	   = NULL;
1636 	mr->mr_pdhdl	   = pd;
1637 	mr->mr_rsrcp	   = rsrc;
1638 	mr->mr_is_umem	   = 0;
1639 	mr->mr_umemcookie  = NULL;
1640 	mr->mr_umem_cbfunc = NULL;
1641 	mr->mr_umem_cbarg1 = NULL;
1642 	mr->mr_umem_cbarg2 = NULL;
1643 
1644 	*mrhdl = mr;
1645 
1646 	return (DDI_SUCCESS);
1647 
1648 /*
1649  * The following is cleanup for all possible failure cases in this routine
1650  */
1651 mrcommon_fail7:
1652 	tavor_rsrc_free(state, &rsrc);
1653 mrcommon_fail2:
1654 	tavor_rsrc_free(state, &mpt);
1655 mrcommon_fail1:
1656 	tavor_pd_refcnt_dec(pd);
1657 mrcommon_fail:
1658 	return (status);
1659 }
1660 
1661 /*
1662  * tavor_mr_mtt_bind()
1663  *    Context: Can be called from interrupt or base context.
1664  */
1665 int
1666 tavor_mr_mtt_bind(tavor_state_t *state, tavor_bind_info_t *bind,
1667     ddi_dma_handle_t bind_dmahdl, tavor_rsrc_t **mtt, uint_t *mtt_pgsize_bits)
1668 {
1669 	uint64_t		nummtt;
1670 	uint_t			sleep;
1671 	int			status;
1672 	char			*errormsg;
1673 
1674 	TAVOR_TNF_ENTER(tavor_mr_common_reg);
1675 
1676 	/*
1677 	 * Check the sleep flag.  Ensure that it is consistent with the
1678 	 * current thread context (i.e. if we are currently in the interrupt
1679 	 * context, then we shouldn't be attempting to sleep).
1680 	 */
1681 	sleep = (bind->bi_flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1682 	if ((sleep == TAVOR_SLEEP) &&
1683 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1684 		/* Set "status" and "errormsg" and goto failure */
1685 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1686 		goto mrmttbind_fail;
1687 	}
1688 
1689 	/*
1690 	 * Bind the memory and determine the mapped addresses.  This is
1691 	 * the first of two routines that do all the "heavy lifting" for
1692 	 * the Tavor memory registration routines.  The tavor_mr_mem_bind()
1693 	 * routine takes the "bind" struct with all its fields filled
1694 	 * in and returns a list of DMA cookies (for the PCI mapped addresses
1695 	 * corresponding to the specified address region) which are used by
1696 	 * the tavor_mr_fast_mtt_write() routine below.  If we fail here, we
1697 	 * must undo all the previous resource allocation (and PD reference
1698 	 * count).
1699 	 */
1700 	status = tavor_mr_mem_bind(state, bind, bind_dmahdl, sleep);
1701 	if (status != DDI_SUCCESS) {
1702 		/* Set "status" and "errormsg" and goto failure */
1703 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
1704 		goto mrmttbind_fail;
1705 	}
1706 
1707 	/*
1708 	 * Determine number of pages spanned.  This routine uses the
1709 	 * information in the "bind" struct to determine the required
1710 	 * number of MTT entries needed (and returns the suggested page size -
1711 	 * as a "power-of-2" - for each MTT entry).
1712 	 */
1713 	nummtt = tavor_mr_nummtt_needed(state, bind, mtt_pgsize_bits);
1714 
1715 	/*
1716 	 * Allocate the MTT entries.  Use the calculations performed above to
1717 	 * allocate the required number of MTT entries.  Note: MTT entries are
1718 	 * allocated in "MTT segments" which consist of complete cachelines
1719 	 * (i.e. 8 entries, 16 entries, etc.)  So the TAVOR_NUMMTT_TO_MTTSEG()
1720 	 * macro is used to do the proper conversion.  If we fail here, we
1721 	 * must not only undo all the previous resource allocation (and PD
1722 	 * reference count), but we must also unbind the memory.
1723 	 */
1724 	status = tavor_rsrc_alloc(state, TAVOR_MTT,
1725 	    TAVOR_NUMMTT_TO_MTTSEG(nummtt), sleep, mtt);
1726 	if (status != DDI_SUCCESS) {
1727 		/* Set "status" and "errormsg" and goto failure */
1728 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MTT");
1729 		goto mrmttbind_fail2;
1730 	}
1731 
1732 	/*
1733 	 * Write the mapped addresses into the MTT entries.  This is part two
1734 	 * of the "heavy lifting" routines that we talked about above.  Note:
1735 	 * we pass the suggested page size from the earlier operation here.
1736 	 * And if we fail here, we again do pretty much the same huge clean up.
1737 	 */
1738 	status = tavor_mr_fast_mtt_write(*mtt, bind, *mtt_pgsize_bits);
1739 	if (status != DDI_SUCCESS) {
1740 		/* Set "status" and "errormsg" and goto failure */
1741 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed write mtt");
1742 		goto mrmttbind_fail3;
1743 	}
1744 	TAVOR_TNF_EXIT(tavor_mr_mtt_bind);
1745 	return (DDI_SUCCESS);
1746 
1747 /*
1748  * The following is cleanup for all possible failure cases in this routine
1749  */
1750 mrmttbind_fail3:
1751 	tavor_rsrc_free(state, mtt);
1752 mrmttbind_fail2:
1753 	tavor_mr_mem_unbind(state, bind);
1754 mrmttbind_fail:
1755 	TNF_PROBE_1(tavor_mr_mtt_bind_fail, TAVOR_TNF_ERROR, "",
1756 	    tnf_string, msg, errormsg);
1757 	TAVOR_TNF_EXIT(tavor_mr_mtt_bind);
1758 	return (status);
1759 }
1760 
1761 
1762 /*
1763  * tavor_mr_mtt_unbind()
1764  *    Context: Can be called from interrupt or base context.
1765  */
1766 int
1767 tavor_mr_mtt_unbind(tavor_state_t *state, tavor_bind_info_t *bind,
1768     tavor_rsrc_t *mtt)
1769 {
1770 	TAVOR_TNF_ENTER(tavor_mr_mtt_unbind);
1771 
1772 	/*
1773 	 * Free up the MTT entries and unbind the memory.  Here, as above, we
1774 	 * attempt to free these resources only if it is appropriate to do so.
1775 	 */
1776 	tavor_mr_mem_unbind(state, bind);
1777 	tavor_rsrc_free(state, &mtt);
1778 
1779 	TAVOR_TNF_EXIT(tavor_mr_mtt_unbind);
1780 	return (DDI_SUCCESS);
1781 }
1782 
1783 
1784 /*
1785  * tavor_mr_common_rereg()
1786  *    Context: Can be called from interrupt or base context.
1787  */
1788 static int
1789 tavor_mr_common_rereg(tavor_state_t *state, tavor_mrhdl_t mr,
1790     tavor_pdhdl_t pd, tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl_new,
1791     tavor_mr_options_t *op)
1792 {
1793 	tavor_rsrc_t		*mpt;
1794 	ibt_mr_attr_flags_t	acc_flags_to_use;
1795 	ibt_mr_flags_t		flags;
1796 	tavor_pdhdl_t		pd_to_use;
1797 	tavor_hw_mpt_t		mpt_entry;
1798 	uint64_t		mtt_addr_to_use, vaddr_to_use, len_to_use;
1799 	uint_t			sleep, dereg_level;
1800 	int			status;
1801 	char			*errormsg;
1802 
1803 	TAVOR_TNF_ENTER(tavor_mr_common_rereg);
1804 
1805 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1806 
1807 	/*
1808 	 * Check here to see if the memory region corresponds to a userland
1809 	 * mapping.  Reregistration of userland memory regions is not
1810 	 * currently supported.  Return failure. XXX
1811 	 */
1812 	if (mr->mr_is_umem) {
1813 		/* Set "status" and "errormsg" and goto failure */
1814 		TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl");
1815 		goto mrrereg_fail;
1816 	}
1817 
1818 	mutex_enter(&mr->mr_lock);
1819 
1820 	/* Pull MPT resource pointer from the Tavor Memory Region handle */
1821 	mpt = mr->mr_mptrsrcp;
1822 
1823 	/* Extract the flags field from the tavor_bind_info_t */
1824 	flags = bind->bi_flags;
1825 
1826 	/*
1827 	 * Check the sleep flag.  Ensure that it is consistent with the
1828 	 * current thread context (i.e. if we are currently in the interrupt
1829 	 * context, then we shouldn't be attempting to sleep).
1830 	 */
1831 	sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1832 	if ((sleep == TAVOR_SLEEP) &&
1833 	    (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1834 		mutex_exit(&mr->mr_lock);
1835 		/* Set "status" and "errormsg" and goto failure */
1836 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags");
1837 		goto mrrereg_fail;
1838 	}
1839 
1840 	/*
1841 	 * First step is to temporarily invalidate the MPT entry.  This
1842 	 * regains ownership from the hardware, and gives us the opportunity
1843 	 * to modify the entry.  Note: The HW2SW_MPT command returns the
1844 	 * current MPT entry contents.  These are saved away here because
1845 	 * they will be reused in a later step below.  If the region has
1846 	 * bound memory windows that we fail returning an "in use" error code.
1847 	 * Otherwise, this is an unexpected error and we deregister the
1848 	 * memory region and return error.
1849 	 *
1850 	 * We use TAVOR_CMD_NOSLEEP_SPIN here always because we must protect
1851 	 * against holding the lock around this rereg call in all contexts.
1852 	 */
1853 	status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT, &mpt_entry,
1854 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, TAVOR_CMD_NOSLEEP_SPIN);
1855 	if (status != TAVOR_CMD_SUCCESS) {
1856 		mutex_exit(&mr->mr_lock);
1857 		if (status == TAVOR_CMD_REG_BOUND) {
1858 			TAVOR_TNF_EXIT(tavor_mr_common_rereg);
1859 			return (IBT_MR_IN_USE);
1860 		} else {
1861 			cmn_err(CE_CONT, "Tavor: HW2SW_MPT command failed: "
1862 			    "%08x\n", status);
1863 
1864 			/*
1865 			 * Call deregister and ensure that all current
1866 			 * resources get freed up
1867 			 */
1868 			if (tavor_mr_deregister(state, &mr,
1869 			    TAVOR_MR_DEREG_ALL, sleep) != DDI_SUCCESS) {
1870 				TAVOR_WARNING(state, "failed to deregister "
1871 				    "memory region");
1872 			}
1873 			TNF_PROBE_1(tavor_mr_common_rereg_hw2sw_mpt_cmd_fail,
1874 			    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1875 			TAVOR_TNF_EXIT(tavor_mr_common_rereg);
1876 			return (ibc_get_ci_failure(0));
1877 		}
1878 	}
1879 
1880 	/*
1881 	 * If we're changing the protection domain, then validate the new one
1882 	 */
1883 	if (flags & IBT_MR_CHANGE_PD) {
1884 
1885 		/* Check for valid PD handle pointer */
1886 		if (pd == NULL) {
1887 			mutex_exit(&mr->mr_lock);
1888 			/*
1889 			 * Call deregister and ensure that all current
1890 			 * resources get properly freed up. Unnecessary
1891 			 * here to attempt to regain software ownership
1892 			 * of the MPT entry as that has already been
1893 			 * done above.
1894 			 */
1895 			if (tavor_mr_deregister(state, &mr,
1896 			    TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) !=
1897 			    DDI_SUCCESS) {
1898 				TAVOR_WARNING(state, "failed to deregister "
1899 				    "memory region");
1900 			}
1901 			/* Set "status" and "errormsg" and goto failure */
1902 			TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle");
1903 			goto mrrereg_fail;
1904 		}
1905 
1906 		/* Use the new PD handle in all operations below */
1907 		pd_to_use = pd;
1908 
1909 	} else {
1910 		/* Use the current PD handle in all operations below */
1911 		pd_to_use = mr->mr_pdhdl;
1912 	}
1913 
1914 	/*
1915 	 * If we're changing access permissions, then validate the new ones
1916 	 */
1917 	if (flags & IBT_MR_CHANGE_ACCESS) {
1918 		/*
1919 		 * Validate the access flags.  Both remote write and remote
1920 		 * atomic require the local write flag to be set
1921 		 */
1922 		if (((flags & IBT_MR_ENABLE_REMOTE_WRITE) ||
1923 		    (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)) &&
1924 		    !(flags & IBT_MR_ENABLE_LOCAL_WRITE)) {
1925 			mutex_exit(&mr->mr_lock);
1926 			/*
1927 			 * Call deregister and ensure that all current
1928 			 * resources get properly freed up. Unnecessary
1929 			 * here to attempt to regain software ownership
1930 			 * of the MPT entry as that has already been
1931 			 * done above.
1932 			 */
1933 			if (tavor_mr_deregister(state, &mr,
1934 			    TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) !=
1935 			    DDI_SUCCESS) {
1936 				TAVOR_WARNING(state, "failed to deregister "
1937 				    "memory region");
1938 			}
1939 			/* Set "status" and "errormsg" and goto failure */
1940 			TAVOR_TNF_FAIL(IBT_MR_ACCESS_REQ_INVALID,
1941 			    "invalid access flags");
1942 			goto mrrereg_fail;
1943 		}
1944 
1945 		/*
1946 		 * Setup and validate the memory region access flags.  This
1947 		 * means translating the IBTF's enable flags into the access
1948 		 * flags that will be used in later operations.
1949 		 */
1950 		acc_flags_to_use = 0;
1951 		if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1952 			acc_flags_to_use |= IBT_MR_WINDOW_BIND;
1953 		if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1954 			acc_flags_to_use |= IBT_MR_LOCAL_WRITE;
1955 		if (flags & IBT_MR_ENABLE_REMOTE_READ)
1956 			acc_flags_to_use |= IBT_MR_REMOTE_READ;
1957 		if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1958 			acc_flags_to_use |= IBT_MR_REMOTE_WRITE;
1959 		if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1960 			acc_flags_to_use |= IBT_MR_REMOTE_ATOMIC;
1961 
1962 	} else {
1963 		acc_flags_to_use = mr->mr_accflag;
1964 	}
1965 
1966 	/*
1967 	 * If we're modifying the translation, then figure out whether
1968 	 * we can reuse the current MTT resources.  This means calling
1969 	 * tavor_mr_rereg_xlat_helper() which does most of the heavy lifting
1970 	 * for the reregistration.  If the current memory region contains
1971 	 * sufficient MTT entries for the new regions, then it will be
1972 	 * reused and filled in.  Otherwise, new entries will be allocated,
1973 	 * the old ones will be freed, and the new entries will be filled
1974 	 * in.  Note:  If we're not modifying the translation, then we
1975 	 * should already have all the information we need to update the MPT.
1976 	 * Also note: If tavor_mr_rereg_xlat_helper() fails, it will return
1977 	 * a "dereg_level" which is the level of cleanup that needs to be
1978 	 * passed to tavor_mr_deregister() to finish the cleanup.
1979 	 */
1980 	if (flags & IBT_MR_CHANGE_TRANSLATION) {
1981 		status = tavor_mr_rereg_xlat_helper(state, mr, bind, op,
1982 		    &mtt_addr_to_use, sleep, &dereg_level);
1983 		if (status != DDI_SUCCESS) {
1984 			mutex_exit(&mr->mr_lock);
1985 			/*
1986 			 * Call deregister and ensure that all resources get
1987 			 * properly freed up.
1988 			 */
1989 			if (tavor_mr_deregister(state, &mr, dereg_level,
1990 			    sleep) != DDI_SUCCESS) {
1991 				TAVOR_WARNING(state, "failed to deregister "
1992 				    "memory region");
1993 			}
1994 
1995 			/* Set "status" and "errormsg" and goto failure */
1996 			TAVOR_TNF_FAIL(status, "failed rereg helper");
1997 			goto mrrereg_fail;
1998 		}
1999 		vaddr_to_use = mr->mr_bindinfo.bi_addr;
2000 		len_to_use   = mr->mr_bindinfo.bi_len;
2001 	} else {
2002 		mtt_addr_to_use = (((uint64_t)mpt_entry.mttseg_addr_h << 32) |
2003 		    ((uint64_t)mpt_entry.mttseg_addr_l << 6));
2004 		vaddr_to_use = mr->mr_bindinfo.bi_addr;
2005 		len_to_use   = mr->mr_bindinfo.bi_len;
2006 	}
2007 
2008 	/*
2009 	 * Calculate new keys (Lkey, Rkey) from MPT index.  Just like they were
2010 	 * when the region was first registered, each key is formed from
2011 	 * "constrained" bits and "unconstrained" bits.  Note:  If no remote
2012 	 * access is required, then the RKey value is not filled in.  Otherwise
2013 	 * both Rkey and LKey are given the same value.
2014 	 */
2015 	tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
2016 	if ((acc_flags_to_use & IBT_MR_REMOTE_READ) ||
2017 	    (acc_flags_to_use & IBT_MR_REMOTE_WRITE) ||
2018 	    (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC)) {
2019 		mr->mr_rkey = mr->mr_lkey;
2020 	}
2021 
2022 	/*
2023 	 * Update the MPT entry with the new information.  Some of this
2024 	 * information is retained from the previous operation, some of
2025 	 * it is new based on request.
2026 	 */
2027 	mpt_entry.en_bind = (acc_flags_to_use & IBT_MR_WINDOW_BIND)   ? 1 : 0;
2028 	mpt_entry.atomic  = (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
2029 	mpt_entry.rw	  = (acc_flags_to_use & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
2030 	mpt_entry.rr	  = (acc_flags_to_use & IBT_MR_REMOTE_READ)   ? 1 : 0;
2031 	mpt_entry.lw	  = (acc_flags_to_use & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
2032 	mpt_entry.page_sz	= mr->mr_logmttpgsz - 0xC;
2033 	mpt_entry.mem_key	= mr->mr_lkey;
2034 	mpt_entry.pd		= pd_to_use->pd_pdnum;
2035 	mpt_entry.start_addr	= vaddr_to_use;
2036 	mpt_entry.reg_win_len	= len_to_use;
2037 	mpt_entry.mttseg_addr_h = mtt_addr_to_use >> 32;
2038 	mpt_entry.mttseg_addr_l = mtt_addr_to_use >> 6;
2039 
2040 	/*
2041 	 * Write the updated MPT entry to hardware
2042 	 *
2043 	 * We use TAVOR_CMD_NOSLEEP_SPIN here always because we must protect
2044 	 * against holding the lock around this rereg call in all contexts.
2045 	 */
2046 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
2047 	    sizeof (tavor_hw_mpt_t), mpt->tr_indx, TAVOR_CMD_NOSLEEP_SPIN);
2048 	if (status != TAVOR_CMD_SUCCESS) {
2049 		mutex_exit(&mr->mr_lock);
2050 		cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
2051 		    status);
2052 		/*
2053 		 * Call deregister and ensure that all current resources get
2054 		 * properly freed up. Unnecessary here to attempt to regain
2055 		 * software ownership of the MPT entry as that has already
2056 		 * been done above.
2057 		 */
2058 		if (tavor_mr_deregister(state, &mr,
2059 		    TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) != DDI_SUCCESS) {
2060 			TAVOR_WARNING(state, "failed to deregister memory "
2061 			    "region");
2062 		}
2063 		TNF_PROBE_1(tavor_mr_common_rereg_sw2hw_mpt_cmd_fail,
2064 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
2065 		TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2066 		return (ibc_get_ci_failure(0));
2067 	}
2068 
2069 	/*
2070 	 * If we're changing PD, then update their reference counts now.
2071 	 * This means decrementing the reference count on the old PD and
2072 	 * incrementing the reference count on the new PD.
2073 	 */
2074 	if (flags & IBT_MR_CHANGE_PD) {
2075 		tavor_pd_refcnt_dec(mr->mr_pdhdl);
2076 		tavor_pd_refcnt_inc(pd);
2077 	}
2078 
2079 	/*
2080 	 * Update the contents of the Tavor Memory Region handle to reflect
2081 	 * what has been changed.
2082 	 */
2083 	mr->mr_pdhdl	  = pd_to_use;
2084 	mr->mr_accflag	  = acc_flags_to_use;
2085 	mr->mr_is_umem	  = 0;
2086 	mr->mr_umemcookie = NULL;
2087 
2088 	/* New MR handle is same as the old */
2089 	*mrhdl_new = mr;
2090 	mutex_exit(&mr->mr_lock);
2091 
2092 	TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2093 	return (DDI_SUCCESS);
2094 
2095 mrrereg_fail:
2096 	TNF_PROBE_1(tavor_mr_common_rereg_fail, TAVOR_TNF_ERROR, "",
2097 	    tnf_string, msg, errormsg);
2098 	TAVOR_TNF_EXIT(tavor_mr_common_rereg);
2099 	return (status);
2100 }
2101 
2102 
2103 /*
2104  * tavor_mr_rereg_xlat_helper
2105  *    Context: Can be called from interrupt or base context.
2106  *    Note: This routine expects the "mr_lock" to be held when it
2107  *    is called.  Upon returning failure, this routine passes information
2108  *    about what "dereg_level" should be passed to tavor_mr_deregister().
2109  */
2110 static int
2111 tavor_mr_rereg_xlat_helper(tavor_state_t *state, tavor_mrhdl_t mr,
2112     tavor_bind_info_t *bind, tavor_mr_options_t *op, uint64_t *mtt_addr,
2113     uint_t sleep, uint_t *dereg_level)
2114 {
2115 	tavor_rsrc_pool_info_t	*rsrc_pool;
2116 	tavor_rsrc_t		*mtt, *mtt_refcnt;
2117 	tavor_sw_refcnt_t	*swrc_old, *swrc_new;
2118 	ddi_dma_handle_t	dmahdl;
2119 	uint64_t		nummtt_needed, nummtt_in_currrsrc, max_sz;
2120 	uint64_t		mtt_ddrbaseaddr;
2121 	uint_t			mtt_pgsize_bits, bind_type, reuse_dmahdl;
2122 	int			status;
2123 	char			*errormsg;
2124 
2125 	TAVOR_TNF_ENTER(tavor_mr_rereg_xlat_helper);
2126 
2127 	ASSERT(MUTEX_HELD(&mr->mr_lock));
2128 
2129 	/*
2130 	 * Check the "options" flag.  Currently this flag tells the driver
2131 	 * whether or not the region should be bound normally (i.e. with
2132 	 * entries written into the PCI IOMMU) or whether it should be
2133 	 * registered to bypass the IOMMU.
2134 	 */
2135 	if (op == NULL) {
2136 		bind_type = TAVOR_BINDMEM_NORMAL;
2137 	} else {
2138 		bind_type = op->mro_bind_type;
2139 	}
2140 
2141 	/*
2142 	 * Check for invalid length.  Check is the length is zero or if the
2143 	 * length is larger than the maximum configured value.  Return error
2144 	 * if it is.
2145 	 */
2146 	max_sz = ((uint64_t)1 << state->ts_cfg_profile->cp_log_max_mrw_sz);
2147 	if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
2148 		/*
2149 		 * Deregister will be called upon returning failure from this
2150 		 * routine. This will ensure that all current resources get
2151 		 * properly freed up. Unnecessary to attempt to regain
2152 		 * software ownership of the MPT entry as that has already
2153 		 * been done above (in tavor_mr_reregister())
2154 		 */
2155 		*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT;
2156 
2157 		/* Set "status" and "errormsg" and goto failure */
2158 		TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length");
2159 		goto mrrereghelp_fail;
2160 	}
2161 
2162 	/*
2163 	 * Determine the number of pages necessary for new region and the
2164 	 * number of pages supported by the current MTT resources
2165 	 */
2166 	nummtt_needed = tavor_mr_nummtt_needed(state, bind, &mtt_pgsize_bits);
2167 	nummtt_in_currrsrc = mr->mr_mttrsrcp->tr_len >> TAVOR_MTT_SIZE_SHIFT;
2168 
2169 	/*
2170 	 * Depending on whether we have enough pages or not, the next step is
2171 	 * to fill in a set of MTT entries that reflect the new mapping.  In
2172 	 * the first case below, we already have enough entries.  This means
2173 	 * we need to unbind the memory from the previous mapping, bind the
2174 	 * memory for the new mapping, write the new MTT entries, and update
2175 	 * the mr to reflect the changes.
2176 	 * In the second case below, we do not have enough entries in the
2177 	 * current mapping.  So, in this case, we need not only to unbind the
2178 	 * current mapping, but we need to free up the MTT resources associated
2179 	 * with that mapping.  After we've successfully done that, we continue
2180 	 * by binding the new memory, allocating new MTT entries, writing the
2181 	 * new MTT entries, and updating the mr to reflect the changes.
2182 	 */
2183 
2184 	/*
2185 	 * If this region is being shared (i.e. MTT refcount != 1), then we
2186 	 * can't reuse the current MTT resources regardless of their size.
2187 	 * Instead we'll need to alloc new ones (below) just as if there
2188 	 * hadn't been enough room in the current entries.
2189 	 */
2190 	swrc_old = (tavor_sw_refcnt_t *)mr->mr_mttrefcntp->tr_addr;
2191 	if (TAVOR_MTT_IS_NOT_SHARED(swrc_old) &&
2192 	    (nummtt_needed <= nummtt_in_currrsrc)) {
2193 
2194 		/*
2195 		 * Unbind the old mapping for this memory region, but retain
2196 		 * the ddi_dma_handle_t (if possible) for reuse in the bind
2197 		 * operation below.  Note:  If original memory region was
2198 		 * bound for IOMMU bypass and the new region can not use
2199 		 * bypass, then a new DMA handle will be necessary.
2200 		 */
2201 		if (TAVOR_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2202 			mr->mr_bindinfo.bi_free_dmahdl = 0;
2203 			tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2204 			dmahdl = mr->mr_bindinfo.bi_dmahdl;
2205 			reuse_dmahdl = 1;
2206 		} else {
2207 			tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2208 			dmahdl = NULL;
2209 			reuse_dmahdl = 0;
2210 		}
2211 
2212 		/*
2213 		 * Bind the new memory and determine the mapped addresses.
2214 		 * As described, this routine and tavor_mr_fast_mtt_write()
2215 		 * do the majority of the work for the memory registration
2216 		 * operations.  Note:  When we successfully finish the binding,
2217 		 * we will set the "bi_free_dmahdl" flag to indicate that
2218 		 * even though we may have reused the ddi_dma_handle_t we do
2219 		 * wish it to be freed up at some later time.  Note also that
2220 		 * if we fail, we may need to cleanup the ddi_dma_handle_t.
2221 		 */
2222 		bind->bi_bypass	= bind_type;
2223 		status = tavor_mr_mem_bind(state, bind, dmahdl, sleep);
2224 		if (status != DDI_SUCCESS) {
2225 			if (reuse_dmahdl) {
2226 				ddi_dma_free_handle(&dmahdl);
2227 			}
2228 
2229 			/*
2230 			 * Deregister will be called upon returning failure
2231 			 * from this routine. This will ensure that all
2232 			 * current resources get properly freed up.
2233 			 * Unnecessary to attempt to regain software ownership
2234 			 * of the MPT entry as that has already been done
2235 			 * above (in tavor_mr_reregister()).  Also unnecessary
2236 			 * to attempt to unbind the memory.
2237 			 */
2238 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2239 
2240 			/* Set "status" and "errormsg" and goto failure */
2241 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
2242 			goto mrrereghelp_fail;
2243 		}
2244 		if (reuse_dmahdl) {
2245 			bind->bi_free_dmahdl = 1;
2246 		}
2247 
2248 		/*
2249 		 * Using the new mapping, but reusing the current MTT
2250 		 * resources, write the updated entries to MTT
2251 		 */
2252 		mtt    = mr->mr_mttrsrcp;
2253 		status = tavor_mr_fast_mtt_write(mtt, bind, mtt_pgsize_bits);
2254 		if (status != DDI_SUCCESS) {
2255 			/*
2256 			 * Deregister will be called upon returning failure
2257 			 * from this routine. This will ensure that all
2258 			 * current resources get properly freed up.
2259 			 * Unnecessary to attempt to regain software ownership
2260 			 * of the MPT entry as that has already been done
2261 			 * above (in tavor_mr_reregister()).  Also unnecessary
2262 			 * to attempt to unbind the memory.
2263 			 *
2264 			 * But we do need to unbind the newly bound memory
2265 			 * before returning.
2266 			 */
2267 			tavor_mr_mem_unbind(state, bind);
2268 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2269 
2270 			/* Set "status" and "errormsg" and goto failure */
2271 			TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
2272 			    "failed write mtt");
2273 			goto mrrereghelp_fail;
2274 		}
2275 
2276 		/* Put the updated information into the Mem Region handle */
2277 		mr->mr_bindinfo	  = *bind;
2278 		mr->mr_logmttpgsz = mtt_pgsize_bits;
2279 
2280 	} else {
2281 		/*
2282 		 * Check if the memory region MTT is shared by any other MRs.
2283 		 * Since the resource may be shared between multiple memory
2284 		 * regions (as a result of a "RegisterSharedMR()" verb) it is
2285 		 * important that we not unbind any resources prematurely.
2286 		 */
2287 		if (!TAVOR_MTT_IS_SHARED(swrc_old)) {
2288 			/*
2289 			 * Unbind the old mapping for this memory region, but
2290 			 * retain the ddi_dma_handle_t for reuse in the bind
2291 			 * operation below. Note: This can only be done here
2292 			 * because the region being reregistered is not
2293 			 * currently shared.  Also if original memory region
2294 			 * was bound for IOMMU bypass and the new region can
2295 			 * not use bypass, then a new DMA handle will be
2296 			 * necessary.
2297 			 */
2298 			if (TAVOR_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2299 				mr->mr_bindinfo.bi_free_dmahdl = 0;
2300 				tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2301 				dmahdl = mr->mr_bindinfo.bi_dmahdl;
2302 				reuse_dmahdl = 1;
2303 			} else {
2304 				tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2305 				dmahdl = NULL;
2306 				reuse_dmahdl = 0;
2307 			}
2308 		} else {
2309 			dmahdl = NULL;
2310 			reuse_dmahdl = 0;
2311 		}
2312 
2313 		/*
2314 		 * Bind the new memory and determine the mapped addresses.
2315 		 * As described, this routine and tavor_mr_fast_mtt_write()
2316 		 * do the majority of the work for the memory registration
2317 		 * operations.  Note:  When we successfully finish the binding,
2318 		 * we will set the "bi_free_dmahdl" flag to indicate that
2319 		 * even though we may have reused the ddi_dma_handle_t we do
2320 		 * wish it to be freed up at some later time.  Note also that
2321 		 * if we fail, we may need to cleanup the ddi_dma_handle_t.
2322 		 */
2323 		bind->bi_bypass	= bind_type;
2324 		status = tavor_mr_mem_bind(state, bind, dmahdl, sleep);
2325 		if (status != DDI_SUCCESS) {
2326 			if (reuse_dmahdl) {
2327 				ddi_dma_free_handle(&dmahdl);
2328 			}
2329 
2330 			/*
2331 			 * Deregister will be called upon returning failure
2332 			 * from this routine. This will ensure that all
2333 			 * current resources get properly freed up.
2334 			 * Unnecessary to attempt to regain software ownership
2335 			 * of the MPT entry as that has already been done
2336 			 * above (in tavor_mr_reregister()).  Also unnecessary
2337 			 * to attempt to unbind the memory.
2338 			 */
2339 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2340 
2341 			/* Set "status" and "errormsg" and goto failure */
2342 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind");
2343 			goto mrrereghelp_fail;
2344 		}
2345 		if (reuse_dmahdl) {
2346 			bind->bi_free_dmahdl = 1;
2347 		}
2348 
2349 		/*
2350 		 * Allocate the new MTT entries resource
2351 		 */
2352 		status = tavor_rsrc_alloc(state, TAVOR_MTT,
2353 		    TAVOR_NUMMTT_TO_MTTSEG(nummtt_needed), sleep, &mtt);
2354 		if (status != DDI_SUCCESS) {
2355 			/*
2356 			 * Deregister will be called upon returning failure
2357 			 * from this routine. This will ensure that all
2358 			 * current resources get properly freed up.
2359 			 * Unnecessary to attempt to regain software ownership
2360 			 * of the MPT entry as that has already been done
2361 			 * above (in tavor_mr_reregister()).  Also unnecessary
2362 			 * to attempt to unbind the memory.
2363 			 *
2364 			 * But we do need to unbind the newly bound memory
2365 			 * before returning.
2366 			 */
2367 			tavor_mr_mem_unbind(state, bind);
2368 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2369 
2370 			/* Set "status" and "errormsg" and goto failure */
2371 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MTT");
2372 			goto mrrereghelp_fail;
2373 		}
2374 
2375 		/*
2376 		 * Allocate MTT reference count (to track shared memory
2377 		 * regions).  As mentioned elsewhere above, this reference
2378 		 * count resource may never be used on the given memory region,
2379 		 * but if it is ever later registered as a "shared" memory
2380 		 * region then this resource will be necessary.  Note:  This
2381 		 * is only necessary here if the existing memory region is
2382 		 * already being shared (because otherwise we already have
2383 		 * a useable reference count resource).
2384 		 */
2385 		if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2386 			status = tavor_rsrc_alloc(state, TAVOR_REFCNT, 1,
2387 			    sleep, &mtt_refcnt);
2388 			if (status != DDI_SUCCESS) {
2389 				/*
2390 				 * Deregister will be called upon returning
2391 				 * failure from this routine. This will ensure
2392 				 * that all current resources get properly
2393 				 * freed up.  Unnecessary to attempt to regain
2394 				 * software ownership of the MPT entry as that
2395 				 * has already been done above (in
2396 				 * tavor_mr_reregister()).  Also unnecessary
2397 				 * to attempt to unbind the memory.
2398 				 *
2399 				 * But we need to unbind the newly bound
2400 				 * memory and free up the newly allocated MTT
2401 				 * entries before returning.
2402 				 */
2403 				tavor_mr_mem_unbind(state, bind);
2404 				tavor_rsrc_free(state, &mtt);
2405 				*dereg_level =
2406 				    TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2407 
2408 				/* Set "status"/"errormsg", goto failure */
2409 				TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE,
2410 				    "failed reference count");
2411 				goto mrrereghelp_fail;
2412 			}
2413 			swrc_new = (tavor_sw_refcnt_t *)mtt_refcnt->tr_addr;
2414 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_new))
2415 			TAVOR_MTT_REFCNT_INIT(swrc_new);
2416 		} else {
2417 			mtt_refcnt = mr->mr_mttrefcntp;
2418 		}
2419 
2420 		/*
2421 		 * Using the new mapping and the new MTT resources, write the
2422 		 * updated entries to MTT
2423 		 */
2424 		status = tavor_mr_fast_mtt_write(mtt, bind, mtt_pgsize_bits);
2425 		if (status != DDI_SUCCESS) {
2426 			/*
2427 			 * Deregister will be called upon returning failure
2428 			 * from this routine. This will ensure that all
2429 			 * current resources get properly freed up.
2430 			 * Unnecessary to attempt to regain software ownership
2431 			 * of the MPT entry as that has already been done
2432 			 * above (in tavor_mr_reregister()).  Also unnecessary
2433 			 * to attempt to unbind the memory.
2434 			 *
2435 			 * But we need to unbind the newly bound memory,
2436 			 * free up the newly allocated MTT entries, and
2437 			 * (possibly) free the new MTT reference count
2438 			 * resource before returning.
2439 			 */
2440 			if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2441 				tavor_rsrc_free(state, &mtt_refcnt);
2442 			}
2443 			tavor_mr_mem_unbind(state, bind);
2444 			tavor_rsrc_free(state, &mtt);
2445 			*dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2446 
2447 			/* Set "status" and "errormsg" and goto failure */
2448 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed write mtt");
2449 			goto mrrereghelp_fail;
2450 		}
2451 
2452 		/*
2453 		 * Check if the memory region MTT is shared by any other MRs.
2454 		 * Since the resource may be shared between multiple memory
2455 		 * regions (as a result of a "RegisterSharedMR()" verb) it is
2456 		 * important that we not free up any resources prematurely.
2457 		 */
2458 		if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2459 			/* Decrement MTT reference count for "old" region */
2460 			(void) tavor_mtt_refcnt_dec(mr->mr_mttrefcntp);
2461 		} else {
2462 			/* Free up the old MTT entries resource */
2463 			tavor_rsrc_free(state, &mr->mr_mttrsrcp);
2464 		}
2465 
2466 		/* Put the updated information into the mrhdl */
2467 		mr->mr_bindinfo	  = *bind;
2468 		mr->mr_logmttpgsz = mtt_pgsize_bits;
2469 		mr->mr_mttrsrcp   = mtt;
2470 		mr->mr_mttrefcntp = mtt_refcnt;
2471 	}
2472 
2473 	/*
2474 	 * Calculate and return the updated MTT address (in the DDR address
2475 	 * space).  This will be used by the caller (tavor_mr_reregister) in
2476 	 * the updated MPT entry
2477 	 */
2478 	rsrc_pool	= &state->ts_rsrc_hdl[TAVOR_MTT];
2479 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
2480 	*mtt_addr	= mtt_ddrbaseaddr + (mtt->tr_indx <<
2481 	    TAVOR_MTT_SIZE_SHIFT);
2482 
2483 	TAVOR_TNF_EXIT(tavor_mr_rereg_xlat_helper);
2484 	return (DDI_SUCCESS);
2485 
2486 mrrereghelp_fail:
2487 	TNF_PROBE_1(tavor_mr_rereg_xlat_helper_fail, TAVOR_TNF_ERROR, "",
2488 	    tnf_string, msg, errormsg);
2489 	TAVOR_TNF_EXIT(tavor_mr_rereg_xlat_helper);
2490 	return (status);
2491 }
2492 
2493 
2494 /*
2495  * tavor_mr_nummtt_needed()
2496  *    Context: Can be called from interrupt or base context.
2497  */
2498 /* ARGSUSED */
2499 static uint64_t
2500 tavor_mr_nummtt_needed(tavor_state_t *state, tavor_bind_info_t *bind,
2501     uint_t *mtt_pgsize_bits)
2502 {
2503 	uint64_t	pg_offset_mask;
2504 	uint64_t	pg_offset, tmp_length;
2505 
2506 	/*
2507 	 * For now we specify the page size as 8Kb (the default page size for
2508 	 * the sun4u architecture), or 4Kb for x86.  Figure out optimal page
2509 	 * size by examining the dmacookies XXX
2510 	 */
2511 	*mtt_pgsize_bits = PAGESHIFT;
2512 
2513 	pg_offset_mask = ((uint64_t)1 << *mtt_pgsize_bits) - 1;
2514 	pg_offset = bind->bi_addr & pg_offset_mask;
2515 	tmp_length = pg_offset + (bind->bi_len - 1);
2516 	return ((tmp_length >> *mtt_pgsize_bits) + 1);
2517 }
2518 
2519 
2520 /*
2521  * tavor_mr_mem_bind()
2522  *    Context: Can be called from interrupt or base context.
2523  */
2524 static int
2525 tavor_mr_mem_bind(tavor_state_t *state, tavor_bind_info_t *bind,
2526     ddi_dma_handle_t dmahdl, uint_t sleep)
2527 {
2528 	ddi_dma_attr_t	dma_attr;
2529 	int		(*callback)(caddr_t);
2530 	uint_t		dma_xfer_mode;
2531 	int		status;
2532 
2533 	/* bi_type must be set to a meaningful value to get a bind handle */
2534 	ASSERT(bind->bi_type == TAVOR_BINDHDL_VADDR ||
2535 	    bind->bi_type == TAVOR_BINDHDL_BUF ||
2536 	    bind->bi_type == TAVOR_BINDHDL_UBUF);
2537 
2538 	TAVOR_TNF_ENTER(tavor_mr_mem_bind);
2539 
2540 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2541 
2542 	/* Set the callback flag appropriately */
2543 	callback = (sleep == TAVOR_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT;
2544 
2545 	/* Determine whether to map STREAMING or CONSISTENT */
2546 	dma_xfer_mode = (bind->bi_flags & IBT_MR_NONCOHERENT) ?
2547 	    DDI_DMA_STREAMING : DDI_DMA_CONSISTENT;
2548 
2549 	/*
2550 	 * Initialize many of the default DMA attributes.  Then, if we're
2551 	 * bypassing the IOMMU, set the DDI_DMA_FORCE_PHYSICAL flag.
2552 	 */
2553 	if (dmahdl == NULL) {
2554 		tavor_dma_attr_init(&dma_attr);
2555 #ifdef	__sparc
2556 		/*
2557 		 * First, disable streaming and switch to consistent if
2558 		 * configured to do so and IOMMU BYPASS is enabled.
2559 		 */
2560 		if (state->ts_cfg_profile->cp_disable_streaming_on_bypass &&
2561 		    dma_xfer_mode == DDI_DMA_STREAMING &&
2562 		    bind->bi_bypass == TAVOR_BINDMEM_BYPASS) {
2563 			dma_xfer_mode = DDI_DMA_CONSISTENT;
2564 		}
2565 
2566 		/*
2567 		 * Then, if streaming is still specified, then "bypass" is not
2568 		 * allowed.
2569 		 */
2570 		if ((dma_xfer_mode == DDI_DMA_CONSISTENT) &&
2571 		    (bind->bi_bypass == TAVOR_BINDMEM_BYPASS)) {
2572 			dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2573 		}
2574 #endif
2575 		/* Allocate a DMA handle for the binding */
2576 		status = ddi_dma_alloc_handle(state->ts_dip, &dma_attr,
2577 		    callback, NULL, &bind->bi_dmahdl);
2578 		if (status != DDI_SUCCESS) {
2579 			TNF_PROBE_0(tavor_mr_mem_bind_dmahdl_fail,
2580 			    TAVOR_TNF_ERROR, "");
2581 			TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2582 			return (status);
2583 		}
2584 		bind->bi_free_dmahdl = 1;
2585 
2586 	} else  {
2587 		bind->bi_dmahdl = dmahdl;
2588 		bind->bi_free_dmahdl = 0;
2589 	}
2590 
2591 	/*
2592 	 * Bind the memory to get the PCI mapped addresses.  The decision
2593 	 * to call ddi_dma_addr_bind_handle() or ddi_dma_buf_bind_handle()
2594 	 * is determined by the "bi_type" flag.  Note: if the bind operation
2595 	 * fails then we have to free up the DMA handle and return error.
2596 	 */
2597 	if (bind->bi_type == TAVOR_BINDHDL_VADDR) {
2598 		status = ddi_dma_addr_bind_handle(bind->bi_dmahdl, NULL,
2599 		    (caddr_t)(uintptr_t)bind->bi_addr, bind->bi_len,
2600 		    (DDI_DMA_RDWR | dma_xfer_mode), callback, NULL,
2601 		    &bind->bi_dmacookie, &bind->bi_cookiecnt);
2602 	} else { /* TAVOR_BINDHDL_BUF || TAVOR_BINDHDL_UBUF */
2603 		status = ddi_dma_buf_bind_handle(bind->bi_dmahdl,
2604 		    bind->bi_buf, (DDI_DMA_RDWR | dma_xfer_mode), callback,
2605 		    NULL, &bind->bi_dmacookie, &bind->bi_cookiecnt);
2606 	}
2607 
2608 	if (status != DDI_DMA_MAPPED) {
2609 		if (bind->bi_free_dmahdl != 0) {
2610 			ddi_dma_free_handle(&bind->bi_dmahdl);
2611 		}
2612 		TNF_PROBE_0(tavor_mr_mem_bind_dmabind_fail, TAVOR_TNF_ERROR,
2613 		    "");
2614 		TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2615 		return (status);
2616 	}
2617 
2618 	TAVOR_TNF_EXIT(tavor_mr_mem_bind);
2619 	return (DDI_SUCCESS);
2620 }
2621 
2622 
2623 /*
2624  * tavor_mr_mem_unbind()
2625  *    Context: Can be called from interrupt or base context.
2626  */
2627 static void
2628 tavor_mr_mem_unbind(tavor_state_t *state, tavor_bind_info_t *bind)
2629 {
2630 	int	status;
2631 
2632 	TAVOR_TNF_ENTER(tavor_mr_mem_unbind);
2633 
2634 	/*
2635 	 * In case of TAVOR_BINDHDL_UBUF, the memory bi_buf points to
2636 	 * is actually allocated by ddi_umem_iosetup() internally, then
2637 	 * it's required to free it here. Reset bi_type to TAVOR_BINDHDL_NONE
2638 	 * not to free it again later.
2639 	 */
2640 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2641 	if (bind->bi_type == TAVOR_BINDHDL_UBUF) {
2642 		freerbuf(bind->bi_buf);
2643 		bind->bi_type = TAVOR_BINDHDL_NONE;
2644 	}
2645 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
2646 
2647 	/*
2648 	 * Unbind the DMA memory for the region
2649 	 *
2650 	 * Note: The only way ddi_dma_unbind_handle() currently
2651 	 * can return an error is if the handle passed in is invalid.
2652 	 * Since this should never happen, we choose to return void
2653 	 * from this function!  If this does return an error, however,
2654 	 * then we print a warning message to the console.
2655 	 */
2656 	status = ddi_dma_unbind_handle(bind->bi_dmahdl);
2657 	if (status != DDI_SUCCESS) {
2658 		TAVOR_WARNING(state, "failed to unbind DMA mapping");
2659 		TNF_PROBE_0(tavor_mr_mem_unbind_dmaunbind_fail,
2660 		    TAVOR_TNF_ERROR, "");
2661 		TAVOR_TNF_EXIT(tavor_mr_mem_unbind);
2662 		return;
2663 	}
2664 
2665 	/* Free up the DMA handle */
2666 	if (bind->bi_free_dmahdl != 0) {
2667 		ddi_dma_free_handle(&bind->bi_dmahdl);
2668 	}
2669 
2670 	TAVOR_TNF_EXIT(tavor_mr_mem_unbind);
2671 }
2672 
2673 
2674 /*
2675  * tavor_mr_fast_mtt_write()
2676  *    Context: Can be called from interrupt or base context.
2677  */
2678 static int
2679 tavor_mr_fast_mtt_write(tavor_rsrc_t *mtt, tavor_bind_info_t *bind,
2680     uint32_t mtt_pgsize_bits)
2681 {
2682 	ddi_dma_cookie_t	dmacookie;
2683 	uint_t			cookie_cnt;
2684 	uint64_t		*mtt_table;
2685 	uint64_t		mtt_entry;
2686 	uint64_t		addr, endaddr;
2687 	uint64_t		pagesize;
2688 	int			i;
2689 
2690 	TAVOR_TNF_ENTER(tavor_mr_fast_mtt_write);
2691 
2692 	/* Calculate page size from the suggested value passed in */
2693 	pagesize = ((uint64_t)1 << mtt_pgsize_bits);
2694 
2695 	/*
2696 	 * Walk the "cookie list" and fill in the MTT table entries
2697 	 */
2698 	i = 0;
2699 	mtt_table  = (uint64_t *)mtt->tr_addr;
2700 	dmacookie  = bind->bi_dmacookie;
2701 	cookie_cnt = bind->bi_cookiecnt;
2702 	while (cookie_cnt-- > 0) {
2703 		addr	= dmacookie.dmac_laddress;
2704 		endaddr = addr + (dmacookie.dmac_size - 1);
2705 		addr	= addr & ~((uint64_t)pagesize - 1);
2706 		while (addr <= endaddr) {
2707 			/*
2708 			 * Fill in the mapped addresses (calculated above) and
2709 			 * set TAVOR_MTT_ENTRY_PRESET flag for each MTT entry.
2710 			 */
2711 			mtt_entry = addr | TAVOR_MTT_ENTRY_PRESET;
2712 			ddi_put64(mtt->tr_acchdl, &mtt_table[i], mtt_entry);
2713 			addr += pagesize;
2714 			i++;
2715 
2716 			if (addr == 0) {
2717 				static int do_once = 1;
2718 				_NOTE(SCHEME_PROTECTS_DATA("safe sharing",
2719 				    do_once))
2720 				if (do_once) {
2721 					do_once = 0;
2722 					cmn_err(CE_NOTE, "probable error in "
2723 					    "dma_cookie address from caller\n");
2724 				}
2725 				break;
2726 			}
2727 		}
2728 
2729 		/*
2730 		 * When we've reached the end of the current DMA cookie,
2731 		 * jump to the next cookie (if there are more)
2732 		 */
2733 		if (cookie_cnt != 0) {
2734 			ddi_dma_nextcookie(bind->bi_dmahdl, &dmacookie);
2735 		}
2736 	}
2737 
2738 	TAVOR_TNF_EXIT(tavor_mr_fast_mtt_write);
2739 	return (DDI_SUCCESS);
2740 }
2741 
2742 /*
2743  * tavor_mtt_refcnt_inc()
2744  *    Context: Can be called from interrupt or base context.
2745  */
2746 static int
2747 tavor_mtt_refcnt_inc(tavor_rsrc_t *rsrc)
2748 {
2749 	tavor_sw_refcnt_t *rc;
2750 	uint32_t	  cnt;
2751 
2752 	rc = (tavor_sw_refcnt_t *)rsrc->tr_addr;
2753 
2754 	/* Increment the MTT's reference count */
2755 	mutex_enter(&rc->swrc_lock);
2756 	TNF_PROBE_1_DEBUG(tavor_mtt_refcnt_inc, TAVOR_TNF_TRACE, "",
2757 	    tnf_uint, refcnt, rc->swrc_refcnt);
2758 	cnt = rc->swrc_refcnt++;
2759 	mutex_exit(&rc->swrc_lock);
2760 
2761 	return (cnt);
2762 }
2763 
2764 
2765 /*
2766  * tavor_mtt_refcnt_dec()
2767  *    Context: Can be called from interrupt or base context.
2768  */
2769 static int
2770 tavor_mtt_refcnt_dec(tavor_rsrc_t *rsrc)
2771 {
2772 	tavor_sw_refcnt_t *rc;
2773 	uint32_t	  cnt;
2774 
2775 	rc = (tavor_sw_refcnt_t *)rsrc->tr_addr;
2776 
2777 	/* Decrement the MTT's reference count */
2778 	mutex_enter(&rc->swrc_lock);
2779 	cnt = --rc->swrc_refcnt;
2780 	TNF_PROBE_1_DEBUG(tavor_mtt_refcnt_dec, TAVOR_TNF_TRACE, "",
2781 	    tnf_uint, refcnt, rc->swrc_refcnt);
2782 	mutex_exit(&rc->swrc_lock);
2783 
2784 	return (cnt);
2785 }
2786