xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/hermon/hermon_mr.c (revision 4bbb904c4a2c2409e0f6cca1c7e540a129337678)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * hermon_mr.c
29  *    Hermon Memory Region/Window Routines
30  *
31  *    Implements all the routines necessary to provide the requisite memory
32  *    registration verbs.  These include operations like RegisterMemRegion(),
33  *    DeregisterMemRegion(), ReregisterMemRegion, RegisterSharedMemRegion,
34  *    etc., that affect Memory Regions.  It also includes the verbs that
35  *    affect Memory Windows, including AllocMemWindow(), FreeMemWindow(),
36  *    and QueryMemWindow().
37  */
38 
39 #include <sys/types.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/modctl.h>
44 #include <sys/esunddi.h>
45 
46 #include <sys/ib/adapters/hermon/hermon.h>
47 
48 extern uint32_t hermon_kernel_data_ro;
49 extern uint32_t hermon_user_data_ro;
50 
51 /*
52  * Used by hermon_mr_keycalc() below to fill in the "unconstrained" portion
53  * of Hermon memory keys (LKeys and RKeys)
54  */
55 static	uint_t hermon_memkey_cnt = 0x00;
56 #define	HERMON_MEMKEY_SHIFT	 24
57 #define	HERMON_MPT_SW_OWNERSHIP	 0xF
58 
59 static int hermon_mr_common_reg(hermon_state_t *state, hermon_pdhdl_t pd,
60     hermon_bind_info_t *bind, hermon_mrhdl_t *mrhdl, hermon_mr_options_t *op,
61     hermon_mpt_rsrc_type_t mpt_type);
62 static int hermon_mr_common_rereg(hermon_state_t *state, hermon_mrhdl_t mr,
63     hermon_pdhdl_t pd, hermon_bind_info_t *bind, hermon_mrhdl_t *mrhdl_new,
64     hermon_mr_options_t *op);
65 static int hermon_mr_rereg_xlat_helper(hermon_state_t *state, hermon_mrhdl_t mr,
66     hermon_bind_info_t *bind, hermon_mr_options_t *op, uint64_t *mtt_addr,
67     uint_t sleep, uint_t *dereg_level);
68 static uint64_t hermon_mr_nummtt_needed(hermon_state_t *state,
69     hermon_bind_info_t *bind, uint_t *mtt_pgsize);
70 static int hermon_mr_mem_bind(hermon_state_t *state, hermon_bind_info_t *bind,
71     ddi_dma_handle_t dmahdl, uint_t sleep, uint_t is_buffer);
72 static void hermon_mr_mem_unbind(hermon_state_t *state,
73     hermon_bind_info_t *bind);
74 static int hermon_mr_fast_mtt_write(hermon_state_t *state, hermon_rsrc_t *mtt,
75     hermon_bind_info_t *bind, uint32_t mtt_pgsize_bits);
76 static int hermon_mr_fast_mtt_write_fmr(hermon_rsrc_t *mtt,
77     ibt_pmr_attr_t *mem_pattr, uint32_t mtt_pgsize_bits);
78 static uint_t hermon_mtt_refcnt_inc(hermon_rsrc_t *rsrc);
79 static uint_t hermon_mtt_refcnt_dec(hermon_rsrc_t *rsrc);
80 
81 
82 /*
83  * The Hermon umem_lockmemory() callback ops.  When userland memory is
84  * registered, these callback ops are specified.  The hermon_umap_umemlock_cb()
85  * callback will be called whenever the memory for the corresponding
86  * ddi_umem_cookie_t is being freed.
87  */
88 static struct umem_callback_ops hermon_umem_cbops = {
89 	UMEM_CALLBACK_VERSION,
90 	hermon_umap_umemlock_cb,
91 };
92 
93 
94 
95 /*
96  * hermon_mr_register()
97  *    Context: Can be called from interrupt or base context.
98  */
99 int
100 hermon_mr_register(hermon_state_t *state, hermon_pdhdl_t pd,
101     ibt_mr_attr_t *mr_attr, hermon_mrhdl_t *mrhdl, hermon_mr_options_t *op,
102     hermon_mpt_rsrc_type_t mpt_type)
103 {
104 	hermon_bind_info_t	bind;
105 	int			status;
106 
107 	/*
108 	 * Fill in the "bind" struct.  This struct provides the majority
109 	 * of the information that will be used to distinguish between an
110 	 * "addr" binding (as is the case here) and a "buf" binding (see
111 	 * below).  The "bind" struct is later passed to hermon_mr_mem_bind()
112 	 * which does most of the "heavy lifting" for the Hermon memory
113 	 * registration routines.
114 	 */
115 	bind.bi_type  = HERMON_BINDHDL_VADDR;
116 	bind.bi_addr  = mr_attr->mr_vaddr;
117 	bind.bi_len   = mr_attr->mr_len;
118 	bind.bi_as    = mr_attr->mr_as;
119 	bind.bi_flags = mr_attr->mr_flags;
120 	status = hermon_mr_common_reg(state, pd, &bind, mrhdl, op,
121 	    mpt_type);
122 	return (status);
123 }
124 
125 
126 /*
127  * hermon_mr_register_buf()
128  *    Context: Can be called from interrupt or base context.
129  */
130 int
131 hermon_mr_register_buf(hermon_state_t *state, hermon_pdhdl_t pd,
132     ibt_smr_attr_t *mr_attr, struct buf *buf, hermon_mrhdl_t *mrhdl,
133     hermon_mr_options_t *op, hermon_mpt_rsrc_type_t mpt_type)
134 {
135 	hermon_bind_info_t	bind;
136 	int			status;
137 
138 	/*
139 	 * Fill in the "bind" struct.  This struct provides the majority
140 	 * of the information that will be used to distinguish between an
141 	 * "addr" binding (see above) and a "buf" binding (as is the case
142 	 * here).  The "bind" struct is later passed to hermon_mr_mem_bind()
143 	 * which does most of the "heavy lifting" for the Hermon memory
144 	 * registration routines.  Note: We have chosen to provide
145 	 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
146 	 * not set).  It is not critical what value we choose here as it need
147 	 * only be unique for the given RKey (which will happen by default),
148 	 * so the choice here is somewhat arbitrary.
149 	 */
150 	bind.bi_type  = HERMON_BINDHDL_BUF;
151 	bind.bi_buf   = buf;
152 	if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
153 		bind.bi_addr  = mr_attr->mr_vaddr;
154 	} else {
155 		bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
156 	}
157 	bind.bi_as    = NULL;
158 	bind.bi_len   = (uint64_t)buf->b_bcount;
159 	bind.bi_flags = mr_attr->mr_flags;
160 	status = hermon_mr_common_reg(state, pd, &bind, mrhdl, op, mpt_type);
161 	return (status);
162 }
163 
164 
165 /*
166  * hermon_mr_register_shared()
167  *    Context: Can be called from interrupt or base context.
168  */
169 int
170 hermon_mr_register_shared(hermon_state_t *state, hermon_mrhdl_t mrhdl,
171     hermon_pdhdl_t pd, ibt_smr_attr_t *mr_attr, hermon_mrhdl_t *mrhdl_new)
172 {
173 	hermon_rsrc_t		*mpt, *mtt, *rsrc;
174 	hermon_umap_db_entry_t	*umapdb;
175 	hermon_hw_dmpt_t	mpt_entry;
176 	hermon_mrhdl_t		mr;
177 	hermon_bind_info_t	*bind;
178 	ddi_umem_cookie_t	umem_cookie;
179 	size_t			umem_len;
180 	caddr_t			umem_addr;
181 	uint64_t		mtt_addr, pgsize_msk;
182 	uint_t			sleep, mr_is_umem;
183 	int			status, umem_flags;
184 
185 	/*
186 	 * Check the sleep flag.  Ensure that it is consistent with the
187 	 * current thread context (i.e. if we are currently in the interrupt
188 	 * context, then we shouldn't be attempting to sleep).
189 	 */
190 	sleep = (mr_attr->mr_flags & IBT_MR_NOSLEEP) ? HERMON_NOSLEEP :
191 	    HERMON_SLEEP;
192 	if ((sleep == HERMON_SLEEP) &&
193 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
194 		status = IBT_INVALID_PARAM;
195 		goto mrshared_fail;
196 	}
197 
198 	/* Increment the reference count on the protection domain (PD) */
199 	hermon_pd_refcnt_inc(pd);
200 
201 	/*
202 	 * Allocate an MPT entry.  This will be filled in with all the
203 	 * necessary parameters to define the shared memory region.
204 	 * Specifically, it will be made to reference the currently existing
205 	 * MTT entries and ownership of the MPT will be passed to the hardware
206 	 * in the last step below.  If we fail here, we must undo the
207 	 * protection domain reference count.
208 	 */
209 	status = hermon_rsrc_alloc(state, HERMON_DMPT, 1, sleep, &mpt);
210 	if (status != DDI_SUCCESS) {
211 		status = IBT_INSUFF_RESOURCE;
212 		goto mrshared_fail1;
213 	}
214 
215 	/*
216 	 * Allocate the software structure for tracking the shared memory
217 	 * region (i.e. the Hermon Memory Region handle).  If we fail here, we
218 	 * must undo the protection domain reference count and the previous
219 	 * resource allocation.
220 	 */
221 	status = hermon_rsrc_alloc(state, HERMON_MRHDL, 1, sleep, &rsrc);
222 	if (status != DDI_SUCCESS) {
223 		status = IBT_INSUFF_RESOURCE;
224 		goto mrshared_fail2;
225 	}
226 	mr = (hermon_mrhdl_t)rsrc->hr_addr;
227 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
228 
229 	/*
230 	 * Setup and validate the memory region access flags.  This means
231 	 * translating the IBTF's enable flags into the access flags that
232 	 * will be used in later operations.
233 	 */
234 	mr->mr_accflag = 0;
235 	if (mr_attr->mr_flags & IBT_MR_ENABLE_WINDOW_BIND)
236 		mr->mr_accflag |= IBT_MR_WINDOW_BIND;
237 	if (mr_attr->mr_flags & IBT_MR_ENABLE_LOCAL_WRITE)
238 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
239 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_READ)
240 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
241 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_WRITE)
242 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
243 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
244 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
245 
246 	/*
247 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
248 	 * from a certain number of "constrained" bits (the least significant
249 	 * bits) and some number of "unconstrained" bits.  The constrained
250 	 * bits must be set to the index of the entry in the MPT table, but
251 	 * the unconstrained bits can be set to any value we wish.  Note:
252 	 * if no remote access is required, then the RKey value is not filled
253 	 * in.  Otherwise both Rkey and LKey are given the same value.
254 	 */
255 	mr->mr_lkey = hermon_mr_keycalc(mpt->hr_indx);
256 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
257 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
258 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
259 		mr->mr_rkey = mr->mr_lkey;
260 	}
261 
262 	/* Grab the MR lock for the current memory region */
263 	mutex_enter(&mrhdl->mr_lock);
264 
265 	/*
266 	 * Check here to see if the memory region has already been partially
267 	 * deregistered as a result of a hermon_umap_umemlock_cb() callback.
268 	 * If so, this is an error, return failure.
269 	 */
270 	if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
271 		mutex_exit(&mrhdl->mr_lock);
272 		status = IBT_MR_HDL_INVALID;
273 		goto mrshared_fail3;
274 	}
275 
276 	/*
277 	 * Determine if the original memory was from userland and, if so, pin
278 	 * the pages (again) with umem_lockmemory().  This will guarantee a
279 	 * separate callback for each of this shared region's MR handles.
280 	 * If this is userland memory, then allocate an entry in the
281 	 * "userland resources database".  This will later be added to
282 	 * the database (after all further memory registration operations are
283 	 * successful).  If we fail here, we must undo all the above setup.
284 	 */
285 	mr_is_umem = mrhdl->mr_is_umem;
286 	if (mr_is_umem) {
287 		umem_len   = ptob(btopr(mrhdl->mr_bindinfo.bi_len));
288 		umem_addr  = (caddr_t)((uintptr_t)mrhdl->mr_bindinfo.bi_addr &
289 		    ~PAGEOFFSET);
290 		umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
291 		    DDI_UMEMLOCK_LONGTERM);
292 		status = umem_lockmemory(umem_addr, umem_len, umem_flags,
293 		    &umem_cookie, &hermon_umem_cbops, curproc);
294 		if (status != 0) {
295 			mutex_exit(&mrhdl->mr_lock);
296 			status = IBT_INSUFF_RESOURCE;
297 			goto mrshared_fail3;
298 		}
299 
300 		umapdb = hermon_umap_db_alloc(state->hs_instance,
301 		    (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
302 		    (uint64_t)(uintptr_t)rsrc);
303 		if (umapdb == NULL) {
304 			mutex_exit(&mrhdl->mr_lock);
305 			status = IBT_INSUFF_RESOURCE;
306 			goto mrshared_fail4;
307 		}
308 	}
309 
310 	/*
311 	 * Copy the MTT resource pointer (and additional parameters) from
312 	 * the original Hermon Memory Region handle.  Note: this is normally
313 	 * where the hermon_mr_mem_bind() routine would be called, but because
314 	 * we already have bound and filled-in MTT entries it is simply a
315 	 * matter here of managing the MTT reference count and grabbing the
316 	 * address of the MTT table entries (for filling in the shared region's
317 	 * MPT entry).
318 	 */
319 	mr->mr_mttrsrcp	  = mrhdl->mr_mttrsrcp;
320 	mr->mr_logmttpgsz = mrhdl->mr_logmttpgsz;
321 	mr->mr_bindinfo	  = mrhdl->mr_bindinfo;
322 	mr->mr_mttrefcntp = mrhdl->mr_mttrefcntp;
323 	mutex_exit(&mrhdl->mr_lock);
324 	bind = &mr->mr_bindinfo;
325 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
326 	mtt = mr->mr_mttrsrcp;
327 
328 	/*
329 	 * Increment the MTT reference count (to reflect the fact that
330 	 * the MTT is now shared)
331 	 */
332 	(void) hermon_mtt_refcnt_inc(mr->mr_mttrefcntp);
333 
334 	/*
335 	 * Update the new "bind" virtual address.  Do some extra work here
336 	 * to ensure proper alignment.  That is, make sure that the page
337 	 * offset for the beginning of the old range is the same as the
338 	 * offset for this new mapping
339 	 */
340 	pgsize_msk = (((uint64_t)1 << mr->mr_logmttpgsz) - 1);
341 	bind->bi_addr = ((mr_attr->mr_vaddr & ~pgsize_msk) |
342 	    (mr->mr_bindinfo.bi_addr & pgsize_msk));
343 
344 	/*
345 	 * Fill in the MPT entry.  This is the final step before passing
346 	 * ownership of the MPT entry to the Hermon hardware.  We use all of
347 	 * the information collected/calculated above to fill in the
348 	 * requisite portions of the MPT.
349 	 */
350 	bzero(&mpt_entry, sizeof (hermon_hw_dmpt_t));
351 	mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
352 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
353 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
354 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
355 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
356 	mpt_entry.lr	  = 1;
357 	mpt_entry.reg_win = HERMON_MPT_IS_REGION;
358 	mpt_entry.entity_sz	= mr->mr_logmttpgsz;
359 	mpt_entry.mem_key	= mr->mr_lkey;
360 	mpt_entry.pd		= pd->pd_pdnum;
361 	mpt_entry.start_addr	= bind->bi_addr;
362 	mpt_entry.reg_win_len	= bind->bi_len;
363 	mtt_addr = (mtt->hr_indx << HERMON_MTT_SIZE_SHIFT);
364 	mpt_entry.mtt_addr_h = mtt_addr >> 32;
365 	mpt_entry.mtt_addr_l = mtt_addr >> 3;
366 
367 	/*
368 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
369 	 * the entry to the hardware.  Note: in general, this operation
370 	 * shouldn't fail.  But if it does, we have to undo everything we've
371 	 * done above before returning error.
372 	 */
373 	status = hermon_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
374 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, sleep);
375 	if (status != HERMON_CMD_SUCCESS) {
376 		cmn_err(CE_CONT, "Hermon: SW2HW_MPT command failed: %08x\n",
377 		    status);
378 		if (status == HERMON_CMD_INVALID_STATUS) {
379 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
380 		}
381 		status = ibc_get_ci_failure(0);
382 		goto mrshared_fail5;
383 	}
384 
385 	/*
386 	 * Fill in the rest of the Hermon Memory Region handle.  Having
387 	 * successfully transferred ownership of the MPT, we can update the
388 	 * following fields for use in further operations on the MR.
389 	 */
390 	mr->mr_mptrsrcp	  = mpt;
391 	mr->mr_mttrsrcp	  = mtt;
392 	mr->mr_mpt_type	  = HERMON_MPT_DMPT;
393 	mr->mr_pdhdl	  = pd;
394 	mr->mr_rsrcp	  = rsrc;
395 	mr->mr_is_umem	  = mr_is_umem;
396 	mr->mr_is_fmr	  = 0;
397 	mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL;
398 	mr->mr_umem_cbfunc = NULL;
399 	mr->mr_umem_cbarg1 = NULL;
400 	mr->mr_umem_cbarg2 = NULL;
401 	mr->mr_lkey	   = hermon_mr_key_swap(mr->mr_lkey);
402 	mr->mr_rkey	   = hermon_mr_key_swap(mr->mr_rkey);
403 
404 	/*
405 	 * If this is userland memory, then we need to insert the previously
406 	 * allocated entry into the "userland resources database".  This will
407 	 * allow for later coordination between the hermon_umap_umemlock_cb()
408 	 * callback and hermon_mr_deregister().
409 	 */
410 	if (mr_is_umem) {
411 		hermon_umap_db_add(umapdb);
412 	}
413 
414 	*mrhdl_new = mr;
415 
416 	return (DDI_SUCCESS);
417 
418 /*
419  * The following is cleanup for all possible failure cases in this routine
420  */
421 mrshared_fail5:
422 	(void) hermon_mtt_refcnt_dec(mr->mr_mttrefcntp);
423 	if (mr_is_umem) {
424 		hermon_umap_db_free(umapdb);
425 	}
426 mrshared_fail4:
427 	if (mr_is_umem) {
428 		ddi_umem_unlock(umem_cookie);
429 	}
430 mrshared_fail3:
431 	hermon_rsrc_free(state, &rsrc);
432 mrshared_fail2:
433 	hermon_rsrc_free(state, &mpt);
434 mrshared_fail1:
435 	hermon_pd_refcnt_dec(pd);
436 mrshared_fail:
437 	return (status);
438 }
439 
440 /*
441  * hermon_mr_alloc_fmr()
442  *    Context: Can be called from interrupt or base context.
443  */
444 int
445 hermon_mr_alloc_fmr(hermon_state_t *state, hermon_pdhdl_t pd,
446     hermon_fmrhdl_t fmr_pool, hermon_mrhdl_t *mrhdl)
447 {
448 	hermon_rsrc_t		*mpt, *mtt, *rsrc;
449 	hermon_hw_dmpt_t		mpt_entry;
450 	hermon_mrhdl_t		mr;
451 	hermon_bind_info_t	bind;
452 	uint64_t		mtt_addr;
453 	uint64_t		nummtt;
454 	uint_t			sleep, mtt_pgsize_bits;
455 	int			status;
456 
457 	/*
458 	 * Check the sleep flag.  Ensure that it is consistent with the
459 	 * current thread context (i.e. if we are currently in the interrupt
460 	 * context, then we shouldn't be attempting to sleep).
461 	 */
462 	sleep = (fmr_pool->fmr_flags & IBT_MR_SLEEP) ? HERMON_SLEEP :
463 	    HERMON_NOSLEEP;
464 	if ((sleep == HERMON_SLEEP) &&
465 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
466 		return (IBT_INVALID_PARAM);
467 	}
468 
469 	/* Increment the reference count on the protection domain (PD) */
470 	hermon_pd_refcnt_inc(pd);
471 
472 	/*
473 	 * Allocate an MPT entry.  This will be filled in with all the
474 	 * necessary parameters to define the FMR.  Specifically, it will be
475 	 * made to reference the currently existing MTT entries and ownership
476 	 * of the MPT will be passed to the hardware in the last step below.
477 	 * If we fail here, we must undo the protection domain reference count.
478 	 */
479 
480 	status = hermon_rsrc_alloc(state, HERMON_DMPT, 1, sleep, &mpt);
481 	if (status != DDI_SUCCESS) {
482 		status = IBT_INSUFF_RESOURCE;
483 		goto fmralloc_fail1;
484 	}
485 
486 	/*
487 	 * Allocate the software structure for tracking the fmr memory
488 	 * region (i.e. the Hermon Memory Region handle).  If we fail here, we
489 	 * must undo the protection domain reference count and the previous
490 	 * resource allocation.
491 	 */
492 	status = hermon_rsrc_alloc(state, HERMON_MRHDL, 1, sleep, &rsrc);
493 	if (status != DDI_SUCCESS) {
494 		status = IBT_INSUFF_RESOURCE;
495 		goto fmralloc_fail2;
496 	}
497 	mr = (hermon_mrhdl_t)rsrc->hr_addr;
498 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
499 
500 	/*
501 	 * Setup and validate the memory region access flags.  This means
502 	 * translating the IBTF's enable flags into the access flags that
503 	 * will be used in later operations.
504 	 */
505 	mr->mr_accflag = 0;
506 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_LOCAL_WRITE)
507 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
508 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_REMOTE_READ)
509 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
510 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_REMOTE_WRITE)
511 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
512 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
513 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
514 
515 	/*
516 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
517 	 * from a certain number of "constrained" bits (the least significant
518 	 * bits) and some number of "unconstrained" bits.  The constrained
519 	 * bits must be set to the index of the entry in the MPT table, but
520 	 * the unconstrained bits can be set to any value we wish.  Note:
521 	 * if no remote access is required, then the RKey value is not filled
522 	 * in.  Otherwise both Rkey and LKey are given the same value.
523 	 */
524 	mr->mr_lkey = hermon_mr_keycalc(mpt->hr_indx);
525 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
526 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
527 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
528 		mr->mr_rkey = mr->mr_lkey;
529 	}
530 
531 	/*
532 	 * Determine number of pages spanned.  This routine uses the
533 	 * information in the "bind" struct to determine the required
534 	 * number of MTT entries needed (and returns the suggested page size -
535 	 * as a "power-of-2" - for each MTT entry).
536 	 */
537 	/* Assume address will be page aligned later */
538 	bind.bi_addr = 0;
539 	/* Calculate size based on given max pages */
540 	bind.bi_len = fmr_pool->fmr_max_pages << PAGESHIFT;
541 	nummtt = hermon_mr_nummtt_needed(state, &bind, &mtt_pgsize_bits);
542 
543 	/*
544 	 * Allocate the MTT entries.  Use the calculations performed above to
545 	 * allocate the required number of MTT entries.  If we fail here, we
546 	 * must not only undo all the previous resource allocation (and PD
547 	 * reference count), but we must also unbind the memory.
548 	 */
549 	status = hermon_rsrc_alloc(state, HERMON_MTT, nummtt, sleep, &mtt);
550 	if (status != DDI_SUCCESS) {
551 		status = IBT_INSUFF_RESOURCE;
552 		goto fmralloc_fail3;
553 	}
554 	mr->mr_logmttpgsz = mtt_pgsize_bits;
555 
556 	/*
557 	 * Fill in the MPT entry.  This is the final step before passing
558 	 * ownership of the MPT entry to the Hermon hardware.  We use all of
559 	 * the information collected/calculated above to fill in the
560 	 * requisite portions of the MPT.
561 	 */
562 	bzero(&mpt_entry, sizeof (hermon_hw_dmpt_t));
563 	mpt_entry.en_bind = 0;
564 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
565 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
566 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
567 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
568 	mpt_entry.lr	  = 1;
569 	mpt_entry.reg_win = HERMON_MPT_IS_REGION;
570 	mpt_entry.pd		= pd->pd_pdnum;
571 
572 	mpt_entry.entity_sz	= mr->mr_logmttpgsz;
573 	mtt_addr = (mtt->hr_indx << HERMON_MTT_SIZE_SHIFT);
574 	mpt_entry.mtt_addr_h = mtt_addr >> 32;
575 	mpt_entry.mtt_addr_l = mtt_addr >> 3;
576 	mpt_entry.mem_key = mr->mr_lkey;
577 
578 	/*
579 	 * FMR sets these to 0 for now.  Later during actual fmr registration
580 	 * these values are filled in.
581 	 */
582 	mpt_entry.start_addr	= 0;
583 	mpt_entry.reg_win_len	= 0;
584 
585 	/*
586 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
587 	 * the entry to the hardware.  Note: in general, this operation
588 	 * shouldn't fail.  But if it does, we have to undo everything we've
589 	 * done above before returning error.
590 	 */
591 	status = hermon_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
592 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, sleep);
593 	if (status != HERMON_CMD_SUCCESS) {
594 		cmn_err(CE_CONT, "Hermon: SW2HW_MPT command failed: %08x\n",
595 		    status);
596 		if (status == HERMON_CMD_INVALID_STATUS) {
597 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
598 		}
599 		status = ibc_get_ci_failure(0);
600 		goto fmralloc_fail4;
601 	}
602 
603 	/*
604 	 * Fill in the rest of the Hermon Memory Region handle.  Having
605 	 * successfully transferred ownership of the MPT, we can update the
606 	 * following fields for use in further operations on the MR.  Also, set
607 	 * that this is an FMR region.
608 	 */
609 	mr->mr_mptrsrcp	  = mpt;
610 	mr->mr_mttrsrcp	  = mtt;
611 	mr->mr_mpt_type   = HERMON_MPT_DMPT;
612 	mr->mr_pdhdl	  = pd;
613 	mr->mr_rsrcp	  = rsrc;
614 	mr->mr_is_fmr	  = 1;
615 	mr->mr_lkey	   = hermon_mr_key_swap(mr->mr_lkey);
616 	mr->mr_rkey	   = hermon_mr_key_swap(mr->mr_rkey);
617 	(void) memcpy(&mr->mr_bindinfo, &bind, sizeof (hermon_bind_info_t));
618 
619 	*mrhdl = mr;
620 
621 	return (DDI_SUCCESS);
622 
623 /*
624  * The following is cleanup for all possible failure cases in this routine
625  */
626 fmralloc_fail4:
627 	kmem_free(mtt, sizeof (hermon_rsrc_t) * nummtt);
628 fmralloc_fail3:
629 	hermon_rsrc_free(state, &rsrc);
630 fmralloc_fail2:
631 	hermon_rsrc_free(state, &mpt);
632 fmralloc_fail1:
633 	hermon_pd_refcnt_dec(pd);
634 fmralloc_fail:
635 	return (status);
636 }
637 
638 /*
639  * hermon_mr_register_physical_fmr()
640  *    Context: Can be called from interrupt or base context.
641  */
642 /*ARGSUSED*/
643 int
644 hermon_mr_register_physical_fmr(hermon_state_t *state,
645     ibt_pmr_attr_t *mem_pattr_p, hermon_mrhdl_t mr, ibt_pmr_desc_t *mem_desc_p)
646 {
647 	hermon_rsrc_t		*mpt;
648 	uint64_t		*mpt_table;
649 	int			status;
650 
651 	mutex_enter(&mr->mr_lock);
652 	mpt = mr->mr_mptrsrcp;
653 	mpt_table = (uint64_t *)mpt->hr_addr;
654 
655 	/* Write MPT status to SW bit */
656 	ddi_put8(mpt->hr_acchdl, (uint8_t *)&mpt_table[0], 0xF);
657 
658 	/*
659 	 * Write the mapped addresses into the MTT entries.  FMR needs to do
660 	 * this a little differently, so we call the fmr specific fast mtt
661 	 * write here.
662 	 */
663 	status = hermon_mr_fast_mtt_write_fmr(mr->mr_mttrsrcp, mem_pattr_p,
664 	    mr->mr_logmttpgsz);
665 	if (status != DDI_SUCCESS) {
666 		mutex_exit(&mr->mr_lock);
667 		status = ibc_get_ci_failure(0);
668 		goto fmr_reg_fail1;
669 	}
670 
671 	/*
672 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
673 	 * from a certain number of "constrained" bits (the least significant
674 	 * bits) and some number of "unconstrained" bits.  The constrained
675 	 * bits must be set to the index of the entry in the MPT table, but
676 	 * the unconstrained bits can be set to any value we wish.  Note:
677 	 * if no remote access is required, then the RKey value is not filled
678 	 * in.  Otherwise both Rkey and LKey are given the same value.
679 	 */
680 	mr->mr_lkey = hermon_mr_keycalc(mpt->hr_indx);
681 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
682 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
683 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
684 		mr->mr_rkey = mr->mr_lkey;
685 	}
686 
687 	/* write mem key value */
688 	ddi_put32(mpt->hr_acchdl, (uint32_t *)&mpt_table[1], mr->mr_lkey);
689 
690 	/* write length value */
691 	ddi_put64(mpt->hr_acchdl, &mpt_table[3], mem_pattr_p->pmr_len);
692 
693 	/* write start addr value */
694 	ddi_put64(mpt->hr_acchdl, &mpt_table[2], mem_pattr_p->pmr_iova);
695 
696 	/* write lkey value */
697 	ddi_put32(mpt->hr_acchdl, (uint32_t *)&mpt_table[4], mr->mr_lkey);
698 
699 	/* Write MPT status to HW bit */
700 	ddi_put8(mpt->hr_acchdl, (uint8_t *)&mpt_table[0], 0x0);
701 
702 	/* Fill in return parameters */
703 	mem_desc_p->pmd_lkey = mr->mr_lkey;
704 	mem_desc_p->pmd_rkey = mr->mr_rkey;
705 	mem_desc_p->pmd_iova = mem_pattr_p->pmr_iova;
706 	mem_desc_p->pmd_phys_buf_list_sz = mem_pattr_p->pmr_len;
707 
708 	/* Fill in MR bindinfo struct for later sync or query operations */
709 	mr->mr_bindinfo.bi_addr = mem_pattr_p->pmr_iova;
710 	mr->mr_bindinfo.bi_flags = mem_pattr_p->pmr_flags & IBT_MR_NONCOHERENT;
711 
712 	mutex_exit(&mr->mr_lock);
713 
714 	return (DDI_SUCCESS);
715 
716 fmr_reg_fail1:
717 	/*
718 	 * Note, we fail here, and purposely leave the memory ownership in
719 	 * software.  The memory tables may be corrupt, so we leave the region
720 	 * unregistered.
721 	 */
722 	return (DDI_FAILURE);
723 }
724 
725 
726 /*
727  * hermon_mr_deregister()
728  *    Context: Can be called from interrupt or base context.
729  */
730 /* ARGSUSED */
731 int
732 hermon_mr_deregister(hermon_state_t *state, hermon_mrhdl_t *mrhdl, uint_t level,
733     uint_t sleep)
734 {
735 	hermon_rsrc_t		*mpt, *mtt, *rsrc, *mtt_refcnt;
736 	hermon_umap_db_entry_t	*umapdb;
737 	hermon_pdhdl_t		pd;
738 	hermon_mrhdl_t		mr;
739 	hermon_bind_info_t	*bind;
740 	uint64_t		value;
741 	int			status;
742 	uint_t			shared_mtt;
743 
744 	/*
745 	 * Check the sleep flag.  Ensure that it is consistent with the
746 	 * current thread context (i.e. if we are currently in the interrupt
747 	 * context, then we shouldn't be attempting to sleep).
748 	 */
749 	if ((sleep == HERMON_SLEEP) &&
750 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
751 		status = IBT_INVALID_PARAM;
752 		return (status);
753 	}
754 
755 	/*
756 	 * Pull all the necessary information from the Hermon Memory Region
757 	 * handle.  This is necessary here because the resource for the
758 	 * MR handle is going to be freed up as part of the this
759 	 * deregistration
760 	 */
761 	mr	= *mrhdl;
762 	mutex_enter(&mr->mr_lock);
763 	mpt	= mr->mr_mptrsrcp;
764 	mtt	= mr->mr_mttrsrcp;
765 	mtt_refcnt = mr->mr_mttrefcntp;
766 	rsrc	= mr->mr_rsrcp;
767 	pd	= mr->mr_pdhdl;
768 	bind	= &mr->mr_bindinfo;
769 
770 	/*
771 	 * Check here if the memory region is really an FMR.  If so, this is a
772 	 * bad thing and we shouldn't be here.  Return failure.
773 	 */
774 	if (mr->mr_is_fmr) {
775 		mutex_exit(&mr->mr_lock);
776 		return (IBT_INVALID_PARAM);
777 	}
778 
779 	/*
780 	 * Check here to see if the memory region has already been partially
781 	 * deregistered as a result of the hermon_umap_umemlock_cb() callback.
782 	 * If so, then jump to the end and free the remaining resources.
783 	 */
784 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
785 		goto mrdereg_finish_cleanup;
786 	}
787 
788 	/*
789 	 * We must drop the "mr_lock" here to ensure that both SLEEP and
790 	 * NOSLEEP calls into the firmware work as expected.  Also, if two
791 	 * threads are attemping to access this MR (via de-register,
792 	 * re-register, or otherwise), then we allow the firmware to enforce
793 	 * the checking, that only one deregister is valid.
794 	 */
795 	mutex_exit(&mr->mr_lock);
796 
797 	/*
798 	 * Reclaim MPT entry from hardware (if necessary).  Since the
799 	 * hermon_mr_deregister() routine is used in the memory region
800 	 * reregistration process as well, it is possible that we will
801 	 * not always wish to reclaim ownership of the MPT.  Check the
802 	 * "level" arg and, if necessary, attempt to reclaim it.  If
803 	 * the ownership transfer fails for any reason, we check to see
804 	 * what command status was returned from the hardware.  The only
805 	 * "expected" error status is the one that indicates an attempt to
806 	 * deregister a memory region that has memory windows bound to it
807 	 */
808 	if (level >= HERMON_MR_DEREG_ALL) {
809 		if (mr->mr_mpt_type >= HERMON_MPT_DMPT) {
810 			status = hermon_cmn_ownership_cmd_post(state, HW2SW_MPT,
811 			    NULL, 0, mpt->hr_indx, sleep);
812 			if (status != HERMON_CMD_SUCCESS) {
813 				if (status == HERMON_CMD_REG_BOUND) {
814 					return (IBT_MR_IN_USE);
815 				} else {
816 					cmn_err(CE_CONT, "Hermon: HW2SW_MPT "
817 					    "command failed: %08x\n", status);
818 					if (status ==
819 					    HERMON_CMD_INVALID_STATUS) {
820 						hermon_fm_ereport(state,
821 						    HCA_SYS_ERR,
822 						    DDI_SERVICE_LOST);
823 					}
824 					return (IBT_INVALID_PARAM);
825 				}
826 			}
827 		}
828 	}
829 
830 	/*
831 	 * Re-grab the mr_lock here.  Since further access to the protected
832 	 * 'mr' structure is needed, and we would have returned previously for
833 	 * the multiple deregistration case, we can safely grab the lock here.
834 	 */
835 	mutex_enter(&mr->mr_lock);
836 
837 	/*
838 	 * If the memory had come from userland, then we do a lookup in the
839 	 * "userland resources database".  On success, we free the entry, call
840 	 * ddi_umem_unlock(), and continue the cleanup.  On failure (which is
841 	 * an indication that the umem_lockmemory() callback has called
842 	 * hermon_mr_deregister()), we call ddi_umem_unlock() and invalidate
843 	 * the "mr_umemcookie" field in the MR handle (this will be used
844 	 * later to detect that only partial cleaup still remains to be done
845 	 * on the MR handle).
846 	 */
847 	if (mr->mr_is_umem) {
848 		status = hermon_umap_db_find(state->hs_instance,
849 		    (uint64_t)(uintptr_t)mr->mr_umemcookie,
850 		    MLNX_UMAP_MRMEM_RSRC, &value, HERMON_UMAP_DB_REMOVE,
851 		    &umapdb);
852 		if (status == DDI_SUCCESS) {
853 			hermon_umap_db_free(umapdb);
854 			ddi_umem_unlock(mr->mr_umemcookie);
855 		} else {
856 			ddi_umem_unlock(mr->mr_umemcookie);
857 			mr->mr_umemcookie = NULL;
858 		}
859 	}
860 
861 	/*
862 	 * Decrement the MTT reference count.  Since the MTT resource
863 	 * may be shared between multiple memory regions (as a result
864 	 * of a "RegisterSharedMR" verb) it is important that we not
865 	 * free up or unbind resources prematurely.  If it's not shared (as
866 	 * indicated by the return status), then free the resource.
867 	 */
868 	shared_mtt = hermon_mtt_refcnt_dec(mtt_refcnt);
869 	if (!shared_mtt) {
870 		hermon_rsrc_free(state, &mtt_refcnt);
871 	}
872 
873 	/*
874 	 * Free up the MTT entries and unbind the memory.  Here, as above, we
875 	 * attempt to free these resources only if it is appropriate to do so.
876 	 */
877 	if (!shared_mtt) {
878 		if (level >= HERMON_MR_DEREG_NO_HW2SW_MPT) {
879 			hermon_mr_mem_unbind(state, bind);
880 		}
881 		hermon_rsrc_free(state, &mtt);
882 	}
883 
884 	/*
885 	 * If the MR handle has been invalidated, then drop the
886 	 * lock and return success.  Note: This only happens because
887 	 * the umem_lockmemory() callback has been triggered.  The
888 	 * cleanup here is partial, and further cleanup (in a
889 	 * subsequent hermon_mr_deregister() call) will be necessary.
890 	 */
891 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
892 		mutex_exit(&mr->mr_lock);
893 		return (DDI_SUCCESS);
894 	}
895 
896 mrdereg_finish_cleanup:
897 	mutex_exit(&mr->mr_lock);
898 
899 	/* Free the Hermon Memory Region handle */
900 	hermon_rsrc_free(state, &rsrc);
901 
902 	/* Free up the MPT entry resource */
903 	if (mpt != NULL)
904 		hermon_rsrc_free(state, &mpt);
905 
906 	/* Decrement the reference count on the protection domain (PD) */
907 	hermon_pd_refcnt_dec(pd);
908 
909 	/* Set the mrhdl pointer to NULL and return success */
910 	*mrhdl = NULL;
911 
912 	return (DDI_SUCCESS);
913 }
914 
915 /*
916  * hermon_mr_dealloc_fmr()
917  *    Context: Can be called from interrupt or base context.
918  */
919 /* ARGSUSED */
920 int
921 hermon_mr_dealloc_fmr(hermon_state_t *state, hermon_mrhdl_t *mrhdl)
922 {
923 	hermon_rsrc_t		*mpt, *mtt, *rsrc;
924 	hermon_pdhdl_t		pd;
925 	hermon_mrhdl_t		mr;
926 
927 	/*
928 	 * Pull all the necessary information from the Hermon Memory Region
929 	 * handle.  This is necessary here because the resource for the
930 	 * MR handle is going to be freed up as part of the this
931 	 * deregistration
932 	 */
933 	mr	= *mrhdl;
934 	mutex_enter(&mr->mr_lock);
935 	mpt	= mr->mr_mptrsrcp;
936 	mtt	= mr->mr_mttrsrcp;
937 	rsrc	= mr->mr_rsrcp;
938 	pd	= mr->mr_pdhdl;
939 	mutex_exit(&mr->mr_lock);
940 
941 	/* Free the MTT entries */
942 	hermon_rsrc_free(state, &mtt);
943 
944 	/* Free the Hermon Memory Region handle */
945 	hermon_rsrc_free(state, &rsrc);
946 
947 	/* Free up the MPT entry resource */
948 	hermon_rsrc_free(state, &mpt);
949 
950 	/* Decrement the reference count on the protection domain (PD) */
951 	hermon_pd_refcnt_dec(pd);
952 
953 	/* Set the mrhdl pointer to NULL and return success */
954 	*mrhdl = NULL;
955 
956 	return (DDI_SUCCESS);
957 }
958 
959 /*
960  * hermon_mr_invalidate_fmr()
961  *    Context: Can be called from interrupt or base context.
962  */
963 /* ARGSUSED */
964 int
965 hermon_mr_invalidate_fmr(hermon_state_t *state, hermon_mrhdl_t mr)
966 {
967 	hermon_rsrc_t		*mpt;
968 	uint64_t		*mpt_table;
969 
970 	mutex_enter(&mr->mr_lock);
971 	mpt = mr->mr_mptrsrcp;
972 	mpt_table = (uint64_t *)mpt->hr_addr;
973 
974 	/* Write MPT status to SW bit */
975 	ddi_put8(mpt->hr_acchdl, (uint8_t *)&mpt_table[0], 0xF);
976 
977 	/* invalidate mem key value */
978 	ddi_put32(mpt->hr_acchdl, (uint32_t *)&mpt_table[1], 0);
979 
980 	/* invalidate lkey value */
981 	ddi_put32(mpt->hr_acchdl, (uint32_t *)&mpt_table[4], 0);
982 
983 	/* Write MPT status to HW bit */
984 	ddi_put8(mpt->hr_acchdl, (uint8_t *)&mpt_table[0], 0x0);
985 
986 	mutex_exit(&mr->mr_lock);
987 
988 	return (DDI_SUCCESS);
989 }
990 
991 /*
992  * hermon_mr_deregister_fmr()
993  *    Context: Can be called from interrupt or base context.
994  */
995 /* ARGSUSED */
996 int
997 hermon_mr_deregister_fmr(hermon_state_t *state, hermon_mrhdl_t mr)
998 {
999 	hermon_rsrc_t		*mpt;
1000 	uint64_t		*mpt_table;
1001 
1002 	mutex_enter(&mr->mr_lock);
1003 	mpt = mr->mr_mptrsrcp;
1004 	mpt_table = (uint64_t *)mpt->hr_addr;
1005 
1006 	/* Write MPT status to SW bit */
1007 	ddi_put8(mpt->hr_acchdl, (uint8_t *)&mpt_table[0], 0xF);
1008 	mutex_exit(&mr->mr_lock);
1009 
1010 	return (DDI_SUCCESS);
1011 }
1012 
1013 
1014 /*
1015  * hermon_mr_query()
1016  *    Context: Can be called from interrupt or base context.
1017  */
1018 /* ARGSUSED */
1019 int
1020 hermon_mr_query(hermon_state_t *state, hermon_mrhdl_t mr,
1021     ibt_mr_query_attr_t *attr)
1022 {
1023 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr))
1024 
1025 	mutex_enter(&mr->mr_lock);
1026 
1027 	/*
1028 	 * Check here to see if the memory region has already been partially
1029 	 * deregistered as a result of a hermon_umap_umemlock_cb() callback.
1030 	 * If so, this is an error, return failure.
1031 	 */
1032 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
1033 		mutex_exit(&mr->mr_lock);
1034 		return (IBT_MR_HDL_INVALID);
1035 	}
1036 
1037 	/* Fill in the queried attributes */
1038 	attr->mr_attr_flags = mr->mr_accflag;
1039 	attr->mr_pd	= (ibt_pd_hdl_t)mr->mr_pdhdl;
1040 
1041 	/* Fill in the "local" attributes */
1042 	attr->mr_lkey = (ibt_lkey_t)mr->mr_lkey;
1043 	attr->mr_lbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
1044 	attr->mr_lbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
1045 
1046 	/*
1047 	 * Fill in the "remote" attributes (if necessary).  Note: the
1048 	 * remote attributes are only valid if the memory region has one
1049 	 * or more of the remote access flags set.
1050 	 */
1051 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1052 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1053 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1054 		attr->mr_rkey = (ibt_rkey_t)mr->mr_rkey;
1055 		attr->mr_rbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
1056 		attr->mr_rbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
1057 	}
1058 
1059 	/*
1060 	 * If region is mapped for streaming (i.e. noncoherent), then set sync
1061 	 * is required
1062 	 */
1063 	attr->mr_sync_required = (mr->mr_bindinfo.bi_flags &
1064 	    IBT_MR_NONCOHERENT) ? B_TRUE : B_FALSE;
1065 
1066 	mutex_exit(&mr->mr_lock);
1067 	return (DDI_SUCCESS);
1068 }
1069 
1070 
1071 /*
1072  * hermon_mr_reregister()
1073  *    Context: Can be called from interrupt or base context.
1074  */
1075 int
1076 hermon_mr_reregister(hermon_state_t *state, hermon_mrhdl_t mr,
1077     hermon_pdhdl_t pd, ibt_mr_attr_t *mr_attr, hermon_mrhdl_t *mrhdl_new,
1078     hermon_mr_options_t *op)
1079 {
1080 	hermon_bind_info_t	bind;
1081 	int			status;
1082 
1083 	/*
1084 	 * Fill in the "bind" struct.  This struct provides the majority
1085 	 * of the information that will be used to distinguish between an
1086 	 * "addr" binding (as is the case here) and a "buf" binding (see
1087 	 * below).  The "bind" struct is later passed to hermon_mr_mem_bind()
1088 	 * which does most of the "heavy lifting" for the Hermon memory
1089 	 * registration (and reregistration) routines.
1090 	 */
1091 	bind.bi_type  = HERMON_BINDHDL_VADDR;
1092 	bind.bi_addr  = mr_attr->mr_vaddr;
1093 	bind.bi_len   = mr_attr->mr_len;
1094 	bind.bi_as    = mr_attr->mr_as;
1095 	bind.bi_flags = mr_attr->mr_flags;
1096 	status = hermon_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
1097 	return (status);
1098 }
1099 
1100 
1101 /*
1102  * hermon_mr_reregister_buf()
1103  *    Context: Can be called from interrupt or base context.
1104  */
1105 int
1106 hermon_mr_reregister_buf(hermon_state_t *state, hermon_mrhdl_t mr,
1107     hermon_pdhdl_t pd, ibt_smr_attr_t *mr_attr, struct buf *buf,
1108     hermon_mrhdl_t *mrhdl_new, hermon_mr_options_t *op)
1109 {
1110 	hermon_bind_info_t	bind;
1111 	int			status;
1112 
1113 	/*
1114 	 * Fill in the "bind" struct.  This struct provides the majority
1115 	 * of the information that will be used to distinguish between an
1116 	 * "addr" binding (see above) and a "buf" binding (as is the case
1117 	 * here).  The "bind" struct is later passed to hermon_mr_mem_bind()
1118 	 * which does most of the "heavy lifting" for the Hermon memory
1119 	 * registration routines.  Note: We have chosen to provide
1120 	 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
1121 	 * not set).  It is not critical what value we choose here as it need
1122 	 * only be unique for the given RKey (which will happen by default),
1123 	 * so the choice here is somewhat arbitrary.
1124 	 */
1125 	bind.bi_type  = HERMON_BINDHDL_BUF;
1126 	bind.bi_buf   = buf;
1127 	if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
1128 		bind.bi_addr  = mr_attr->mr_vaddr;
1129 	} else {
1130 		bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
1131 	}
1132 	bind.bi_len   = (uint64_t)buf->b_bcount;
1133 	bind.bi_flags = mr_attr->mr_flags;
1134 	bind.bi_as    = NULL;
1135 	status = hermon_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
1136 	return (status);
1137 }
1138 
1139 
1140 /*
1141  * hermon_mr_sync()
1142  *    Context: Can be called from interrupt or base context.
1143  */
1144 /* ARGSUSED */
1145 int
1146 hermon_mr_sync(hermon_state_t *state, ibt_mr_sync_t *mr_segs, size_t num_segs)
1147 {
1148 	hermon_mrhdl_t		mrhdl;
1149 	uint64_t		seg_vaddr, seg_len, seg_end;
1150 	uint64_t		mr_start, mr_end;
1151 	uint_t			type;
1152 	int			status, i;
1153 
1154 	/* Process each of the ibt_mr_sync_t's */
1155 	for (i = 0; i < num_segs; i++) {
1156 		mrhdl = (hermon_mrhdl_t)mr_segs[i].ms_handle;
1157 
1158 		/* Check for valid memory region handle */
1159 		if (mrhdl == NULL) {
1160 			status = IBT_MR_HDL_INVALID;
1161 			goto mrsync_fail;
1162 		}
1163 
1164 		mutex_enter(&mrhdl->mr_lock);
1165 
1166 		/*
1167 		 * Check here to see if the memory region has already been
1168 		 * partially deregistered as a result of a
1169 		 * hermon_umap_umemlock_cb() callback.  If so, this is an
1170 		 * error, return failure.
1171 		 */
1172 		if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
1173 			mutex_exit(&mrhdl->mr_lock);
1174 			status = IBT_MR_HDL_INVALID;
1175 			goto mrsync_fail;
1176 		}
1177 
1178 		/* Check for valid bounds on sync request */
1179 		seg_vaddr = mr_segs[i].ms_vaddr;
1180 		seg_len	  = mr_segs[i].ms_len;
1181 		seg_end	  = seg_vaddr + seg_len - 1;
1182 		mr_start  = mrhdl->mr_bindinfo.bi_addr;
1183 		mr_end	  = mr_start + mrhdl->mr_bindinfo.bi_len - 1;
1184 		if ((seg_vaddr < mr_start) || (seg_vaddr > mr_end)) {
1185 			mutex_exit(&mrhdl->mr_lock);
1186 			status = IBT_MR_VA_INVALID;
1187 			goto mrsync_fail;
1188 		}
1189 		if ((seg_end < mr_start) || (seg_end > mr_end)) {
1190 			mutex_exit(&mrhdl->mr_lock);
1191 			status = IBT_MR_LEN_INVALID;
1192 			goto mrsync_fail;
1193 		}
1194 
1195 		/* Determine what type (i.e. direction) for sync */
1196 		if (mr_segs[i].ms_flags & IBT_SYNC_READ) {
1197 			type = DDI_DMA_SYNC_FORDEV;
1198 		} else if (mr_segs[i].ms_flags & IBT_SYNC_WRITE) {
1199 			type = DDI_DMA_SYNC_FORCPU;
1200 		} else {
1201 			mutex_exit(&mrhdl->mr_lock);
1202 			status = IBT_INVALID_PARAM;
1203 			goto mrsync_fail;
1204 		}
1205 
1206 		(void) ddi_dma_sync(mrhdl->mr_bindinfo.bi_dmahdl,
1207 		    (off_t)(seg_vaddr - mr_start), (size_t)seg_len, type);
1208 
1209 		mutex_exit(&mrhdl->mr_lock);
1210 	}
1211 
1212 	return (DDI_SUCCESS);
1213 
1214 mrsync_fail:
1215 	return (status);
1216 }
1217 
1218 
1219 /*
1220  * hermon_mw_alloc()
1221  *    Context: Can be called from interrupt or base context.
1222  */
1223 int
1224 hermon_mw_alloc(hermon_state_t *state, hermon_pdhdl_t pd, ibt_mw_flags_t flags,
1225     hermon_mwhdl_t *mwhdl)
1226 {
1227 	hermon_rsrc_t		*mpt, *rsrc;
1228 	hermon_hw_dmpt_t		mpt_entry;
1229 	hermon_mwhdl_t		mw;
1230 	uint_t			sleep;
1231 	int			status;
1232 
1233 	if (state != NULL)	/* XXX - bogus test that is always TRUE */
1234 		return (IBT_INSUFF_RESOURCE);
1235 
1236 	/*
1237 	 * Check the sleep flag.  Ensure that it is consistent with the
1238 	 * current thread context (i.e. if we are currently in the interrupt
1239 	 * context, then we shouldn't be attempting to sleep).
1240 	 */
1241 	sleep = (flags & IBT_MW_NOSLEEP) ? HERMON_NOSLEEP : HERMON_SLEEP;
1242 	if ((sleep == HERMON_SLEEP) &&
1243 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
1244 		status = IBT_INVALID_PARAM;
1245 		goto mwalloc_fail;
1246 	}
1247 
1248 	/* Increment the reference count on the protection domain (PD) */
1249 	hermon_pd_refcnt_inc(pd);
1250 
1251 	/*
1252 	 * Allocate an MPT entry (for use as a memory window).  Since the
1253 	 * Hermon hardware uses the MPT entry for memory regions and for
1254 	 * memory windows, we will fill in this MPT with all the necessary
1255 	 * parameters for the memory window.  And then (just as we do for
1256 	 * memory regions) ownership will be passed to the hardware in the
1257 	 * final step below.  If we fail here, we must undo the protection
1258 	 * domain reference count.
1259 	 */
1260 	status = hermon_rsrc_alloc(state, HERMON_DMPT, 1, sleep, &mpt);
1261 	if (status != DDI_SUCCESS) {
1262 		status = IBT_INSUFF_RESOURCE;
1263 		goto mwalloc_fail1;
1264 	}
1265 
1266 	/*
1267 	 * Allocate the software structure for tracking the memory window (i.e.
1268 	 * the Hermon Memory Window handle).  Note: This is actually the same
1269 	 * software structure used for tracking memory regions, but since many
1270 	 * of the same properties are needed, only a single structure is
1271 	 * necessary.  If we fail here, we must undo the protection domain
1272 	 * reference count and the previous resource allocation.
1273 	 */
1274 	status = hermon_rsrc_alloc(state, HERMON_MRHDL, 1, sleep, &rsrc);
1275 	if (status != DDI_SUCCESS) {
1276 		status = IBT_INSUFF_RESOURCE;
1277 		goto mwalloc_fail2;
1278 	}
1279 	mw = (hermon_mwhdl_t)rsrc->hr_addr;
1280 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
1281 
1282 	/*
1283 	 * Calculate an "unbound" RKey from MPT index.  In much the same way
1284 	 * as we do for memory regions (above), this key is constructed from
1285 	 * a "constrained" (which depends on the MPT index) and an
1286 	 * "unconstrained" portion (which may be arbitrarily chosen).
1287 	 */
1288 	mw->mr_rkey = hermon_mr_keycalc(mpt->hr_indx);
1289 
1290 	/*
1291 	 * Fill in the MPT entry.  This is the final step before passing
1292 	 * ownership of the MPT entry to the Hermon hardware.  We use all of
1293 	 * the information collected/calculated above to fill in the
1294 	 * requisite portions of the MPT.  Note: fewer entries in the MPT
1295 	 * entry are necessary to allocate a memory window.
1296 	 */
1297 	bzero(&mpt_entry, sizeof (hermon_hw_dmpt_t));
1298 	mpt_entry.reg_win	= HERMON_MPT_IS_WINDOW;
1299 	mpt_entry.mem_key	= mw->mr_rkey;
1300 	mpt_entry.pd		= pd->pd_pdnum;
1301 	mpt_entry.lr		= 1;
1302 
1303 	/*
1304 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
1305 	 * the entry to the hardware.  Note: in general, this operation
1306 	 * shouldn't fail.  But if it does, we have to undo everything we've
1307 	 * done above before returning error.
1308 	 */
1309 	status = hermon_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1310 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, sleep);
1311 	if (status != HERMON_CMD_SUCCESS) {
1312 		cmn_err(CE_CONT, "Hermon: SW2HW_MPT command failed: %08x\n",
1313 		    status);
1314 		if (status == HERMON_CMD_INVALID_STATUS) {
1315 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1316 		}
1317 		status = ibc_get_ci_failure(0);
1318 		goto mwalloc_fail3;
1319 	}
1320 
1321 	/*
1322 	 * Fill in the rest of the Hermon Memory Window handle.  Having
1323 	 * successfully transferred ownership of the MPT, we can update the
1324 	 * following fields for use in further operations on the MW.
1325 	 */
1326 	mw->mr_mptrsrcp	= mpt;
1327 	mw->mr_pdhdl	= pd;
1328 	mw->mr_rsrcp	= rsrc;
1329 	mw->mr_rkey	= hermon_mr_key_swap(mw->mr_rkey);
1330 	*mwhdl = mw;
1331 
1332 	return (DDI_SUCCESS);
1333 
1334 mwalloc_fail3:
1335 	hermon_rsrc_free(state, &rsrc);
1336 mwalloc_fail2:
1337 	hermon_rsrc_free(state, &mpt);
1338 mwalloc_fail1:
1339 	hermon_pd_refcnt_dec(pd);
1340 mwalloc_fail:
1341 	return (status);
1342 }
1343 
1344 
1345 /*
1346  * hermon_mw_free()
1347  *    Context: Can be called from interrupt or base context.
1348  */
1349 int
1350 hermon_mw_free(hermon_state_t *state, hermon_mwhdl_t *mwhdl, uint_t sleep)
1351 {
1352 	hermon_rsrc_t		*mpt, *rsrc;
1353 	hermon_mwhdl_t		mw;
1354 	int			status;
1355 	hermon_pdhdl_t		pd;
1356 
1357 	/*
1358 	 * Check the sleep flag.  Ensure that it is consistent with the
1359 	 * current thread context (i.e. if we are currently in the interrupt
1360 	 * context, then we shouldn't be attempting to sleep).
1361 	 */
1362 	if ((sleep == HERMON_SLEEP) &&
1363 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
1364 		status = IBT_INVALID_PARAM;
1365 		return (status);
1366 	}
1367 
1368 	/*
1369 	 * Pull all the necessary information from the Hermon Memory Window
1370 	 * handle.  This is necessary here because the resource for the
1371 	 * MW handle is going to be freed up as part of the this operation.
1372 	 */
1373 	mw	= *mwhdl;
1374 	mutex_enter(&mw->mr_lock);
1375 	mpt	= mw->mr_mptrsrcp;
1376 	rsrc	= mw->mr_rsrcp;
1377 	pd	= mw->mr_pdhdl;
1378 	mutex_exit(&mw->mr_lock);
1379 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
1380 
1381 	/*
1382 	 * Reclaim the MPT entry from hardware.  Note: in general, it is
1383 	 * unexpected for this operation to return an error.
1384 	 */
1385 	status = hermon_cmn_ownership_cmd_post(state, HW2SW_MPT, NULL,
1386 	    0, mpt->hr_indx, sleep);
1387 	if (status != HERMON_CMD_SUCCESS) {
1388 		cmn_err(CE_CONT, "Hermon: HW2SW_MPT command failed: %08x\n",
1389 		    status);
1390 		if (status == HERMON_CMD_INVALID_STATUS) {
1391 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1392 		}
1393 		return (ibc_get_ci_failure(0));
1394 	}
1395 
1396 	/* Free the Hermon Memory Window handle */
1397 	hermon_rsrc_free(state, &rsrc);
1398 
1399 	/* Free up the MPT entry resource */
1400 	hermon_rsrc_free(state, &mpt);
1401 
1402 	/* Decrement the reference count on the protection domain (PD) */
1403 	hermon_pd_refcnt_dec(pd);
1404 
1405 	/* Set the mwhdl pointer to NULL and return success */
1406 	*mwhdl = NULL;
1407 
1408 	return (DDI_SUCCESS);
1409 }
1410 
1411 
1412 /*
1413  * hermon_mr_keycalc()
1414  *    Context: Can be called from interrupt or base context.
1415  *    NOTE:  Produces a key in the form of
1416  *		KKKKKKKK IIIIIIII IIIIIIII IIIIIIIII
1417  *    where K == the arbitrary bits and I == the index
1418  */
1419 uint32_t
1420 hermon_mr_keycalc(uint32_t indx)
1421 {
1422 	uint32_t tmp_key, tmp_indx;
1423 
1424 	/*
1425 	 * Generate a simple key from counter.  Note:  We increment this
1426 	 * static variable _intentionally_ without any kind of mutex around
1427 	 * it.  First, single-threading all operations through a single lock
1428 	 * would be a bad idea (from a performance point-of-view).  Second,
1429 	 * the upper "unconstrained" bits don't really have to be unique
1430 	 * because the lower bits are guaranteed to be (although we do make a
1431 	 * best effort to ensure that they are).  Third, the window for the
1432 	 * race (where both threads read and update the counter at the same
1433 	 * time) is incredibly small.
1434 	 * And, lastly, we'd like to make this into a "random" key
1435 	 */
1436 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(hermon_memkey_cnt))
1437 	tmp_key = (hermon_memkey_cnt++) << HERMON_MEMKEY_SHIFT;
1438 	tmp_indx = indx & 0xffffff;
1439 	return (tmp_key | tmp_indx);
1440 }
1441 
1442 
1443 /*
1444  * hermon_mr_key_swap()
1445  *    Context: Can be called from interrupt or base context.
1446  *    NOTE:  Produces a key in the form of
1447  *		IIIIIIII IIIIIIII IIIIIIIII KKKKKKKK
1448  *    where K == the arbitrary bits and I == the index
1449  */
1450 uint32_t
1451 hermon_mr_key_swap(uint32_t indx)
1452 {
1453 	/*
1454 	 * The memory key format to pass down to the hardware is
1455 	 * (key[7:0],index[23:0]), which defines the index to the
1456 	 * hardware resource. When the driver passes this as a memory
1457 	 * key, (i.e. to retrieve a resource) the format is
1458 	 * (index[23:0],key[7:0]).
1459 	 */
1460 	return (((indx >> 24) & 0x000000ff) | ((indx << 8) & 0xffffff00));
1461 }
1462 
1463 /*
1464  * hermon_mr_common_reg()
1465  *    Context: Can be called from interrupt or base context.
1466  */
1467 static int
1468 hermon_mr_common_reg(hermon_state_t *state, hermon_pdhdl_t pd,
1469     hermon_bind_info_t *bind, hermon_mrhdl_t *mrhdl, hermon_mr_options_t *op,
1470     hermon_mpt_rsrc_type_t mpt_type)
1471 {
1472 	hermon_rsrc_t		*mpt, *mtt, *rsrc, *mtt_refcnt;
1473 	hermon_umap_db_entry_t	*umapdb;
1474 	hermon_sw_refcnt_t	*swrc_tmp;
1475 	hermon_hw_dmpt_t	mpt_entry;
1476 	hermon_mrhdl_t		mr;
1477 	ibt_mr_flags_t		flags;
1478 	hermon_bind_info_t	*bh;
1479 	ddi_dma_handle_t	bind_dmahdl;
1480 	ddi_umem_cookie_t	umem_cookie;
1481 	size_t			umem_len;
1482 	caddr_t			umem_addr;
1483 	uint64_t		mtt_addr, max_sz;
1484 	uint_t			sleep, mtt_pgsize_bits, bind_type, mr_is_umem;
1485 	int			status, umem_flags, bind_override_addr;
1486 
1487 	/*
1488 	 * Check the "options" flag.  Currently this flag tells the driver
1489 	 * whether or not the region should be bound normally (i.e. with
1490 	 * entries written into the PCI IOMMU), whether it should be
1491 	 * registered to bypass the IOMMU, and whether or not the resulting
1492 	 * address should be "zero-based" (to aid the alignment restrictions
1493 	 * for QPs).
1494 	 */
1495 	if (op == NULL) {
1496 		bind_type   = HERMON_BINDMEM_NORMAL;
1497 		bind_dmahdl = NULL;
1498 		bind_override_addr = 0;
1499 	} else {
1500 		bind_type	   = op->mro_bind_type;
1501 		bind_dmahdl	   = op->mro_bind_dmahdl;
1502 		bind_override_addr = op->mro_bind_override_addr;
1503 	}
1504 
1505 	/* check what kind of mpt to use */
1506 
1507 	/* Extract the flags field from the hermon_bind_info_t */
1508 	flags = bind->bi_flags;
1509 
1510 	/*
1511 	 * Check for invalid length.  Check is the length is zero or if the
1512 	 * length is larger than the maximum configured value.  Return error
1513 	 * if it is.
1514 	 */
1515 	max_sz = ((uint64_t)1 << state->hs_cfg_profile->cp_log_max_mrw_sz);
1516 	if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
1517 		status = IBT_MR_LEN_INVALID;
1518 		goto mrcommon_fail;
1519 	}
1520 
1521 	/*
1522 	 * Check the sleep flag.  Ensure that it is consistent with the
1523 	 * current thread context (i.e. if we are currently in the interrupt
1524 	 * context, then we shouldn't be attempting to sleep).
1525 	 */
1526 	sleep = (flags & IBT_MR_NOSLEEP) ? HERMON_NOSLEEP: HERMON_SLEEP;
1527 	if ((sleep == HERMON_SLEEP) &&
1528 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
1529 		status = IBT_INVALID_PARAM;
1530 		goto mrcommon_fail;
1531 	}
1532 
1533 	/* Increment the reference count on the protection domain (PD) */
1534 	hermon_pd_refcnt_inc(pd);
1535 
1536 	/*
1537 	 * Allocate an MPT entry.  This will be filled in with all the
1538 	 * necessary parameters to define the memory region.  And then
1539 	 * ownership will be passed to the hardware in the final step
1540 	 * below.  If we fail here, we must undo the protection domain
1541 	 * reference count.
1542 	 */
1543 	if (mpt_type == HERMON_MPT_DMPT) {
1544 		status = hermon_rsrc_alloc(state, HERMON_DMPT, 1, sleep, &mpt);
1545 		if (status != DDI_SUCCESS) {
1546 			status = IBT_INSUFF_RESOURCE;
1547 			goto mrcommon_fail1;
1548 		}
1549 	} else {
1550 		mpt = NULL;
1551 	}
1552 
1553 	/*
1554 	 * Allocate the software structure for tracking the memory region (i.e.
1555 	 * the Hermon Memory Region handle).  If we fail here, we must undo
1556 	 * the protection domain reference count and the previous resource
1557 	 * allocation.
1558 	 */
1559 	status = hermon_rsrc_alloc(state, HERMON_MRHDL, 1, sleep, &rsrc);
1560 	if (status != DDI_SUCCESS) {
1561 		status = IBT_INSUFF_RESOURCE;
1562 		goto mrcommon_fail2;
1563 	}
1564 	mr = (hermon_mrhdl_t)rsrc->hr_addr;
1565 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
1566 
1567 	/*
1568 	 * Setup and validate the memory region access flags.  This means
1569 	 * translating the IBTF's enable flags into the access flags that
1570 	 * will be used in later operations.
1571 	 */
1572 	mr->mr_accflag = 0;
1573 	if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1574 		mr->mr_accflag |= IBT_MR_WINDOW_BIND;
1575 	if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1576 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
1577 	if (flags & IBT_MR_ENABLE_REMOTE_READ)
1578 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
1579 	if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1580 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
1581 	if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1582 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
1583 
1584 	/*
1585 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
1586 	 * from a certain number of "constrained" bits (the least significant
1587 	 * bits) and some number of "unconstrained" bits.  The constrained
1588 	 * bits must be set to the index of the entry in the MPT table, but
1589 	 * the unconstrained bits can be set to any value we wish.  Note:
1590 	 * if no remote access is required, then the RKey value is not filled
1591 	 * in.  Otherwise both Rkey and LKey are given the same value.
1592 	 */
1593 	if (mpt)
1594 		mr->mr_lkey = hermon_mr_keycalc(mpt->hr_indx);
1595 
1596 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1597 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1598 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1599 		mr->mr_rkey = mr->mr_lkey;
1600 	}
1601 
1602 	/*
1603 	 * Determine if the memory is from userland and pin the pages
1604 	 * with umem_lockmemory() if necessary.
1605 	 * Then, if this is userland memory, allocate an entry in the
1606 	 * "userland resources database".  This will later be added to
1607 	 * the database (after all further memory registration operations are
1608 	 * successful).  If we fail here, we must undo the reference counts
1609 	 * and the previous resource allocations.
1610 	 */
1611 	mr_is_umem = (((bind->bi_as != NULL) && (bind->bi_as != &kas)) ? 1 : 0);
1612 	if (mr_is_umem) {
1613 		umem_len   = ptob(btopr(bind->bi_len +
1614 		    ((uintptr_t)bind->bi_addr & PAGEOFFSET)));
1615 		umem_addr  = (caddr_t)((uintptr_t)bind->bi_addr & ~PAGEOFFSET);
1616 		umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
1617 		    DDI_UMEMLOCK_LONGTERM);
1618 		status = umem_lockmemory(umem_addr, umem_len, umem_flags,
1619 		    &umem_cookie, &hermon_umem_cbops, curproc);
1620 		if (status != 0) {
1621 			status = IBT_INSUFF_RESOURCE;
1622 			goto mrcommon_fail3;
1623 		}
1624 
1625 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1626 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1627 
1628 		bind->bi_buf = ddi_umem_iosetup(umem_cookie, 0, umem_len,
1629 		    B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP);
1630 		if (bind->bi_buf == NULL) {
1631 			status = IBT_INSUFF_RESOURCE;
1632 			goto mrcommon_fail3;
1633 		}
1634 		bind->bi_type = HERMON_BINDHDL_UBUF;
1635 		bind->bi_buf->b_flags |= B_READ;
1636 
1637 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1638 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1639 
1640 		umapdb = hermon_umap_db_alloc(state->hs_instance,
1641 		    (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
1642 		    (uint64_t)(uintptr_t)rsrc);
1643 		if (umapdb == NULL) {
1644 			status = IBT_INSUFF_RESOURCE;
1645 			goto mrcommon_fail4;
1646 		}
1647 	}
1648 
1649 	/*
1650 	 * Setup the bindinfo for the mtt bind call
1651 	 */
1652 	bh = &mr->mr_bindinfo;
1653 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bh))
1654 	bcopy(bind, bh, sizeof (hermon_bind_info_t));
1655 	bh->bi_bypass = bind_type;
1656 	status = hermon_mr_mtt_bind(state, bh, bind_dmahdl, &mtt,
1657 	    &mtt_pgsize_bits, mpt != NULL);
1658 	if (status != DDI_SUCCESS) {
1659 		goto mrcommon_fail5;
1660 	}
1661 	mr->mr_logmttpgsz = mtt_pgsize_bits;
1662 
1663 	/*
1664 	 * Allocate MTT reference count (to track shared memory regions).
1665 	 * This reference count resource may never be used on the given
1666 	 * memory region, but if it is ever later registered as "shared"
1667 	 * memory region then this resource will be necessary.  If we fail
1668 	 * here, we do pretty much the same as above to clean up.
1669 	 */
1670 	status = hermon_rsrc_alloc(state, HERMON_REFCNT, 1, sleep,
1671 	    &mtt_refcnt);
1672 	if (status != DDI_SUCCESS) {
1673 		status = IBT_INSUFF_RESOURCE;
1674 		goto mrcommon_fail6;
1675 	}
1676 	mr->mr_mttrefcntp = mtt_refcnt;
1677 	swrc_tmp = (hermon_sw_refcnt_t *)mtt_refcnt->hr_addr;
1678 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_tmp))
1679 	HERMON_MTT_REFCNT_INIT(swrc_tmp);
1680 
1681 	mtt_addr = (mtt->hr_indx << HERMON_MTT_SIZE_SHIFT);
1682 
1683 	/*
1684 	 * Fill in the MPT entry.  This is the final step before passing
1685 	 * ownership of the MPT entry to the Hermon hardware.  We use all of
1686 	 * the information collected/calculated above to fill in the
1687 	 * requisite portions of the MPT.  Do this ONLY for DMPTs.
1688 	 */
1689 	if (mpt == NULL)
1690 		goto no_passown;
1691 
1692 	bzero(&mpt_entry, sizeof (hermon_hw_dmpt_t));
1693 
1694 	mpt_entry.status  = HERMON_MPT_SW_OWNERSHIP;
1695 	mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
1696 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
1697 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
1698 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
1699 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
1700 	mpt_entry.lr	  = 1;
1701 	mpt_entry.phys_addr = 0;
1702 	mpt_entry.reg_win = HERMON_MPT_IS_REGION;
1703 
1704 	mpt_entry.entity_sz	= mr->mr_logmttpgsz;
1705 	mpt_entry.mem_key	= mr->mr_lkey;
1706 	mpt_entry.pd		= pd->pd_pdnum;
1707 	mpt_entry.rem_acc_en = 0;
1708 	mpt_entry.fast_reg_en = 0;
1709 	mpt_entry.en_inval = 0;
1710 	mpt_entry.lkey = 0;
1711 	mpt_entry.win_cnt = 0;
1712 
1713 	if (bind_override_addr == 0) {
1714 		mpt_entry.start_addr = bh->bi_addr;
1715 	} else {
1716 		bh->bi_addr = bh->bi_addr & ((1 << mr->mr_logmttpgsz) - 1);
1717 		mpt_entry.start_addr = bh->bi_addr;
1718 	}
1719 	mpt_entry.reg_win_len	= bh->bi_len;
1720 
1721 	mpt_entry.mtt_addr_h = mtt_addr >> 32;  /* only 8 more bits */
1722 	mpt_entry.mtt_addr_l = mtt_addr >> 3;	/* only 29 bits */
1723 
1724 	/*
1725 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
1726 	 * the entry to the hardware if needed.  Note: in general, this
1727 	 * operation shouldn't fail.  But if it does, we have to undo
1728 	 * everything we've done above before returning error.
1729 	 *
1730 	 * For Hermon, this routine (which is common to the contexts) will only
1731 	 * set the ownership if needed - the process of passing the context
1732 	 * itself to HW will take care of setting up the MPT (based on type
1733 	 * and index).
1734 	 */
1735 
1736 	mpt_entry.bnd_qp = 0;	/* dMPT for a qp, check for window */
1737 	status = hermon_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1738 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, sleep);
1739 	if (status != HERMON_CMD_SUCCESS) {
1740 		cmn_err(CE_CONT, "Hermon: SW2HW_MPT command failed: %08x\n",
1741 		    status);
1742 		if (status == HERMON_CMD_INVALID_STATUS) {
1743 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1744 		}
1745 		status = ibc_get_ci_failure(0);
1746 		goto mrcommon_fail7;
1747 	}
1748 no_passown:
1749 
1750 	/*
1751 	 * Fill in the rest of the Hermon Memory Region handle.  Having
1752 	 * successfully transferred ownership of the MPT, we can update the
1753 	 * following fields for use in further operations on the MR.
1754 	 */
1755 	mr->mr_mttaddr	   = mtt_addr;
1756 
1757 	mr->mr_log2_pgsz   = (mr->mr_logmttpgsz - HERMON_PAGESHIFT);
1758 	mr->mr_mptrsrcp	   = mpt;
1759 	mr->mr_mttrsrcp	   = mtt;
1760 	mr->mr_pdhdl	   = pd;
1761 	mr->mr_rsrcp	   = rsrc;
1762 	mr->mr_is_umem	   = mr_is_umem;
1763 	mr->mr_is_fmr	   = 0;
1764 	mr->mr_umemcookie  = (mr_is_umem != 0) ? umem_cookie : NULL;
1765 	mr->mr_umem_cbfunc = NULL;
1766 	mr->mr_umem_cbarg1 = NULL;
1767 	mr->mr_umem_cbarg2 = NULL;
1768 	mr->mr_lkey	   = hermon_mr_key_swap(mr->mr_lkey);
1769 	mr->mr_rkey	   = hermon_mr_key_swap(mr->mr_rkey);
1770 	mr->mr_mpt_type	   = mpt_type;
1771 
1772 	/*
1773 	 * If this is userland memory, then we need to insert the previously
1774 	 * allocated entry into the "userland resources database".  This will
1775 	 * allow for later coordination between the hermon_umap_umemlock_cb()
1776 	 * callback and hermon_mr_deregister().
1777 	 */
1778 	if (mr_is_umem) {
1779 		hermon_umap_db_add(umapdb);
1780 	}
1781 
1782 	*mrhdl = mr;
1783 
1784 	return (DDI_SUCCESS);
1785 
1786 /*
1787  * The following is cleanup for all possible failure cases in this routine
1788  */
1789 mrcommon_fail7:
1790 	hermon_rsrc_free(state, &mtt_refcnt);
1791 mrcommon_fail6:
1792 	hermon_mr_mem_unbind(state, bh);
1793 mrcommon_fail5:
1794 	if (mr_is_umem) {
1795 		hermon_umap_db_free(umapdb);
1796 	}
1797 mrcommon_fail4:
1798 	if (mr_is_umem) {
1799 		/*
1800 		 * Free up the memory ddi_umem_iosetup() allocates
1801 		 * internally.
1802 		 */
1803 		if (bind->bi_type == HERMON_BINDHDL_UBUF) {
1804 			freerbuf(bind->bi_buf);
1805 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1806 			bind->bi_type = HERMON_BINDHDL_NONE;
1807 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1808 		}
1809 		ddi_umem_unlock(umem_cookie);
1810 	}
1811 mrcommon_fail3:
1812 	hermon_rsrc_free(state, &rsrc);
1813 mrcommon_fail2:
1814 	if (mpt != NULL)
1815 		hermon_rsrc_free(state, &mpt);
1816 mrcommon_fail1:
1817 	hermon_pd_refcnt_dec(pd);
1818 mrcommon_fail:
1819 	return (status);
1820 }
1821 
1822 /*
1823  * hermon_mr_mtt_bind()
1824  *    Context: Can be called from interrupt or base context.
1825  */
1826 int
1827 hermon_mr_mtt_bind(hermon_state_t *state, hermon_bind_info_t *bind,
1828     ddi_dma_handle_t bind_dmahdl, hermon_rsrc_t **mtt, uint_t *mtt_pgsize_bits,
1829     uint_t is_buffer)
1830 {
1831 	uint64_t		nummtt;
1832 	uint_t			sleep;
1833 	int			status;
1834 
1835 	/*
1836 	 * Check the sleep flag.  Ensure that it is consistent with the
1837 	 * current thread context (i.e. if we are currently in the interrupt
1838 	 * context, then we shouldn't be attempting to sleep).
1839 	 */
1840 	sleep = (bind->bi_flags & IBT_MR_NOSLEEP) ?
1841 	    HERMON_NOSLEEP : HERMON_SLEEP;
1842 	if ((sleep == HERMON_SLEEP) &&
1843 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
1844 		status = IBT_INVALID_PARAM;
1845 		goto mrmttbind_fail;
1846 	}
1847 
1848 	/*
1849 	 * Bind the memory and determine the mapped addresses.  This is
1850 	 * the first of two routines that do all the "heavy lifting" for
1851 	 * the Hermon memory registration routines.  The hermon_mr_mem_bind()
1852 	 * routine takes the "bind" struct with all its fields filled
1853 	 * in and returns a list of DMA cookies (for the PCI mapped addresses
1854 	 * corresponding to the specified address region) which are used by
1855 	 * the hermon_mr_fast_mtt_write() routine below.  If we fail here, we
1856 	 * must undo all the previous resource allocation (and PD reference
1857 	 * count).
1858 	 */
1859 	status = hermon_mr_mem_bind(state, bind, bind_dmahdl, sleep, is_buffer);
1860 	if (status != DDI_SUCCESS) {
1861 		status = IBT_INSUFF_RESOURCE;
1862 		goto mrmttbind_fail;
1863 	}
1864 
1865 	/*
1866 	 * Determine number of pages spanned.  This routine uses the
1867 	 * information in the "bind" struct to determine the required
1868 	 * number of MTT entries needed (and returns the suggested page size -
1869 	 * as a "power-of-2" - for each MTT entry).
1870 	 */
1871 	nummtt = hermon_mr_nummtt_needed(state, bind, mtt_pgsize_bits);
1872 
1873 	/*
1874 	 * Allocate the MTT entries.  Use the calculations performed above to
1875 	 * allocate the required number of MTT entries. If we fail here, we
1876 	 * must not only undo all the previous resource allocation (and PD
1877 	 * reference count), but we must also unbind the memory.
1878 	 */
1879 	status = hermon_rsrc_alloc(state, HERMON_MTT, nummtt, sleep, mtt);
1880 	if (status != DDI_SUCCESS) {
1881 		status = IBT_INSUFF_RESOURCE;
1882 		goto mrmttbind_fail2;
1883 	}
1884 
1885 	/*
1886 	 * Write the mapped addresses into the MTT entries.  This is part two
1887 	 * of the "heavy lifting" routines that we talked about above.  Note:
1888 	 * we pass the suggested page size from the earlier operation here.
1889 	 * And if we fail here, we again do pretty much the same huge clean up.
1890 	 */
1891 	status = hermon_mr_fast_mtt_write(state, *mtt, bind, *mtt_pgsize_bits);
1892 	if (status != DDI_SUCCESS) {
1893 		/*
1894 		 * hermon_mr_fast_mtt_write() returns DDI_FAILURE
1895 		 * only if it detects a HW error during DMA.
1896 		 */
1897 		hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1898 		status = ibc_get_ci_failure(0);
1899 		goto mrmttbind_fail3;
1900 	}
1901 	return (DDI_SUCCESS);
1902 
1903 /*
1904  * The following is cleanup for all possible failure cases in this routine
1905  */
1906 mrmttbind_fail3:
1907 	hermon_rsrc_free(state, mtt);
1908 mrmttbind_fail2:
1909 	hermon_mr_mem_unbind(state, bind);
1910 mrmttbind_fail:
1911 	return (status);
1912 }
1913 
1914 
1915 /*
1916  * hermon_mr_mtt_unbind()
1917  *    Context: Can be called from interrupt or base context.
1918  */
1919 int
1920 hermon_mr_mtt_unbind(hermon_state_t *state, hermon_bind_info_t *bind,
1921     hermon_rsrc_t *mtt)
1922 {
1923 	/*
1924 	 * Free up the MTT entries and unbind the memory.  Here, as above, we
1925 	 * attempt to free these resources only if it is appropriate to do so.
1926 	 */
1927 	hermon_mr_mem_unbind(state, bind);
1928 	hermon_rsrc_free(state, &mtt);
1929 
1930 	return (DDI_SUCCESS);
1931 }
1932 
1933 
1934 /*
1935  * hermon_mr_common_rereg()
1936  *    Context: Can be called from interrupt or base context.
1937  */
1938 static int
1939 hermon_mr_common_rereg(hermon_state_t *state, hermon_mrhdl_t mr,
1940     hermon_pdhdl_t pd, hermon_bind_info_t *bind, hermon_mrhdl_t *mrhdl_new,
1941     hermon_mr_options_t *op)
1942 {
1943 	hermon_rsrc_t		*mpt;
1944 	ibt_mr_attr_flags_t	acc_flags_to_use;
1945 	ibt_mr_flags_t		flags;
1946 	hermon_pdhdl_t		pd_to_use;
1947 	hermon_hw_dmpt_t	mpt_entry;
1948 	uint64_t		mtt_addr_to_use, vaddr_to_use, len_to_use;
1949 	uint_t			sleep, dereg_level;
1950 	int			status;
1951 
1952 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1953 
1954 	/*
1955 	 * Check here to see if the memory region corresponds to a userland
1956 	 * mapping.  Reregistration of userland memory regions is not
1957 	 * currently supported.  Return failure.
1958 	 */
1959 	if (mr->mr_is_umem) {
1960 		status = IBT_MR_HDL_INVALID;
1961 		goto mrrereg_fail;
1962 	}
1963 
1964 	mutex_enter(&mr->mr_lock);
1965 
1966 	/* Pull MPT resource pointer from the Hermon Memory Region handle */
1967 	mpt = mr->mr_mptrsrcp;
1968 
1969 	/* Extract the flags field from the hermon_bind_info_t */
1970 	flags = bind->bi_flags;
1971 
1972 	/*
1973 	 * Check the sleep flag.  Ensure that it is consistent with the
1974 	 * current thread context (i.e. if we are currently in the interrupt
1975 	 * context, then we shouldn't be attempting to sleep).
1976 	 */
1977 	sleep = (flags & IBT_MR_NOSLEEP) ? HERMON_NOSLEEP: HERMON_SLEEP;
1978 	if ((sleep == HERMON_SLEEP) &&
1979 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
1980 		mutex_exit(&mr->mr_lock);
1981 		status = IBT_INVALID_PARAM;
1982 		goto mrrereg_fail;
1983 	}
1984 
1985 	/*
1986 	 * First step is to temporarily invalidate the MPT entry.  This
1987 	 * regains ownership from the hardware, and gives us the opportunity
1988 	 * to modify the entry.  Note: The HW2SW_MPT command returns the
1989 	 * current MPT entry contents.  These are saved away here because
1990 	 * they will be reused in a later step below.  If the region has
1991 	 * bound memory windows that we fail returning an "in use" error code.
1992 	 * Otherwise, this is an unexpected error and we deregister the
1993 	 * memory region and return error.
1994 	 *
1995 	 * We use HERMON_CMD_NOSLEEP_SPIN here always because we must protect
1996 	 * against holding the lock around this rereg call in all contexts.
1997 	 */
1998 	status = hermon_cmn_ownership_cmd_post(state, HW2SW_MPT, &mpt_entry,
1999 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, HERMON_CMD_NOSLEEP_SPIN);
2000 	if (status != HERMON_CMD_SUCCESS) {
2001 		mutex_exit(&mr->mr_lock);
2002 		if (status == HERMON_CMD_REG_BOUND) {
2003 			return (IBT_MR_IN_USE);
2004 		} else {
2005 			cmn_err(CE_CONT, "Hermon: HW2SW_MPT command failed: "
2006 			    "%08x\n", status);
2007 			if (status == HERMON_CMD_INVALID_STATUS) {
2008 				hermon_fm_ereport(state, HCA_SYS_ERR,
2009 				    HCA_ERR_SRV_LOST);
2010 			}
2011 			/*
2012 			 * Call deregister and ensure that all current
2013 			 * resources get freed up
2014 			 */
2015 			if (hermon_mr_deregister(state, &mr,
2016 			    HERMON_MR_DEREG_ALL, sleep) != DDI_SUCCESS) {
2017 				HERMON_WARNING(state, "failed to deregister "
2018 				    "memory region");
2019 			}
2020 			return (ibc_get_ci_failure(0));
2021 		}
2022 	}
2023 
2024 	/*
2025 	 * If we're changing the protection domain, then validate the new one
2026 	 */
2027 	if (flags & IBT_MR_CHANGE_PD) {
2028 
2029 		/* Check for valid PD handle pointer */
2030 		if (pd == NULL) {
2031 			mutex_exit(&mr->mr_lock);
2032 			/*
2033 			 * Call deregister and ensure that all current
2034 			 * resources get properly freed up. Unnecessary
2035 			 * here to attempt to regain software ownership
2036 			 * of the MPT entry as that has already been
2037 			 * done above.
2038 			 */
2039 			if (hermon_mr_deregister(state, &mr,
2040 			    HERMON_MR_DEREG_NO_HW2SW_MPT, sleep) !=
2041 			    DDI_SUCCESS) {
2042 				HERMON_WARNING(state, "failed to deregister "
2043 				    "memory region");
2044 			}
2045 			status = IBT_PD_HDL_INVALID;
2046 			goto mrrereg_fail;
2047 		}
2048 
2049 		/* Use the new PD handle in all operations below */
2050 		pd_to_use = pd;
2051 
2052 	} else {
2053 		/* Use the current PD handle in all operations below */
2054 		pd_to_use = mr->mr_pdhdl;
2055 	}
2056 
2057 	/*
2058 	 * If we're changing access permissions, then validate the new ones
2059 	 */
2060 	if (flags & IBT_MR_CHANGE_ACCESS) {
2061 		/*
2062 		 * Validate the access flags.  Both remote write and remote
2063 		 * atomic require the local write flag to be set
2064 		 */
2065 		if (((flags & IBT_MR_ENABLE_REMOTE_WRITE) ||
2066 		    (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)) &&
2067 		    !(flags & IBT_MR_ENABLE_LOCAL_WRITE)) {
2068 			mutex_exit(&mr->mr_lock);
2069 			/*
2070 			 * Call deregister and ensure that all current
2071 			 * resources get properly freed up. Unnecessary
2072 			 * here to attempt to regain software ownership
2073 			 * of the MPT entry as that has already been
2074 			 * done above.
2075 			 */
2076 			if (hermon_mr_deregister(state, &mr,
2077 			    HERMON_MR_DEREG_NO_HW2SW_MPT, sleep) !=
2078 			    DDI_SUCCESS) {
2079 				HERMON_WARNING(state, "failed to deregister "
2080 				    "memory region");
2081 			}
2082 			status = IBT_MR_ACCESS_REQ_INVALID;
2083 			goto mrrereg_fail;
2084 		}
2085 
2086 		/*
2087 		 * Setup and validate the memory region access flags.  This
2088 		 * means translating the IBTF's enable flags into the access
2089 		 * flags that will be used in later operations.
2090 		 */
2091 		acc_flags_to_use = 0;
2092 		if (flags & IBT_MR_ENABLE_WINDOW_BIND)
2093 			acc_flags_to_use |= IBT_MR_WINDOW_BIND;
2094 		if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
2095 			acc_flags_to_use |= IBT_MR_LOCAL_WRITE;
2096 		if (flags & IBT_MR_ENABLE_REMOTE_READ)
2097 			acc_flags_to_use |= IBT_MR_REMOTE_READ;
2098 		if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
2099 			acc_flags_to_use |= IBT_MR_REMOTE_WRITE;
2100 		if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
2101 			acc_flags_to_use |= IBT_MR_REMOTE_ATOMIC;
2102 
2103 	} else {
2104 		acc_flags_to_use = mr->mr_accflag;
2105 	}
2106 
2107 	/*
2108 	 * If we're modifying the translation, then figure out whether
2109 	 * we can reuse the current MTT resources.  This means calling
2110 	 * hermon_mr_rereg_xlat_helper() which does most of the heavy lifting
2111 	 * for the reregistration.  If the current memory region contains
2112 	 * sufficient MTT entries for the new regions, then it will be
2113 	 * reused and filled in.  Otherwise, new entries will be allocated,
2114 	 * the old ones will be freed, and the new entries will be filled
2115 	 * in.  Note:  If we're not modifying the translation, then we
2116 	 * should already have all the information we need to update the MPT.
2117 	 * Also note: If hermon_mr_rereg_xlat_helper() fails, it will return
2118 	 * a "dereg_level" which is the level of cleanup that needs to be
2119 	 * passed to hermon_mr_deregister() to finish the cleanup.
2120 	 */
2121 	if (flags & IBT_MR_CHANGE_TRANSLATION) {
2122 		status = hermon_mr_rereg_xlat_helper(state, mr, bind, op,
2123 		    &mtt_addr_to_use, sleep, &dereg_level);
2124 		if (status != DDI_SUCCESS) {
2125 			mutex_exit(&mr->mr_lock);
2126 			/*
2127 			 * Call deregister and ensure that all resources get
2128 			 * properly freed up.
2129 			 */
2130 			if (hermon_mr_deregister(state, &mr, dereg_level,
2131 			    sleep) != DDI_SUCCESS) {
2132 				HERMON_WARNING(state, "failed to deregister "
2133 				    "memory region");
2134 			}
2135 			goto mrrereg_fail;
2136 		}
2137 		vaddr_to_use = mr->mr_bindinfo.bi_addr;
2138 		len_to_use   = mr->mr_bindinfo.bi_len;
2139 	} else {
2140 		mtt_addr_to_use = mr->mr_mttaddr;
2141 		vaddr_to_use = mr->mr_bindinfo.bi_addr;
2142 		len_to_use   = mr->mr_bindinfo.bi_len;
2143 	}
2144 
2145 	/*
2146 	 * Calculate new keys (Lkey, Rkey) from MPT index.  Just like they were
2147 	 * when the region was first registered, each key is formed from
2148 	 * "constrained" bits and "unconstrained" bits.  Note:  If no remote
2149 	 * access is required, then the RKey value is not filled in.  Otherwise
2150 	 * both Rkey and LKey are given the same value.
2151 	 */
2152 	mr->mr_lkey = hermon_mr_keycalc(mpt->hr_indx);
2153 	if ((acc_flags_to_use & IBT_MR_REMOTE_READ) ||
2154 	    (acc_flags_to_use & IBT_MR_REMOTE_WRITE) ||
2155 	    (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC)) {
2156 		mr->mr_rkey = mr->mr_lkey;
2157 	} else
2158 		mr->mr_rkey = 0;
2159 
2160 	/*
2161 	 * Fill in the MPT entry.  This is the final step before passing
2162 	 * ownership of the MPT entry to the Hermon hardware.  We use all of
2163 	 * the information collected/calculated above to fill in the
2164 	 * requisite portions of the MPT.
2165 	 */
2166 	bzero(&mpt_entry, sizeof (hermon_hw_dmpt_t));
2167 
2168 	mpt_entry.status  = HERMON_MPT_SW_OWNERSHIP;
2169 	mpt_entry.en_bind = (acc_flags_to_use & IBT_MR_WINDOW_BIND)   ? 1 : 0;
2170 	mpt_entry.atomic  = (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
2171 	mpt_entry.rw	  = (acc_flags_to_use & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
2172 	mpt_entry.rr	  = (acc_flags_to_use & IBT_MR_REMOTE_READ)   ? 1 : 0;
2173 	mpt_entry.lw	  = (acc_flags_to_use & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
2174 	mpt_entry.lr	  = 1;
2175 	mpt_entry.phys_addr = 0;
2176 	mpt_entry.reg_win = HERMON_MPT_IS_REGION;
2177 
2178 	mpt_entry.entity_sz	= mr->mr_logmttpgsz;
2179 	mpt_entry.mem_key	= mr->mr_lkey;
2180 	mpt_entry.pd		= pd_to_use->pd_pdnum;
2181 
2182 	mpt_entry.start_addr	= vaddr_to_use;
2183 	mpt_entry.reg_win_len	= len_to_use;
2184 	mpt_entry.mtt_addr_h = mtt_addr_to_use >> 32;
2185 	mpt_entry.mtt_addr_l = mtt_addr_to_use >> 3;
2186 
2187 	/*
2188 	 * Write the updated MPT entry to hardware
2189 	 *
2190 	 * We use HERMON_CMD_NOSLEEP_SPIN here always because we must protect
2191 	 * against holding the lock around this rereg call in all contexts.
2192 	 */
2193 	status = hermon_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
2194 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, HERMON_CMD_NOSLEEP_SPIN);
2195 	if (status != HERMON_CMD_SUCCESS) {
2196 		mutex_exit(&mr->mr_lock);
2197 		cmn_err(CE_CONT, "Hermon: SW2HW_MPT command failed: %08x\n",
2198 		    status);
2199 		if (status == HERMON_CMD_INVALID_STATUS) {
2200 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
2201 		}
2202 		/*
2203 		 * Call deregister and ensure that all current resources get
2204 		 * properly freed up. Unnecessary here to attempt to regain
2205 		 * software ownership of the MPT entry as that has already
2206 		 * been done above.
2207 		 */
2208 		if (hermon_mr_deregister(state, &mr,
2209 		    HERMON_MR_DEREG_NO_HW2SW_MPT, sleep) != DDI_SUCCESS) {
2210 			HERMON_WARNING(state, "failed to deregister memory "
2211 			    "region");
2212 		}
2213 		return (ibc_get_ci_failure(0));
2214 	}
2215 
2216 	/*
2217 	 * If we're changing PD, then update their reference counts now.
2218 	 * This means decrementing the reference count on the old PD and
2219 	 * incrementing the reference count on the new PD.
2220 	 */
2221 	if (flags & IBT_MR_CHANGE_PD) {
2222 		hermon_pd_refcnt_dec(mr->mr_pdhdl);
2223 		hermon_pd_refcnt_inc(pd);
2224 	}
2225 
2226 	/*
2227 	 * Update the contents of the Hermon Memory Region handle to reflect
2228 	 * what has been changed.
2229 	 */
2230 	mr->mr_pdhdl	  = pd_to_use;
2231 	mr->mr_accflag	  = acc_flags_to_use;
2232 	mr->mr_is_umem	  = 0;
2233 	mr->mr_is_fmr	  = 0;
2234 	mr->mr_umemcookie = NULL;
2235 	mr->mr_lkey	  = hermon_mr_key_swap(mr->mr_lkey);
2236 	mr->mr_rkey	  = hermon_mr_key_swap(mr->mr_rkey);
2237 
2238 	/* New MR handle is same as the old */
2239 	*mrhdl_new = mr;
2240 	mutex_exit(&mr->mr_lock);
2241 
2242 	return (DDI_SUCCESS);
2243 
2244 mrrereg_fail:
2245 	return (status);
2246 }
2247 
2248 
2249 /*
2250  * hermon_mr_rereg_xlat_helper
2251  *    Context: Can be called from interrupt or base context.
2252  *    Note: This routine expects the "mr_lock" to be held when it
2253  *    is called.  Upon returning failure, this routine passes information
2254  *    about what "dereg_level" should be passed to hermon_mr_deregister().
2255  */
2256 static int
2257 hermon_mr_rereg_xlat_helper(hermon_state_t *state, hermon_mrhdl_t mr,
2258     hermon_bind_info_t *bind, hermon_mr_options_t *op, uint64_t *mtt_addr,
2259     uint_t sleep, uint_t *dereg_level)
2260 {
2261 	hermon_rsrc_t		*mtt, *mtt_refcnt;
2262 	hermon_sw_refcnt_t	*swrc_old, *swrc_new;
2263 	ddi_dma_handle_t	dmahdl;
2264 	uint64_t		nummtt_needed, nummtt_in_currrsrc, max_sz;
2265 	uint_t			mtt_pgsize_bits, bind_type, reuse_dmahdl;
2266 	int			status;
2267 
2268 	ASSERT(MUTEX_HELD(&mr->mr_lock));
2269 
2270 	/*
2271 	 * Check the "options" flag.  Currently this flag tells the driver
2272 	 * whether or not the region should be bound normally (i.e. with
2273 	 * entries written into the PCI IOMMU) or whether it should be
2274 	 * registered to bypass the IOMMU.
2275 	 */
2276 	if (op == NULL) {
2277 		bind_type = HERMON_BINDMEM_NORMAL;
2278 	} else {
2279 		bind_type = op->mro_bind_type;
2280 	}
2281 
2282 	/*
2283 	 * Check for invalid length.  Check is the length is zero or if the
2284 	 * length is larger than the maximum configured value.  Return error
2285 	 * if it is.
2286 	 */
2287 	max_sz = ((uint64_t)1 << state->hs_cfg_profile->cp_log_max_mrw_sz);
2288 	if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
2289 		/*
2290 		 * Deregister will be called upon returning failure from this
2291 		 * routine. This will ensure that all current resources get
2292 		 * properly freed up. Unnecessary to attempt to regain
2293 		 * software ownership of the MPT entry as that has already
2294 		 * been done above (in hermon_mr_reregister())
2295 		 */
2296 		*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT;
2297 
2298 		status = IBT_MR_LEN_INVALID;
2299 		goto mrrereghelp_fail;
2300 	}
2301 
2302 	/*
2303 	 * Determine the number of pages necessary for new region and the
2304 	 * number of pages supported by the current MTT resources
2305 	 */
2306 	nummtt_needed = hermon_mr_nummtt_needed(state, bind, &mtt_pgsize_bits);
2307 	nummtt_in_currrsrc = mr->mr_mttrsrcp->hr_len >> HERMON_MTT_SIZE_SHIFT;
2308 
2309 	/*
2310 	 * Depending on whether we have enough pages or not, the next step is
2311 	 * to fill in a set of MTT entries that reflect the new mapping.  In
2312 	 * the first case below, we already have enough entries.  This means
2313 	 * we need to unbind the memory from the previous mapping, bind the
2314 	 * memory for the new mapping, write the new MTT entries, and update
2315 	 * the mr to reflect the changes.
2316 	 * In the second case below, we do not have enough entries in the
2317 	 * current mapping.  So, in this case, we need not only to unbind the
2318 	 * current mapping, but we need to free up the MTT resources associated
2319 	 * with that mapping.  After we've successfully done that, we continue
2320 	 * by binding the new memory, allocating new MTT entries, writing the
2321 	 * new MTT entries, and updating the mr to reflect the changes.
2322 	 */
2323 
2324 	/*
2325 	 * If this region is being shared (i.e. MTT refcount != 1), then we
2326 	 * can't reuse the current MTT resources regardless of their size.
2327 	 * Instead we'll need to alloc new ones (below) just as if there
2328 	 * hadn't been enough room in the current entries.
2329 	 */
2330 	swrc_old = (hermon_sw_refcnt_t *)mr->mr_mttrefcntp->hr_addr;
2331 	if (HERMON_MTT_IS_NOT_SHARED(swrc_old) &&
2332 	    (nummtt_needed <= nummtt_in_currrsrc)) {
2333 
2334 		/*
2335 		 * Unbind the old mapping for this memory region, but retain
2336 		 * the ddi_dma_handle_t (if possible) for reuse in the bind
2337 		 * operation below.  Note:  If original memory region was
2338 		 * bound for IOMMU bypass and the new region can not use
2339 		 * bypass, then a new DMA handle will be necessary.
2340 		 */
2341 		if (HERMON_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2342 			mr->mr_bindinfo.bi_free_dmahdl = 0;
2343 			hermon_mr_mem_unbind(state, &mr->mr_bindinfo);
2344 			dmahdl = mr->mr_bindinfo.bi_dmahdl;
2345 			reuse_dmahdl = 1;
2346 		} else {
2347 			hermon_mr_mem_unbind(state, &mr->mr_bindinfo);
2348 			dmahdl = NULL;
2349 			reuse_dmahdl = 0;
2350 		}
2351 
2352 		/*
2353 		 * Bind the new memory and determine the mapped addresses.
2354 		 * As described, this routine and hermon_mr_fast_mtt_write()
2355 		 * do the majority of the work for the memory registration
2356 		 * operations.  Note:  When we successfully finish the binding,
2357 		 * we will set the "bi_free_dmahdl" flag to indicate that
2358 		 * even though we may have reused the ddi_dma_handle_t we do
2359 		 * wish it to be freed up at some later time.  Note also that
2360 		 * if we fail, we may need to cleanup the ddi_dma_handle_t.
2361 		 */
2362 		bind->bi_bypass	= bind_type;
2363 		status = hermon_mr_mem_bind(state, bind, dmahdl, sleep, 1);
2364 		if (status != DDI_SUCCESS) {
2365 			if (reuse_dmahdl) {
2366 				ddi_dma_free_handle(&dmahdl);
2367 			}
2368 
2369 			/*
2370 			 * Deregister will be called upon returning failure
2371 			 * from this routine. This will ensure that all
2372 			 * current resources get properly freed up.
2373 			 * Unnecessary to attempt to regain software ownership
2374 			 * of the MPT entry as that has already been done
2375 			 * above (in hermon_mr_reregister()).  Also unnecessary
2376 			 * to attempt to unbind the memory.
2377 			 */
2378 			*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2379 
2380 			status = IBT_INSUFF_RESOURCE;
2381 			goto mrrereghelp_fail;
2382 		}
2383 		if (reuse_dmahdl) {
2384 			bind->bi_free_dmahdl = 1;
2385 		}
2386 
2387 		/*
2388 		 * Using the new mapping, but reusing the current MTT
2389 		 * resources, write the updated entries to MTT
2390 		 */
2391 		mtt    = mr->mr_mttrsrcp;
2392 		status = hermon_mr_fast_mtt_write(state, mtt, bind,
2393 		    mtt_pgsize_bits);
2394 		if (status != DDI_SUCCESS) {
2395 			/*
2396 			 * Deregister will be called upon returning failure
2397 			 * from this routine. This will ensure that all
2398 			 * current resources get properly freed up.
2399 			 * Unnecessary to attempt to regain software ownership
2400 			 * of the MPT entry as that has already been done
2401 			 * above (in hermon_mr_reregister()).  Also unnecessary
2402 			 * to attempt to unbind the memory.
2403 			 *
2404 			 * But we do need to unbind the newly bound memory
2405 			 * before returning.
2406 			 */
2407 			hermon_mr_mem_unbind(state, bind);
2408 			*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2409 
2410 			/*
2411 			 * hermon_mr_fast_mtt_write() returns DDI_FAILURE
2412 			 * only if it detects a HW error during DMA.
2413 			 */
2414 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
2415 			status = ibc_get_ci_failure(0);
2416 			goto mrrereghelp_fail;
2417 		}
2418 
2419 		/* Put the updated information into the Mem Region handle */
2420 		mr->mr_bindinfo	  = *bind;
2421 		mr->mr_logmttpgsz = mtt_pgsize_bits;
2422 
2423 	} else {
2424 		/*
2425 		 * Check if the memory region MTT is shared by any other MRs.
2426 		 * Since the resource may be shared between multiple memory
2427 		 * regions (as a result of a "RegisterSharedMR()" verb) it is
2428 		 * important that we not unbind any resources prematurely.
2429 		 */
2430 		if (!HERMON_MTT_IS_SHARED(swrc_old)) {
2431 			/*
2432 			 * Unbind the old mapping for this memory region, but
2433 			 * retain the ddi_dma_handle_t for reuse in the bind
2434 			 * operation below. Note: This can only be done here
2435 			 * because the region being reregistered is not
2436 			 * currently shared.  Also if original memory region
2437 			 * was bound for IOMMU bypass and the new region can
2438 			 * not use bypass, then a new DMA handle will be
2439 			 * necessary.
2440 			 */
2441 			if (HERMON_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2442 				mr->mr_bindinfo.bi_free_dmahdl = 0;
2443 				hermon_mr_mem_unbind(state, &mr->mr_bindinfo);
2444 				dmahdl = mr->mr_bindinfo.bi_dmahdl;
2445 				reuse_dmahdl = 1;
2446 			} else {
2447 				hermon_mr_mem_unbind(state, &mr->mr_bindinfo);
2448 				dmahdl = NULL;
2449 				reuse_dmahdl = 0;
2450 			}
2451 		} else {
2452 			dmahdl = NULL;
2453 			reuse_dmahdl = 0;
2454 		}
2455 
2456 		/*
2457 		 * Bind the new memory and determine the mapped addresses.
2458 		 * As described, this routine and hermon_mr_fast_mtt_write()
2459 		 * do the majority of the work for the memory registration
2460 		 * operations.  Note:  When we successfully finish the binding,
2461 		 * we will set the "bi_free_dmahdl" flag to indicate that
2462 		 * even though we may have reused the ddi_dma_handle_t we do
2463 		 * wish it to be freed up at some later time.  Note also that
2464 		 * if we fail, we may need to cleanup the ddi_dma_handle_t.
2465 		 */
2466 		bind->bi_bypass	= bind_type;
2467 		status = hermon_mr_mem_bind(state, bind, dmahdl, sleep, 1);
2468 		if (status != DDI_SUCCESS) {
2469 			if (reuse_dmahdl) {
2470 				ddi_dma_free_handle(&dmahdl);
2471 			}
2472 
2473 			/*
2474 			 * Deregister will be called upon returning failure
2475 			 * from this routine. This will ensure that all
2476 			 * current resources get properly freed up.
2477 			 * Unnecessary to attempt to regain software ownership
2478 			 * of the MPT entry as that has already been done
2479 			 * above (in hermon_mr_reregister()).  Also unnecessary
2480 			 * to attempt to unbind the memory.
2481 			 */
2482 			*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2483 
2484 			status = IBT_INSUFF_RESOURCE;
2485 			goto mrrereghelp_fail;
2486 		}
2487 		if (reuse_dmahdl) {
2488 			bind->bi_free_dmahdl = 1;
2489 		}
2490 
2491 		/*
2492 		 * Allocate the new MTT entries resource
2493 		 */
2494 		status = hermon_rsrc_alloc(state, HERMON_MTT, nummtt_needed,
2495 		    sleep, &mtt);
2496 		if (status != DDI_SUCCESS) {
2497 			/*
2498 			 * Deregister will be called upon returning failure
2499 			 * from this routine. This will ensure that all
2500 			 * current resources get properly freed up.
2501 			 * Unnecessary to attempt to regain software ownership
2502 			 * of the MPT entry as that has already been done
2503 			 * above (in hermon_mr_reregister()).  Also unnecessary
2504 			 * to attempt to unbind the memory.
2505 			 *
2506 			 * But we do need to unbind the newly bound memory
2507 			 * before returning.
2508 			 */
2509 			hermon_mr_mem_unbind(state, bind);
2510 			*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2511 
2512 			status = IBT_INSUFF_RESOURCE;
2513 			goto mrrereghelp_fail;
2514 		}
2515 
2516 		/*
2517 		 * Allocate MTT reference count (to track shared memory
2518 		 * regions).  As mentioned elsewhere above, this reference
2519 		 * count resource may never be used on the given memory region,
2520 		 * but if it is ever later registered as a "shared" memory
2521 		 * region then this resource will be necessary.  Note:  This
2522 		 * is only necessary here if the existing memory region is
2523 		 * already being shared (because otherwise we already have
2524 		 * a useable reference count resource).
2525 		 */
2526 		if (HERMON_MTT_IS_SHARED(swrc_old)) {
2527 			status = hermon_rsrc_alloc(state, HERMON_REFCNT, 1,
2528 			    sleep, &mtt_refcnt);
2529 			if (status != DDI_SUCCESS) {
2530 				/*
2531 				 * Deregister will be called upon returning
2532 				 * failure from this routine. This will ensure
2533 				 * that all current resources get properly
2534 				 * freed up.  Unnecessary to attempt to regain
2535 				 * software ownership of the MPT entry as that
2536 				 * has already been done above (in
2537 				 * hermon_mr_reregister()).  Also unnecessary
2538 				 * to attempt to unbind the memory.
2539 				 *
2540 				 * But we need to unbind the newly bound
2541 				 * memory and free up the newly allocated MTT
2542 				 * entries before returning.
2543 				 */
2544 				hermon_mr_mem_unbind(state, bind);
2545 				hermon_rsrc_free(state, &mtt);
2546 				*dereg_level =
2547 				    HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2548 
2549 				status = IBT_INSUFF_RESOURCE;
2550 				goto mrrereghelp_fail;
2551 			}
2552 			swrc_new = (hermon_sw_refcnt_t *)mtt_refcnt->hr_addr;
2553 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_new))
2554 			HERMON_MTT_REFCNT_INIT(swrc_new);
2555 		} else {
2556 			mtt_refcnt = mr->mr_mttrefcntp;
2557 		}
2558 
2559 		/*
2560 		 * Using the new mapping and the new MTT resources, write the
2561 		 * updated entries to MTT
2562 		 */
2563 		status = hermon_mr_fast_mtt_write(state, mtt, bind,
2564 		    mtt_pgsize_bits);
2565 		if (status != DDI_SUCCESS) {
2566 			/*
2567 			 * Deregister will be called upon returning failure
2568 			 * from this routine. This will ensure that all
2569 			 * current resources get properly freed up.
2570 			 * Unnecessary to attempt to regain software ownership
2571 			 * of the MPT entry as that has already been done
2572 			 * above (in hermon_mr_reregister()).  Also unnecessary
2573 			 * to attempt to unbind the memory.
2574 			 *
2575 			 * But we need to unbind the newly bound memory,
2576 			 * free up the newly allocated MTT entries, and
2577 			 * (possibly) free the new MTT reference count
2578 			 * resource before returning.
2579 			 */
2580 			if (HERMON_MTT_IS_SHARED(swrc_old)) {
2581 				hermon_rsrc_free(state, &mtt_refcnt);
2582 			}
2583 			hermon_mr_mem_unbind(state, bind);
2584 			hermon_rsrc_free(state, &mtt);
2585 			*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2586 
2587 			status = IBT_INSUFF_RESOURCE;
2588 			goto mrrereghelp_fail;
2589 		}
2590 
2591 		/*
2592 		 * Check if the memory region MTT is shared by any other MRs.
2593 		 * Since the resource may be shared between multiple memory
2594 		 * regions (as a result of a "RegisterSharedMR()" verb) it is
2595 		 * important that we not free up any resources prematurely.
2596 		 */
2597 		if (HERMON_MTT_IS_SHARED(swrc_old)) {
2598 			/* Decrement MTT reference count for "old" region */
2599 			(void) hermon_mtt_refcnt_dec(mr->mr_mttrefcntp);
2600 		} else {
2601 			/* Free up the old MTT entries resource */
2602 			hermon_rsrc_free(state, &mr->mr_mttrsrcp);
2603 		}
2604 
2605 		/* Put the updated information into the mrhdl */
2606 		mr->mr_bindinfo	  = *bind;
2607 		mr->mr_logmttpgsz = mtt_pgsize_bits;
2608 		mr->mr_mttrsrcp   = mtt;
2609 		mr->mr_mttrefcntp = mtt_refcnt;
2610 	}
2611 
2612 	/*
2613 	 * Calculate and return the updated MTT address (in the DDR address
2614 	 * space).  This will be used by the caller (hermon_mr_reregister) in
2615 	 * the updated MPT entry
2616 	 */
2617 	*mtt_addr = mtt->hr_indx << HERMON_MTT_SIZE_SHIFT;
2618 
2619 	return (DDI_SUCCESS);
2620 
2621 mrrereghelp_fail:
2622 	return (status);
2623 }
2624 
2625 
2626 /*
2627  * hermon_mr_nummtt_needed()
2628  *    Context: Can be called from interrupt or base context.
2629  */
2630 /* ARGSUSED */
2631 static uint64_t
2632 hermon_mr_nummtt_needed(hermon_state_t *state, hermon_bind_info_t *bind,
2633     uint_t *mtt_pgsize_bits)
2634 {
2635 	uint64_t	pg_offset_mask;
2636 	uint64_t	pg_offset, tmp_length;
2637 
2638 	/*
2639 	 * For now we specify the page size as 8Kb (the default page size for
2640 	 * the sun4u architecture), or 4Kb for x86.  Figure out optimal page
2641 	 * size by examining the dmacookies
2642 	 */
2643 	*mtt_pgsize_bits = PAGESHIFT;
2644 
2645 	pg_offset_mask = ((uint64_t)1 << *mtt_pgsize_bits) - 1;
2646 	pg_offset = bind->bi_addr & pg_offset_mask;
2647 	tmp_length = pg_offset + (bind->bi_len - 1);
2648 	return ((tmp_length >> *mtt_pgsize_bits) + 1);
2649 }
2650 
2651 
2652 /*
2653  * hermon_mr_mem_bind()
2654  *    Context: Can be called from interrupt or base context.
2655  */
2656 static int
2657 hermon_mr_mem_bind(hermon_state_t *state, hermon_bind_info_t *bind,
2658     ddi_dma_handle_t dmahdl, uint_t sleep, uint_t is_buffer)
2659 {
2660 	ddi_dma_attr_t	dma_attr;
2661 	int		(*callback)(caddr_t);
2662 	int		status;
2663 
2664 	/* bi_type must be set to a meaningful value to get a bind handle */
2665 	ASSERT(bind->bi_type == HERMON_BINDHDL_VADDR ||
2666 	    bind->bi_type == HERMON_BINDHDL_BUF ||
2667 	    bind->bi_type == HERMON_BINDHDL_UBUF);
2668 
2669 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2670 
2671 	/* Set the callback flag appropriately */
2672 	callback = (sleep == HERMON_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT;
2673 
2674 	/*
2675 	 * Initialize many of the default DMA attributes.  Then, if we're
2676 	 * bypassing the IOMMU, set the DDI_DMA_FORCE_PHYSICAL flag.
2677 	 */
2678 	if (dmahdl == NULL) {
2679 		hermon_dma_attr_init(state, &dma_attr);
2680 #ifdef	__sparc
2681 		if (bind->bi_bypass == HERMON_BINDMEM_BYPASS) {
2682 			dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2683 		}
2684 #endif
2685 
2686 		/* set RO if needed - tunable set and 'is_buffer' is non-0 */
2687 		if (is_buffer) {
2688 			if (! (bind->bi_flags & IBT_MR_DISABLE_RO)) {
2689 				if ((bind->bi_type != HERMON_BINDHDL_UBUF) &&
2690 				    (hermon_kernel_data_ro ==
2691 				    HERMON_RO_ENABLED)) {
2692 					dma_attr.dma_attr_flags |=
2693 					    DDI_DMA_RELAXED_ORDERING;
2694 				}
2695 				if (((bind->bi_type == HERMON_BINDHDL_UBUF) &&
2696 				    (hermon_user_data_ro ==
2697 				    HERMON_RO_ENABLED))) {
2698 					dma_attr.dma_attr_flags |=
2699 					    DDI_DMA_RELAXED_ORDERING;
2700 				}
2701 			}
2702 		}
2703 
2704 		/* Allocate a DMA handle for the binding */
2705 		status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr,
2706 		    callback, NULL, &bind->bi_dmahdl);
2707 		if (status != DDI_SUCCESS) {
2708 			return (status);
2709 		}
2710 		bind->bi_free_dmahdl = 1;
2711 
2712 	} else  {
2713 		bind->bi_dmahdl = dmahdl;
2714 		bind->bi_free_dmahdl = 0;
2715 	}
2716 
2717 
2718 	/*
2719 	 * Bind the memory to get the PCI mapped addresses.  The decision
2720 	 * to call ddi_dma_addr_bind_handle() or ddi_dma_buf_bind_handle()
2721 	 * is determined by the "bi_type" flag.  Note: if the bind operation
2722 	 * fails then we have to free up the DMA handle and return error.
2723 	 */
2724 	if (bind->bi_type == HERMON_BINDHDL_VADDR) {
2725 		status = ddi_dma_addr_bind_handle(bind->bi_dmahdl, NULL,
2726 		    (caddr_t)(uintptr_t)bind->bi_addr, bind->bi_len,
2727 		    (DDI_DMA_RDWR | DDI_DMA_CONSISTENT), callback, NULL,
2728 		    &bind->bi_dmacookie, &bind->bi_cookiecnt);
2729 
2730 	} else {  /* HERMON_BINDHDL_BUF or HERMON_BINDHDL_UBUF */
2731 
2732 		status = ddi_dma_buf_bind_handle(bind->bi_dmahdl,
2733 		    bind->bi_buf, (DDI_DMA_RDWR | DDI_DMA_CONSISTENT), callback,
2734 		    NULL, &bind->bi_dmacookie, &bind->bi_cookiecnt);
2735 	}
2736 	if (status != DDI_DMA_MAPPED) {
2737 		if (bind->bi_free_dmahdl != 0) {
2738 			ddi_dma_free_handle(&bind->bi_dmahdl);
2739 		}
2740 		return (status);
2741 	}
2742 
2743 	return (DDI_SUCCESS);
2744 }
2745 
2746 
2747 /*
2748  * hermon_mr_mem_unbind()
2749  *    Context: Can be called from interrupt or base context.
2750  */
2751 static void
2752 hermon_mr_mem_unbind(hermon_state_t *state, hermon_bind_info_t *bind)
2753 {
2754 	int	status;
2755 
2756 	/*
2757 	 * In case of HERMON_BINDHDL_UBUF, the memory bi_buf points to
2758 	 * is actually allocated by ddi_umem_iosetup() internally, then
2759 	 * it's required to free it here. Reset bi_type to HERMON_BINDHDL_NONE
2760 	 * not to free it again later.
2761 	 */
2762 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2763 	if (bind->bi_type == HERMON_BINDHDL_UBUF) {
2764 		freerbuf(bind->bi_buf);
2765 		bind->bi_type = HERMON_BINDHDL_NONE;
2766 	}
2767 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
2768 
2769 	/*
2770 	 * Unbind the DMA memory for the region
2771 	 *
2772 	 * Note: The only way ddi_dma_unbind_handle() currently
2773 	 * can return an error is if the handle passed in is invalid.
2774 	 * Since this should never happen, we choose to return void
2775 	 * from this function!  If this does return an error, however,
2776 	 * then we print a warning message to the console.
2777 	 */
2778 	status = ddi_dma_unbind_handle(bind->bi_dmahdl);
2779 	if (status != DDI_SUCCESS) {
2780 		HERMON_WARNING(state, "failed to unbind DMA mapping");
2781 		return;
2782 	}
2783 
2784 	/* Free up the DMA handle */
2785 	if (bind->bi_free_dmahdl != 0) {
2786 		ddi_dma_free_handle(&bind->bi_dmahdl);
2787 	}
2788 }
2789 
2790 
2791 /*
2792  * hermon_mr_fast_mtt_write()
2793  *    Context: Can be called from interrupt or base context.
2794  */
2795 static int
2796 hermon_mr_fast_mtt_write(hermon_state_t *state, hermon_rsrc_t *mtt,
2797     hermon_bind_info_t *bind, uint32_t mtt_pgsize_bits)
2798 {
2799 	hermon_icm_table_t	*icm_table;
2800 	hermon_dma_info_t	*dma_info;
2801 	uint32_t		index1, index2, rindx;
2802 	ddi_dma_cookie_t	dmacookie;
2803 	uint_t			cookie_cnt;
2804 	uint64_t		*mtt_table;
2805 	uint64_t		mtt_entry;
2806 	uint64_t		addr, endaddr;
2807 	uint64_t		pagesize;
2808 	offset_t		i, start;
2809 	uint_t			per_span;
2810 	int			sync_needed;
2811 
2812 	/*
2813 	 * XXX According to the PRM, we are to use the WRITE_MTT
2814 	 * command to write out MTTs. Tavor does not do this,
2815 	 * instead taking advantage of direct access to the MTTs,
2816 	 * and knowledge that Mellanox FMR relies on our ability
2817 	 * to write directly to the MTTs without any further
2818 	 * notification to the firmware. Likewise, we will choose
2819 	 * to not use the WRITE_MTT command, but to simply write
2820 	 * out the MTTs.
2821 	 */
2822 
2823 	/* Calculate page size from the suggested value passed in */
2824 	pagesize = ((uint64_t)1 << mtt_pgsize_bits);
2825 
2826 	/* Walk the "cookie list" and fill in the MTT table entries */
2827 	dmacookie  = bind->bi_dmacookie;
2828 	cookie_cnt = bind->bi_cookiecnt;
2829 
2830 	icm_table = &state->hs_icm[HERMON_MTT];
2831 	rindx = mtt->hr_indx;
2832 	hermon_index(index1, index2, rindx, icm_table, i);
2833 	start = i;
2834 
2835 	per_span   = icm_table->span;
2836 	dma_info   = icm_table->icm_dma[index1] + index2;
2837 	mtt_table  = (uint64_t *)(uintptr_t)dma_info->vaddr;
2838 
2839 	sync_needed = 0;
2840 	while (cookie_cnt-- > 0) {
2841 		addr    = dmacookie.dmac_laddress;
2842 		endaddr = addr + (dmacookie.dmac_size - 1);
2843 		addr    = addr & ~((uint64_t)pagesize - 1);
2844 
2845 		while (addr <= endaddr) {
2846 
2847 			/*
2848 			 * Fill in the mapped addresses (calculated above) and
2849 			 * set HERMON_MTT_ENTRY_PRESENT flag for each MTT entry.
2850 			 */
2851 			mtt_entry = addr | HERMON_MTT_ENTRY_PRESENT;
2852 			mtt_table[i] = htonll(mtt_entry);
2853 			i++;
2854 			rindx++;
2855 
2856 			if (i == per_span) {
2857 
2858 				(void) ddi_dma_sync(dma_info->dma_hdl,
2859 				    start * sizeof (hermon_hw_mtt_t),
2860 				    (i - start) * sizeof (hermon_hw_mtt_t),
2861 				    DDI_DMA_SYNC_FORDEV);
2862 
2863 				if ((addr + pagesize > endaddr) &&
2864 				    (cookie_cnt == 0))
2865 					return (DDI_SUCCESS);
2866 
2867 				hermon_index(index1, index2, rindx, icm_table,
2868 				    i);
2869 				start = i * sizeof (hermon_hw_mtt_t);
2870 				dma_info = icm_table->icm_dma[index1] + index2;
2871 				mtt_table =
2872 				    (uint64_t *)(uintptr_t)dma_info->vaddr;
2873 
2874 				sync_needed = 0;
2875 			} else {
2876 				sync_needed = 1;
2877 			}
2878 
2879 			addr += pagesize;
2880 			if (addr == 0) {
2881 				static int do_once = 1;
2882 				_NOTE(SCHEME_PROTECTS_DATA("safe sharing",
2883 				    do_once))
2884 				if (do_once) {
2885 					do_once = 0;
2886 					cmn_err(CE_NOTE, "probable error in "
2887 					    "dma_cookie address from caller\n");
2888 				}
2889 				break;
2890 			}
2891 		}
2892 
2893 		/*
2894 		 * When we've reached the end of the current DMA cookie,
2895 		 * jump to the next cookie (if there are more)
2896 		 */
2897 		if (cookie_cnt != 0) {
2898 			ddi_dma_nextcookie(bind->bi_dmahdl, &dmacookie);
2899 		}
2900 	}
2901 
2902 	/* done all the cookies, now sync the memory for the device */
2903 	if (sync_needed)
2904 		(void) ddi_dma_sync(dma_info->dma_hdl,
2905 		    start * sizeof (hermon_hw_mtt_t),
2906 		    (i - start) * sizeof (hermon_hw_mtt_t),
2907 		    DDI_DMA_SYNC_FORDEV);
2908 
2909 	return (DDI_SUCCESS);
2910 }
2911 
2912 /*
2913  * hermon_mr_fast_mtt_write_fmr()
2914  *    Context: Can be called from interrupt or base context.
2915  */
2916 static int
2917 hermon_mr_fast_mtt_write_fmr(hermon_rsrc_t *mtt, ibt_pmr_attr_t *mem_pattr,
2918     uint32_t mtt_pgsize_bits)
2919 {
2920 	uint64_t		*mtt_table;
2921 	ibt_phys_addr_t		*buf;
2922 	uint64_t		mtt_entry;
2923 	uint64_t		addr, first_addr, endaddr;
2924 	uint64_t		pagesize;
2925 	int			i;
2926 
2927 	/* Calculate page size from the suggested value passed in */
2928 	pagesize = ((uint64_t)1 << mtt_pgsize_bits);
2929 
2930 	/*
2931 	 * Walk the "addr list" and fill in the MTT table entries
2932 	 */
2933 	mtt_table  = (uint64_t *)mtt->hr_addr;
2934 	for (i = 0; i < mem_pattr->pmr_num_buf; i++) {
2935 		buf = &mem_pattr->pmr_addr_list[i];
2936 
2937 		/*
2938 		 * For first cookie, use the offset field to determine where
2939 		 * the buffer starts.  The end addr is then calculated with the
2940 		 * offset in mind.
2941 		 */
2942 		if (i == 0) {
2943 			first_addr = addr = buf->p_laddr +
2944 			    mem_pattr->pmr_offset;
2945 			endaddr = addr + (mem_pattr->pmr_buf_sz - 1) -
2946 			    mem_pattr->pmr_offset;
2947 		/*
2948 		 * For last cookie, determine end addr based on starting
2949 		 * address and size of the total buffer
2950 		 */
2951 		} else if (i == mem_pattr->pmr_num_buf - 1) {
2952 			addr = buf->p_laddr;
2953 			endaddr = addr + (first_addr + mem_pattr->pmr_len &
2954 			    (mem_pattr->pmr_buf_sz - 1));
2955 		/*
2956 		 * For the middle cookies case, start and end addr are
2957 		 * straightforward.  Just use the laddr, and the size, as all
2958 		 * middle cookies are a set size.
2959 		 */
2960 		} else {
2961 			addr = buf->p_laddr;
2962 			endaddr = addr + (mem_pattr->pmr_buf_sz - 1);
2963 		}
2964 
2965 		addr	= addr & ~((uint64_t)pagesize - 1);
2966 		while (addr <= endaddr) {
2967 			/*
2968 			 * Fill in the mapped addresses (calculated above) and
2969 			 * set HERMON_MTT_ENTRY_PRESENT flag for each MTT entry.
2970 			 */
2971 			mtt_entry = addr | HERMON_MTT_ENTRY_PRESENT;
2972 			mtt_table[i] = htonll(mtt_entry);
2973 			addr += pagesize;
2974 		}
2975 	}
2976 
2977 	return (DDI_SUCCESS);
2978 }
2979 
2980 
2981 /*
2982  * hermon_mtt_refcnt_inc()
2983  *    Context: Can be called from interrupt or base context.
2984  */
2985 static uint_t
2986 hermon_mtt_refcnt_inc(hermon_rsrc_t *rsrc)
2987 {
2988 	hermon_sw_refcnt_t *rc;
2989 
2990 	rc = (hermon_sw_refcnt_t *)rsrc->hr_addr;
2991 	return (atomic_inc_uint_nv(&rc->swrc_refcnt));
2992 }
2993 
2994 
2995 /*
2996  * hermon_mtt_refcnt_dec()
2997  *    Context: Can be called from interrupt or base context.
2998  */
2999 static uint_t
3000 hermon_mtt_refcnt_dec(hermon_rsrc_t *rsrc)
3001 {
3002 	hermon_sw_refcnt_t *rc;
3003 
3004 	rc = (hermon_sw_refcnt_t *)rsrc->hr_addr;
3005 	return (atomic_dec_uint_nv(&rc->swrc_refcnt));
3006 }
3007