1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * hermon_mr.c
29  *    Hermon Memory Region/Window Routines
30  *
31  *    Implements all the routines necessary to provide the requisite memory
32  *    registration verbs.  These include operations like RegisterMemRegion(),
33  *    DeregisterMemRegion(), ReregisterMemRegion, RegisterSharedMemRegion,
34  *    etc., that affect Memory Regions.  It also includes the verbs that
35  *    affect Memory Windows, including AllocMemWindow(), FreeMemWindow(),
36  *    and QueryMemWindow().
37  */
38 
39 #include <sys/types.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/modctl.h>
44 #include <sys/esunddi.h>
45 
46 #include <sys/ib/adapters/hermon/hermon.h>
47 
48 extern uint32_t hermon_kernel_data_ro;
49 extern uint32_t hermon_user_data_ro;
50 
51 /*
52  * Used by hermon_mr_keycalc() below to fill in the "unconstrained" portion
53  * of Hermon memory keys (LKeys and RKeys)
54  */
55 static	uint_t hermon_memkey_cnt = 0x00;
56 #define	HERMON_MEMKEY_SHIFT	 24
57 #define	HERMON_MPT_SW_OWNERSHIP	 0xF
58 
59 static int hermon_mr_common_reg(hermon_state_t *state, hermon_pdhdl_t pd,
60     hermon_bind_info_t *bind, hermon_mrhdl_t *mrhdl, hermon_mr_options_t *op,
61     hermon_mpt_rsrc_type_t mpt_type);
62 static int hermon_mr_common_rereg(hermon_state_t *state, hermon_mrhdl_t mr,
63     hermon_pdhdl_t pd, hermon_bind_info_t *bind, hermon_mrhdl_t *mrhdl_new,
64     hermon_mr_options_t *op);
65 static int hermon_mr_rereg_xlat_helper(hermon_state_t *state, hermon_mrhdl_t mr,
66     hermon_bind_info_t *bind, hermon_mr_options_t *op, uint64_t *mtt_addr,
67     uint_t sleep, uint_t *dereg_level);
68 static uint64_t hermon_mr_nummtt_needed(hermon_state_t *state,
69     hermon_bind_info_t *bind, uint_t *mtt_pgsize);
70 static int hermon_mr_mem_bind(hermon_state_t *state, hermon_bind_info_t *bind,
71     ddi_dma_handle_t dmahdl, uint_t sleep, uint_t is_buffer);
72 static void hermon_mr_mem_unbind(hermon_state_t *state,
73     hermon_bind_info_t *bind);
74 static int hermon_mr_fast_mtt_write(hermon_state_t *state, hermon_rsrc_t *mtt,
75     hermon_bind_info_t *bind, uint32_t mtt_pgsize_bits);
76 static int hermon_mr_fast_mtt_write_fmr(hermon_rsrc_t *mtt,
77     ibt_pmr_attr_t *mem_pattr, uint32_t mtt_pgsize_bits);
78 static uint_t hermon_mtt_refcnt_inc(hermon_rsrc_t *rsrc);
79 static uint_t hermon_mtt_refcnt_dec(hermon_rsrc_t *rsrc);
80 
81 
82 /*
83  * The Hermon umem_lockmemory() callback ops.  When userland memory is
84  * registered, these callback ops are specified.  The hermon_umap_umemlock_cb()
85  * callback will be called whenever the memory for the corresponding
86  * ddi_umem_cookie_t is being freed.
87  */
88 static struct umem_callback_ops hermon_umem_cbops = {
89 	UMEM_CALLBACK_VERSION,
90 	hermon_umap_umemlock_cb,
91 };
92 
93 
94 
95 /*
96  * hermon_mr_register()
97  *    Context: Can be called from interrupt or base context.
98  */
99 int
100 hermon_mr_register(hermon_state_t *state, hermon_pdhdl_t pd,
101     ibt_mr_attr_t *mr_attr, hermon_mrhdl_t *mrhdl, hermon_mr_options_t *op,
102     hermon_mpt_rsrc_type_t mpt_type)
103 {
104 	hermon_bind_info_t	bind;
105 	int			status;
106 
107 	/*
108 	 * Fill in the "bind" struct.  This struct provides the majority
109 	 * of the information that will be used to distinguish between an
110 	 * "addr" binding (as is the case here) and a "buf" binding (see
111 	 * below).  The "bind" struct is later passed to hermon_mr_mem_bind()
112 	 * which does most of the "heavy lifting" for the Hermon memory
113 	 * registration routines.
114 	 */
115 	bind.bi_type  = HERMON_BINDHDL_VADDR;
116 	bind.bi_addr  = mr_attr->mr_vaddr;
117 	bind.bi_len   = mr_attr->mr_len;
118 	bind.bi_as    = mr_attr->mr_as;
119 	bind.bi_flags = mr_attr->mr_flags;
120 	status = hermon_mr_common_reg(state, pd, &bind, mrhdl, op,
121 	    mpt_type);
122 	return (status);
123 }
124 
125 
126 /*
127  * hermon_mr_register_buf()
128  *    Context: Can be called from interrupt or base context.
129  */
130 int
131 hermon_mr_register_buf(hermon_state_t *state, hermon_pdhdl_t pd,
132     ibt_smr_attr_t *mr_attr, struct buf *buf, hermon_mrhdl_t *mrhdl,
133     hermon_mr_options_t *op, hermon_mpt_rsrc_type_t mpt_type)
134 {
135 	hermon_bind_info_t	bind;
136 	int			status;
137 
138 	/*
139 	 * Fill in the "bind" struct.  This struct provides the majority
140 	 * of the information that will be used to distinguish between an
141 	 * "addr" binding (see above) and a "buf" binding (as is the case
142 	 * here).  The "bind" struct is later passed to hermon_mr_mem_bind()
143 	 * which does most of the "heavy lifting" for the Hermon memory
144 	 * registration routines.  Note: We have chosen to provide
145 	 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
146 	 * not set).  It is not critical what value we choose here as it need
147 	 * only be unique for the given RKey (which will happen by default),
148 	 * so the choice here is somewhat arbitrary.
149 	 */
150 	bind.bi_type  = HERMON_BINDHDL_BUF;
151 	bind.bi_buf   = buf;
152 	if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
153 		bind.bi_addr  = mr_attr->mr_vaddr;
154 	} else {
155 		bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
156 	}
157 	bind.bi_as    = NULL;
158 	bind.bi_len   = (uint64_t)buf->b_bcount;
159 	bind.bi_flags = mr_attr->mr_flags;
160 	status = hermon_mr_common_reg(state, pd, &bind, mrhdl, op, mpt_type);
161 	return (status);
162 }
163 
164 
165 /*
166  * hermon_mr_register_shared()
167  *    Context: Can be called from interrupt or base context.
168  */
169 int
170 hermon_mr_register_shared(hermon_state_t *state, hermon_mrhdl_t mrhdl,
171     hermon_pdhdl_t pd, ibt_smr_attr_t *mr_attr, hermon_mrhdl_t *mrhdl_new)
172 {
173 	hermon_rsrc_t		*mpt, *mtt, *rsrc;
174 	hermon_umap_db_entry_t	*umapdb;
175 	hermon_hw_dmpt_t	mpt_entry;
176 	hermon_mrhdl_t		mr;
177 	hermon_bind_info_t	*bind;
178 	ddi_umem_cookie_t	umem_cookie;
179 	size_t			umem_len;
180 	caddr_t			umem_addr;
181 	uint64_t		mtt_addr, pgsize_msk;
182 	uint_t			sleep, mr_is_umem;
183 	int			status, umem_flags;
184 
185 	/*
186 	 * Check the sleep flag.  Ensure that it is consistent with the
187 	 * current thread context (i.e. if we are currently in the interrupt
188 	 * context, then we shouldn't be attempting to sleep).
189 	 */
190 	sleep = (mr_attr->mr_flags & IBT_MR_NOSLEEP) ? HERMON_NOSLEEP :
191 	    HERMON_SLEEP;
192 	if ((sleep == HERMON_SLEEP) &&
193 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
194 		status = IBT_INVALID_PARAM;
195 		goto mrshared_fail;
196 	}
197 
198 	/* Increment the reference count on the protection domain (PD) */
199 	hermon_pd_refcnt_inc(pd);
200 
201 	/*
202 	 * Allocate an MPT entry.  This will be filled in with all the
203 	 * necessary parameters to define the shared memory region.
204 	 * Specifically, it will be made to reference the currently existing
205 	 * MTT entries and ownership of the MPT will be passed to the hardware
206 	 * in the last step below.  If we fail here, we must undo the
207 	 * protection domain reference count.
208 	 */
209 	status = hermon_rsrc_alloc(state, HERMON_DMPT, 1, sleep, &mpt);
210 	if (status != DDI_SUCCESS) {
211 		status = IBT_INSUFF_RESOURCE;
212 		goto mrshared_fail1;
213 	}
214 
215 	/*
216 	 * Allocate the software structure for tracking the shared memory
217 	 * region (i.e. the Hermon Memory Region handle).  If we fail here, we
218 	 * must undo the protection domain reference count and the previous
219 	 * resource allocation.
220 	 */
221 	status = hermon_rsrc_alloc(state, HERMON_MRHDL, 1, sleep, &rsrc);
222 	if (status != DDI_SUCCESS) {
223 		status = IBT_INSUFF_RESOURCE;
224 		goto mrshared_fail2;
225 	}
226 	mr = (hermon_mrhdl_t)rsrc->hr_addr;
227 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
228 
229 	/*
230 	 * Setup and validate the memory region access flags.  This means
231 	 * translating the IBTF's enable flags into the access flags that
232 	 * will be used in later operations.
233 	 */
234 	mr->mr_accflag = 0;
235 	if (mr_attr->mr_flags & IBT_MR_ENABLE_WINDOW_BIND)
236 		mr->mr_accflag |= IBT_MR_WINDOW_BIND;
237 	if (mr_attr->mr_flags & IBT_MR_ENABLE_LOCAL_WRITE)
238 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
239 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_READ)
240 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
241 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_WRITE)
242 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
243 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
244 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
245 
246 	/*
247 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
248 	 * from a certain number of "constrained" bits (the least significant
249 	 * bits) and some number of "unconstrained" bits.  The constrained
250 	 * bits must be set to the index of the entry in the MPT table, but
251 	 * the unconstrained bits can be set to any value we wish.  Note:
252 	 * if no remote access is required, then the RKey value is not filled
253 	 * in.  Otherwise both Rkey and LKey are given the same value.
254 	 */
255 	mr->mr_lkey = hermon_mr_keycalc(mpt->hr_indx);
256 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
257 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
258 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
259 		mr->mr_rkey = mr->mr_lkey;
260 	}
261 
262 	/* Grab the MR lock for the current memory region */
263 	mutex_enter(&mrhdl->mr_lock);
264 
265 	/*
266 	 * Check here to see if the memory region has already been partially
267 	 * deregistered as a result of a hermon_umap_umemlock_cb() callback.
268 	 * If so, this is an error, return failure.
269 	 */
270 	if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
271 		mutex_exit(&mrhdl->mr_lock);
272 		status = IBT_MR_HDL_INVALID;
273 		goto mrshared_fail3;
274 	}
275 
276 	/*
277 	 * Determine if the original memory was from userland and, if so, pin
278 	 * the pages (again) with umem_lockmemory().  This will guarantee a
279 	 * separate callback for each of this shared region's MR handles.
280 	 * If this is userland memory, then allocate an entry in the
281 	 * "userland resources database".  This will later be added to
282 	 * the database (after all further memory registration operations are
283 	 * successful).  If we fail here, we must undo all the above setup.
284 	 */
285 	mr_is_umem = mrhdl->mr_is_umem;
286 	if (mr_is_umem) {
287 		umem_len   = ptob(btopr(mrhdl->mr_bindinfo.bi_len));
288 		umem_addr  = (caddr_t)((uintptr_t)mrhdl->mr_bindinfo.bi_addr &
289 		    ~PAGEOFFSET);
290 		umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
291 		    DDI_UMEMLOCK_LONGTERM);
292 		status = umem_lockmemory(umem_addr, umem_len, umem_flags,
293 		    &umem_cookie, &hermon_umem_cbops, NULL);
294 		if (status != 0) {
295 			mutex_exit(&mrhdl->mr_lock);
296 			status = IBT_INSUFF_RESOURCE;
297 			goto mrshared_fail3;
298 		}
299 
300 		umapdb = hermon_umap_db_alloc(state->hs_instance,
301 		    (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
302 		    (uint64_t)(uintptr_t)rsrc);
303 		if (umapdb == NULL) {
304 			mutex_exit(&mrhdl->mr_lock);
305 			status = IBT_INSUFF_RESOURCE;
306 			goto mrshared_fail4;
307 		}
308 	}
309 
310 	/*
311 	 * Copy the MTT resource pointer (and additional parameters) from
312 	 * the original Hermon Memory Region handle.  Note: this is normally
313 	 * where the hermon_mr_mem_bind() routine would be called, but because
314 	 * we already have bound and filled-in MTT entries it is simply a
315 	 * matter here of managing the MTT reference count and grabbing the
316 	 * address of the MTT table entries (for filling in the shared region's
317 	 * MPT entry).
318 	 */
319 	mr->mr_mttrsrcp	  = mrhdl->mr_mttrsrcp;
320 	mr->mr_logmttpgsz = mrhdl->mr_logmttpgsz;
321 	mr->mr_bindinfo	  = mrhdl->mr_bindinfo;
322 	mr->mr_mttrefcntp = mrhdl->mr_mttrefcntp;
323 	mutex_exit(&mrhdl->mr_lock);
324 	bind = &mr->mr_bindinfo;
325 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
326 	mtt = mr->mr_mttrsrcp;
327 
328 	/*
329 	 * Increment the MTT reference count (to reflect the fact that
330 	 * the MTT is now shared)
331 	 */
332 	(void) hermon_mtt_refcnt_inc(mr->mr_mttrefcntp);
333 
334 	/*
335 	 * Update the new "bind" virtual address.  Do some extra work here
336 	 * to ensure proper alignment.  That is, make sure that the page
337 	 * offset for the beginning of the old range is the same as the
338 	 * offset for this new mapping
339 	 */
340 	pgsize_msk = (((uint64_t)1 << mr->mr_logmttpgsz) - 1);
341 	bind->bi_addr = ((mr_attr->mr_vaddr & ~pgsize_msk) |
342 	    (mr->mr_bindinfo.bi_addr & pgsize_msk));
343 
344 	/*
345 	 * Fill in the MPT entry.  This is the final step before passing
346 	 * ownership of the MPT entry to the Hermon hardware.  We use all of
347 	 * the information collected/calculated above to fill in the
348 	 * requisite portions of the MPT.
349 	 */
350 	bzero(&mpt_entry, sizeof (hermon_hw_dmpt_t));
351 	mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
352 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
353 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
354 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
355 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
356 	mpt_entry.lr	  = 1;
357 	mpt_entry.reg_win = HERMON_MPT_IS_REGION;
358 	mpt_entry.entity_sz	= mr->mr_logmttpgsz;
359 	mpt_entry.mem_key	= mr->mr_lkey;
360 	mpt_entry.pd		= pd->pd_pdnum;
361 	mpt_entry.start_addr	= bind->bi_addr;
362 	mpt_entry.reg_win_len	= bind->bi_len;
363 	mtt_addr = (mtt->hr_indx << HERMON_MTT_SIZE_SHIFT);
364 	mpt_entry.mtt_addr_h = mtt_addr >> 32;
365 	mpt_entry.mtt_addr_l = mtt_addr >> 3;
366 
367 	/*
368 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
369 	 * the entry to the hardware.  Note: in general, this operation
370 	 * shouldn't fail.  But if it does, we have to undo everything we've
371 	 * done above before returning error.
372 	 */
373 	status = hermon_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
374 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, sleep);
375 	if (status != HERMON_CMD_SUCCESS) {
376 		cmn_err(CE_CONT, "Hermon: SW2HW_MPT command failed: %08x\n",
377 		    status);
378 		if (status == HERMON_CMD_INVALID_STATUS) {
379 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
380 		}
381 		status = ibc_get_ci_failure(0);
382 		goto mrshared_fail5;
383 	}
384 
385 	/*
386 	 * Fill in the rest of the Hermon Memory Region handle.  Having
387 	 * successfully transferred ownership of the MPT, we can update the
388 	 * following fields for use in further operations on the MR.
389 	 */
390 	mr->mr_mptrsrcp	  = mpt;
391 	mr->mr_mttrsrcp	  = mtt;
392 	mr->mr_mpt_type	  = HERMON_MPT_DMPT;
393 	mr->mr_pdhdl	  = pd;
394 	mr->mr_rsrcp	  = rsrc;
395 	mr->mr_is_umem	  = mr_is_umem;
396 	mr->mr_is_fmr	  = 0;
397 	mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL;
398 	mr->mr_umem_cbfunc = NULL;
399 	mr->mr_umem_cbarg1 = NULL;
400 	mr->mr_umem_cbarg2 = NULL;
401 	mr->mr_lkey	   = hermon_mr_key_swap(mr->mr_lkey);
402 	mr->mr_rkey	   = hermon_mr_key_swap(mr->mr_rkey);
403 
404 	/*
405 	 * If this is userland memory, then we need to insert the previously
406 	 * allocated entry into the "userland resources database".  This will
407 	 * allow for later coordination between the hermon_umap_umemlock_cb()
408 	 * callback and hermon_mr_deregister().
409 	 */
410 	if (mr_is_umem) {
411 		hermon_umap_db_add(umapdb);
412 	}
413 
414 	*mrhdl_new = mr;
415 
416 	return (DDI_SUCCESS);
417 
418 /*
419  * The following is cleanup for all possible failure cases in this routine
420  */
421 mrshared_fail5:
422 	(void) hermon_mtt_refcnt_dec(mr->mr_mttrefcntp);
423 	if (mr_is_umem) {
424 		hermon_umap_db_free(umapdb);
425 	}
426 mrshared_fail4:
427 	if (mr_is_umem) {
428 		ddi_umem_unlock(umem_cookie);
429 	}
430 mrshared_fail3:
431 	hermon_rsrc_free(state, &rsrc);
432 mrshared_fail2:
433 	hermon_rsrc_free(state, &mpt);
434 mrshared_fail1:
435 	hermon_pd_refcnt_dec(pd);
436 mrshared_fail:
437 	return (status);
438 }
439 
440 /*
441  * hermon_mr_alloc_fmr()
442  *    Context: Can be called from interrupt or base context.
443  */
444 int
445 hermon_mr_alloc_fmr(hermon_state_t *state, hermon_pdhdl_t pd,
446     hermon_fmrhdl_t fmr_pool, hermon_mrhdl_t *mrhdl)
447 {
448 	hermon_rsrc_t		*mpt, *mtt, *rsrc;
449 	hermon_hw_dmpt_t		mpt_entry;
450 	hermon_mrhdl_t		mr;
451 	hermon_bind_info_t	bind;
452 	uint64_t		mtt_addr;
453 	uint64_t		nummtt;
454 	uint_t			sleep, mtt_pgsize_bits;
455 	int			status;
456 
457 	/*
458 	 * Check the sleep flag.  Ensure that it is consistent with the
459 	 * current thread context (i.e. if we are currently in the interrupt
460 	 * context, then we shouldn't be attempting to sleep).
461 	 */
462 	sleep = (fmr_pool->fmr_flags & IBT_MR_SLEEP) ? HERMON_SLEEP :
463 	    HERMON_NOSLEEP;
464 	if ((sleep == HERMON_SLEEP) &&
465 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
466 		return (IBT_INVALID_PARAM);
467 	}
468 
469 	/* Increment the reference count on the protection domain (PD) */
470 	hermon_pd_refcnt_inc(pd);
471 
472 	/*
473 	 * Allocate an MPT entry.  This will be filled in with all the
474 	 * necessary parameters to define the FMR.  Specifically, it will be
475 	 * made to reference the currently existing MTT entries and ownership
476 	 * of the MPT will be passed to the hardware in the last step below.
477 	 * If we fail here, we must undo the protection domain reference count.
478 	 */
479 
480 	status = hermon_rsrc_alloc(state, HERMON_DMPT, 1, sleep, &mpt);
481 	if (status != DDI_SUCCESS) {
482 		status = IBT_INSUFF_RESOURCE;
483 		goto fmralloc_fail1;
484 	}
485 
486 	/*
487 	 * Allocate the software structure for tracking the fmr memory
488 	 * region (i.e. the Hermon Memory Region handle).  If we fail here, we
489 	 * must undo the protection domain reference count and the previous
490 	 * resource allocation.
491 	 */
492 	status = hermon_rsrc_alloc(state, HERMON_MRHDL, 1, sleep, &rsrc);
493 	if (status != DDI_SUCCESS) {
494 		status = IBT_INSUFF_RESOURCE;
495 		goto fmralloc_fail2;
496 	}
497 	mr = (hermon_mrhdl_t)rsrc->hr_addr;
498 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
499 
500 	/*
501 	 * Setup and validate the memory region access flags.  This means
502 	 * translating the IBTF's enable flags into the access flags that
503 	 * will be used in later operations.
504 	 */
505 	mr->mr_accflag = 0;
506 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_LOCAL_WRITE)
507 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
508 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_REMOTE_READ)
509 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
510 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_REMOTE_WRITE)
511 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
512 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
513 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
514 
515 	/*
516 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
517 	 * from a certain number of "constrained" bits (the least significant
518 	 * bits) and some number of "unconstrained" bits.  The constrained
519 	 * bits must be set to the index of the entry in the MPT table, but
520 	 * the unconstrained bits can be set to any value we wish.  Note:
521 	 * if no remote access is required, then the RKey value is not filled
522 	 * in.  Otherwise both Rkey and LKey are given the same value.
523 	 */
524 	mr->mr_lkey = hermon_mr_keycalc(mpt->hr_indx);
525 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
526 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
527 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
528 		mr->mr_rkey = mr->mr_lkey;
529 	}
530 
531 	/*
532 	 * Determine number of pages spanned.  This routine uses the
533 	 * information in the "bind" struct to determine the required
534 	 * number of MTT entries needed (and returns the suggested page size -
535 	 * as a "power-of-2" - for each MTT entry).
536 	 */
537 	/* Assume address will be page aligned later */
538 	bind.bi_addr = 0;
539 	/* Calculate size based on given max pages */
540 	bind.bi_len = fmr_pool->fmr_max_pages << PAGESHIFT;
541 	nummtt = hermon_mr_nummtt_needed(state, &bind, &mtt_pgsize_bits);
542 
543 	/*
544 	 * Allocate the MTT entries.  Use the calculations performed above to
545 	 * allocate the required number of MTT entries.  If we fail here, we
546 	 * must not only undo all the previous resource allocation (and PD
547 	 * reference count), but we must also unbind the memory.
548 	 */
549 	status = hermon_rsrc_alloc(state, HERMON_MTT, nummtt, sleep, &mtt);
550 	if (status != DDI_SUCCESS) {
551 		status = IBT_INSUFF_RESOURCE;
552 		goto fmralloc_fail3;
553 	}
554 	mr->mr_logmttpgsz = mtt_pgsize_bits;
555 
556 	/*
557 	 * Fill in the MPT entry.  This is the final step before passing
558 	 * ownership of the MPT entry to the Hermon hardware.  We use all of
559 	 * the information collected/calculated above to fill in the
560 	 * requisite portions of the MPT.
561 	 */
562 	bzero(&mpt_entry, sizeof (hermon_hw_dmpt_t));
563 	mpt_entry.en_bind = 0;
564 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
565 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
566 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
567 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
568 	mpt_entry.lr	  = 1;
569 	mpt_entry.reg_win = HERMON_MPT_IS_REGION;
570 	mpt_entry.pd		= pd->pd_pdnum;
571 
572 	mpt_entry.entity_sz	= mr->mr_logmttpgsz;
573 	mtt_addr = (mtt->hr_indx << HERMON_MTT_SIZE_SHIFT);
574 	mpt_entry.mtt_addr_h = mtt_addr >> 32;
575 	mpt_entry.mtt_addr_l = mtt_addr >> 3;
576 	mpt_entry.mem_key = mr->mr_lkey;
577 
578 	/*
579 	 * FMR sets these to 0 for now.  Later during actual fmr registration
580 	 * these values are filled in.
581 	 */
582 	mpt_entry.start_addr	= 0;
583 	mpt_entry.reg_win_len	= 0;
584 
585 	/*
586 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
587 	 * the entry to the hardware.  Note: in general, this operation
588 	 * shouldn't fail.  But if it does, we have to undo everything we've
589 	 * done above before returning error.
590 	 */
591 	status = hermon_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
592 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, sleep);
593 	if (status != HERMON_CMD_SUCCESS) {
594 		cmn_err(CE_CONT, "Hermon: SW2HW_MPT command failed: %08x\n",
595 		    status);
596 		if (status == HERMON_CMD_INVALID_STATUS) {
597 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
598 		}
599 		status = ibc_get_ci_failure(0);
600 		goto fmralloc_fail4;
601 	}
602 
603 	/*
604 	 * Fill in the rest of the Hermon Memory Region handle.  Having
605 	 * successfully transferred ownership of the MPT, we can update the
606 	 * following fields for use in further operations on the MR.  Also, set
607 	 * that this is an FMR region.
608 	 */
609 	mr->mr_mptrsrcp	  = mpt;
610 	mr->mr_mttrsrcp	  = mtt;
611 	mr->mr_mpt_type   = HERMON_MPT_DMPT;
612 	mr->mr_pdhdl	  = pd;
613 	mr->mr_rsrcp	  = rsrc;
614 	mr->mr_is_fmr	  = 1;
615 	mr->mr_lkey	   = hermon_mr_key_swap(mr->mr_lkey);
616 	mr->mr_rkey	   = hermon_mr_key_swap(mr->mr_rkey);
617 	(void) memcpy(&mr->mr_bindinfo, &bind, sizeof (hermon_bind_info_t));
618 
619 	*mrhdl = mr;
620 
621 	return (DDI_SUCCESS);
622 
623 /*
624  * The following is cleanup for all possible failure cases in this routine
625  */
626 fmralloc_fail4:
627 	kmem_free(mtt, sizeof (hermon_rsrc_t) * nummtt);
628 fmralloc_fail3:
629 	hermon_rsrc_free(state, &rsrc);
630 fmralloc_fail2:
631 	hermon_rsrc_free(state, &mpt);
632 fmralloc_fail1:
633 	hermon_pd_refcnt_dec(pd);
634 fmralloc_fail:
635 	return (status);
636 }
637 
638 /*
639  * hermon_mr_register_physical_fmr()
640  *    Context: Can be called from interrupt or base context.
641  */
642 /*ARGSUSED*/
643 int
644 hermon_mr_register_physical_fmr(hermon_state_t *state,
645     ibt_pmr_attr_t *mem_pattr_p, hermon_mrhdl_t mr, ibt_pmr_desc_t *mem_desc_p)
646 {
647 	hermon_rsrc_t		*mpt;
648 	uint64_t		*mpt_table;
649 	int			status;
650 
651 	mutex_enter(&mr->mr_lock);
652 	mpt = mr->mr_mptrsrcp;
653 	mpt_table = (uint64_t *)mpt->hr_addr;
654 
655 	/* Write MPT status to SW bit */
656 	ddi_put8(mpt->hr_acchdl, (uint8_t *)&mpt_table[0], 0xF);
657 
658 	/*
659 	 * Write the mapped addresses into the MTT entries.  FMR needs to do
660 	 * this a little differently, so we call the fmr specific fast mtt
661 	 * write here.
662 	 */
663 	status = hermon_mr_fast_mtt_write_fmr(mr->mr_mttrsrcp, mem_pattr_p,
664 	    mr->mr_logmttpgsz);
665 	if (status != DDI_SUCCESS) {
666 		mutex_exit(&mr->mr_lock);
667 		status = ibc_get_ci_failure(0);
668 		goto fmr_reg_fail1;
669 	}
670 
671 	/*
672 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
673 	 * from a certain number of "constrained" bits (the least significant
674 	 * bits) and some number of "unconstrained" bits.  The constrained
675 	 * bits must be set to the index of the entry in the MPT table, but
676 	 * the unconstrained bits can be set to any value we wish.  Note:
677 	 * if no remote access is required, then the RKey value is not filled
678 	 * in.  Otherwise both Rkey and LKey are given the same value.
679 	 */
680 	mr->mr_lkey = hermon_mr_keycalc(mpt->hr_indx);
681 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
682 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
683 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
684 		mr->mr_rkey = mr->mr_lkey;
685 	}
686 
687 	/* write mem key value */
688 	ddi_put32(mpt->hr_acchdl, (uint32_t *)&mpt_table[1], mr->mr_lkey);
689 
690 	/* write length value */
691 	ddi_put64(mpt->hr_acchdl, &mpt_table[3], mem_pattr_p->pmr_len);
692 
693 	/* write start addr value */
694 	ddi_put64(mpt->hr_acchdl, &mpt_table[2], mem_pattr_p->pmr_iova);
695 
696 	/* write lkey value */
697 	ddi_put32(mpt->hr_acchdl, (uint32_t *)&mpt_table[4], mr->mr_lkey);
698 
699 	/* Write MPT status to HW bit */
700 	ddi_put8(mpt->hr_acchdl, (uint8_t *)&mpt_table[0], 0x0);
701 
702 	/* Fill in return parameters */
703 	mem_desc_p->pmd_lkey = mr->mr_lkey;
704 	mem_desc_p->pmd_rkey = mr->mr_rkey;
705 	mem_desc_p->pmd_iova = mem_pattr_p->pmr_iova;
706 	mem_desc_p->pmd_phys_buf_list_sz = mem_pattr_p->pmr_len;
707 
708 	/* Fill in MR bindinfo struct for later sync or query operations */
709 	mr->mr_bindinfo.bi_addr = mem_pattr_p->pmr_iova;
710 	mr->mr_bindinfo.bi_flags = mem_pattr_p->pmr_flags & IBT_MR_NONCOHERENT;
711 
712 	mutex_exit(&mr->mr_lock);
713 
714 	return (DDI_SUCCESS);
715 
716 fmr_reg_fail1:
717 	/*
718 	 * Note, we fail here, and purposely leave the memory ownership in
719 	 * software.  The memory tables may be corrupt, so we leave the region
720 	 * unregistered.
721 	 */
722 	return (DDI_FAILURE);
723 }
724 
725 
726 /*
727  * hermon_mr_deregister()
728  *    Context: Can be called from interrupt or base context.
729  */
730 /* ARGSUSED */
731 int
732 hermon_mr_deregister(hermon_state_t *state, hermon_mrhdl_t *mrhdl, uint_t level,
733     uint_t sleep)
734 {
735 	hermon_rsrc_t		*mpt, *mtt, *rsrc, *mtt_refcnt;
736 	hermon_umap_db_entry_t	*umapdb;
737 	hermon_pdhdl_t		pd;
738 	hermon_mrhdl_t		mr;
739 	hermon_bind_info_t	*bind;
740 	uint64_t		value;
741 	int			status;
742 	uint_t			shared_mtt;
743 
744 	/*
745 	 * Check the sleep flag.  Ensure that it is consistent with the
746 	 * current thread context (i.e. if we are currently in the interrupt
747 	 * context, then we shouldn't be attempting to sleep).
748 	 */
749 	if ((sleep == HERMON_SLEEP) &&
750 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
751 		status = IBT_INVALID_PARAM;
752 		return (status);
753 	}
754 
755 	/*
756 	 * Pull all the necessary information from the Hermon Memory Region
757 	 * handle.  This is necessary here because the resource for the
758 	 * MR handle is going to be freed up as part of the this
759 	 * deregistration
760 	 */
761 	mr	= *mrhdl;
762 	mutex_enter(&mr->mr_lock);
763 	mpt	= mr->mr_mptrsrcp;
764 	mtt	= mr->mr_mttrsrcp;
765 	mtt_refcnt = mr->mr_mttrefcntp;
766 	rsrc	= mr->mr_rsrcp;
767 	pd	= mr->mr_pdhdl;
768 	bind	= &mr->mr_bindinfo;
769 
770 	/*
771 	 * Check here if the memory region is really an FMR.  If so, this is a
772 	 * bad thing and we shouldn't be here.  Return failure.
773 	 */
774 	if (mr->mr_is_fmr) {
775 		mutex_exit(&mr->mr_lock);
776 		return (IBT_INVALID_PARAM);
777 	}
778 
779 	/*
780 	 * Check here to see if the memory region has already been partially
781 	 * deregistered as a result of the hermon_umap_umemlock_cb() callback.
782 	 * If so, then jump to the end and free the remaining resources.
783 	 */
784 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
785 		goto mrdereg_finish_cleanup;
786 	}
787 
788 	/*
789 	 * We must drop the "mr_lock" here to ensure that both SLEEP and
790 	 * NOSLEEP calls into the firmware work as expected.  Also, if two
791 	 * threads are attemping to access this MR (via de-register,
792 	 * re-register, or otherwise), then we allow the firmware to enforce
793 	 * the checking, that only one deregister is valid.
794 	 */
795 	mutex_exit(&mr->mr_lock);
796 
797 	/*
798 	 * Reclaim MPT entry from hardware (if necessary).  Since the
799 	 * hermon_mr_deregister() routine is used in the memory region
800 	 * reregistration process as well, it is possible that we will
801 	 * not always wish to reclaim ownership of the MPT.  Check the
802 	 * "level" arg and, if necessary, attempt to reclaim it.  If
803 	 * the ownership transfer fails for any reason, we check to see
804 	 * what command status was returned from the hardware.  The only
805 	 * "expected" error status is the one that indicates an attempt to
806 	 * deregister a memory region that has memory windows bound to it
807 	 */
808 	if (level >= HERMON_MR_DEREG_ALL) {
809 		if (mr->mr_mpt_type >= HERMON_MPT_DMPT) {
810 			status = hermon_cmn_ownership_cmd_post(state, HW2SW_MPT,
811 			    NULL, 0, mpt->hr_indx, sleep);
812 			if (status != HERMON_CMD_SUCCESS) {
813 				if (status == HERMON_CMD_REG_BOUND) {
814 					return (IBT_MR_IN_USE);
815 				} else {
816 					cmn_err(CE_CONT, "Hermon: HW2SW_MPT "
817 					    "command failed: %08x\n", status);
818 					if (status ==
819 					    HERMON_CMD_INVALID_STATUS) {
820 						hermon_fm_ereport(state,
821 						    HCA_SYS_ERR,
822 						    DDI_SERVICE_LOST);
823 					}
824 					return (IBT_INVALID_PARAM);
825 				}
826 			}
827 		}
828 	}
829 
830 	/*
831 	 * Re-grab the mr_lock here.  Since further access to the protected
832 	 * 'mr' structure is needed, and we would have returned previously for
833 	 * the multiple deregistration case, we can safely grab the lock here.
834 	 */
835 	mutex_enter(&mr->mr_lock);
836 
837 	/*
838 	 * If the memory had come from userland, then we do a lookup in the
839 	 * "userland resources database".  On success, we free the entry, call
840 	 * ddi_umem_unlock(), and continue the cleanup.  On failure (which is
841 	 * an indication that the umem_lockmemory() callback has called
842 	 * hermon_mr_deregister()), we call ddi_umem_unlock() and invalidate
843 	 * the "mr_umemcookie" field in the MR handle (this will be used
844 	 * later to detect that only partial cleaup still remains to be done
845 	 * on the MR handle).
846 	 */
847 	if (mr->mr_is_umem) {
848 		status = hermon_umap_db_find(state->hs_instance,
849 		    (uint64_t)(uintptr_t)mr->mr_umemcookie,
850 		    MLNX_UMAP_MRMEM_RSRC, &value, HERMON_UMAP_DB_REMOVE,
851 		    &umapdb);
852 		if (status == DDI_SUCCESS) {
853 			hermon_umap_db_free(umapdb);
854 			ddi_umem_unlock(mr->mr_umemcookie);
855 		} else {
856 			ddi_umem_unlock(mr->mr_umemcookie);
857 			mr->mr_umemcookie = NULL;
858 		}
859 	}
860 
861 	/*
862 	 * Decrement the MTT reference count.  Since the MTT resource
863 	 * may be shared between multiple memory regions (as a result
864 	 * of a "RegisterSharedMR" verb) it is important that we not
865 	 * free up or unbind resources prematurely.  If it's not shared (as
866 	 * indicated by the return status), then free the resource.
867 	 */
868 	shared_mtt = hermon_mtt_refcnt_dec(mtt_refcnt);
869 	if (!shared_mtt) {
870 		hermon_rsrc_free(state, &mtt_refcnt);
871 	}
872 
873 	/*
874 	 * Free up the MTT entries and unbind the memory.  Here, as above, we
875 	 * attempt to free these resources only if it is appropriate to do so.
876 	 */
877 	if (!shared_mtt) {
878 		if (level >= HERMON_MR_DEREG_NO_HW2SW_MPT) {
879 			hermon_mr_mem_unbind(state, bind);
880 		}
881 		hermon_rsrc_free(state, &mtt);
882 	}
883 
884 	/*
885 	 * If the MR handle has been invalidated, then drop the
886 	 * lock and return success.  Note: This only happens because
887 	 * the umem_lockmemory() callback has been triggered.  The
888 	 * cleanup here is partial, and further cleanup (in a
889 	 * subsequent hermon_mr_deregister() call) will be necessary.
890 	 */
891 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
892 		mutex_exit(&mr->mr_lock);
893 		return (DDI_SUCCESS);
894 	}
895 
896 mrdereg_finish_cleanup:
897 	mutex_exit(&mr->mr_lock);
898 
899 	/* Free the Hermon Memory Region handle */
900 	hermon_rsrc_free(state, &rsrc);
901 
902 	/* Free up the MPT entry resource */
903 	if (mpt != NULL)
904 		hermon_rsrc_free(state, &mpt);
905 
906 	/* Decrement the reference count on the protection domain (PD) */
907 	hermon_pd_refcnt_dec(pd);
908 
909 	/* Set the mrhdl pointer to NULL and return success */
910 	*mrhdl = NULL;
911 
912 	return (DDI_SUCCESS);
913 }
914 
915 /*
916  * hermon_mr_dealloc_fmr()
917  *    Context: Can be called from interrupt or base context.
918  */
919 /* ARGSUSED */
920 int
921 hermon_mr_dealloc_fmr(hermon_state_t *state, hermon_mrhdl_t *mrhdl)
922 {
923 	hermon_rsrc_t		*mpt, *mtt, *rsrc;
924 	hermon_pdhdl_t		pd;
925 	hermon_mrhdl_t		mr;
926 
927 	/*
928 	 * Pull all the necessary information from the Hermon Memory Region
929 	 * handle.  This is necessary here because the resource for the
930 	 * MR handle is going to be freed up as part of the this
931 	 * deregistration
932 	 */
933 	mr	= *mrhdl;
934 	mutex_enter(&mr->mr_lock);
935 	mpt	= mr->mr_mptrsrcp;
936 	mtt	= mr->mr_mttrsrcp;
937 	rsrc	= mr->mr_rsrcp;
938 	pd	= mr->mr_pdhdl;
939 	mutex_exit(&mr->mr_lock);
940 
941 	/* Free the MTT entries */
942 	hermon_rsrc_free(state, &mtt);
943 
944 	/* Free the Hermon Memory Region handle */
945 	hermon_rsrc_free(state, &rsrc);
946 
947 	/* Free up the MPT entry resource */
948 	hermon_rsrc_free(state, &mpt);
949 
950 	/* Decrement the reference count on the protection domain (PD) */
951 	hermon_pd_refcnt_dec(pd);
952 
953 	/* Set the mrhdl pointer to NULL and return success */
954 	*mrhdl = NULL;
955 
956 	return (DDI_SUCCESS);
957 }
958 
959 /*
960  * hermon_mr_invalidate_fmr()
961  *    Context: Can be called from interrupt or base context.
962  */
963 /* ARGSUSED */
964 int
965 hermon_mr_invalidate_fmr(hermon_state_t *state, hermon_mrhdl_t mr)
966 {
967 	hermon_rsrc_t		*mpt;
968 	uint64_t		*mpt_table;
969 
970 	mutex_enter(&mr->mr_lock);
971 	mpt = mr->mr_mptrsrcp;
972 	mpt_table = (uint64_t *)mpt->hr_addr;
973 
974 	/* Write MPT status to SW bit */
975 	ddi_put8(mpt->hr_acchdl, (uint8_t *)&mpt_table[0], 0xF);
976 
977 	/* invalidate mem key value */
978 	ddi_put32(mpt->hr_acchdl, (uint32_t *)&mpt_table[1], 0);
979 
980 	/* invalidate lkey value */
981 	ddi_put32(mpt->hr_acchdl, (uint32_t *)&mpt_table[4], 0);
982 
983 	/* Write MPT status to HW bit */
984 	ddi_put8(mpt->hr_acchdl, (uint8_t *)&mpt_table[0], 0x0);
985 
986 	mutex_exit(&mr->mr_lock);
987 
988 	return (DDI_SUCCESS);
989 }
990 
991 /*
992  * hermon_mr_deregister_fmr()
993  *    Context: Can be called from interrupt or base context.
994  */
995 /* ARGSUSED */
996 int
997 hermon_mr_deregister_fmr(hermon_state_t *state, hermon_mrhdl_t mr)
998 {
999 	hermon_rsrc_t		*mpt;
1000 	uint64_t		*mpt_table;
1001 
1002 	mutex_enter(&mr->mr_lock);
1003 	mpt = mr->mr_mptrsrcp;
1004 	mpt_table = (uint64_t *)mpt->hr_addr;
1005 
1006 	/* Write MPT status to SW bit */
1007 	ddi_put8(mpt->hr_acchdl, (uint8_t *)&mpt_table[0], 0xF);
1008 	mutex_exit(&mr->mr_lock);
1009 
1010 	return (DDI_SUCCESS);
1011 }
1012 
1013 
1014 /*
1015  * hermon_mr_query()
1016  *    Context: Can be called from interrupt or base context.
1017  */
1018 /* ARGSUSED */
1019 int
1020 hermon_mr_query(hermon_state_t *state, hermon_mrhdl_t mr,
1021     ibt_mr_query_attr_t *attr)
1022 {
1023 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr))
1024 
1025 	mutex_enter(&mr->mr_lock);
1026 
1027 	/*
1028 	 * Check here to see if the memory region has already been partially
1029 	 * deregistered as a result of a hermon_umap_umemlock_cb() callback.
1030 	 * If so, this is an error, return failure.
1031 	 */
1032 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
1033 		mutex_exit(&mr->mr_lock);
1034 		return (IBT_MR_HDL_INVALID);
1035 	}
1036 
1037 	/* Fill in the queried attributes */
1038 	attr->mr_attr_flags = mr->mr_accflag;
1039 	attr->mr_pd	= (ibt_pd_hdl_t)mr->mr_pdhdl;
1040 
1041 	/* Fill in the "local" attributes */
1042 	attr->mr_lkey = (ibt_lkey_t)mr->mr_lkey;
1043 	attr->mr_lbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
1044 	attr->mr_lbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
1045 
1046 	/*
1047 	 * Fill in the "remote" attributes (if necessary).  Note: the
1048 	 * remote attributes are only valid if the memory region has one
1049 	 * or more of the remote access flags set.
1050 	 */
1051 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1052 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1053 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1054 		attr->mr_rkey = (ibt_rkey_t)mr->mr_rkey;
1055 		attr->mr_rbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
1056 		attr->mr_rbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
1057 	}
1058 
1059 	/*
1060 	 * If region is mapped for streaming (i.e. noncoherent), then set sync
1061 	 * is required
1062 	 */
1063 	attr->mr_sync_required = (mr->mr_bindinfo.bi_flags &
1064 	    IBT_MR_NONCOHERENT) ? B_TRUE : B_FALSE;
1065 
1066 	mutex_exit(&mr->mr_lock);
1067 	return (DDI_SUCCESS);
1068 }
1069 
1070 
1071 /*
1072  * hermon_mr_reregister()
1073  *    Context: Can be called from interrupt or base context.
1074  */
1075 int
1076 hermon_mr_reregister(hermon_state_t *state, hermon_mrhdl_t mr,
1077     hermon_pdhdl_t pd, ibt_mr_attr_t *mr_attr, hermon_mrhdl_t *mrhdl_new,
1078     hermon_mr_options_t *op)
1079 {
1080 	hermon_bind_info_t	bind;
1081 	int			status;
1082 
1083 	/*
1084 	 * Fill in the "bind" struct.  This struct provides the majority
1085 	 * of the information that will be used to distinguish between an
1086 	 * "addr" binding (as is the case here) and a "buf" binding (see
1087 	 * below).  The "bind" struct is later passed to hermon_mr_mem_bind()
1088 	 * which does most of the "heavy lifting" for the Hermon memory
1089 	 * registration (and reregistration) routines.
1090 	 */
1091 	bind.bi_type  = HERMON_BINDHDL_VADDR;
1092 	bind.bi_addr  = mr_attr->mr_vaddr;
1093 	bind.bi_len   = mr_attr->mr_len;
1094 	bind.bi_as    = mr_attr->mr_as;
1095 	bind.bi_flags = mr_attr->mr_flags;
1096 	status = hermon_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
1097 	return (status);
1098 }
1099 
1100 
1101 /*
1102  * hermon_mr_reregister_buf()
1103  *    Context: Can be called from interrupt or base context.
1104  */
1105 int
1106 hermon_mr_reregister_buf(hermon_state_t *state, hermon_mrhdl_t mr,
1107     hermon_pdhdl_t pd, ibt_smr_attr_t *mr_attr, struct buf *buf,
1108     hermon_mrhdl_t *mrhdl_new, hermon_mr_options_t *op)
1109 {
1110 	hermon_bind_info_t	bind;
1111 	int			status;
1112 
1113 	/*
1114 	 * Fill in the "bind" struct.  This struct provides the majority
1115 	 * of the information that will be used to distinguish between an
1116 	 * "addr" binding (see above) and a "buf" binding (as is the case
1117 	 * here).  The "bind" struct is later passed to hermon_mr_mem_bind()
1118 	 * which does most of the "heavy lifting" for the Hermon memory
1119 	 * registration routines.  Note: We have chosen to provide
1120 	 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
1121 	 * not set).  It is not critical what value we choose here as it need
1122 	 * only be unique for the given RKey (which will happen by default),
1123 	 * so the choice here is somewhat arbitrary.
1124 	 */
1125 	bind.bi_type  = HERMON_BINDHDL_BUF;
1126 	bind.bi_buf   = buf;
1127 	if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
1128 		bind.bi_addr  = mr_attr->mr_vaddr;
1129 	} else {
1130 		bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
1131 	}
1132 	bind.bi_len   = (uint64_t)buf->b_bcount;
1133 	bind.bi_flags = mr_attr->mr_flags;
1134 	bind.bi_as    = NULL;
1135 	status = hermon_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
1136 	return (status);
1137 }
1138 
1139 
1140 /*
1141  * hermon_mr_sync()
1142  *    Context: Can be called from interrupt or base context.
1143  */
1144 /* ARGSUSED */
1145 int
1146 hermon_mr_sync(hermon_state_t *state, ibt_mr_sync_t *mr_segs, size_t num_segs)
1147 {
1148 	hermon_mrhdl_t		mrhdl;
1149 	uint64_t		seg_vaddr, seg_len, seg_end;
1150 	uint64_t		mr_start, mr_end;
1151 	uint_t			type;
1152 	int			status, i;
1153 
1154 	/* Process each of the ibt_mr_sync_t's */
1155 	for (i = 0; i < num_segs; i++) {
1156 		mrhdl = (hermon_mrhdl_t)mr_segs[i].ms_handle;
1157 
1158 		/* Check for valid memory region handle */
1159 		if (mrhdl == NULL) {
1160 			status = IBT_MR_HDL_INVALID;
1161 			goto mrsync_fail;
1162 		}
1163 
1164 		mutex_enter(&mrhdl->mr_lock);
1165 
1166 		/*
1167 		 * Check here to see if the memory region has already been
1168 		 * partially deregistered as a result of a
1169 		 * hermon_umap_umemlock_cb() callback.  If so, this is an
1170 		 * error, return failure.
1171 		 */
1172 		if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
1173 			mutex_exit(&mrhdl->mr_lock);
1174 			status = IBT_MR_HDL_INVALID;
1175 			goto mrsync_fail;
1176 		}
1177 
1178 		/* Check for valid bounds on sync request */
1179 		seg_vaddr = mr_segs[i].ms_vaddr;
1180 		seg_len	  = mr_segs[i].ms_len;
1181 		seg_end	  = seg_vaddr + seg_len - 1;
1182 		mr_start  = mrhdl->mr_bindinfo.bi_addr;
1183 		mr_end	  = mr_start + mrhdl->mr_bindinfo.bi_len - 1;
1184 		if ((seg_vaddr < mr_start) || (seg_vaddr > mr_end)) {
1185 			mutex_exit(&mrhdl->mr_lock);
1186 			status = IBT_MR_VA_INVALID;
1187 			goto mrsync_fail;
1188 		}
1189 		if ((seg_end < mr_start) || (seg_end > mr_end)) {
1190 			mutex_exit(&mrhdl->mr_lock);
1191 			status = IBT_MR_LEN_INVALID;
1192 			goto mrsync_fail;
1193 		}
1194 
1195 		/* Determine what type (i.e. direction) for sync */
1196 		if (mr_segs[i].ms_flags & IBT_SYNC_READ) {
1197 			type = DDI_DMA_SYNC_FORDEV;
1198 		} else if (mr_segs[i].ms_flags & IBT_SYNC_WRITE) {
1199 			type = DDI_DMA_SYNC_FORCPU;
1200 		} else {
1201 			mutex_exit(&mrhdl->mr_lock);
1202 			status = IBT_INVALID_PARAM;
1203 			goto mrsync_fail;
1204 		}
1205 
1206 		(void) ddi_dma_sync(mrhdl->mr_bindinfo.bi_dmahdl,
1207 		    (off_t)(seg_vaddr - mr_start), (size_t)seg_len, type);
1208 
1209 		mutex_exit(&mrhdl->mr_lock);
1210 	}
1211 
1212 	return (DDI_SUCCESS);
1213 
1214 mrsync_fail:
1215 	return (status);
1216 }
1217 
1218 
1219 /*
1220  * hermon_mw_alloc()
1221  *    Context: Can be called from interrupt or base context.
1222  */
1223 int
1224 hermon_mw_alloc(hermon_state_t *state, hermon_pdhdl_t pd, ibt_mw_flags_t flags,
1225     hermon_mwhdl_t *mwhdl)
1226 {
1227 	hermon_rsrc_t		*mpt, *rsrc;
1228 	hermon_hw_dmpt_t		mpt_entry;
1229 	hermon_mwhdl_t		mw;
1230 	uint_t			sleep;
1231 	int			status;
1232 
1233 	if (state != NULL)	/* XXX - bogus test that is always TRUE */
1234 		return (IBT_INSUFF_RESOURCE);
1235 
1236 	/*
1237 	 * Check the sleep flag.  Ensure that it is consistent with the
1238 	 * current thread context (i.e. if we are currently in the interrupt
1239 	 * context, then we shouldn't be attempting to sleep).
1240 	 */
1241 	sleep = (flags & IBT_MW_NOSLEEP) ? HERMON_NOSLEEP : HERMON_SLEEP;
1242 	if ((sleep == HERMON_SLEEP) &&
1243 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
1244 		status = IBT_INVALID_PARAM;
1245 		goto mwalloc_fail;
1246 	}
1247 
1248 	/* Increment the reference count on the protection domain (PD) */
1249 	hermon_pd_refcnt_inc(pd);
1250 
1251 	/*
1252 	 * Allocate an MPT entry (for use as a memory window).  Since the
1253 	 * Hermon hardware uses the MPT entry for memory regions and for
1254 	 * memory windows, we will fill in this MPT with all the necessary
1255 	 * parameters for the memory window.  And then (just as we do for
1256 	 * memory regions) ownership will be passed to the hardware in the
1257 	 * final step below.  If we fail here, we must undo the protection
1258 	 * domain reference count.
1259 	 */
1260 	status = hermon_rsrc_alloc(state, HERMON_DMPT, 1, sleep, &mpt);
1261 	if (status != DDI_SUCCESS) {
1262 		status = IBT_INSUFF_RESOURCE;
1263 		goto mwalloc_fail1;
1264 	}
1265 
1266 	/*
1267 	 * Allocate the software structure for tracking the memory window (i.e.
1268 	 * the Hermon Memory Window handle).  Note: This is actually the same
1269 	 * software structure used for tracking memory regions, but since many
1270 	 * of the same properties are needed, only a single structure is
1271 	 * necessary.  If we fail here, we must undo the protection domain
1272 	 * reference count and the previous resource allocation.
1273 	 */
1274 	status = hermon_rsrc_alloc(state, HERMON_MRHDL, 1, sleep, &rsrc);
1275 	if (status != DDI_SUCCESS) {
1276 		status = IBT_INSUFF_RESOURCE;
1277 		goto mwalloc_fail2;
1278 	}
1279 	mw = (hermon_mwhdl_t)rsrc->hr_addr;
1280 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
1281 
1282 	/*
1283 	 * Calculate an "unbound" RKey from MPT index.  In much the same way
1284 	 * as we do for memory regions (above), this key is constructed from
1285 	 * a "constrained" (which depends on the MPT index) and an
1286 	 * "unconstrained" portion (which may be arbitrarily chosen).
1287 	 */
1288 	mw->mr_rkey = hermon_mr_keycalc(mpt->hr_indx);
1289 
1290 	/*
1291 	 * Fill in the MPT entry.  This is the final step before passing
1292 	 * ownership of the MPT entry to the Hermon hardware.  We use all of
1293 	 * the information collected/calculated above to fill in the
1294 	 * requisite portions of the MPT.  Note: fewer entries in the MPT
1295 	 * entry are necessary to allocate a memory window.
1296 	 */
1297 	bzero(&mpt_entry, sizeof (hermon_hw_dmpt_t));
1298 	mpt_entry.reg_win	= HERMON_MPT_IS_WINDOW;
1299 	mpt_entry.mem_key	= mw->mr_rkey;
1300 	mpt_entry.pd		= pd->pd_pdnum;
1301 	mpt_entry.lr		= 1;
1302 
1303 	/*
1304 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
1305 	 * the entry to the hardware.  Note: in general, this operation
1306 	 * shouldn't fail.  But if it does, we have to undo everything we've
1307 	 * done above before returning error.
1308 	 */
1309 	status = hermon_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1310 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, sleep);
1311 	if (status != HERMON_CMD_SUCCESS) {
1312 		cmn_err(CE_CONT, "Hermon: SW2HW_MPT command failed: %08x\n",
1313 		    status);
1314 		if (status == HERMON_CMD_INVALID_STATUS) {
1315 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1316 		}
1317 		status = ibc_get_ci_failure(0);
1318 		goto mwalloc_fail3;
1319 	}
1320 
1321 	/*
1322 	 * Fill in the rest of the Hermon Memory Window handle.  Having
1323 	 * successfully transferred ownership of the MPT, we can update the
1324 	 * following fields for use in further operations on the MW.
1325 	 */
1326 	mw->mr_mptrsrcp	= mpt;
1327 	mw->mr_pdhdl	= pd;
1328 	mw->mr_rsrcp	= rsrc;
1329 	mw->mr_rkey	= hermon_mr_key_swap(mw->mr_rkey);
1330 	*mwhdl = mw;
1331 
1332 	return (DDI_SUCCESS);
1333 
1334 mwalloc_fail3:
1335 	hermon_rsrc_free(state, &rsrc);
1336 mwalloc_fail2:
1337 	hermon_rsrc_free(state, &mpt);
1338 mwalloc_fail1:
1339 	hermon_pd_refcnt_dec(pd);
1340 mwalloc_fail:
1341 	return (status);
1342 }
1343 
1344 
1345 /*
1346  * hermon_mw_free()
1347  *    Context: Can be called from interrupt or base context.
1348  */
1349 int
1350 hermon_mw_free(hermon_state_t *state, hermon_mwhdl_t *mwhdl, uint_t sleep)
1351 {
1352 	hermon_rsrc_t		*mpt, *rsrc;
1353 	hermon_mwhdl_t		mw;
1354 	int			status;
1355 	hermon_pdhdl_t		pd;
1356 
1357 	/*
1358 	 * Check the sleep flag.  Ensure that it is consistent with the
1359 	 * current thread context (i.e. if we are currently in the interrupt
1360 	 * context, then we shouldn't be attempting to sleep).
1361 	 */
1362 	if ((sleep == HERMON_SLEEP) &&
1363 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
1364 		status = IBT_INVALID_PARAM;
1365 		return (status);
1366 	}
1367 
1368 	/*
1369 	 * Pull all the necessary information from the Hermon Memory Window
1370 	 * handle.  This is necessary here because the resource for the
1371 	 * MW handle is going to be freed up as part of the this operation.
1372 	 */
1373 	mw	= *mwhdl;
1374 	mutex_enter(&mw->mr_lock);
1375 	mpt	= mw->mr_mptrsrcp;
1376 	rsrc	= mw->mr_rsrcp;
1377 	pd	= mw->mr_pdhdl;
1378 	mutex_exit(&mw->mr_lock);
1379 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
1380 
1381 	/*
1382 	 * Reclaim the MPT entry from hardware.  Note: in general, it is
1383 	 * unexpected for this operation to return an error.
1384 	 */
1385 	status = hermon_cmn_ownership_cmd_post(state, HW2SW_MPT, NULL,
1386 	    0, mpt->hr_indx, sleep);
1387 	if (status != HERMON_CMD_SUCCESS) {
1388 		cmn_err(CE_CONT, "Hermon: HW2SW_MPT command failed: %08x\n",
1389 		    status);
1390 		if (status == HERMON_CMD_INVALID_STATUS) {
1391 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1392 		}
1393 		return (ibc_get_ci_failure(0));
1394 	}
1395 
1396 	/* Free the Hermon Memory Window handle */
1397 	hermon_rsrc_free(state, &rsrc);
1398 
1399 	/* Free up the MPT entry resource */
1400 	hermon_rsrc_free(state, &mpt);
1401 
1402 	/* Decrement the reference count on the protection domain (PD) */
1403 	hermon_pd_refcnt_dec(pd);
1404 
1405 	/* Set the mwhdl pointer to NULL and return success */
1406 	*mwhdl = NULL;
1407 
1408 	return (DDI_SUCCESS);
1409 }
1410 
1411 
1412 /*
1413  * hermon_mr_keycalc()
1414  *    Context: Can be called from interrupt or base context.
1415  *    NOTE:  Produces a key in the form of
1416  *		KKKKKKKK IIIIIIII IIIIIIII IIIIIIIII
1417  *    where K == the arbitrary bits and I == the index
1418  */
1419 uint32_t
1420 hermon_mr_keycalc(uint32_t indx)
1421 {
1422 	uint32_t tmp_key, tmp_indx;
1423 
1424 	/*
1425 	 * Generate a simple key from counter.  Note:  We increment this
1426 	 * static variable _intentionally_ without any kind of mutex around
1427 	 * it.  First, single-threading all operations through a single lock
1428 	 * would be a bad idea (from a performance point-of-view).  Second,
1429 	 * the upper "unconstrained" bits don't really have to be unique
1430 	 * because the lower bits are guaranteed to be (although we do make a
1431 	 * best effort to ensure that they are).  Third, the window for the
1432 	 * race (where both threads read and update the counter at the same
1433 	 * time) is incredibly small.
1434 	 * And, lastly, we'd like to make this into a "random" key
1435 	 */
1436 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(hermon_memkey_cnt))
1437 	tmp_key = (hermon_memkey_cnt++) << HERMON_MEMKEY_SHIFT;
1438 	tmp_indx = indx & 0xffffff;
1439 	return (tmp_key | tmp_indx);
1440 }
1441 
1442 
1443 /*
1444  * hermon_mr_key_swap()
1445  *    Context: Can be called from interrupt or base context.
1446  *    NOTE:  Produces a key in the form of
1447  *		IIIIIIII IIIIIIII IIIIIIIII KKKKKKKK
1448  *    where K == the arbitrary bits and I == the index
1449  */
1450 uint32_t
1451 hermon_mr_key_swap(uint32_t indx)
1452 {
1453 	/*
1454 	 * The memory key format to pass down to the hardware is
1455 	 * (key[7:0],index[23:0]), which defines the index to the
1456 	 * hardware resource. When the driver passes this as a memory
1457 	 * key, (i.e. to retrieve a resource) the format is
1458 	 * (index[23:0],key[7:0]).
1459 	 */
1460 	return (((indx >> 24) & 0x000000ff) | ((indx << 8) & 0xffffff00));
1461 }
1462 
1463 /*
1464  * hermon_mr_common_reg()
1465  *    Context: Can be called from interrupt or base context.
1466  */
1467 static int
1468 hermon_mr_common_reg(hermon_state_t *state, hermon_pdhdl_t pd,
1469     hermon_bind_info_t *bind, hermon_mrhdl_t *mrhdl, hermon_mr_options_t *op,
1470     hermon_mpt_rsrc_type_t mpt_type)
1471 {
1472 	hermon_rsrc_t		*mpt, *mtt, *rsrc, *mtt_refcnt;
1473 	hermon_umap_db_entry_t	*umapdb;
1474 	hermon_sw_refcnt_t	*swrc_tmp;
1475 	hermon_hw_dmpt_t	mpt_entry;
1476 	hermon_mrhdl_t		mr;
1477 	ibt_mr_flags_t		flags;
1478 	hermon_bind_info_t	*bh;
1479 	ddi_dma_handle_t	bind_dmahdl;
1480 	ddi_umem_cookie_t	umem_cookie;
1481 	size_t			umem_len;
1482 	caddr_t			umem_addr;
1483 	uint64_t		mtt_addr, max_sz;
1484 	uint_t			sleep, mtt_pgsize_bits, bind_type, mr_is_umem;
1485 	int			status, umem_flags, bind_override_addr;
1486 
1487 	/*
1488 	 * Check the "options" flag.  Currently this flag tells the driver
1489 	 * whether or not the region should be bound normally (i.e. with
1490 	 * entries written into the PCI IOMMU), whether it should be
1491 	 * registered to bypass the IOMMU, and whether or not the resulting
1492 	 * address should be "zero-based" (to aid the alignment restrictions
1493 	 * for QPs).
1494 	 */
1495 	if (op == NULL) {
1496 		bind_type   = HERMON_BINDMEM_NORMAL;
1497 		bind_dmahdl = NULL;
1498 		bind_override_addr = 0;
1499 	} else {
1500 		bind_type	   = op->mro_bind_type;
1501 		bind_dmahdl	   = op->mro_bind_dmahdl;
1502 		bind_override_addr = op->mro_bind_override_addr;
1503 	}
1504 
1505 	/* check what kind of mpt to use */
1506 
1507 	/* Extract the flags field from the hermon_bind_info_t */
1508 	flags = bind->bi_flags;
1509 
1510 	/*
1511 	 * Check for invalid length.  Check is the length is zero or if the
1512 	 * length is larger than the maximum configured value.  Return error
1513 	 * if it is.
1514 	 */
1515 	max_sz = ((uint64_t)1 << state->hs_cfg_profile->cp_log_max_mrw_sz);
1516 	if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
1517 		status = IBT_MR_LEN_INVALID;
1518 		goto mrcommon_fail;
1519 	}
1520 
1521 	/*
1522 	 * Check the sleep flag.  Ensure that it is consistent with the
1523 	 * current thread context (i.e. if we are currently in the interrupt
1524 	 * context, then we shouldn't be attempting to sleep).
1525 	 */
1526 	sleep = (flags & IBT_MR_NOSLEEP) ? HERMON_NOSLEEP: HERMON_SLEEP;
1527 	if ((sleep == HERMON_SLEEP) &&
1528 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
1529 		status = IBT_INVALID_PARAM;
1530 		goto mrcommon_fail;
1531 	}
1532 
1533 	/* Increment the reference count on the protection domain (PD) */
1534 	hermon_pd_refcnt_inc(pd);
1535 
1536 	/*
1537 	 * Allocate an MPT entry.  This will be filled in with all the
1538 	 * necessary parameters to define the memory region.  And then
1539 	 * ownership will be passed to the hardware in the final step
1540 	 * below.  If we fail here, we must undo the protection domain
1541 	 * reference count.
1542 	 */
1543 	if (mpt_type == HERMON_MPT_DMPT) {
1544 		status = hermon_rsrc_alloc(state, HERMON_DMPT, 1, sleep, &mpt);
1545 		if (status != DDI_SUCCESS) {
1546 			status = IBT_INSUFF_RESOURCE;
1547 			goto mrcommon_fail1;
1548 		}
1549 	} else {
1550 		mpt = NULL;
1551 	}
1552 
1553 	/*
1554 	 * Allocate the software structure for tracking the memory region (i.e.
1555 	 * the Hermon Memory Region handle).  If we fail here, we must undo
1556 	 * the protection domain reference count and the previous resource
1557 	 * allocation.
1558 	 */
1559 	status = hermon_rsrc_alloc(state, HERMON_MRHDL, 1, sleep, &rsrc);
1560 	if (status != DDI_SUCCESS) {
1561 		status = IBT_INSUFF_RESOURCE;
1562 		goto mrcommon_fail2;
1563 	}
1564 	mr = (hermon_mrhdl_t)rsrc->hr_addr;
1565 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
1566 
1567 	/*
1568 	 * Setup and validate the memory region access flags.  This means
1569 	 * translating the IBTF's enable flags into the access flags that
1570 	 * will be used in later operations.
1571 	 */
1572 	mr->mr_accflag = 0;
1573 	if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1574 		mr->mr_accflag |= IBT_MR_WINDOW_BIND;
1575 	if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1576 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
1577 	if (flags & IBT_MR_ENABLE_REMOTE_READ)
1578 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
1579 	if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1580 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
1581 	if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1582 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
1583 
1584 	/*
1585 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
1586 	 * from a certain number of "constrained" bits (the least significant
1587 	 * bits) and some number of "unconstrained" bits.  The constrained
1588 	 * bits must be set to the index of the entry in the MPT table, but
1589 	 * the unconstrained bits can be set to any value we wish.  Note:
1590 	 * if no remote access is required, then the RKey value is not filled
1591 	 * in.  Otherwise both Rkey and LKey are given the same value.
1592 	 */
1593 	if (mpt)
1594 		mr->mr_lkey = hermon_mr_keycalc(mpt->hr_indx);
1595 
1596 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1597 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1598 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1599 		mr->mr_rkey = mr->mr_lkey;
1600 	}
1601 
1602 	/*
1603 	 * Determine if the memory is from userland and pin the pages
1604 	 * with umem_lockmemory() if necessary.
1605 	 * Then, if this is userland memory, allocate an entry in the
1606 	 * "userland resources database".  This will later be added to
1607 	 * the database (after all further memory registration operations are
1608 	 * successful).  If we fail here, we must undo the reference counts
1609 	 * and the previous resource allocations.
1610 	 */
1611 	mr_is_umem = (((bind->bi_as != NULL) && (bind->bi_as != &kas)) ? 1 : 0);
1612 	if (mr_is_umem) {
1613 		umem_len   = ptob(btopr(bind->bi_len +
1614 		    ((uintptr_t)bind->bi_addr & PAGEOFFSET)));
1615 		umem_addr  = (caddr_t)((uintptr_t)bind->bi_addr & ~PAGEOFFSET);
1616 		umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
1617 		    DDI_UMEMLOCK_LONGTERM);
1618 		status = umem_lockmemory(umem_addr, umem_len, umem_flags,
1619 		    &umem_cookie, &hermon_umem_cbops, NULL);
1620 		if (status != 0) {
1621 			status = IBT_INSUFF_RESOURCE;
1622 			goto mrcommon_fail3;
1623 		}
1624 
1625 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1626 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1627 
1628 		bind->bi_buf = ddi_umem_iosetup(umem_cookie, 0, umem_len,
1629 		    B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP);
1630 		if (bind->bi_buf == NULL) {
1631 			status = IBT_INSUFF_RESOURCE;
1632 			goto mrcommon_fail3;
1633 		}
1634 		bind->bi_type = HERMON_BINDHDL_UBUF;
1635 		bind->bi_buf->b_flags |= B_READ;
1636 
1637 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1638 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1639 
1640 		umapdb = hermon_umap_db_alloc(state->hs_instance,
1641 		    (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
1642 		    (uint64_t)(uintptr_t)rsrc);
1643 		if (umapdb == NULL) {
1644 			status = IBT_INSUFF_RESOURCE;
1645 			goto mrcommon_fail4;
1646 		}
1647 	}
1648 
1649 	/*
1650 	 * Setup the bindinfo for the mtt bind call
1651 	 */
1652 	bh = &mr->mr_bindinfo;
1653 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bh))
1654 	bcopy(bind, bh, sizeof (hermon_bind_info_t));
1655 	bh->bi_bypass = bind_type;
1656 	status = hermon_mr_mtt_bind(state, bh, bind_dmahdl, &mtt,
1657 	    &mtt_pgsize_bits, mpt != NULL);
1658 	if (status != DDI_SUCCESS) {
1659 		/*
1660 		 * When mtt_bind fails, freerbuf has already been done,
1661 		 * so make sure not to call it again.
1662 		 */
1663 		bind->bi_type = bh->bi_type;
1664 		goto mrcommon_fail5;
1665 	}
1666 	mr->mr_logmttpgsz = mtt_pgsize_bits;
1667 
1668 	/*
1669 	 * Allocate MTT reference count (to track shared memory regions).
1670 	 * This reference count resource may never be used on the given
1671 	 * memory region, but if it is ever later registered as "shared"
1672 	 * memory region then this resource will be necessary.  If we fail
1673 	 * here, we do pretty much the same as above to clean up.
1674 	 */
1675 	status = hermon_rsrc_alloc(state, HERMON_REFCNT, 1, sleep,
1676 	    &mtt_refcnt);
1677 	if (status != DDI_SUCCESS) {
1678 		status = IBT_INSUFF_RESOURCE;
1679 		goto mrcommon_fail6;
1680 	}
1681 	mr->mr_mttrefcntp = mtt_refcnt;
1682 	swrc_tmp = (hermon_sw_refcnt_t *)mtt_refcnt->hr_addr;
1683 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_tmp))
1684 	HERMON_MTT_REFCNT_INIT(swrc_tmp);
1685 
1686 	mtt_addr = (mtt->hr_indx << HERMON_MTT_SIZE_SHIFT);
1687 
1688 	/*
1689 	 * Fill in the MPT entry.  This is the final step before passing
1690 	 * ownership of the MPT entry to the Hermon hardware.  We use all of
1691 	 * the information collected/calculated above to fill in the
1692 	 * requisite portions of the MPT.  Do this ONLY for DMPTs.
1693 	 */
1694 	if (mpt == NULL)
1695 		goto no_passown;
1696 
1697 	bzero(&mpt_entry, sizeof (hermon_hw_dmpt_t));
1698 
1699 	mpt_entry.status  = HERMON_MPT_SW_OWNERSHIP;
1700 	mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
1701 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
1702 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
1703 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
1704 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
1705 	mpt_entry.lr	  = 1;
1706 	mpt_entry.phys_addr = 0;
1707 	mpt_entry.reg_win = HERMON_MPT_IS_REGION;
1708 
1709 	mpt_entry.entity_sz	= mr->mr_logmttpgsz;
1710 	mpt_entry.mem_key	= mr->mr_lkey;
1711 	mpt_entry.pd		= pd->pd_pdnum;
1712 	mpt_entry.rem_acc_en = 0;
1713 	mpt_entry.fast_reg_en = 0;
1714 	mpt_entry.en_inval = 0;
1715 	mpt_entry.lkey = 0;
1716 	mpt_entry.win_cnt = 0;
1717 
1718 	if (bind_override_addr == 0) {
1719 		mpt_entry.start_addr = bh->bi_addr;
1720 	} else {
1721 		bh->bi_addr = bh->bi_addr & ((1 << mr->mr_logmttpgsz) - 1);
1722 		mpt_entry.start_addr = bh->bi_addr;
1723 	}
1724 	mpt_entry.reg_win_len	= bh->bi_len;
1725 
1726 	mpt_entry.mtt_addr_h = mtt_addr >> 32;  /* only 8 more bits */
1727 	mpt_entry.mtt_addr_l = mtt_addr >> 3;	/* only 29 bits */
1728 
1729 	/*
1730 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
1731 	 * the entry to the hardware if needed.  Note: in general, this
1732 	 * operation shouldn't fail.  But if it does, we have to undo
1733 	 * everything we've done above before returning error.
1734 	 *
1735 	 * For Hermon, this routine (which is common to the contexts) will only
1736 	 * set the ownership if needed - the process of passing the context
1737 	 * itself to HW will take care of setting up the MPT (based on type
1738 	 * and index).
1739 	 */
1740 
1741 	mpt_entry.bnd_qp = 0;	/* dMPT for a qp, check for window */
1742 	status = hermon_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1743 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, sleep);
1744 	if (status != HERMON_CMD_SUCCESS) {
1745 		cmn_err(CE_CONT, "Hermon: SW2HW_MPT command failed: %08x\n",
1746 		    status);
1747 		if (status == HERMON_CMD_INVALID_STATUS) {
1748 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1749 		}
1750 		status = ibc_get_ci_failure(0);
1751 		goto mrcommon_fail7;
1752 	}
1753 no_passown:
1754 
1755 	/*
1756 	 * Fill in the rest of the Hermon Memory Region handle.  Having
1757 	 * successfully transferred ownership of the MPT, we can update the
1758 	 * following fields for use in further operations on the MR.
1759 	 */
1760 	mr->mr_mttaddr	   = mtt_addr;
1761 
1762 	mr->mr_log2_pgsz   = (mr->mr_logmttpgsz - HERMON_PAGESHIFT);
1763 	mr->mr_mptrsrcp	   = mpt;
1764 	mr->mr_mttrsrcp	   = mtt;
1765 	mr->mr_pdhdl	   = pd;
1766 	mr->mr_rsrcp	   = rsrc;
1767 	mr->mr_is_umem	   = mr_is_umem;
1768 	mr->mr_is_fmr	   = 0;
1769 	mr->mr_umemcookie  = (mr_is_umem != 0) ? umem_cookie : NULL;
1770 	mr->mr_umem_cbfunc = NULL;
1771 	mr->mr_umem_cbarg1 = NULL;
1772 	mr->mr_umem_cbarg2 = NULL;
1773 	mr->mr_lkey	   = hermon_mr_key_swap(mr->mr_lkey);
1774 	mr->mr_rkey	   = hermon_mr_key_swap(mr->mr_rkey);
1775 	mr->mr_mpt_type	   = mpt_type;
1776 
1777 	/*
1778 	 * If this is userland memory, then we need to insert the previously
1779 	 * allocated entry into the "userland resources database".  This will
1780 	 * allow for later coordination between the hermon_umap_umemlock_cb()
1781 	 * callback and hermon_mr_deregister().
1782 	 */
1783 	if (mr_is_umem) {
1784 		hermon_umap_db_add(umapdb);
1785 	}
1786 
1787 	*mrhdl = mr;
1788 
1789 	return (DDI_SUCCESS);
1790 
1791 /*
1792  * The following is cleanup for all possible failure cases in this routine
1793  */
1794 mrcommon_fail7:
1795 	hermon_rsrc_free(state, &mtt_refcnt);
1796 mrcommon_fail6:
1797 	hermon_mr_mem_unbind(state, bh);
1798 	bind->bi_type = bh->bi_type;
1799 mrcommon_fail5:
1800 	if (mr_is_umem) {
1801 		hermon_umap_db_free(umapdb);
1802 	}
1803 mrcommon_fail4:
1804 	if (mr_is_umem) {
1805 		/*
1806 		 * Free up the memory ddi_umem_iosetup() allocates
1807 		 * internally.
1808 		 */
1809 		if (bind->bi_type == HERMON_BINDHDL_UBUF) {
1810 			freerbuf(bind->bi_buf);
1811 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1812 			bind->bi_type = HERMON_BINDHDL_NONE;
1813 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1814 		}
1815 		ddi_umem_unlock(umem_cookie);
1816 	}
1817 mrcommon_fail3:
1818 	hermon_rsrc_free(state, &rsrc);
1819 mrcommon_fail2:
1820 	if (mpt != NULL)
1821 		hermon_rsrc_free(state, &mpt);
1822 mrcommon_fail1:
1823 	hermon_pd_refcnt_dec(pd);
1824 mrcommon_fail:
1825 	return (status);
1826 }
1827 
1828 /*
1829  * hermon_mr_mtt_bind()
1830  *    Context: Can be called from interrupt or base context.
1831  */
1832 int
1833 hermon_mr_mtt_bind(hermon_state_t *state, hermon_bind_info_t *bind,
1834     ddi_dma_handle_t bind_dmahdl, hermon_rsrc_t **mtt, uint_t *mtt_pgsize_bits,
1835     uint_t is_buffer)
1836 {
1837 	uint64_t		nummtt;
1838 	uint_t			sleep;
1839 	int			status;
1840 
1841 	/*
1842 	 * Check the sleep flag.  Ensure that it is consistent with the
1843 	 * current thread context (i.e. if we are currently in the interrupt
1844 	 * context, then we shouldn't be attempting to sleep).
1845 	 */
1846 	sleep = (bind->bi_flags & IBT_MR_NOSLEEP) ?
1847 	    HERMON_NOSLEEP : HERMON_SLEEP;
1848 	if ((sleep == HERMON_SLEEP) &&
1849 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
1850 		status = IBT_INVALID_PARAM;
1851 		goto mrmttbind_fail;
1852 	}
1853 
1854 	/*
1855 	 * Bind the memory and determine the mapped addresses.  This is
1856 	 * the first of two routines that do all the "heavy lifting" for
1857 	 * the Hermon memory registration routines.  The hermon_mr_mem_bind()
1858 	 * routine takes the "bind" struct with all its fields filled
1859 	 * in and returns a list of DMA cookies (for the PCI mapped addresses
1860 	 * corresponding to the specified address region) which are used by
1861 	 * the hermon_mr_fast_mtt_write() routine below.  If we fail here, we
1862 	 * must undo all the previous resource allocation (and PD reference
1863 	 * count).
1864 	 */
1865 	status = hermon_mr_mem_bind(state, bind, bind_dmahdl, sleep, is_buffer);
1866 	if (status != DDI_SUCCESS) {
1867 		status = IBT_INSUFF_RESOURCE;
1868 		goto mrmttbind_fail;
1869 	}
1870 
1871 	/*
1872 	 * Determine number of pages spanned.  This routine uses the
1873 	 * information in the "bind" struct to determine the required
1874 	 * number of MTT entries needed (and returns the suggested page size -
1875 	 * as a "power-of-2" - for each MTT entry).
1876 	 */
1877 	nummtt = hermon_mr_nummtt_needed(state, bind, mtt_pgsize_bits);
1878 
1879 	/*
1880 	 * Allocate the MTT entries.  Use the calculations performed above to
1881 	 * allocate the required number of MTT entries. If we fail here, we
1882 	 * must not only undo all the previous resource allocation (and PD
1883 	 * reference count), but we must also unbind the memory.
1884 	 */
1885 	status = hermon_rsrc_alloc(state, HERMON_MTT, nummtt, sleep, mtt);
1886 	if (status != DDI_SUCCESS) {
1887 		status = IBT_INSUFF_RESOURCE;
1888 		goto mrmttbind_fail2;
1889 	}
1890 
1891 	/*
1892 	 * Write the mapped addresses into the MTT entries.  This is part two
1893 	 * of the "heavy lifting" routines that we talked about above.  Note:
1894 	 * we pass the suggested page size from the earlier operation here.
1895 	 * And if we fail here, we again do pretty much the same huge clean up.
1896 	 */
1897 	status = hermon_mr_fast_mtt_write(state, *mtt, bind, *mtt_pgsize_bits);
1898 	if (status != DDI_SUCCESS) {
1899 		/*
1900 		 * hermon_mr_fast_mtt_write() returns DDI_FAILURE
1901 		 * only if it detects a HW error during DMA.
1902 		 */
1903 		hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1904 		status = ibc_get_ci_failure(0);
1905 		goto mrmttbind_fail3;
1906 	}
1907 	return (DDI_SUCCESS);
1908 
1909 /*
1910  * The following is cleanup for all possible failure cases in this routine
1911  */
1912 mrmttbind_fail3:
1913 	hermon_rsrc_free(state, mtt);
1914 mrmttbind_fail2:
1915 	hermon_mr_mem_unbind(state, bind);
1916 mrmttbind_fail:
1917 	return (status);
1918 }
1919 
1920 
1921 /*
1922  * hermon_mr_mtt_unbind()
1923  *    Context: Can be called from interrupt or base context.
1924  */
1925 int
1926 hermon_mr_mtt_unbind(hermon_state_t *state, hermon_bind_info_t *bind,
1927     hermon_rsrc_t *mtt)
1928 {
1929 	/*
1930 	 * Free up the MTT entries and unbind the memory.  Here, as above, we
1931 	 * attempt to free these resources only if it is appropriate to do so.
1932 	 */
1933 	hermon_mr_mem_unbind(state, bind);
1934 	hermon_rsrc_free(state, &mtt);
1935 
1936 	return (DDI_SUCCESS);
1937 }
1938 
1939 
1940 /*
1941  * hermon_mr_common_rereg()
1942  *    Context: Can be called from interrupt or base context.
1943  */
1944 static int
1945 hermon_mr_common_rereg(hermon_state_t *state, hermon_mrhdl_t mr,
1946     hermon_pdhdl_t pd, hermon_bind_info_t *bind, hermon_mrhdl_t *mrhdl_new,
1947     hermon_mr_options_t *op)
1948 {
1949 	hermon_rsrc_t		*mpt;
1950 	ibt_mr_attr_flags_t	acc_flags_to_use;
1951 	ibt_mr_flags_t		flags;
1952 	hermon_pdhdl_t		pd_to_use;
1953 	hermon_hw_dmpt_t	mpt_entry;
1954 	uint64_t		mtt_addr_to_use, vaddr_to_use, len_to_use;
1955 	uint_t			sleep, dereg_level;
1956 	int			status;
1957 
1958 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1959 
1960 	/*
1961 	 * Check here to see if the memory region corresponds to a userland
1962 	 * mapping.  Reregistration of userland memory regions is not
1963 	 * currently supported.  Return failure.
1964 	 */
1965 	if (mr->mr_is_umem) {
1966 		status = IBT_MR_HDL_INVALID;
1967 		goto mrrereg_fail;
1968 	}
1969 
1970 	mutex_enter(&mr->mr_lock);
1971 
1972 	/* Pull MPT resource pointer from the Hermon Memory Region handle */
1973 	mpt = mr->mr_mptrsrcp;
1974 
1975 	/* Extract the flags field from the hermon_bind_info_t */
1976 	flags = bind->bi_flags;
1977 
1978 	/*
1979 	 * Check the sleep flag.  Ensure that it is consistent with the
1980 	 * current thread context (i.e. if we are currently in the interrupt
1981 	 * context, then we shouldn't be attempting to sleep).
1982 	 */
1983 	sleep = (flags & IBT_MR_NOSLEEP) ? HERMON_NOSLEEP: HERMON_SLEEP;
1984 	if ((sleep == HERMON_SLEEP) &&
1985 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
1986 		mutex_exit(&mr->mr_lock);
1987 		status = IBT_INVALID_PARAM;
1988 		goto mrrereg_fail;
1989 	}
1990 
1991 	/*
1992 	 * First step is to temporarily invalidate the MPT entry.  This
1993 	 * regains ownership from the hardware, and gives us the opportunity
1994 	 * to modify the entry.  Note: The HW2SW_MPT command returns the
1995 	 * current MPT entry contents.  These are saved away here because
1996 	 * they will be reused in a later step below.  If the region has
1997 	 * bound memory windows that we fail returning an "in use" error code.
1998 	 * Otherwise, this is an unexpected error and we deregister the
1999 	 * memory region and return error.
2000 	 *
2001 	 * We use HERMON_CMD_NOSLEEP_SPIN here always because we must protect
2002 	 * against holding the lock around this rereg call in all contexts.
2003 	 */
2004 	status = hermon_cmn_ownership_cmd_post(state, HW2SW_MPT, &mpt_entry,
2005 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, HERMON_CMD_NOSLEEP_SPIN);
2006 	if (status != HERMON_CMD_SUCCESS) {
2007 		mutex_exit(&mr->mr_lock);
2008 		if (status == HERMON_CMD_REG_BOUND) {
2009 			return (IBT_MR_IN_USE);
2010 		} else {
2011 			cmn_err(CE_CONT, "Hermon: HW2SW_MPT command failed: "
2012 			    "%08x\n", status);
2013 			if (status == HERMON_CMD_INVALID_STATUS) {
2014 				hermon_fm_ereport(state, HCA_SYS_ERR,
2015 				    HCA_ERR_SRV_LOST);
2016 			}
2017 			/*
2018 			 * Call deregister and ensure that all current
2019 			 * resources get freed up
2020 			 */
2021 			if (hermon_mr_deregister(state, &mr,
2022 			    HERMON_MR_DEREG_ALL, sleep) != DDI_SUCCESS) {
2023 				HERMON_WARNING(state, "failed to deregister "
2024 				    "memory region");
2025 			}
2026 			return (ibc_get_ci_failure(0));
2027 		}
2028 	}
2029 
2030 	/*
2031 	 * If we're changing the protection domain, then validate the new one
2032 	 */
2033 	if (flags & IBT_MR_CHANGE_PD) {
2034 
2035 		/* Check for valid PD handle pointer */
2036 		if (pd == NULL) {
2037 			mutex_exit(&mr->mr_lock);
2038 			/*
2039 			 * Call deregister and ensure that all current
2040 			 * resources get properly freed up. Unnecessary
2041 			 * here to attempt to regain software ownership
2042 			 * of the MPT entry as that has already been
2043 			 * done above.
2044 			 */
2045 			if (hermon_mr_deregister(state, &mr,
2046 			    HERMON_MR_DEREG_NO_HW2SW_MPT, sleep) !=
2047 			    DDI_SUCCESS) {
2048 				HERMON_WARNING(state, "failed to deregister "
2049 				    "memory region");
2050 			}
2051 			status = IBT_PD_HDL_INVALID;
2052 			goto mrrereg_fail;
2053 		}
2054 
2055 		/* Use the new PD handle in all operations below */
2056 		pd_to_use = pd;
2057 
2058 	} else {
2059 		/* Use the current PD handle in all operations below */
2060 		pd_to_use = mr->mr_pdhdl;
2061 	}
2062 
2063 	/*
2064 	 * If we're changing access permissions, then validate the new ones
2065 	 */
2066 	if (flags & IBT_MR_CHANGE_ACCESS) {
2067 		/*
2068 		 * Validate the access flags.  Both remote write and remote
2069 		 * atomic require the local write flag to be set
2070 		 */
2071 		if (((flags & IBT_MR_ENABLE_REMOTE_WRITE) ||
2072 		    (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)) &&
2073 		    !(flags & IBT_MR_ENABLE_LOCAL_WRITE)) {
2074 			mutex_exit(&mr->mr_lock);
2075 			/*
2076 			 * Call deregister and ensure that all current
2077 			 * resources get properly freed up. Unnecessary
2078 			 * here to attempt to regain software ownership
2079 			 * of the MPT entry as that has already been
2080 			 * done above.
2081 			 */
2082 			if (hermon_mr_deregister(state, &mr,
2083 			    HERMON_MR_DEREG_NO_HW2SW_MPT, sleep) !=
2084 			    DDI_SUCCESS) {
2085 				HERMON_WARNING(state, "failed to deregister "
2086 				    "memory region");
2087 			}
2088 			status = IBT_MR_ACCESS_REQ_INVALID;
2089 			goto mrrereg_fail;
2090 		}
2091 
2092 		/*
2093 		 * Setup and validate the memory region access flags.  This
2094 		 * means translating the IBTF's enable flags into the access
2095 		 * flags that will be used in later operations.
2096 		 */
2097 		acc_flags_to_use = 0;
2098 		if (flags & IBT_MR_ENABLE_WINDOW_BIND)
2099 			acc_flags_to_use |= IBT_MR_WINDOW_BIND;
2100 		if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
2101 			acc_flags_to_use |= IBT_MR_LOCAL_WRITE;
2102 		if (flags & IBT_MR_ENABLE_REMOTE_READ)
2103 			acc_flags_to_use |= IBT_MR_REMOTE_READ;
2104 		if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
2105 			acc_flags_to_use |= IBT_MR_REMOTE_WRITE;
2106 		if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
2107 			acc_flags_to_use |= IBT_MR_REMOTE_ATOMIC;
2108 
2109 	} else {
2110 		acc_flags_to_use = mr->mr_accflag;
2111 	}
2112 
2113 	/*
2114 	 * If we're modifying the translation, then figure out whether
2115 	 * we can reuse the current MTT resources.  This means calling
2116 	 * hermon_mr_rereg_xlat_helper() which does most of the heavy lifting
2117 	 * for the reregistration.  If the current memory region contains
2118 	 * sufficient MTT entries for the new regions, then it will be
2119 	 * reused and filled in.  Otherwise, new entries will be allocated,
2120 	 * the old ones will be freed, and the new entries will be filled
2121 	 * in.  Note:  If we're not modifying the translation, then we
2122 	 * should already have all the information we need to update the MPT.
2123 	 * Also note: If hermon_mr_rereg_xlat_helper() fails, it will return
2124 	 * a "dereg_level" which is the level of cleanup that needs to be
2125 	 * passed to hermon_mr_deregister() to finish the cleanup.
2126 	 */
2127 	if (flags & IBT_MR_CHANGE_TRANSLATION) {
2128 		status = hermon_mr_rereg_xlat_helper(state, mr, bind, op,
2129 		    &mtt_addr_to_use, sleep, &dereg_level);
2130 		if (status != DDI_SUCCESS) {
2131 			mutex_exit(&mr->mr_lock);
2132 			/*
2133 			 * Call deregister and ensure that all resources get
2134 			 * properly freed up.
2135 			 */
2136 			if (hermon_mr_deregister(state, &mr, dereg_level,
2137 			    sleep) != DDI_SUCCESS) {
2138 				HERMON_WARNING(state, "failed to deregister "
2139 				    "memory region");
2140 			}
2141 			goto mrrereg_fail;
2142 		}
2143 		vaddr_to_use = mr->mr_bindinfo.bi_addr;
2144 		len_to_use   = mr->mr_bindinfo.bi_len;
2145 	} else {
2146 		mtt_addr_to_use = mr->mr_mttaddr;
2147 		vaddr_to_use = mr->mr_bindinfo.bi_addr;
2148 		len_to_use   = mr->mr_bindinfo.bi_len;
2149 	}
2150 
2151 	/*
2152 	 * Calculate new keys (Lkey, Rkey) from MPT index.  Just like they were
2153 	 * when the region was first registered, each key is formed from
2154 	 * "constrained" bits and "unconstrained" bits.  Note:  If no remote
2155 	 * access is required, then the RKey value is not filled in.  Otherwise
2156 	 * both Rkey and LKey are given the same value.
2157 	 */
2158 	mr->mr_lkey = hermon_mr_keycalc(mpt->hr_indx);
2159 	if ((acc_flags_to_use & IBT_MR_REMOTE_READ) ||
2160 	    (acc_flags_to_use & IBT_MR_REMOTE_WRITE) ||
2161 	    (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC)) {
2162 		mr->mr_rkey = mr->mr_lkey;
2163 	} else
2164 		mr->mr_rkey = 0;
2165 
2166 	/*
2167 	 * Fill in the MPT entry.  This is the final step before passing
2168 	 * ownership of the MPT entry to the Hermon hardware.  We use all of
2169 	 * the information collected/calculated above to fill in the
2170 	 * requisite portions of the MPT.
2171 	 */
2172 	bzero(&mpt_entry, sizeof (hermon_hw_dmpt_t));
2173 
2174 	mpt_entry.status  = HERMON_MPT_SW_OWNERSHIP;
2175 	mpt_entry.en_bind = (acc_flags_to_use & IBT_MR_WINDOW_BIND)   ? 1 : 0;
2176 	mpt_entry.atomic  = (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
2177 	mpt_entry.rw	  = (acc_flags_to_use & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
2178 	mpt_entry.rr	  = (acc_flags_to_use & IBT_MR_REMOTE_READ)   ? 1 : 0;
2179 	mpt_entry.lw	  = (acc_flags_to_use & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
2180 	mpt_entry.lr	  = 1;
2181 	mpt_entry.phys_addr = 0;
2182 	mpt_entry.reg_win = HERMON_MPT_IS_REGION;
2183 
2184 	mpt_entry.entity_sz	= mr->mr_logmttpgsz;
2185 	mpt_entry.mem_key	= mr->mr_lkey;
2186 	mpt_entry.pd		= pd_to_use->pd_pdnum;
2187 
2188 	mpt_entry.start_addr	= vaddr_to_use;
2189 	mpt_entry.reg_win_len	= len_to_use;
2190 	mpt_entry.mtt_addr_h = mtt_addr_to_use >> 32;
2191 	mpt_entry.mtt_addr_l = mtt_addr_to_use >> 3;
2192 
2193 	/*
2194 	 * Write the updated MPT entry to hardware
2195 	 *
2196 	 * We use HERMON_CMD_NOSLEEP_SPIN here always because we must protect
2197 	 * against holding the lock around this rereg call in all contexts.
2198 	 */
2199 	status = hermon_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
2200 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, HERMON_CMD_NOSLEEP_SPIN);
2201 	if (status != HERMON_CMD_SUCCESS) {
2202 		mutex_exit(&mr->mr_lock);
2203 		cmn_err(CE_CONT, "Hermon: SW2HW_MPT command failed: %08x\n",
2204 		    status);
2205 		if (status == HERMON_CMD_INVALID_STATUS) {
2206 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
2207 		}
2208 		/*
2209 		 * Call deregister and ensure that all current resources get
2210 		 * properly freed up. Unnecessary here to attempt to regain
2211 		 * software ownership of the MPT entry as that has already
2212 		 * been done above.
2213 		 */
2214 		if (hermon_mr_deregister(state, &mr,
2215 		    HERMON_MR_DEREG_NO_HW2SW_MPT, sleep) != DDI_SUCCESS) {
2216 			HERMON_WARNING(state, "failed to deregister memory "
2217 			    "region");
2218 		}
2219 		return (ibc_get_ci_failure(0));
2220 	}
2221 
2222 	/*
2223 	 * If we're changing PD, then update their reference counts now.
2224 	 * This means decrementing the reference count on the old PD and
2225 	 * incrementing the reference count on the new PD.
2226 	 */
2227 	if (flags & IBT_MR_CHANGE_PD) {
2228 		hermon_pd_refcnt_dec(mr->mr_pdhdl);
2229 		hermon_pd_refcnt_inc(pd);
2230 	}
2231 
2232 	/*
2233 	 * Update the contents of the Hermon Memory Region handle to reflect
2234 	 * what has been changed.
2235 	 */
2236 	mr->mr_pdhdl	  = pd_to_use;
2237 	mr->mr_accflag	  = acc_flags_to_use;
2238 	mr->mr_is_umem	  = 0;
2239 	mr->mr_is_fmr	  = 0;
2240 	mr->mr_umemcookie = NULL;
2241 	mr->mr_lkey	  = hermon_mr_key_swap(mr->mr_lkey);
2242 	mr->mr_rkey	  = hermon_mr_key_swap(mr->mr_rkey);
2243 
2244 	/* New MR handle is same as the old */
2245 	*mrhdl_new = mr;
2246 	mutex_exit(&mr->mr_lock);
2247 
2248 	return (DDI_SUCCESS);
2249 
2250 mrrereg_fail:
2251 	return (status);
2252 }
2253 
2254 
2255 /*
2256  * hermon_mr_rereg_xlat_helper
2257  *    Context: Can be called from interrupt or base context.
2258  *    Note: This routine expects the "mr_lock" to be held when it
2259  *    is called.  Upon returning failure, this routine passes information
2260  *    about what "dereg_level" should be passed to hermon_mr_deregister().
2261  */
2262 static int
2263 hermon_mr_rereg_xlat_helper(hermon_state_t *state, hermon_mrhdl_t mr,
2264     hermon_bind_info_t *bind, hermon_mr_options_t *op, uint64_t *mtt_addr,
2265     uint_t sleep, uint_t *dereg_level)
2266 {
2267 	hermon_rsrc_t		*mtt, *mtt_refcnt;
2268 	hermon_sw_refcnt_t	*swrc_old, *swrc_new;
2269 	ddi_dma_handle_t	dmahdl;
2270 	uint64_t		nummtt_needed, nummtt_in_currrsrc, max_sz;
2271 	uint_t			mtt_pgsize_bits, bind_type, reuse_dmahdl;
2272 	int			status;
2273 
2274 	ASSERT(MUTEX_HELD(&mr->mr_lock));
2275 
2276 	/*
2277 	 * Check the "options" flag.  Currently this flag tells the driver
2278 	 * whether or not the region should be bound normally (i.e. with
2279 	 * entries written into the PCI IOMMU) or whether it should be
2280 	 * registered to bypass the IOMMU.
2281 	 */
2282 	if (op == NULL) {
2283 		bind_type = HERMON_BINDMEM_NORMAL;
2284 	} else {
2285 		bind_type = op->mro_bind_type;
2286 	}
2287 
2288 	/*
2289 	 * Check for invalid length.  Check is the length is zero or if the
2290 	 * length is larger than the maximum configured value.  Return error
2291 	 * if it is.
2292 	 */
2293 	max_sz = ((uint64_t)1 << state->hs_cfg_profile->cp_log_max_mrw_sz);
2294 	if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
2295 		/*
2296 		 * Deregister will be called upon returning failure from this
2297 		 * routine. This will ensure that all current resources get
2298 		 * properly freed up. Unnecessary to attempt to regain
2299 		 * software ownership of the MPT entry as that has already
2300 		 * been done above (in hermon_mr_reregister())
2301 		 */
2302 		*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT;
2303 
2304 		status = IBT_MR_LEN_INVALID;
2305 		goto mrrereghelp_fail;
2306 	}
2307 
2308 	/*
2309 	 * Determine the number of pages necessary for new region and the
2310 	 * number of pages supported by the current MTT resources
2311 	 */
2312 	nummtt_needed = hermon_mr_nummtt_needed(state, bind, &mtt_pgsize_bits);
2313 	nummtt_in_currrsrc = mr->mr_mttrsrcp->hr_len >> HERMON_MTT_SIZE_SHIFT;
2314 
2315 	/*
2316 	 * Depending on whether we have enough pages or not, the next step is
2317 	 * to fill in a set of MTT entries that reflect the new mapping.  In
2318 	 * the first case below, we already have enough entries.  This means
2319 	 * we need to unbind the memory from the previous mapping, bind the
2320 	 * memory for the new mapping, write the new MTT entries, and update
2321 	 * the mr to reflect the changes.
2322 	 * In the second case below, we do not have enough entries in the
2323 	 * current mapping.  So, in this case, we need not only to unbind the
2324 	 * current mapping, but we need to free up the MTT resources associated
2325 	 * with that mapping.  After we've successfully done that, we continue
2326 	 * by binding the new memory, allocating new MTT entries, writing the
2327 	 * new MTT entries, and updating the mr to reflect the changes.
2328 	 */
2329 
2330 	/*
2331 	 * If this region is being shared (i.e. MTT refcount != 1), then we
2332 	 * can't reuse the current MTT resources regardless of their size.
2333 	 * Instead we'll need to alloc new ones (below) just as if there
2334 	 * hadn't been enough room in the current entries.
2335 	 */
2336 	swrc_old = (hermon_sw_refcnt_t *)mr->mr_mttrefcntp->hr_addr;
2337 	if (HERMON_MTT_IS_NOT_SHARED(swrc_old) &&
2338 	    (nummtt_needed <= nummtt_in_currrsrc)) {
2339 
2340 		/*
2341 		 * Unbind the old mapping for this memory region, but retain
2342 		 * the ddi_dma_handle_t (if possible) for reuse in the bind
2343 		 * operation below.  Note:  If original memory region was
2344 		 * bound for IOMMU bypass and the new region can not use
2345 		 * bypass, then a new DMA handle will be necessary.
2346 		 */
2347 		if (HERMON_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2348 			mr->mr_bindinfo.bi_free_dmahdl = 0;
2349 			hermon_mr_mem_unbind(state, &mr->mr_bindinfo);
2350 			dmahdl = mr->mr_bindinfo.bi_dmahdl;
2351 			reuse_dmahdl = 1;
2352 		} else {
2353 			hermon_mr_mem_unbind(state, &mr->mr_bindinfo);
2354 			dmahdl = NULL;
2355 			reuse_dmahdl = 0;
2356 		}
2357 
2358 		/*
2359 		 * Bind the new memory and determine the mapped addresses.
2360 		 * As described, this routine and hermon_mr_fast_mtt_write()
2361 		 * do the majority of the work for the memory registration
2362 		 * operations.  Note:  When we successfully finish the binding,
2363 		 * we will set the "bi_free_dmahdl" flag to indicate that
2364 		 * even though we may have reused the ddi_dma_handle_t we do
2365 		 * wish it to be freed up at some later time.  Note also that
2366 		 * if we fail, we may need to cleanup the ddi_dma_handle_t.
2367 		 */
2368 		bind->bi_bypass	= bind_type;
2369 		status = hermon_mr_mem_bind(state, bind, dmahdl, sleep, 1);
2370 		if (status != DDI_SUCCESS) {
2371 			if (reuse_dmahdl) {
2372 				ddi_dma_free_handle(&dmahdl);
2373 			}
2374 
2375 			/*
2376 			 * Deregister will be called upon returning failure
2377 			 * from this routine. This will ensure that all
2378 			 * current resources get properly freed up.
2379 			 * Unnecessary to attempt to regain software ownership
2380 			 * of the MPT entry as that has already been done
2381 			 * above (in hermon_mr_reregister()).  Also unnecessary
2382 			 * to attempt to unbind the memory.
2383 			 */
2384 			*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2385 
2386 			status = IBT_INSUFF_RESOURCE;
2387 			goto mrrereghelp_fail;
2388 		}
2389 		if (reuse_dmahdl) {
2390 			bind->bi_free_dmahdl = 1;
2391 		}
2392 
2393 		/*
2394 		 * Using the new mapping, but reusing the current MTT
2395 		 * resources, write the updated entries to MTT
2396 		 */
2397 		mtt    = mr->mr_mttrsrcp;
2398 		status = hermon_mr_fast_mtt_write(state, mtt, bind,
2399 		    mtt_pgsize_bits);
2400 		if (status != DDI_SUCCESS) {
2401 			/*
2402 			 * Deregister will be called upon returning failure
2403 			 * from this routine. This will ensure that all
2404 			 * current resources get properly freed up.
2405 			 * Unnecessary to attempt to regain software ownership
2406 			 * of the MPT entry as that has already been done
2407 			 * above (in hermon_mr_reregister()).  Also unnecessary
2408 			 * to attempt to unbind the memory.
2409 			 *
2410 			 * But we do need to unbind the newly bound memory
2411 			 * before returning.
2412 			 */
2413 			hermon_mr_mem_unbind(state, bind);
2414 			*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2415 
2416 			/*
2417 			 * hermon_mr_fast_mtt_write() returns DDI_FAILURE
2418 			 * only if it detects a HW error during DMA.
2419 			 */
2420 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
2421 			status = ibc_get_ci_failure(0);
2422 			goto mrrereghelp_fail;
2423 		}
2424 
2425 		/* Put the updated information into the Mem Region handle */
2426 		mr->mr_bindinfo	  = *bind;
2427 		mr->mr_logmttpgsz = mtt_pgsize_bits;
2428 
2429 	} else {
2430 		/*
2431 		 * Check if the memory region MTT is shared by any other MRs.
2432 		 * Since the resource may be shared between multiple memory
2433 		 * regions (as a result of a "RegisterSharedMR()" verb) it is
2434 		 * important that we not unbind any resources prematurely.
2435 		 */
2436 		if (!HERMON_MTT_IS_SHARED(swrc_old)) {
2437 			/*
2438 			 * Unbind the old mapping for this memory region, but
2439 			 * retain the ddi_dma_handle_t for reuse in the bind
2440 			 * operation below. Note: This can only be done here
2441 			 * because the region being reregistered is not
2442 			 * currently shared.  Also if original memory region
2443 			 * was bound for IOMMU bypass and the new region can
2444 			 * not use bypass, then a new DMA handle will be
2445 			 * necessary.
2446 			 */
2447 			if (HERMON_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2448 				mr->mr_bindinfo.bi_free_dmahdl = 0;
2449 				hermon_mr_mem_unbind(state, &mr->mr_bindinfo);
2450 				dmahdl = mr->mr_bindinfo.bi_dmahdl;
2451 				reuse_dmahdl = 1;
2452 			} else {
2453 				hermon_mr_mem_unbind(state, &mr->mr_bindinfo);
2454 				dmahdl = NULL;
2455 				reuse_dmahdl = 0;
2456 			}
2457 		} else {
2458 			dmahdl = NULL;
2459 			reuse_dmahdl = 0;
2460 		}
2461 
2462 		/*
2463 		 * Bind the new memory and determine the mapped addresses.
2464 		 * As described, this routine and hermon_mr_fast_mtt_write()
2465 		 * do the majority of the work for the memory registration
2466 		 * operations.  Note:  When we successfully finish the binding,
2467 		 * we will set the "bi_free_dmahdl" flag to indicate that
2468 		 * even though we may have reused the ddi_dma_handle_t we do
2469 		 * wish it to be freed up at some later time.  Note also that
2470 		 * if we fail, we may need to cleanup the ddi_dma_handle_t.
2471 		 */
2472 		bind->bi_bypass	= bind_type;
2473 		status = hermon_mr_mem_bind(state, bind, dmahdl, sleep, 1);
2474 		if (status != DDI_SUCCESS) {
2475 			if (reuse_dmahdl) {
2476 				ddi_dma_free_handle(&dmahdl);
2477 			}
2478 
2479 			/*
2480 			 * Deregister will be called upon returning failure
2481 			 * from this routine. This will ensure that all
2482 			 * current resources get properly freed up.
2483 			 * Unnecessary to attempt to regain software ownership
2484 			 * of the MPT entry as that has already been done
2485 			 * above (in hermon_mr_reregister()).  Also unnecessary
2486 			 * to attempt to unbind the memory.
2487 			 */
2488 			*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2489 
2490 			status = IBT_INSUFF_RESOURCE;
2491 			goto mrrereghelp_fail;
2492 		}
2493 		if (reuse_dmahdl) {
2494 			bind->bi_free_dmahdl = 1;
2495 		}
2496 
2497 		/*
2498 		 * Allocate the new MTT entries resource
2499 		 */
2500 		status = hermon_rsrc_alloc(state, HERMON_MTT, nummtt_needed,
2501 		    sleep, &mtt);
2502 		if (status != DDI_SUCCESS) {
2503 			/*
2504 			 * Deregister will be called upon returning failure
2505 			 * from this routine. This will ensure that all
2506 			 * current resources get properly freed up.
2507 			 * Unnecessary to attempt to regain software ownership
2508 			 * of the MPT entry as that has already been done
2509 			 * above (in hermon_mr_reregister()).  Also unnecessary
2510 			 * to attempt to unbind the memory.
2511 			 *
2512 			 * But we do need to unbind the newly bound memory
2513 			 * before returning.
2514 			 */
2515 			hermon_mr_mem_unbind(state, bind);
2516 			*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2517 
2518 			status = IBT_INSUFF_RESOURCE;
2519 			goto mrrereghelp_fail;
2520 		}
2521 
2522 		/*
2523 		 * Allocate MTT reference count (to track shared memory
2524 		 * regions).  As mentioned elsewhere above, this reference
2525 		 * count resource may never be used on the given memory region,
2526 		 * but if it is ever later registered as a "shared" memory
2527 		 * region then this resource will be necessary.  Note:  This
2528 		 * is only necessary here if the existing memory region is
2529 		 * already being shared (because otherwise we already have
2530 		 * a useable reference count resource).
2531 		 */
2532 		if (HERMON_MTT_IS_SHARED(swrc_old)) {
2533 			status = hermon_rsrc_alloc(state, HERMON_REFCNT, 1,
2534 			    sleep, &mtt_refcnt);
2535 			if (status != DDI_SUCCESS) {
2536 				/*
2537 				 * Deregister will be called upon returning
2538 				 * failure from this routine. This will ensure
2539 				 * that all current resources get properly
2540 				 * freed up.  Unnecessary to attempt to regain
2541 				 * software ownership of the MPT entry as that
2542 				 * has already been done above (in
2543 				 * hermon_mr_reregister()).  Also unnecessary
2544 				 * to attempt to unbind the memory.
2545 				 *
2546 				 * But we need to unbind the newly bound
2547 				 * memory and free up the newly allocated MTT
2548 				 * entries before returning.
2549 				 */
2550 				hermon_mr_mem_unbind(state, bind);
2551 				hermon_rsrc_free(state, &mtt);
2552 				*dereg_level =
2553 				    HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2554 
2555 				status = IBT_INSUFF_RESOURCE;
2556 				goto mrrereghelp_fail;
2557 			}
2558 			swrc_new = (hermon_sw_refcnt_t *)mtt_refcnt->hr_addr;
2559 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_new))
2560 			HERMON_MTT_REFCNT_INIT(swrc_new);
2561 		} else {
2562 			mtt_refcnt = mr->mr_mttrefcntp;
2563 		}
2564 
2565 		/*
2566 		 * Using the new mapping and the new MTT resources, write the
2567 		 * updated entries to MTT
2568 		 */
2569 		status = hermon_mr_fast_mtt_write(state, mtt, bind,
2570 		    mtt_pgsize_bits);
2571 		if (status != DDI_SUCCESS) {
2572 			/*
2573 			 * Deregister will be called upon returning failure
2574 			 * from this routine. This will ensure that all
2575 			 * current resources get properly freed up.
2576 			 * Unnecessary to attempt to regain software ownership
2577 			 * of the MPT entry as that has already been done
2578 			 * above (in hermon_mr_reregister()).  Also unnecessary
2579 			 * to attempt to unbind the memory.
2580 			 *
2581 			 * But we need to unbind the newly bound memory,
2582 			 * free up the newly allocated MTT entries, and
2583 			 * (possibly) free the new MTT reference count
2584 			 * resource before returning.
2585 			 */
2586 			if (HERMON_MTT_IS_SHARED(swrc_old)) {
2587 				hermon_rsrc_free(state, &mtt_refcnt);
2588 			}
2589 			hermon_mr_mem_unbind(state, bind);
2590 			hermon_rsrc_free(state, &mtt);
2591 			*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2592 
2593 			status = IBT_INSUFF_RESOURCE;
2594 			goto mrrereghelp_fail;
2595 		}
2596 
2597 		/*
2598 		 * Check if the memory region MTT is shared by any other MRs.
2599 		 * Since the resource may be shared between multiple memory
2600 		 * regions (as a result of a "RegisterSharedMR()" verb) it is
2601 		 * important that we not free up any resources prematurely.
2602 		 */
2603 		if (HERMON_MTT_IS_SHARED(swrc_old)) {
2604 			/* Decrement MTT reference count for "old" region */
2605 			(void) hermon_mtt_refcnt_dec(mr->mr_mttrefcntp);
2606 		} else {
2607 			/* Free up the old MTT entries resource */
2608 			hermon_rsrc_free(state, &mr->mr_mttrsrcp);
2609 		}
2610 
2611 		/* Put the updated information into the mrhdl */
2612 		mr->mr_bindinfo	  = *bind;
2613 		mr->mr_logmttpgsz = mtt_pgsize_bits;
2614 		mr->mr_mttrsrcp   = mtt;
2615 		mr->mr_mttrefcntp = mtt_refcnt;
2616 	}
2617 
2618 	/*
2619 	 * Calculate and return the updated MTT address (in the DDR address
2620 	 * space).  This will be used by the caller (hermon_mr_reregister) in
2621 	 * the updated MPT entry
2622 	 */
2623 	*mtt_addr = mtt->hr_indx << HERMON_MTT_SIZE_SHIFT;
2624 
2625 	return (DDI_SUCCESS);
2626 
2627 mrrereghelp_fail:
2628 	return (status);
2629 }
2630 
2631 
2632 /*
2633  * hermon_mr_nummtt_needed()
2634  *    Context: Can be called from interrupt or base context.
2635  */
2636 /* ARGSUSED */
2637 static uint64_t
2638 hermon_mr_nummtt_needed(hermon_state_t *state, hermon_bind_info_t *bind,
2639     uint_t *mtt_pgsize_bits)
2640 {
2641 	uint64_t	pg_offset_mask;
2642 	uint64_t	pg_offset, tmp_length;
2643 
2644 	/*
2645 	 * For now we specify the page size as 8Kb (the default page size for
2646 	 * the sun4u architecture), or 4Kb for x86.  Figure out optimal page
2647 	 * size by examining the dmacookies
2648 	 */
2649 	*mtt_pgsize_bits = PAGESHIFT;
2650 
2651 	pg_offset_mask = ((uint64_t)1 << *mtt_pgsize_bits) - 1;
2652 	pg_offset = bind->bi_addr & pg_offset_mask;
2653 	tmp_length = pg_offset + (bind->bi_len - 1);
2654 	return ((tmp_length >> *mtt_pgsize_bits) + 1);
2655 }
2656 
2657 
2658 /*
2659  * hermon_mr_mem_bind()
2660  *    Context: Can be called from interrupt or base context.
2661  */
2662 static int
2663 hermon_mr_mem_bind(hermon_state_t *state, hermon_bind_info_t *bind,
2664     ddi_dma_handle_t dmahdl, uint_t sleep, uint_t is_buffer)
2665 {
2666 	ddi_dma_attr_t	dma_attr;
2667 	int		(*callback)(caddr_t);
2668 	int		status;
2669 
2670 	/* bi_type must be set to a meaningful value to get a bind handle */
2671 	ASSERT(bind->bi_type == HERMON_BINDHDL_VADDR ||
2672 	    bind->bi_type == HERMON_BINDHDL_BUF ||
2673 	    bind->bi_type == HERMON_BINDHDL_UBUF);
2674 
2675 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2676 
2677 	/* Set the callback flag appropriately */
2678 	callback = (sleep == HERMON_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT;
2679 
2680 	/*
2681 	 * Initialize many of the default DMA attributes.  Then, if we're
2682 	 * bypassing the IOMMU, set the DDI_DMA_FORCE_PHYSICAL flag.
2683 	 */
2684 	if (dmahdl == NULL) {
2685 		hermon_dma_attr_init(state, &dma_attr);
2686 #ifdef	__sparc
2687 		if (bind->bi_bypass == HERMON_BINDMEM_BYPASS) {
2688 			dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2689 		}
2690 #endif
2691 
2692 		/* set RO if needed - tunable set and 'is_buffer' is non-0 */
2693 		if (is_buffer) {
2694 			if (! (bind->bi_flags & IBT_MR_DISABLE_RO)) {
2695 				if ((bind->bi_type != HERMON_BINDHDL_UBUF) &&
2696 				    (hermon_kernel_data_ro ==
2697 				    HERMON_RO_ENABLED)) {
2698 					dma_attr.dma_attr_flags |=
2699 					    DDI_DMA_RELAXED_ORDERING;
2700 				}
2701 				if (((bind->bi_type == HERMON_BINDHDL_UBUF) &&
2702 				    (hermon_user_data_ro ==
2703 				    HERMON_RO_ENABLED))) {
2704 					dma_attr.dma_attr_flags |=
2705 					    DDI_DMA_RELAXED_ORDERING;
2706 				}
2707 			}
2708 		}
2709 
2710 		/* Allocate a DMA handle for the binding */
2711 		status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr,
2712 		    callback, NULL, &bind->bi_dmahdl);
2713 		if (status != DDI_SUCCESS) {
2714 			return (status);
2715 		}
2716 		bind->bi_free_dmahdl = 1;
2717 
2718 	} else  {
2719 		bind->bi_dmahdl = dmahdl;
2720 		bind->bi_free_dmahdl = 0;
2721 	}
2722 
2723 
2724 	/*
2725 	 * Bind the memory to get the PCI mapped addresses.  The decision
2726 	 * to call ddi_dma_addr_bind_handle() or ddi_dma_buf_bind_handle()
2727 	 * is determined by the "bi_type" flag.  Note: if the bind operation
2728 	 * fails then we have to free up the DMA handle and return error.
2729 	 */
2730 	if (bind->bi_type == HERMON_BINDHDL_VADDR) {
2731 		status = ddi_dma_addr_bind_handle(bind->bi_dmahdl, NULL,
2732 		    (caddr_t)(uintptr_t)bind->bi_addr, bind->bi_len,
2733 		    (DDI_DMA_RDWR | DDI_DMA_CONSISTENT), callback, NULL,
2734 		    &bind->bi_dmacookie, &bind->bi_cookiecnt);
2735 
2736 	} else {  /* HERMON_BINDHDL_BUF or HERMON_BINDHDL_UBUF */
2737 
2738 		status = ddi_dma_buf_bind_handle(bind->bi_dmahdl,
2739 		    bind->bi_buf, (DDI_DMA_RDWR | DDI_DMA_CONSISTENT), callback,
2740 		    NULL, &bind->bi_dmacookie, &bind->bi_cookiecnt);
2741 	}
2742 	if (status != DDI_DMA_MAPPED) {
2743 		if (bind->bi_free_dmahdl != 0) {
2744 			ddi_dma_free_handle(&bind->bi_dmahdl);
2745 		}
2746 		return (status);
2747 	}
2748 
2749 	return (DDI_SUCCESS);
2750 }
2751 
2752 
2753 /*
2754  * hermon_mr_mem_unbind()
2755  *    Context: Can be called from interrupt or base context.
2756  */
2757 static void
2758 hermon_mr_mem_unbind(hermon_state_t *state, hermon_bind_info_t *bind)
2759 {
2760 	int	status;
2761 
2762 	/*
2763 	 * In case of HERMON_BINDHDL_UBUF, the memory bi_buf points to
2764 	 * is actually allocated by ddi_umem_iosetup() internally, then
2765 	 * it's required to free it here. Reset bi_type to HERMON_BINDHDL_NONE
2766 	 * not to free it again later.
2767 	 */
2768 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2769 	if (bind->bi_type == HERMON_BINDHDL_UBUF) {
2770 		freerbuf(bind->bi_buf);
2771 		bind->bi_type = HERMON_BINDHDL_NONE;
2772 	}
2773 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
2774 
2775 	/*
2776 	 * Unbind the DMA memory for the region
2777 	 *
2778 	 * Note: The only way ddi_dma_unbind_handle() currently
2779 	 * can return an error is if the handle passed in is invalid.
2780 	 * Since this should never happen, we choose to return void
2781 	 * from this function!  If this does return an error, however,
2782 	 * then we print a warning message to the console.
2783 	 */
2784 	status = ddi_dma_unbind_handle(bind->bi_dmahdl);
2785 	if (status != DDI_SUCCESS) {
2786 		HERMON_WARNING(state, "failed to unbind DMA mapping");
2787 		return;
2788 	}
2789 
2790 	/* Free up the DMA handle */
2791 	if (bind->bi_free_dmahdl != 0) {
2792 		ddi_dma_free_handle(&bind->bi_dmahdl);
2793 	}
2794 }
2795 
2796 
2797 /*
2798  * hermon_mr_fast_mtt_write()
2799  *    Context: Can be called from interrupt or base context.
2800  */
2801 static int
2802 hermon_mr_fast_mtt_write(hermon_state_t *state, hermon_rsrc_t *mtt,
2803     hermon_bind_info_t *bind, uint32_t mtt_pgsize_bits)
2804 {
2805 	hermon_icm_table_t	*icm_table;
2806 	hermon_dma_info_t	*dma_info;
2807 	uint32_t		index1, index2, rindx;
2808 	ddi_dma_cookie_t	dmacookie;
2809 	uint_t			cookie_cnt;
2810 	uint64_t		*mtt_table;
2811 	uint64_t		mtt_entry;
2812 	uint64_t		addr, endaddr;
2813 	uint64_t		pagesize;
2814 	offset_t		i, start;
2815 	uint_t			per_span;
2816 	int			sync_needed;
2817 
2818 	/*
2819 	 * XXX According to the PRM, we are to use the WRITE_MTT
2820 	 * command to write out MTTs. Tavor does not do this,
2821 	 * instead taking advantage of direct access to the MTTs,
2822 	 * and knowledge that Mellanox FMR relies on our ability
2823 	 * to write directly to the MTTs without any further
2824 	 * notification to the firmware. Likewise, we will choose
2825 	 * to not use the WRITE_MTT command, but to simply write
2826 	 * out the MTTs.
2827 	 */
2828 
2829 	/* Calculate page size from the suggested value passed in */
2830 	pagesize = ((uint64_t)1 << mtt_pgsize_bits);
2831 
2832 	/* Walk the "cookie list" and fill in the MTT table entries */
2833 	dmacookie  = bind->bi_dmacookie;
2834 	cookie_cnt = bind->bi_cookiecnt;
2835 
2836 	icm_table = &state->hs_icm[HERMON_MTT];
2837 	rindx = mtt->hr_indx;
2838 	hermon_index(index1, index2, rindx, icm_table, i);
2839 	start = i;
2840 
2841 	per_span   = icm_table->span;
2842 	dma_info   = icm_table->icm_dma[index1] + index2;
2843 	mtt_table  = (uint64_t *)(uintptr_t)dma_info->vaddr;
2844 
2845 	sync_needed = 0;
2846 	while (cookie_cnt-- > 0) {
2847 		addr    = dmacookie.dmac_laddress;
2848 		endaddr = addr + (dmacookie.dmac_size - 1);
2849 		addr    = addr & ~((uint64_t)pagesize - 1);
2850 
2851 		while (addr <= endaddr) {
2852 
2853 			/*
2854 			 * Fill in the mapped addresses (calculated above) and
2855 			 * set HERMON_MTT_ENTRY_PRESENT flag for each MTT entry.
2856 			 */
2857 			mtt_entry = addr | HERMON_MTT_ENTRY_PRESENT;
2858 			mtt_table[i] = htonll(mtt_entry);
2859 			i++;
2860 			rindx++;
2861 
2862 			if (i == per_span) {
2863 
2864 				(void) ddi_dma_sync(dma_info->dma_hdl,
2865 				    start * sizeof (hermon_hw_mtt_t),
2866 				    (i - start) * sizeof (hermon_hw_mtt_t),
2867 				    DDI_DMA_SYNC_FORDEV);
2868 
2869 				if ((addr + pagesize > endaddr) &&
2870 				    (cookie_cnt == 0))
2871 					return (DDI_SUCCESS);
2872 
2873 				hermon_index(index1, index2, rindx, icm_table,
2874 				    i);
2875 				start = i * sizeof (hermon_hw_mtt_t);
2876 				dma_info = icm_table->icm_dma[index1] + index2;
2877 				mtt_table =
2878 				    (uint64_t *)(uintptr_t)dma_info->vaddr;
2879 
2880 				sync_needed = 0;
2881 			} else {
2882 				sync_needed = 1;
2883 			}
2884 
2885 			addr += pagesize;
2886 			if (addr == 0) {
2887 				static int do_once = 1;
2888 				_NOTE(SCHEME_PROTECTS_DATA("safe sharing",
2889 				    do_once))
2890 				if (do_once) {
2891 					do_once = 0;
2892 					cmn_err(CE_NOTE, "probable error in "
2893 					    "dma_cookie address from caller\n");
2894 				}
2895 				break;
2896 			}
2897 		}
2898 
2899 		/*
2900 		 * When we've reached the end of the current DMA cookie,
2901 		 * jump to the next cookie (if there are more)
2902 		 */
2903 		if (cookie_cnt != 0) {
2904 			ddi_dma_nextcookie(bind->bi_dmahdl, &dmacookie);
2905 		}
2906 	}
2907 
2908 	/* done all the cookies, now sync the memory for the device */
2909 	if (sync_needed)
2910 		(void) ddi_dma_sync(dma_info->dma_hdl,
2911 		    start * sizeof (hermon_hw_mtt_t),
2912 		    (i - start) * sizeof (hermon_hw_mtt_t),
2913 		    DDI_DMA_SYNC_FORDEV);
2914 
2915 	return (DDI_SUCCESS);
2916 }
2917 
2918 /*
2919  * hermon_mr_fast_mtt_write_fmr()
2920  *    Context: Can be called from interrupt or base context.
2921  */
2922 static int
2923 hermon_mr_fast_mtt_write_fmr(hermon_rsrc_t *mtt, ibt_pmr_attr_t *mem_pattr,
2924     uint32_t mtt_pgsize_bits)
2925 {
2926 	uint64_t		*mtt_table;
2927 	ibt_phys_addr_t		*buf;
2928 	uint64_t		mtt_entry;
2929 	uint64_t		addr, first_addr, endaddr;
2930 	uint64_t		pagesize;
2931 	int			i;
2932 
2933 	/* Calculate page size from the suggested value passed in */
2934 	pagesize = ((uint64_t)1 << mtt_pgsize_bits);
2935 
2936 	/*
2937 	 * Walk the "addr list" and fill in the MTT table entries
2938 	 */
2939 	mtt_table  = (uint64_t *)mtt->hr_addr;
2940 	for (i = 0; i < mem_pattr->pmr_num_buf; i++) {
2941 		buf = &mem_pattr->pmr_addr_list[i];
2942 
2943 		/*
2944 		 * For first cookie, use the offset field to determine where
2945 		 * the buffer starts.  The end addr is then calculated with the
2946 		 * offset in mind.
2947 		 */
2948 		if (i == 0) {
2949 			first_addr = addr = buf->p_laddr +
2950 			    mem_pattr->pmr_offset;
2951 			endaddr = addr + (mem_pattr->pmr_buf_sz - 1) -
2952 			    mem_pattr->pmr_offset;
2953 		/*
2954 		 * For last cookie, determine end addr based on starting
2955 		 * address and size of the total buffer
2956 		 */
2957 		} else if (i == mem_pattr->pmr_num_buf - 1) {
2958 			addr = buf->p_laddr;
2959 			endaddr = addr + (first_addr + mem_pattr->pmr_len &
2960 			    (mem_pattr->pmr_buf_sz - 1));
2961 		/*
2962 		 * For the middle cookies case, start and end addr are
2963 		 * straightforward.  Just use the laddr, and the size, as all
2964 		 * middle cookies are a set size.
2965 		 */
2966 		} else {
2967 			addr = buf->p_laddr;
2968 			endaddr = addr + (mem_pattr->pmr_buf_sz - 1);
2969 		}
2970 
2971 		addr	= addr & ~((uint64_t)pagesize - 1);
2972 		while (addr <= endaddr) {
2973 			/*
2974 			 * Fill in the mapped addresses (calculated above) and
2975 			 * set HERMON_MTT_ENTRY_PRESENT flag for each MTT entry.
2976 			 */
2977 			mtt_entry = addr | HERMON_MTT_ENTRY_PRESENT;
2978 			mtt_table[i] = htonll(mtt_entry);
2979 			addr += pagesize;
2980 		}
2981 	}
2982 
2983 	return (DDI_SUCCESS);
2984 }
2985 
2986 
2987 /*
2988  * hermon_mtt_refcnt_inc()
2989  *    Context: Can be called from interrupt or base context.
2990  */
2991 static uint_t
2992 hermon_mtt_refcnt_inc(hermon_rsrc_t *rsrc)
2993 {
2994 	hermon_sw_refcnt_t *rc;
2995 
2996 	rc = (hermon_sw_refcnt_t *)rsrc->hr_addr;
2997 	return (atomic_inc_uint_nv(&rc->swrc_refcnt));
2998 }
2999 
3000 
3001 /*
3002  * hermon_mtt_refcnt_dec()
3003  *    Context: Can be called from interrupt or base context.
3004  */
3005 static uint_t
3006 hermon_mtt_refcnt_dec(hermon_rsrc_t *rsrc)
3007 {
3008 	hermon_sw_refcnt_t *rc;
3009 
3010 	rc = (hermon_sw_refcnt_t *)rsrc->hr_addr;
3011 	return (atomic_dec_uint_nv(&rc->swrc_refcnt));
3012 }
3013