1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_misc.c
29  *    Tavor Miscellaneous routines - Address Handle, Multicast, Protection
30  *    Domain, and port-related operations
31  *
32  *    Implements all the routines necessary for allocating, freeing, querying
33  *    and modifying Address Handles and Protection Domains.  Also implements
34  *    all the routines necessary for adding and removing Queue Pairs to/from
35  *    Multicast Groups.  Lastly, it implements the routines necessary for
36  *    port-related query and modify operations.
37  */
38 
39 #include <sys/types.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/modctl.h>
44 #include <sys/bitmap.h>
45 #include <sys/sysmacros.h>
46 
47 #include <sys/ib/adapters/tavor/tavor.h>
48 
49 static void tavor_udav_sync(tavor_ahhdl_t ah, tavor_hw_udav_t *udav,
50     uint_t flag);
51 static int tavor_mcg_qplist_add(tavor_state_t *state, tavor_mcghdl_t mcg,
52     tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp, uint_t *qp_found);
53 static int tavor_mcg_qplist_remove(tavor_mcghdl_t mcg,
54     tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp);
55 static void tavor_qp_mcg_refcnt_inc(tavor_qphdl_t qp);
56 static void tavor_qp_mcg_refcnt_dec(tavor_qphdl_t qp);
57 static uint_t tavor_mcg_walk_mgid_hash(tavor_state_t *state,
58     uint64_t start_indx, ib_gid_t mgid, uint_t *prev_indx);
59 static void tavor_mcg_setup_new_hdr(tavor_mcghdl_t mcg,
60     tavor_hw_mcg_t *mcg_hdr, ib_gid_t mgid, tavor_rsrc_t *mcg_rsrc);
61 static int tavor_mcg_hash_list_remove(tavor_state_t *state, uint_t curr_indx,
62     uint_t prev_indx, tavor_hw_mcg_t *mcg_entry);
63 static int tavor_mcg_entry_invalidate(tavor_state_t *state,
64     tavor_hw_mcg_t *mcg_entry, uint_t indx);
65 static int tavor_mgid_is_valid(ib_gid_t gid);
66 static int tavor_mlid_is_valid(ib_lid_t lid);
67 
68 
69 /*
70  * tavor_ah_alloc()
71  *    Context: Can be called only from user or kernel context.
72  */
73 int
tavor_ah_alloc(tavor_state_t * state,tavor_pdhdl_t pd,ibt_adds_vect_t * attr_p,tavor_ahhdl_t * ahhdl,uint_t sleepflag)74 tavor_ah_alloc(tavor_state_t *state, tavor_pdhdl_t pd,
75     ibt_adds_vect_t *attr_p, tavor_ahhdl_t *ahhdl, uint_t sleepflag)
76 {
77 	tavor_rsrc_t		*udav, *rsrc;
78 	tavor_hw_udav_t		udav_entry;
79 	tavor_ahhdl_t		ah;
80 	ibt_mr_attr_t		mr_attr;
81 	tavor_mr_options_t	op;
82 	tavor_mrhdl_t		mr;
83 	uint64_t		data;
84 	uint32_t		size;
85 	int			status, i, flag;
86 
87 	/*
88 	 * Someday maybe the "ibt_adds_vect_t *attr_p" will be NULL to
89 	 * indicate that we wish to allocate an "invalid" (i.e. empty)
90 	 * address handle XXX
91 	 */
92 
93 	/* Validate that specified port number is legal */
94 	if (!tavor_portnum_is_valid(state, attr_p->av_port_num)) {
95 		goto ahalloc_fail;
96 	}
97 
98 	/*
99 	 * Allocate a UDAV entry.  This will be filled in with all the
100 	 * necessary parameters to define the Address Handle.  Unlike the
101 	 * other hardware resources no ownership transfer takes place as
102 	 * these UDAV entries are always owned by hardware.
103 	 */
104 	status = tavor_rsrc_alloc(state, TAVOR_UDAV, 1, sleepflag, &udav);
105 	if (status != DDI_SUCCESS) {
106 		goto ahalloc_fail;
107 	}
108 
109 	/*
110 	 * Allocate the software structure for tracking the address handle
111 	 * (i.e. the Tavor Address Handle struct).  If we fail here, we must
112 	 * undo the previous resource allocation.
113 	 */
114 	status = tavor_rsrc_alloc(state, TAVOR_AHHDL, 1, sleepflag, &rsrc);
115 	if (status != DDI_SUCCESS) {
116 		goto ahalloc_fail1;
117 	}
118 	ah = (tavor_ahhdl_t)rsrc->tr_addr;
119 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
120 
121 	/* Increment the reference count on the protection domain (PD) */
122 	tavor_pd_refcnt_inc(pd);
123 
124 	/*
125 	 * Fill in the UDAV entry.  Note: We are only filling in a temporary
126 	 * copy here, which we will later copy into the actual entry in
127 	 * Tavor DDR memory.  This starts be zeroing out the temporary copy
128 	 * and then calling tavor_set_addr_path() to fill in the common
129 	 * portions that can be pulled from the "ibt_adds_vect_t" passed in
130 	 */
131 	bzero(&udav_entry, sizeof (tavor_hw_udav_t));
132 	status = tavor_set_addr_path(state, attr_p,
133 	    (tavor_hw_addr_path_t *)&udav_entry, TAVOR_ADDRPATH_UDAV, NULL);
134 	if (status != DDI_SUCCESS) {
135 		tavor_pd_refcnt_dec(pd);
136 		tavor_rsrc_free(state, &rsrc);
137 		tavor_rsrc_free(state, &udav);
138 		goto ahalloc_fail;
139 	}
140 	udav_entry.pd	  = pd->pd_pdnum;
141 	udav_entry.msg_sz = state->ts_cfg_profile->cp_max_mtu - 1;
142 
143 	/*
144 	 * Register the memory for the UDAV.  The memory for the UDAV must
145 	 * be registered in the Tavor TPT tables.  This gives us the LKey
146 	 * that we will need when we later post a UD work request that
147 	 * uses this address handle.
148 	 * We might be able to pre-register all the memory for the UDAV XXX
149 	 */
150 	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
151 	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)udav->tr_addr;
152 	mr_attr.mr_len	 = udav->tr_len;
153 	mr_attr.mr_as	 = NULL;
154 	mr_attr.mr_flags = flag;
155 	op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
156 	op.mro_bind_dmahdl = NULL;
157 	op.mro_bind_override_addr = 0;
158 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
159 	if (status != DDI_SUCCESS) {
160 		goto ahalloc_fail2;
161 	}
162 
163 	/*
164 	 * Fill in the UDAV entry.  Here we copy all the information from
165 	 * the temporary UDAV into the DDR memory for the real UDAV entry.
166 	 * Note that we copy everything but the first 64-bit word.  This
167 	 * is where the PD number for the address handle resides.
168 	 * By filling everything except the PD and then writing the PD in
169 	 * a separate step below, we can ensure that the UDAV is not
170 	 * accessed while there are partially written values in it (something
171 	 * which really should not happen anyway).  This is guaranteed
172 	 * because we take measures to ensure that the PD number is zero for
173 	 * all unused UDAV (and because PD#0 is reserved for Tavor).
174 	 */
175 	size = sizeof (tavor_hw_udav_t) >> 3;
176 	for (i = 1; i < size; i++) {
177 		data = ((uint64_t *)&udav_entry)[i];
178 		ddi_put64(udav->tr_acchdl, ((uint64_t *)udav->tr_addr + i),
179 		    data);
180 	}
181 	data = ((uint64_t *)&udav_entry)[0];
182 	ddi_put64(udav->tr_acchdl, (uint64_t *)udav->tr_addr, data);
183 
184 	/*
185 	 * Fill in the rest of the Tavor Address Handle struct.  Having
186 	 * successfully copied the UDAV into the hardware, we update the
187 	 * following fields for use in further operations on the AH.
188 	 *
189 	 * NOTE: We are saving away a copy of the "av_dgid.gid_guid" field
190 	 * here because we may need to return it later to the IBTF (as a
191 	 * result of a subsequent query operation).  Unlike the other UDAV
192 	 * parameters, the value of "av_dgid.gid_guid" is not always preserved
193 	 * by being written to hardware.  The reason for this is described in
194 	 * tavor_set_addr_path().
195 	 */
196 	ah->ah_udavrsrcp = udav;
197 	ah->ah_rsrcp	 = rsrc;
198 	ah->ah_pdhdl	 = pd;
199 	ah->ah_mrhdl	 = mr;
200 	ah->ah_save_guid = attr_p->av_dgid.gid_guid;
201 	ah->ah_save_srate = attr_p->av_srate;
202 	*ahhdl = ah;
203 
204 	/* Determine if later ddi_dma_sync will be necessary */
205 	ah->ah_sync = TAVOR_UDAV_IS_SYNC_REQ(state);
206 
207 	/* Sync the UDAV for use by the hardware */
208 	tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
209 
210 	return (DDI_SUCCESS);
211 
212 ahalloc_fail2:
213 	tavor_pd_refcnt_dec(pd);
214 	tavor_rsrc_free(state, &rsrc);
215 ahalloc_fail1:
216 	tavor_rsrc_free(state, &udav);
217 ahalloc_fail:
218 	return (status);
219 }
220 
221 
222 /*
223  * tavor_ah_free()
224  *    Context: Can be called only from user or kernel context.
225  */
226 /* ARGSUSED */
227 int
tavor_ah_free(tavor_state_t * state,tavor_ahhdl_t * ahhdl,uint_t sleepflag)228 tavor_ah_free(tavor_state_t *state, tavor_ahhdl_t *ahhdl, uint_t sleepflag)
229 {
230 	tavor_rsrc_t		*udav, *rsrc;
231 	tavor_pdhdl_t		pd;
232 	tavor_mrhdl_t		mr;
233 	tavor_ahhdl_t		ah;
234 	int			status;
235 
236 	/*
237 	 * Pull all the necessary information from the Tavor Address Handle
238 	 * struct.  This is necessary here because the resource for the
239 	 * AH is going to be freed up as part of this operation.
240 	 */
241 	ah    = *ahhdl;
242 	mutex_enter(&ah->ah_lock);
243 	udav  = ah->ah_udavrsrcp;
244 	rsrc  = ah->ah_rsrcp;
245 	pd    = ah->ah_pdhdl;
246 	mr    = ah->ah_mrhdl;
247 	mutex_exit(&ah->ah_lock);
248 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
249 
250 	/*
251 	 * Deregister the memory for the UDAV.  If this fails for any reason,
252 	 * then it is an indication that something (either in HW or SW) has
253 	 * gone seriously wrong.  So we print a warning message and return
254 	 * failure.
255 	 */
256 	status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
257 	    sleepflag);
258 	if (status != DDI_SUCCESS) {
259 		return (ibc_get_ci_failure(0));
260 	}
261 
262 	/*
263 	 * Write zero to the first 64-bit word in the UDAV entry.  As
264 	 * described above (in tavor_ah_alloc), the PD number is stored in
265 	 * the first 64-bits of each UDAV and setting this to zero is
266 	 * guaranteed to invalidate the entry.
267 	 */
268 	ddi_put64(udav->tr_acchdl, (uint64_t *)udav->tr_addr, 0);
269 
270 	/* Sync the UDAV for use by the hardware */
271 	tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
272 
273 	/* Decrement the reference count on the protection domain (PD) */
274 	tavor_pd_refcnt_dec(pd);
275 
276 	/* Free the Tavor Address Handle structure */
277 	tavor_rsrc_free(state, &rsrc);
278 
279 	/* Free up the UDAV entry resource */
280 	tavor_rsrc_free(state, &udav);
281 
282 	/* Set the ahhdl pointer to NULL and return success */
283 	*ahhdl = NULL;
284 
285 	return (DDI_SUCCESS);
286 }
287 
288 
289 /*
290  * tavor_ah_query()
291  *    Context: Can be called from interrupt or base context.
292  */
293 /* ARGSUSED */
294 int
tavor_ah_query(tavor_state_t * state,tavor_ahhdl_t ah,tavor_pdhdl_t * pd,ibt_adds_vect_t * attr_p)295 tavor_ah_query(tavor_state_t *state, tavor_ahhdl_t ah, tavor_pdhdl_t *pd,
296     ibt_adds_vect_t *attr_p)
297 {
298 	tavor_hw_udav_t		udav_entry;
299 	tavor_rsrc_t		*udav;
300 	uint64_t		data;
301 	uint32_t		size;
302 	int			i;
303 
304 	mutex_enter(&ah->ah_lock);
305 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr_p))
306 
307 	/*
308 	 * Pull all the necessary information from the Tavor Address Handle
309 	 * structure
310 	 */
311 	udav	= ah->ah_udavrsrcp;
312 	*pd	= ah->ah_pdhdl;
313 
314 	/*
315 	 * Copy the UDAV entry into the temporary copy.  Here we copy all
316 	 * the information from the UDAV entry in DDR memory into the
317 	 * temporary UDAV.  Note:  We don't need to sync the UDAV for
318 	 * reading by software because Tavor HW never modifies the entry.
319 	 */
320 	size = sizeof (tavor_hw_udav_t) >> 3;
321 	for (i = 0; i < size; i++) {
322 		data = ddi_get64(udav->tr_acchdl,
323 		    ((uint64_t *)udav->tr_addr + i));
324 		((uint64_t *)&udav_entry)[i] = data;
325 	}
326 
327 	/*
328 	 * Fill in "ibt_adds_vect_t".  We call tavor_get_addr_path() to fill
329 	 * the common portions that can be pulled from the UDAV we pass in.
330 	 *
331 	 * NOTE: We will also fill the "av_dgid.gid_guid" field from the
332 	 * "ah_save_guid" field we have previously saved away.  The reason
333 	 * for this is described in tavor_ah_alloc() and tavor_ah_modify().
334 	 */
335 	tavor_get_addr_path(state, (tavor_hw_addr_path_t *)&udav_entry,
336 	    attr_p, TAVOR_ADDRPATH_UDAV, NULL);
337 
338 	attr_p->av_dgid.gid_guid = ah->ah_save_guid;
339 	attr_p->av_srate = ah->ah_save_srate;
340 
341 	mutex_exit(&ah->ah_lock);
342 	return (DDI_SUCCESS);
343 }
344 
345 
346 /*
347  * tavor_ah_modify()
348  *    Context: Can be called from interrupt or base context.
349  */
350 /* ARGSUSED */
351 int
tavor_ah_modify(tavor_state_t * state,tavor_ahhdl_t ah,ibt_adds_vect_t * attr_p)352 tavor_ah_modify(tavor_state_t *state, tavor_ahhdl_t ah,
353     ibt_adds_vect_t *attr_p)
354 {
355 	tavor_hw_udav_t		udav_entry;
356 	tavor_rsrc_t		*udav;
357 	uint64_t		data_new, data_old;
358 	uint32_t		udav_pd, size, portnum_new;
359 	int			i, status;
360 
361 	/* Validate that specified port number is legal */
362 	if (!tavor_portnum_is_valid(state, attr_p->av_port_num)) {
363 		return (IBT_HCA_PORT_INVALID);
364 	}
365 
366 	mutex_enter(&ah->ah_lock);
367 
368 	/*
369 	 * Pull all the necessary information from the Tavor Address Handle
370 	 * structure
371 	 */
372 	udav = ah->ah_udavrsrcp;
373 
374 	/*
375 	 * Fill in the UDAV entry.  Note: we are only filling in a temporary
376 	 * copy here, which we will later copy into the actual entry in
377 	 * Tavor DDR memory.  This starts be zeroing out the temporary copy
378 	 * and then calling tavor_set_addr_path() to fill in the common
379 	 * portions that can be pulled from the "ibt_adds_vect_t" passed in
380 	 *
381 	 * NOTE: We also need to save away a copy of the "av_dgid.gid_guid"
382 	 * field here (just as we did during tavor_ah_alloc()) because we
383 	 * may need to return it later to the IBTF (as a result of a
384 	 * subsequent query operation).  As explained in tavor_ah_alloc(),
385 	 * unlike the other UDAV parameters, the value of "av_dgid.gid_guid"
386 	 * is not always preserved by being written to hardware.  The reason
387 	 * for this is described in tavor_set_addr_path().
388 	 */
389 	bzero(&udav_entry, sizeof (tavor_hw_udav_t));
390 	status = tavor_set_addr_path(state, attr_p,
391 	    (tavor_hw_addr_path_t *)&udav_entry, TAVOR_ADDRPATH_UDAV, NULL);
392 	if (status != DDI_SUCCESS) {
393 		mutex_exit(&ah->ah_lock);
394 		return (status);
395 	}
396 	ah->ah_save_guid = attr_p->av_dgid.gid_guid;
397 	ah->ah_save_srate = attr_p->av_srate;
398 
399 	/*
400 	 * Save away the current PD number for this UDAV.  Then temporarily
401 	 * invalidate the entry (by setting the PD to zero).  Note:  Since
402 	 * the first 32 bits of the UDAV actually contain the current port
403 	 * number _and_ current PD number, we need to mask off some bits.
404 	 */
405 	udav_pd = ddi_get32(udav->tr_acchdl, (uint32_t *)udav->tr_addr);
406 	udav_pd = udav_pd & 0xFFFFFF;
407 	ddi_put32(udav->tr_acchdl, (uint32_t *)udav->tr_addr, 0);
408 
409 	/* Sync the UDAV for use by the hardware */
410 	tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
411 
412 	/*
413 	 * Copy UDAV structure to the entry
414 	 *    Note:  We copy in 64-bit chunks.  For the first two of these
415 	 *    chunks it is necessary to read the current contents of the
416 	 *    UDAV, mask off the modifiable portions (maintaining any
417 	 *    of the "reserved" portions), and then mask on the new data.
418 	 */
419 	size = sizeof (tavor_hw_udav_t) >> 3;
420 	for (i = 0; i < size; i++) {
421 		data_new = ((uint64_t *)&udav_entry)[i];
422 		data_old = ddi_get64(udav->tr_acchdl,
423 		    ((uint64_t *)udav->tr_addr + i));
424 
425 		/*
426 		 * Apply mask to change only the relevant values.  Note: We
427 		 * extract the new portnum from the address handle here
428 		 * because the "PD" and "portnum" fields are in the same
429 		 * 32-bit word in the UDAV.  We will use the (new) port
430 		 * number extracted here when we write the valid PD number
431 		 * in the last step below.
432 		 */
433 		if (i == 0) {
434 			data_old = data_old & TAVOR_UDAV_MODIFY_MASK0;
435 			portnum_new = data_new >> 56;
436 		} else if (i == 1) {
437 			data_old = data_old & TAVOR_UDAV_MODIFY_MASK1;
438 		} else {
439 			data_old = 0;
440 		}
441 
442 		/* Write the updated values to the UDAV (in DDR) */
443 		data_new = data_old | data_new;
444 		ddi_put64(udav->tr_acchdl, ((uint64_t *)udav->tr_addr + i),
445 		    data_new);
446 	}
447 
448 	/*
449 	 * Sync the body of the UDAV for use by the hardware.  After we
450 	 * have updated the PD number (to make the UDAV valid), we sync
451 	 * again to push the entire entry out for hardware access.
452 	 */
453 	tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
454 
455 	/*
456 	 * Put the valid PD number back into UDAV entry.  Note: Because port
457 	 * number and PD number are in the same word, we must mask the
458 	 * new port number with the old PD number before writing it back
459 	 * to the UDAV entry
460 	 */
461 	udav_pd = ((portnum_new << 24) | udav_pd);
462 	ddi_put32(udav->tr_acchdl, (uint32_t *)udav->tr_addr, udav_pd);
463 
464 	/* Sync the rest of the UDAV for use by the hardware */
465 	tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV);
466 
467 	mutex_exit(&ah->ah_lock);
468 	return (DDI_SUCCESS);
469 }
470 
471 
472 /*
473  * tavor_udav_sync()
474  *    Context: Can be called from interrupt or base context.
475  */
476 /* ARGSUSED */
477 static void
tavor_udav_sync(tavor_ahhdl_t ah,tavor_hw_udav_t * udav,uint_t flag)478 tavor_udav_sync(tavor_ahhdl_t ah, tavor_hw_udav_t *udav, uint_t flag)
479 {
480 	ddi_dma_handle_t	dmahdl;
481 	off_t			offset;
482 	int			status;
483 
484 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
485 
486 	/* Determine if AH needs to be synced or not */
487 	if (ah->ah_sync == 0) {
488 		return;
489 	}
490 
491 	/* Get the DMA handle from AH handle */
492 	dmahdl = ah->ah_mrhdl->mr_bindinfo.bi_dmahdl;
493 
494 	/* Calculate offset into address handle */
495 	offset = (off_t)0;
496 	status = ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_udav_t), flag);
497 	if (status != DDI_SUCCESS) {
498 		return;
499 	}
500 }
501 
502 
503 /*
504  * tavor_mcg_attach()
505  *    Context: Can be called only from user or kernel context.
506  */
507 int
tavor_mcg_attach(tavor_state_t * state,tavor_qphdl_t qp,ib_gid_t gid,ib_lid_t lid)508 tavor_mcg_attach(tavor_state_t *state, tavor_qphdl_t qp, ib_gid_t gid,
509     ib_lid_t lid)
510 {
511 	tavor_rsrc_t		*rsrc;
512 	tavor_hw_mcg_t		*mcg_entry;
513 	tavor_hw_mcg_qp_list_t	*mcg_entry_qplist;
514 	tavor_mcghdl_t		mcg, newmcg;
515 	uint64_t		mgid_hash;
516 	uint32_t		end_indx;
517 	int			status;
518 	uint_t			qp_found;
519 
520 	/*
521 	 * It is only allowed to attach MCG to UD queue pairs.  Verify
522 	 * that the intended QP is of the appropriate transport type
523 	 */
524 	if (qp->qp_serv_type != TAVOR_QP_UD) {
525 		goto mcgattach_fail;
526 	}
527 
528 	/*
529 	 * Check for invalid Multicast DLID.  Specifically, all Multicast
530 	 * LIDs should be within a well defined range.  If the specified LID
531 	 * is outside of that range, then return an error.
532 	 */
533 	if (tavor_mlid_is_valid(lid) == 0) {
534 		goto mcgattach_fail;
535 	}
536 	/*
537 	 * Check for invalid Multicast GID.  All Multicast GIDs should have
538 	 * a well-defined pattern of bits and flags that are allowable.  If
539 	 * the specified GID does not meet the criteria, then return an error.
540 	 */
541 	if (tavor_mgid_is_valid(gid) == 0) {
542 		goto mcgattach_fail;
543 	}
544 
545 	/*
546 	 * Compute the MGID hash value.  Since the MCG table is arranged as
547 	 * a number of separate hash chains, this operation converts the
548 	 * specified MGID into the starting index of an entry in the hash
549 	 * table (i.e. the index for the start of the appropriate hash chain).
550 	 * Subsequent operations below will walk the chain searching for the
551 	 * right place to add this new QP.
552 	 */
553 	status = tavor_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
554 	    &mgid_hash, TAVOR_SLEEPFLAG_FOR_CONTEXT());
555 	if (status != TAVOR_CMD_SUCCESS) {
556 		cmn_err(CE_CONT, "Tavor: MGID_HASH command failed: %08x\n",
557 		    status);
558 		return (ibc_get_ci_failure(0));
559 	}
560 
561 	/*
562 	 * Grab the multicast group mutex.  Then grab the pre-allocated
563 	 * temporary buffer used for holding and/or modifying MCG entries.
564 	 * Zero out the temporary MCG entry before we begin.
565 	 */
566 	mutex_enter(&state->ts_mcglock);
567 	mcg_entry = state->ts_mcgtmp;
568 	mcg_entry_qplist = TAVOR_MCG_GET_QPLIST_PTR(mcg_entry);
569 	bzero(mcg_entry, TAVOR_MCGMEM_SZ(state));
570 
571 	/*
572 	 * Walk through the array of MCG entries starting at "mgid_hash".
573 	 * Try to find the appropriate place for this new QP to be added.
574 	 * This could happen when the first entry of the chain has MGID == 0
575 	 * (which means that the hash chain is empty), or because we find
576 	 * an entry with the same MGID (in which case we'll add the QP to
577 	 * that MCG), or because we come to the end of the chain (in which
578 	 * case this is the first QP being added to the multicast group that
579 	 * corresponds to the MGID.  The tavor_mcg_walk_mgid_hash() routine
580 	 * walks the list and returns an index into the MCG table.  The entry
581 	 * at this index is then checked to determine which case we have
582 	 * fallen into (see below).  Note:  We are using the "shadow" MCG
583 	 * list (of tavor_mcg_t structs) for this lookup because the real
584 	 * MCG entries are in hardware (and the lookup process would be much
585 	 * more time consuming).
586 	 */
587 	end_indx = tavor_mcg_walk_mgid_hash(state, mgid_hash, gid, NULL);
588 	mcg	 = &state->ts_mcghdl[end_indx];
589 
590 	/*
591 	 * If MGID == 0, then the hash chain is empty.  Just fill in the
592 	 * current entry.  Note:  No need to allocate an MCG table entry
593 	 * as all the hash chain "heads" are already preallocated.
594 	 */
595 	if ((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) {
596 
597 		/* Fill in the current entry in the "shadow" MCG list */
598 		tavor_mcg_setup_new_hdr(mcg, mcg_entry, gid, NULL);
599 
600 		/*
601 		 * Try to add the new QP number to the list.  This (and the
602 		 * above) routine fills in a temporary MCG.  The "mcg_entry"
603 		 * and "mcg_entry_qplist" pointers simply point to different
604 		 * offsets within the same temporary copy of the MCG (for
605 		 * convenience).  Note:  If this fails, we need to invalidate
606 		 * the entries we've already put into the "shadow" list entry
607 		 * above.
608 		 */
609 		status = tavor_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
610 		    &qp_found);
611 		if (status != DDI_SUCCESS) {
612 			bzero(mcg, sizeof (struct tavor_sw_mcg_list_s));
613 			mutex_exit(&state->ts_mcglock);
614 			goto mcgattach_fail;
615 		}
616 
617 		/*
618 		 * Once the temporary MCG has been filled in, write the entry
619 		 * into the appropriate location in the Tavor MCG entry table.
620 		 * If it's successful, then drop the lock and return success.
621 		 * Note: In general, this operation shouldn't fail.  If it
622 		 * does, then it is an indication that something (probably in
623 		 * HW, but maybe in SW) has gone seriously wrong.  We still
624 		 * want to zero out the entries that we've filled in above
625 		 * (in the tavor_mcg_setup_new_hdr() routine).
626 		 */
627 		status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
628 		    TAVOR_CMD_NOSLEEP_SPIN);
629 		if (status != TAVOR_CMD_SUCCESS) {
630 			bzero(mcg, sizeof (struct tavor_sw_mcg_list_s));
631 			mutex_exit(&state->ts_mcglock);
632 			TAVOR_WARNING(state, "failed to write MCG entry");
633 			cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
634 			    "%08x\n", status);
635 			return (ibc_get_ci_failure(0));
636 		}
637 
638 		/*
639 		 * Now that we know all the Tavor firmware accesses have been
640 		 * successful, we update the "shadow" MCG entry by incrementing
641 		 * the "number of attached QPs" count.
642 		 *
643 		 * We increment only if the QP is not already part of the
644 		 * MCG by checking the 'qp_found' flag returned from the
645 		 * qplist_add above.
646 		 */
647 		if (!qp_found) {
648 			mcg->mcg_num_qps++;
649 
650 			/*
651 			 * Increment the refcnt for this QP.  Because the QP
652 			 * was added to this MCG, the refcnt must be
653 			 * incremented.
654 			 */
655 			tavor_qp_mcg_refcnt_inc(qp);
656 		}
657 
658 		/*
659 		 * We drop the lock and return success.
660 		 */
661 		mutex_exit(&state->ts_mcglock);
662 		return (DDI_SUCCESS);
663 	}
664 
665 	/*
666 	 * If the specified MGID matches the MGID in the current entry, then
667 	 * we need to try to add the QP to the current MCG entry.  In this
668 	 * case, it means that we need to read the existing MCG entry (into
669 	 * the temporary MCG), add the new QP number to the temporary entry
670 	 * (using the same method we used above), and write the entry back
671 	 * to the hardware (same as above).
672 	 */
673 	if ((mcg->mcg_mgid_h == gid.gid_prefix) &&
674 	    (mcg->mcg_mgid_l == gid.gid_guid)) {
675 
676 		/*
677 		 * Read the current MCG entry into the temporary MCG.  Note:
678 		 * In general, this operation shouldn't fail.  If it does,
679 		 * then it is an indication that something (probably in HW,
680 		 * but maybe in SW) has gone seriously wrong.
681 		 */
682 		status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx,
683 		    TAVOR_CMD_NOSLEEP_SPIN);
684 		if (status != TAVOR_CMD_SUCCESS) {
685 			mutex_exit(&state->ts_mcglock);
686 			TAVOR_WARNING(state, "failed to read MCG entry");
687 			cmn_err(CE_CONT, "Tavor: READ_MGM command failed: "
688 			    "%08x\n", status);
689 			return (ibc_get_ci_failure(0));
690 		}
691 
692 		/*
693 		 * Try to add the new QP number to the list.  This routine
694 		 * fills in the necessary pieces of the temporary MCG.  The
695 		 * "mcg_entry_qplist" pointer is used to point to the portion
696 		 * of the temporary MCG that holds the QP numbers.
697 		 *
698 		 * Note: tavor_mcg_qplist_add() returns SUCCESS if it
699 		 * already found the QP in the list.  In this case, the QP is
700 		 * not added on to the list again.  Check the flag 'qp_found'
701 		 * if this value is needed to be known.
702 		 *
703 		 */
704 		status = tavor_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
705 		    &qp_found);
706 		if (status != DDI_SUCCESS) {
707 			mutex_exit(&state->ts_mcglock);
708 			/* Set "status" and "errormsg" and goto failure */
709 			goto mcgattach_fail;
710 		}
711 
712 		/*
713 		 * Once the temporary MCG has been updated, write the entry
714 		 * into the appropriate location in the Tavor MCG entry table.
715 		 * If it's successful, then drop the lock and return success.
716 		 * Note: In general, this operation shouldn't fail.  If it
717 		 * does, then it is an indication that something (probably in
718 		 * HW, but maybe in SW) has gone seriously wrong.
719 		 */
720 		status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
721 		    TAVOR_CMD_NOSLEEP_SPIN);
722 		if (status != TAVOR_CMD_SUCCESS) {
723 			mutex_exit(&state->ts_mcglock);
724 			TAVOR_WARNING(state, "failed to write MCG entry");
725 			cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
726 			    "%08x\n", status);
727 			return (ibc_get_ci_failure(0));
728 		}
729 
730 		/*
731 		 * Now that we know all the Tavor firmware accesses have been
732 		 * successful, we update the current "shadow" MCG entry by
733 		 * incrementing the "number of attached QPs" count.
734 		 *
735 		 * We increment only if the QP is not already part of the
736 		 * MCG by checking the 'qp_found' flag returned from the
737 		 * qplist_add above.
738 		 */
739 		if (!qp_found) {
740 			mcg->mcg_num_qps++;
741 
742 			/*
743 			 * Increment the refcnt for this QP.  Because the QP
744 			 * was added to this MCG, the refcnt must be
745 			 * incremented.
746 			 */
747 			tavor_qp_mcg_refcnt_inc(qp);
748 		}
749 
750 		/*
751 		 * We drop the lock and return success.
752 		 */
753 		mutex_exit(&state->ts_mcglock);
754 		return (DDI_SUCCESS);
755 	}
756 
757 	/*
758 	 * If we've reached here, then we're at the end of the hash chain.
759 	 * We need to allocate a new MCG entry, fill it in, write it to Tavor,
760 	 * and update the previous entry to link the new one to the end of the
761 	 * chain.
762 	 */
763 
764 	/*
765 	 * Allocate an MCG table entry.  This will be filled in with all
766 	 * the necessary parameters to define the multicast group.  Then it
767 	 * will be written to the hardware in the next-to-last step below.
768 	 */
769 	status = tavor_rsrc_alloc(state, TAVOR_MCG, 1, TAVOR_NOSLEEP, &rsrc);
770 	if (status != DDI_SUCCESS) {
771 		mutex_exit(&state->ts_mcglock);
772 		goto mcgattach_fail;
773 	}
774 
775 	/*
776 	 * Fill in the new entry in the "shadow" MCG list.  Note:  Just as
777 	 * it does above, tavor_mcg_setup_new_hdr() also fills in a portion
778 	 * of the temporary MCG entry (the rest of which will be filled in by
779 	 * tavor_mcg_qplist_add() below)
780 	 */
781 	newmcg = &state->ts_mcghdl[rsrc->tr_indx];
782 	tavor_mcg_setup_new_hdr(newmcg, mcg_entry, gid, rsrc);
783 
784 	/*
785 	 * Try to add the new QP number to the list.  This routine fills in
786 	 * the final necessary pieces of the temporary MCG.  The
787 	 * "mcg_entry_qplist" pointer is used to point to the portion of the
788 	 * temporary MCG that holds the QP numbers.  If we fail here, we
789 	 * must undo the previous resource allocation.
790 	 *
791 	 * Note: tavor_mcg_qplist_add() can we return SUCCESS if it already
792 	 * found the QP in the list.  In this case, the QP is not added on to
793 	 * the list again.  Check the flag 'qp_found' if this value is needed
794 	 * to be known.
795 	 */
796 	status = tavor_mcg_qplist_add(state, newmcg, mcg_entry_qplist, qp,
797 	    &qp_found);
798 	if (status != DDI_SUCCESS) {
799 		bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
800 		tavor_rsrc_free(state, &rsrc);
801 		mutex_exit(&state->ts_mcglock);
802 		goto mcgattach_fail;
803 	}
804 
805 	/*
806 	 * Once the temporary MCG has been updated, write the entry into the
807 	 * appropriate location in the Tavor MCG entry table.  If this is
808 	 * successful, then we need to chain the previous entry to this one.
809 	 * Note: In general, this operation shouldn't fail.  If it does, then
810 	 * it is an indication that something (probably in HW, but maybe in
811 	 * SW) has gone seriously wrong.
812 	 */
813 	status = tavor_write_mgm_cmd_post(state, mcg_entry, rsrc->tr_indx,
814 	    TAVOR_CMD_NOSLEEP_SPIN);
815 	if (status != TAVOR_CMD_SUCCESS) {
816 		bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
817 		tavor_rsrc_free(state, &rsrc);
818 		mutex_exit(&state->ts_mcglock);
819 		TAVOR_WARNING(state, "failed to write MCG entry");
820 		cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
821 		    status);
822 		return (ibc_get_ci_failure(0));
823 	}
824 
825 	/*
826 	 * Now read the current MCG entry (the one previously at the end of
827 	 * hash chain) into the temporary MCG.  We are going to update its
828 	 * "next_gid_indx" now and write the entry back to the MCG table.
829 	 * Note:  In general, this operation shouldn't fail.  If it does, then
830 	 * it is an indication that something (probably in HW, but maybe in SW)
831 	 * has gone seriously wrong.  We will free up the MCG entry resource,
832 	 * but we will not undo the previously written MCG entry in the HW.
833 	 * This is OK, though, because the MCG entry is not currently attached
834 	 * to any hash chain.
835 	 */
836 	status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx,
837 	    TAVOR_CMD_NOSLEEP_SPIN);
838 	if (status != TAVOR_CMD_SUCCESS) {
839 		bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
840 		tavor_rsrc_free(state, &rsrc);
841 		mutex_exit(&state->ts_mcglock);
842 		TAVOR_WARNING(state, "failed to read MCG entry");
843 		cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n",
844 		    status);
845 		return (ibc_get_ci_failure(0));
846 	}
847 
848 	/*
849 	 * Finally, we update the "next_gid_indx" field in the temporary MCG
850 	 * and attempt to write the entry back into the Tavor MCG table.  If
851 	 * this succeeds, then we update the "shadow" list to reflect the
852 	 * change, drop the lock, and return success.  Note:  In general, this
853 	 * operation shouldn't fail.  If it does, then it is an indication
854 	 * that something (probably in HW, but maybe in SW) has gone seriously
855 	 * wrong.  Just as we do above, we will free up the MCG entry resource,
856 	 * but we will not try to undo the previously written MCG entry.  This
857 	 * is OK, though, because (since we failed here to update the end of
858 	 * the chain) that other entry is not currently attached to any chain.
859 	 */
860 	mcg_entry->next_gid_indx = rsrc->tr_indx;
861 	status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
862 	    TAVOR_CMD_NOSLEEP_SPIN);
863 	if (status != TAVOR_CMD_SUCCESS) {
864 		bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s));
865 		tavor_rsrc_free(state, &rsrc);
866 		mutex_exit(&state->ts_mcglock);
867 		TAVOR_WARNING(state, "failed to write MCG entry");
868 		cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
869 		    status);
870 		return (ibc_get_ci_failure(0));
871 	}
872 	mcg = &state->ts_mcghdl[end_indx];
873 	mcg->mcg_next_indx = rsrc->tr_indx;
874 
875 	/*
876 	 * Now that we know all the Tavor firmware accesses have been
877 	 * successful, we update the new "shadow" MCG entry by incrementing
878 	 * the "number of attached QPs" count.  Then we drop the lock and
879 	 * return success.
880 	 */
881 	newmcg->mcg_num_qps++;
882 
883 	/*
884 	 * Increment the refcnt for this QP.  Because the QP
885 	 * was added to this MCG, the refcnt must be
886 	 * incremented.
887 	 */
888 	tavor_qp_mcg_refcnt_inc(qp);
889 
890 	mutex_exit(&state->ts_mcglock);
891 	return (DDI_SUCCESS);
892 
893 mcgattach_fail:
894 	return (status);
895 }
896 
897 
898 /*
899  * tavor_mcg_detach()
900  *    Context: Can be called only from user or kernel context.
901  */
902 int
tavor_mcg_detach(tavor_state_t * state,tavor_qphdl_t qp,ib_gid_t gid,ib_lid_t lid)903 tavor_mcg_detach(tavor_state_t *state, tavor_qphdl_t qp, ib_gid_t gid,
904     ib_lid_t lid)
905 {
906 	tavor_hw_mcg_t		*mcg_entry;
907 	tavor_hw_mcg_qp_list_t	*mcg_entry_qplist;
908 	tavor_mcghdl_t		mcg;
909 	uint64_t		mgid_hash;
910 	uint32_t		end_indx, prev_indx;
911 	int			status;
912 
913 	/*
914 	 * Check for invalid Multicast DLID.  Specifically, all Multicast
915 	 * LIDs should be within a well defined range.  If the specified LID
916 	 * is outside of that range, then return an error.
917 	 */
918 	if (tavor_mlid_is_valid(lid) == 0) {
919 		return (IBT_MC_MLID_INVALID);
920 	}
921 
922 	/*
923 	 * Compute the MGID hash value.  As described above, the MCG table is
924 	 * arranged as a number of separate hash chains.  This operation
925 	 * converts the specified MGID into the starting index of an entry in
926 	 * the hash table (i.e. the index for the start of the appropriate
927 	 * hash chain).  Subsequent operations below will walk the chain
928 	 * searching for a matching entry from which to attempt to remove
929 	 * the specified QP.
930 	 */
931 	status = tavor_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
932 	    &mgid_hash, TAVOR_SLEEPFLAG_FOR_CONTEXT());
933 	if (status != TAVOR_CMD_SUCCESS) {
934 		cmn_err(CE_CONT, "Tavor: MGID_HASH command failed: %08x\n",
935 		    status);
936 		return (ibc_get_ci_failure(0));
937 	}
938 
939 	/*
940 	 * Grab the multicast group mutex.  Then grab the pre-allocated
941 	 * temporary buffer used for holding and/or modifying MCG entries.
942 	 */
943 	mutex_enter(&state->ts_mcglock);
944 	mcg_entry = state->ts_mcgtmp;
945 	mcg_entry_qplist = TAVOR_MCG_GET_QPLIST_PTR(mcg_entry);
946 
947 	/*
948 	 * Walk through the array of MCG entries starting at "mgid_hash".
949 	 * Try to find an MCG entry with a matching MGID.  The
950 	 * tavor_mcg_walk_mgid_hash() routine walks the list and returns an
951 	 * index into the MCG table.  The entry at this index is checked to
952 	 * determine whether it is a match or not.  If it is a match, then
953 	 * we continue on to attempt to remove the QP from the MCG.  If it
954 	 * is not a match (or not a valid MCG entry), then we return an error.
955 	 */
956 	end_indx = tavor_mcg_walk_mgid_hash(state, mgid_hash, gid, &prev_indx);
957 	mcg	 = &state->ts_mcghdl[end_indx];
958 
959 	/*
960 	 * If MGID == 0 (the hash chain is empty) or if the specified MGID
961 	 * does not match the MGID in the current entry, then return
962 	 * IBT_MC_MGID_INVALID (to indicate that the specified MGID is not
963 	 * valid).
964 	 */
965 	if (((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) ||
966 	    ((mcg->mcg_mgid_h != gid.gid_prefix) ||
967 	    (mcg->mcg_mgid_l != gid.gid_guid))) {
968 		mutex_exit(&state->ts_mcglock);
969 		return (IBT_MC_MGID_INVALID);
970 	}
971 
972 	/*
973 	 * Read the current MCG entry into the temporary MCG.  Note: In
974 	 * general, this operation shouldn't fail.  If it does, then it is
975 	 * an indication that something (probably in HW, but maybe in SW)
976 	 * has gone seriously wrong.
977 	 */
978 	status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx,
979 	    TAVOR_CMD_NOSLEEP_SPIN);
980 	if (status != TAVOR_CMD_SUCCESS) {
981 		mutex_exit(&state->ts_mcglock);
982 		TAVOR_WARNING(state, "failed to read MCG entry");
983 		cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n",
984 		    status);
985 		return (ibc_get_ci_failure(0));
986 	}
987 
988 	/*
989 	 * Search the QP number list for a match.  If a match is found, then
990 	 * remove the entry from the QP list.  Otherwise, if no match is found,
991 	 * return an error.
992 	 */
993 	status = tavor_mcg_qplist_remove(mcg, mcg_entry_qplist, qp);
994 	if (status != DDI_SUCCESS) {
995 		mutex_exit(&state->ts_mcglock);
996 		return (status);
997 	}
998 
999 	/*
1000 	 * Decrement the MCG count for this QP.  When the 'qp_mcg'
1001 	 * field becomes 0, then this QP is no longer a member of any
1002 	 * MCG.
1003 	 */
1004 	tavor_qp_mcg_refcnt_dec(qp);
1005 
1006 	/*
1007 	 * If the current MCG's QP number list is about to be made empty
1008 	 * ("mcg_num_qps" == 1), then remove the entry itself from the hash
1009 	 * chain.  Otherwise, just write the updated MCG entry back to the
1010 	 * hardware.  In either case, once we successfully update the hardware
1011 	 * chain, then we decrement the "shadow" list entry's "mcg_num_qps"
1012 	 * count (or zero out the entire "shadow" list entry) before returning
1013 	 * success.  Note:  Zeroing out the "shadow" list entry is done
1014 	 * inside of tavor_mcg_hash_list_remove().
1015 	 */
1016 	if (mcg->mcg_num_qps == 1) {
1017 
1018 		/* Remove an MCG entry from the hash chain */
1019 		status = tavor_mcg_hash_list_remove(state, end_indx, prev_indx,
1020 		    mcg_entry);
1021 		if (status != DDI_SUCCESS) {
1022 			mutex_exit(&state->ts_mcglock);
1023 			return (status);
1024 		}
1025 
1026 	} else {
1027 		/*
1028 		 * Write the updated MCG entry back to the Tavor MCG table.
1029 		 * If this succeeds, then we update the "shadow" list to
1030 		 * reflect the change (i.e. decrement the "mcg_num_qps"),
1031 		 * drop the lock, and return success.  Note:  In general,
1032 		 * this operation shouldn't fail.  If it does, then it is an
1033 		 * indication that something (probably in HW, but maybe in SW)
1034 		 * has gone seriously wrong.
1035 		 */
1036 		status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx,
1037 		    TAVOR_CMD_NOSLEEP_SPIN);
1038 		if (status != TAVOR_CMD_SUCCESS) {
1039 			mutex_exit(&state->ts_mcglock);
1040 			TAVOR_WARNING(state, "failed to write MCG entry");
1041 			cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
1042 			    "%08x\n", status);
1043 			return (ibc_get_ci_failure(0));
1044 		}
1045 		mcg->mcg_num_qps--;
1046 	}
1047 
1048 	mutex_exit(&state->ts_mcglock);
1049 	return (DDI_SUCCESS);
1050 }
1051 
1052 /*
1053  * tavor_qp_mcg_refcnt_inc()
1054  *    Context: Can be called from interrupt or base context.
1055  */
1056 static void
tavor_qp_mcg_refcnt_inc(tavor_qphdl_t qp)1057 tavor_qp_mcg_refcnt_inc(tavor_qphdl_t qp)
1058 {
1059 	/* Increment the QP's MCG reference count */
1060 	mutex_enter(&qp->qp_lock);
1061 	qp->qp_mcg_refcnt++;
1062 	mutex_exit(&qp->qp_lock);
1063 }
1064 
1065 
1066 /*
1067  * tavor_qp_mcg_refcnt_dec()
1068  *    Context: Can be called from interrupt or base context.
1069  */
1070 static void
tavor_qp_mcg_refcnt_dec(tavor_qphdl_t qp)1071 tavor_qp_mcg_refcnt_dec(tavor_qphdl_t qp)
1072 {
1073 	/* Decrement the QP's MCG reference count */
1074 	mutex_enter(&qp->qp_lock);
1075 	qp->qp_mcg_refcnt--;
1076 	mutex_exit(&qp->qp_lock);
1077 }
1078 
1079 
1080 /*
1081  * tavor_mcg_qplist_add()
1082  *    Context: Can be called from interrupt or base context.
1083  */
1084 static int
tavor_mcg_qplist_add(tavor_state_t * state,tavor_mcghdl_t mcg,tavor_hw_mcg_qp_list_t * mcg_qplist,tavor_qphdl_t qp,uint_t * qp_found)1085 tavor_mcg_qplist_add(tavor_state_t *state, tavor_mcghdl_t mcg,
1086     tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp,
1087     uint_t *qp_found)
1088 {
1089 	uint_t		qplist_indx;
1090 
1091 	ASSERT(MUTEX_HELD(&state->ts_mcglock));
1092 
1093 	qplist_indx = mcg->mcg_num_qps;
1094 
1095 	/*
1096 	 * Determine if we have exceeded the maximum number of QP per
1097 	 * multicast group.  If we have, then return an error
1098 	 */
1099 	if (qplist_indx >= state->ts_cfg_profile->cp_num_qp_per_mcg) {
1100 		return (IBT_HCA_MCG_QP_EXCEEDED);
1101 	}
1102 
1103 	/*
1104 	 * Determine if the QP is already attached to this MCG table.  If it
1105 	 * is, then we break out and treat this operation as a NO-OP
1106 	 */
1107 	for (qplist_indx = 0; qplist_indx < mcg->mcg_num_qps;
1108 	    qplist_indx++) {
1109 		if (mcg_qplist[qplist_indx].qpn == qp->qp_qpnum) {
1110 			break;
1111 		}
1112 	}
1113 
1114 	/*
1115 	 * If the QP was already on the list, set 'qp_found' to TRUE.  We still
1116 	 * return SUCCESS in this case, but the qplist will not have been
1117 	 * updated because the QP was already on the list.
1118 	 */
1119 	if (qplist_indx < mcg->mcg_num_qps) {
1120 		*qp_found = 1;
1121 	} else {
1122 		/*
1123 		 * Otherwise, append the new QP number to the end of the
1124 		 * current QP list.  Note: We will increment the "mcg_num_qps"
1125 		 * field on the "shadow" MCG list entry later (after we know
1126 		 * that all necessary Tavor firmware accesses have been
1127 		 * successful).
1128 		 *
1129 		 * Set 'qp_found' to 0 so we know the QP was added on to the
1130 		 * list for sure.
1131 		 */
1132 		mcg_qplist[qplist_indx].q   = TAVOR_MCG_QPN_VALID;
1133 		mcg_qplist[qplist_indx].qpn = qp->qp_qpnum;
1134 		*qp_found = 0;
1135 	}
1136 
1137 	return (DDI_SUCCESS);
1138 }
1139 
1140 
1141 
1142 /*
1143  * tavor_mcg_qplist_remove()
1144  *    Context: Can be called from interrupt or base context.
1145  */
1146 static int
tavor_mcg_qplist_remove(tavor_mcghdl_t mcg,tavor_hw_mcg_qp_list_t * mcg_qplist,tavor_qphdl_t qp)1147 tavor_mcg_qplist_remove(tavor_mcghdl_t mcg, tavor_hw_mcg_qp_list_t *mcg_qplist,
1148     tavor_qphdl_t qp)
1149 {
1150 	uint_t		i, qplist_indx;
1151 
1152 	/*
1153 	 * Search the MCG QP list for a matching QPN.  When
1154 	 * it's found, we swap the last entry with the current
1155 	 * one, set the last entry to zero, decrement the last
1156 	 * entry, and return.  If it's not found, then it's
1157 	 * and error.
1158 	 */
1159 	qplist_indx = mcg->mcg_num_qps;
1160 	for (i = 0; i < qplist_indx; i++) {
1161 		if (mcg_qplist[i].qpn == qp->qp_qpnum) {
1162 			mcg_qplist[i] = mcg_qplist[qplist_indx - 1];
1163 			mcg_qplist[qplist_indx - 1].q = TAVOR_MCG_QPN_INVALID;
1164 			mcg_qplist[qplist_indx - 1].qpn = 0;
1165 
1166 			return (DDI_SUCCESS);
1167 		}
1168 	}
1169 
1170 	return (IBT_QP_HDL_INVALID);
1171 }
1172 
1173 
1174 /*
1175  * tavor_mcg_walk_mgid_hash()
1176  *    Context: Can be called from interrupt or base context.
1177  */
1178 static uint_t
tavor_mcg_walk_mgid_hash(tavor_state_t * state,uint64_t start_indx,ib_gid_t mgid,uint_t * p_indx)1179 tavor_mcg_walk_mgid_hash(tavor_state_t *state, uint64_t start_indx,
1180     ib_gid_t mgid, uint_t *p_indx)
1181 {
1182 	tavor_mcghdl_t	curr_mcghdl;
1183 	uint_t		curr_indx, prev_indx;
1184 
1185 	ASSERT(MUTEX_HELD(&state->ts_mcglock));
1186 
1187 	/* Start at the head of the hash chain */
1188 	curr_indx   = start_indx;
1189 	prev_indx   = curr_indx;
1190 	curr_mcghdl = &state->ts_mcghdl[curr_indx];
1191 
1192 	/* If the first entry in the chain has MGID == 0, then stop */
1193 	if ((curr_mcghdl->mcg_mgid_h == 0) &&
1194 	    (curr_mcghdl->mcg_mgid_l == 0)) {
1195 		goto end_mgid_hash_walk;
1196 	}
1197 
1198 	/* If the first entry in the chain matches the MGID, then stop */
1199 	if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
1200 	    (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
1201 		goto end_mgid_hash_walk;
1202 	}
1203 
1204 	/* Otherwise, walk the hash chain looking for a match */
1205 	while (curr_mcghdl->mcg_next_indx != 0) {
1206 		prev_indx = curr_indx;
1207 		curr_indx = curr_mcghdl->mcg_next_indx;
1208 		curr_mcghdl = &state->ts_mcghdl[curr_indx];
1209 
1210 		if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
1211 		    (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
1212 			break;
1213 		}
1214 	}
1215 
1216 end_mgid_hash_walk:
1217 	/*
1218 	 * If necessary, return the index of the previous entry too.  This
1219 	 * is primarily used for detaching a QP from a multicast group.  It
1220 	 * may be necessary, in that case, to delete an MCG entry from the
1221 	 * hash chain and having the index of the previous entry is helpful.
1222 	 */
1223 	if (p_indx != NULL) {
1224 		*p_indx = prev_indx;
1225 	}
1226 	return (curr_indx);
1227 }
1228 
1229 
1230 /*
1231  * tavor_mcg_setup_new_hdr()
1232  *    Context: Can be called from interrupt or base context.
1233  */
1234 static void
tavor_mcg_setup_new_hdr(tavor_mcghdl_t mcg,tavor_hw_mcg_t * mcg_hdr,ib_gid_t mgid,tavor_rsrc_t * mcg_rsrc)1235 tavor_mcg_setup_new_hdr(tavor_mcghdl_t mcg, tavor_hw_mcg_t *mcg_hdr,
1236     ib_gid_t mgid, tavor_rsrc_t *mcg_rsrc)
1237 {
1238 	/*
1239 	 * Fill in the fields of the "shadow" entry used by software
1240 	 * to track MCG hardware entry
1241 	 */
1242 	mcg->mcg_mgid_h	   = mgid.gid_prefix;
1243 	mcg->mcg_mgid_l	   = mgid.gid_guid;
1244 	mcg->mcg_rsrcp	   = mcg_rsrc;
1245 	mcg->mcg_next_indx = 0;
1246 	mcg->mcg_num_qps   = 0;
1247 
1248 	/*
1249 	 * Fill the header fields of the MCG entry (in the temporary copy)
1250 	 */
1251 	mcg_hdr->mgid_h		= mgid.gid_prefix;
1252 	mcg_hdr->mgid_l		= mgid.gid_guid;
1253 	mcg_hdr->next_gid_indx	= 0;
1254 }
1255 
1256 
1257 /*
1258  * tavor_mcg_hash_list_remove()
1259  *    Context: Can be called only from user or kernel context.
1260  */
1261 static int
tavor_mcg_hash_list_remove(tavor_state_t * state,uint_t curr_indx,uint_t prev_indx,tavor_hw_mcg_t * mcg_entry)1262 tavor_mcg_hash_list_remove(tavor_state_t *state, uint_t curr_indx,
1263     uint_t prev_indx, tavor_hw_mcg_t *mcg_entry)
1264 {
1265 	tavor_mcghdl_t		curr_mcg, prev_mcg, next_mcg;
1266 	uint_t			next_indx;
1267 	int			status;
1268 
1269 	/* Get the pointer to "shadow" list for current entry */
1270 	curr_mcg = &state->ts_mcghdl[curr_indx];
1271 
1272 	/*
1273 	 * If this is the first entry on a hash chain, then attempt to replace
1274 	 * the entry with the next entry on the chain.  If there are no
1275 	 * subsequent entries on the chain, then this is the only entry and
1276 	 * should be invalidated.
1277 	 */
1278 	if (curr_indx == prev_indx) {
1279 
1280 		/*
1281 		 * If this is the only entry on the chain, then invalidate it.
1282 		 * Note:  Invalidating an MCG entry means writing all zeros
1283 		 * to the entry.  This is only necessary for those MCG
1284 		 * entries that are the "head" entries of the individual hash
1285 		 * chains.  Regardless of whether this operation returns
1286 		 * success or failure, return that result to the caller.
1287 		 */
1288 		next_indx = curr_mcg->mcg_next_indx;
1289 		if (next_indx == 0) {
1290 			status = tavor_mcg_entry_invalidate(state, mcg_entry,
1291 			    curr_indx);
1292 			bzero(curr_mcg, sizeof (struct tavor_sw_mcg_list_s));
1293 			return (status);
1294 		}
1295 
1296 		/*
1297 		 * Otherwise, this is just the first entry on the chain, so
1298 		 * grab the next one
1299 		 */
1300 		next_mcg = &state->ts_mcghdl[next_indx];
1301 
1302 		/*
1303 		 * Read the next MCG entry into the temporary MCG.  Note:
1304 		 * In general, this operation shouldn't fail.  If it does,
1305 		 * then it is an indication that something (probably in HW,
1306 		 * but maybe in SW) has gone seriously wrong.
1307 		 */
1308 		status = tavor_read_mgm_cmd_post(state, mcg_entry, next_indx,
1309 		    TAVOR_CMD_NOSLEEP_SPIN);
1310 		if (status != TAVOR_CMD_SUCCESS) {
1311 			TAVOR_WARNING(state, "failed to read MCG entry");
1312 			cmn_err(CE_CONT, "Tavor: READ_MGM command failed: "
1313 			    "%08x\n", status);
1314 			return (ibc_get_ci_failure(0));
1315 		}
1316 
1317 		/*
1318 		 * Copy/Write the temporary MCG back to the hardware MCG list
1319 		 * using the current index.  This essentially removes the
1320 		 * current MCG entry from the list by writing over it with
1321 		 * the next one.  If this is successful, then we can do the
1322 		 * same operation for the "shadow" list.  And we can also
1323 		 * free up the Tavor MCG entry resource that was associated
1324 		 * with the (old) next entry.  Note:  In general, this
1325 		 * operation shouldn't fail.  If it does, then it is an
1326 		 * indication that something (probably in HW, but maybe in SW)
1327 		 * has gone seriously wrong.
1328 		 */
1329 		status = tavor_write_mgm_cmd_post(state, mcg_entry, curr_indx,
1330 		    TAVOR_CMD_NOSLEEP_SPIN);
1331 		if (status != TAVOR_CMD_SUCCESS) {
1332 			TAVOR_WARNING(state, "failed to write MCG entry");
1333 			cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: "
1334 			    "%08x\n", status);
1335 			return (ibc_get_ci_failure(0));
1336 		}
1337 
1338 		/*
1339 		 * Copy all the software tracking information from the next
1340 		 * entry on the "shadow" MCG list into the current entry on
1341 		 * the list.  Then invalidate (zero out) the other "shadow"
1342 		 * list entry.
1343 		 */
1344 		bcopy(next_mcg, curr_mcg, sizeof (struct tavor_sw_mcg_list_s));
1345 		bzero(next_mcg, sizeof (struct tavor_sw_mcg_list_s));
1346 
1347 		/*
1348 		 * Free up the Tavor MCG entry resource used by the "next"
1349 		 * MCG entry.  That resource is no longer needed by any
1350 		 * MCG entry which is first on a hash chain (like the "next"
1351 		 * entry has just become).
1352 		 */
1353 		tavor_rsrc_free(state, &curr_mcg->mcg_rsrcp);
1354 
1355 		return (DDI_SUCCESS);
1356 	}
1357 
1358 	/*
1359 	 * Else if this is the last entry on the hash chain (or a middle
1360 	 * entry, then we update the previous entry's "next_gid_index" field
1361 	 * to make it point instead to the next entry on the chain.  By
1362 	 * skipping over the removed entry in this way, we can then free up
1363 	 * any resources associated with the current entry.  Note:  We don't
1364 	 * need to invalidate the "skipped over" hardware entry because it
1365 	 * will no be longer connected to any hash chains, and if/when it is
1366 	 * finally re-used, it will be written with entirely new values.
1367 	 */
1368 
1369 	/*
1370 	 * Read the next MCG entry into the temporary MCG.  Note:  In general,
1371 	 * this operation shouldn't fail.  If it does, then it is an
1372 	 * indication that something (probably in HW, but maybe in SW) has
1373 	 * gone seriously wrong.
1374 	 */
1375 	status = tavor_read_mgm_cmd_post(state, mcg_entry, prev_indx,
1376 	    TAVOR_CMD_NOSLEEP_SPIN);
1377 	if (status != TAVOR_CMD_SUCCESS) {
1378 		TAVOR_WARNING(state, "failed to read MCG entry");
1379 		cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n",
1380 		    status);
1381 		return (ibc_get_ci_failure(0));
1382 	}
1383 
1384 	/*
1385 	 * Finally, we update the "next_gid_indx" field in the temporary MCG
1386 	 * and attempt to write the entry back into the Tavor MCG table.  If
1387 	 * this succeeds, then we update the "shadow" list to reflect the
1388 	 * change, free up the Tavor MCG entry resource that was associated
1389 	 * with the current entry, and return success.  Note:  In general,
1390 	 * this operation shouldn't fail.  If it does, then it is an indication
1391 	 * that something (probably in HW, but maybe in SW) has gone seriously
1392 	 * wrong.
1393 	 */
1394 	mcg_entry->next_gid_indx = curr_mcg->mcg_next_indx;
1395 	status = tavor_write_mgm_cmd_post(state, mcg_entry, prev_indx,
1396 	    TAVOR_CMD_NOSLEEP_SPIN);
1397 	if (status != TAVOR_CMD_SUCCESS) {
1398 		TAVOR_WARNING(state, "failed to write MCG entry");
1399 		cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
1400 		    status);
1401 		return (ibc_get_ci_failure(0));
1402 	}
1403 
1404 	/*
1405 	 * Get the pointer to the "shadow" MCG list entry for the previous
1406 	 * MCG.  Update its "mcg_next_indx" to point to the next entry
1407 	 * the one after the current entry. Note:  This next index may be
1408 	 * zero, indicating the end of the list.
1409 	 */
1410 	prev_mcg = &state->ts_mcghdl[prev_indx];
1411 	prev_mcg->mcg_next_indx = curr_mcg->mcg_next_indx;
1412 
1413 	/*
1414 	 * Free up the Tavor MCG entry resource used by the current entry.
1415 	 * This resource is no longer needed because the chain now skips over
1416 	 * the current entry.  Then invalidate (zero out) the current "shadow"
1417 	 * list entry.
1418 	 */
1419 	tavor_rsrc_free(state, &curr_mcg->mcg_rsrcp);
1420 	bzero(curr_mcg, sizeof (struct tavor_sw_mcg_list_s));
1421 
1422 	return (DDI_SUCCESS);
1423 }
1424 
1425 
1426 /*
1427  * tavor_mcg_entry_invalidate()
1428  *    Context: Can be called only from user or kernel context.
1429  */
1430 static int
tavor_mcg_entry_invalidate(tavor_state_t * state,tavor_hw_mcg_t * mcg_entry,uint_t indx)1431 tavor_mcg_entry_invalidate(tavor_state_t *state, tavor_hw_mcg_t *mcg_entry,
1432     uint_t indx)
1433 {
1434 	int		status;
1435 
1436 	/*
1437 	 * Invalidate the hardware MCG entry by zeroing out this temporary
1438 	 * MCG and writing it the the hardware.  Note: In general, this
1439 	 * operation shouldn't fail.  If it does, then it is an indication
1440 	 * that something (probably in HW, but maybe in SW) has gone seriously
1441 	 * wrong.
1442 	 */
1443 	bzero(mcg_entry, TAVOR_MCGMEM_SZ(state));
1444 	status = tavor_write_mgm_cmd_post(state, mcg_entry, indx,
1445 	    TAVOR_CMD_NOSLEEP_SPIN);
1446 	if (status != TAVOR_CMD_SUCCESS) {
1447 		TAVOR_WARNING(state, "failed to write MCG entry");
1448 		cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n",
1449 		    status);
1450 		return (ibc_get_ci_failure(0));
1451 	}
1452 
1453 	return (DDI_SUCCESS);
1454 }
1455 
1456 
1457 /*
1458  * tavor_mgid_is_valid()
1459  *    Context: Can be called from interrupt or base context.
1460  */
1461 static int
tavor_mgid_is_valid(ib_gid_t gid)1462 tavor_mgid_is_valid(ib_gid_t gid)
1463 {
1464 	uint_t		topbits, flags, scope;
1465 
1466 	/*
1467 	 * According to IBA 1.1 specification (section 4.1.1) a valid
1468 	 * "multicast GID" must have its top eight bits set to all ones
1469 	 */
1470 	topbits = (gid.gid_prefix >> TAVOR_MCG_TOPBITS_SHIFT) &
1471 	    TAVOR_MCG_TOPBITS_MASK;
1472 	if (topbits != TAVOR_MCG_TOPBITS) {
1473 		return (0);
1474 	}
1475 
1476 	/*
1477 	 * The next 4 bits are the "flag" bits.  These are valid only
1478 	 * if they are "0" (which correspond to permanently assigned/
1479 	 * "well-known" multicast GIDs) or "1" (for so-called "transient"
1480 	 * multicast GIDs).  All other values are reserved.
1481 	 */
1482 	flags = (gid.gid_prefix >> TAVOR_MCG_FLAGS_SHIFT) &
1483 	    TAVOR_MCG_FLAGS_MASK;
1484 	if (!((flags == TAVOR_MCG_FLAGS_PERM) ||
1485 	    (flags == TAVOR_MCG_FLAGS_NONPERM))) {
1486 		return (0);
1487 	}
1488 
1489 	/*
1490 	 * The next 4 bits are the "scope" bits.  These are valid only
1491 	 * if they are "2" (Link-local), "5" (Site-local), "8"
1492 	 * (Organization-local) or "E" (Global).  All other values
1493 	 * are reserved (or currently unassigned).
1494 	 */
1495 	scope = (gid.gid_prefix >> TAVOR_MCG_SCOPE_SHIFT) &
1496 	    TAVOR_MCG_SCOPE_MASK;
1497 	if (!((scope == TAVOR_MCG_SCOPE_LINKLOC) ||
1498 	    (scope == TAVOR_MCG_SCOPE_SITELOC)	 ||
1499 	    (scope == TAVOR_MCG_SCOPE_ORGLOC)	 ||
1500 	    (scope == TAVOR_MCG_SCOPE_GLOBAL))) {
1501 		return (0);
1502 	}
1503 
1504 	/*
1505 	 * If it passes all of the above checks, then we will consider it
1506 	 * a valid multicast GID.
1507 	 */
1508 	return (1);
1509 }
1510 
1511 
1512 /*
1513  * tavor_mlid_is_valid()
1514  *    Context: Can be called from interrupt or base context.
1515  */
1516 static int
tavor_mlid_is_valid(ib_lid_t lid)1517 tavor_mlid_is_valid(ib_lid_t lid)
1518 {
1519 	/*
1520 	 * According to IBA 1.1 specification (section 4.1.1) a valid
1521 	 * "multicast DLID" must be between 0xC000 and 0xFFFE.
1522 	 */
1523 	if ((lid < IB_LID_MC_FIRST) || (lid > IB_LID_MC_LAST)) {
1524 		return (0);
1525 	}
1526 
1527 	return (1);
1528 }
1529 
1530 
1531 /*
1532  * tavor_pd_alloc()
1533  *    Context: Can be called only from user or kernel context.
1534  */
1535 int
tavor_pd_alloc(tavor_state_t * state,tavor_pdhdl_t * pdhdl,uint_t sleepflag)1536 tavor_pd_alloc(tavor_state_t *state, tavor_pdhdl_t *pdhdl, uint_t sleepflag)
1537 {
1538 	tavor_rsrc_t	*rsrc;
1539 	tavor_pdhdl_t	pd;
1540 	int		status;
1541 
1542 	/*
1543 	 * Allocate the software structure for tracking the protection domain
1544 	 * (i.e. the Tavor Protection Domain handle).  By default each PD
1545 	 * structure will have a unique PD number assigned to it.  All that
1546 	 * is necessary is for software to initialize the PD reference count
1547 	 * (to zero) and return success.
1548 	 */
1549 	status = tavor_rsrc_alloc(state, TAVOR_PDHDL, 1, sleepflag, &rsrc);
1550 	if (status != DDI_SUCCESS) {
1551 		return (IBT_INSUFF_RESOURCE);
1552 	}
1553 	pd = (tavor_pdhdl_t)rsrc->tr_addr;
1554 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd))
1555 
1556 	pd->pd_refcnt = 0;
1557 	*pdhdl = pd;
1558 
1559 	return (DDI_SUCCESS);
1560 }
1561 
1562 
1563 /*
1564  * tavor_pd_free()
1565  *    Context: Can be called only from user or kernel context.
1566  */
1567 int
tavor_pd_free(tavor_state_t * state,tavor_pdhdl_t * pdhdl)1568 tavor_pd_free(tavor_state_t *state, tavor_pdhdl_t *pdhdl)
1569 {
1570 	tavor_rsrc_t	*rsrc;
1571 	tavor_pdhdl_t	pd;
1572 
1573 	/*
1574 	 * Pull all the necessary information from the Tavor Protection Domain
1575 	 * handle.  This is necessary here because the resource for the
1576 	 * PD is going to be freed up as part of this operation.
1577 	 */
1578 	pd   = *pdhdl;
1579 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd))
1580 	rsrc = pd->pd_rsrcp;
1581 
1582 	/*
1583 	 * Check the PD reference count.  If the reference count is non-zero,
1584 	 * then it means that this protection domain is still referenced by
1585 	 * some memory region, queue pair, address handle, or other IB object
1586 	 * If it is non-zero, then return an error.  Otherwise, free the
1587 	 * Tavor resource and return success.
1588 	 */
1589 	if (pd->pd_refcnt != 0) {
1590 		return (IBT_PD_IN_USE);
1591 	}
1592 
1593 	/* Free the Tavor Protection Domain handle */
1594 	tavor_rsrc_free(state, &rsrc);
1595 
1596 	/* Set the pdhdl pointer to NULL and return success */
1597 	*pdhdl = (tavor_pdhdl_t)NULL;
1598 
1599 	return (DDI_SUCCESS);
1600 }
1601 
1602 
1603 /*
1604  * tavor_pd_refcnt_inc()
1605  *    Context: Can be called from interrupt or base context.
1606  */
1607 void
tavor_pd_refcnt_inc(tavor_pdhdl_t pd)1608 tavor_pd_refcnt_inc(tavor_pdhdl_t pd)
1609 {
1610 	/* Increment the protection domain's reference count */
1611 	mutex_enter(&pd->pd_lock);
1612 	pd->pd_refcnt++;
1613 	mutex_exit(&pd->pd_lock);
1614 
1615 }
1616 
1617 
1618 /*
1619  * tavor_pd_refcnt_dec()
1620  *    Context: Can be called from interrupt or base context.
1621  */
1622 void
tavor_pd_refcnt_dec(tavor_pdhdl_t pd)1623 tavor_pd_refcnt_dec(tavor_pdhdl_t pd)
1624 {
1625 	/* Decrement the protection domain's reference count */
1626 	mutex_enter(&pd->pd_lock);
1627 	pd->pd_refcnt--;
1628 	mutex_exit(&pd->pd_lock);
1629 
1630 }
1631 
1632 
1633 /*
1634  * tavor_port_query()
1635  *    Context: Can be called only from user or kernel context.
1636  */
1637 int
tavor_port_query(tavor_state_t * state,uint_t port,ibt_hca_portinfo_t * pi)1638 tavor_port_query(tavor_state_t *state, uint_t port, ibt_hca_portinfo_t *pi)
1639 {
1640 	sm_portinfo_t		portinfo;
1641 	sm_guidinfo_t		guidinfo;
1642 	sm_pkey_table_t		pkeytable;
1643 	ib_gid_t		*sgid;
1644 	uint_t			sgid_max, pkey_max, tbl_size;
1645 	int			i, j, indx, status;
1646 
1647 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pi))
1648 
1649 	/* Validate that specified port number is legal */
1650 	if (!tavor_portnum_is_valid(state, port)) {
1651 		return (IBT_HCA_PORT_INVALID);
1652 	}
1653 
1654 	/*
1655 	 * We use the Tavor MAD_IFC command to post a GetPortInfo MAD
1656 	 * to the firmware (for the specified port number).  This returns
1657 	 * a full PortInfo MAD (in "portinfo") which we subsequently
1658 	 * parse to fill in the "ibt_hca_portinfo_t" structure returned
1659 	 * to the IBTF.
1660 	 */
1661 	status = tavor_getportinfo_cmd_post(state, port,
1662 	    TAVOR_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
1663 	if (status != TAVOR_CMD_SUCCESS) {
1664 		cmn_err(CE_CONT, "Tavor: GetPortInfo (port %02d) command "
1665 		    "failed: %08x\n", port, status);
1666 		return (ibc_get_ci_failure(0));
1667 	}
1668 
1669 	/*
1670 	 * Parse the PortInfo MAD and fill in the IBTF structure
1671 	 */
1672 	pi->p_base_lid		= portinfo.LID;
1673 	pi->p_qkey_violations	= portinfo.Q_KeyViolations;
1674 	pi->p_pkey_violations	= portinfo.P_KeyViolations;
1675 	pi->p_sm_sl		= portinfo.MasterSMSL;
1676 	pi->p_sm_lid		= portinfo.MasterSMLID;
1677 	pi->p_linkstate		= portinfo.PortState;
1678 	pi->p_port_num		= portinfo.LocalPortNum;
1679 	pi->p_phys_state	= portinfo.PortPhysicalState;
1680 	pi->p_width_supported	= portinfo.LinkWidthSupported;
1681 	pi->p_width_enabled	= portinfo.LinkWidthEnabled;
1682 	pi->p_width_active	= portinfo.LinkWidthActive;
1683 	pi->p_speed_supported	= portinfo.LinkSpeedSupported;
1684 	pi->p_speed_enabled	= portinfo.LinkSpeedEnabled;
1685 	pi->p_speed_active	= portinfo.LinkSpeedActive;
1686 	pi->p_mtu		= portinfo.MTUCap;
1687 	pi->p_lmc		= portinfo.LMC;
1688 	pi->p_max_vl		= portinfo.VLCap;
1689 	pi->p_subnet_timeout	= portinfo.SubnetTimeOut;
1690 	pi->p_msg_sz		= ((uint32_t)1 << TAVOR_QP_LOG_MAX_MSGSZ);
1691 	tbl_size = state->ts_cfg_profile->cp_log_max_gidtbl;
1692 	pi->p_sgid_tbl_sz	= (1 << tbl_size);
1693 	tbl_size = state->ts_cfg_profile->cp_log_max_pkeytbl;
1694 	pi->p_pkey_tbl_sz	= (1 << tbl_size);
1695 
1696 	/*
1697 	 * Convert InfiniBand-defined port capability flags to the format
1698 	 * specified by the IBTF
1699 	 */
1700 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM)
1701 		pi->p_capabilities |= IBT_PORT_CAP_SM;
1702 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM_DISABLED)
1703 		pi->p_capabilities |= IBT_PORT_CAP_SM_DISABLED;
1704 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SNMP_SUPPD)
1705 		pi->p_capabilities |= IBT_PORT_CAP_SNMP_TUNNEL;
1706 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_DM_SUPPD)
1707 		pi->p_capabilities |= IBT_PORT_CAP_DM;
1708 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_VM_SUPPD)
1709 		pi->p_capabilities |= IBT_PORT_CAP_VENDOR;
1710 
1711 	/*
1712 	 * Fill in the SGID table.  Since the only access to the Tavor
1713 	 * GID tables is through the firmware's MAD_IFC interface, we
1714 	 * post as many GetGUIDInfo MADs as necessary to read in the entire
1715 	 * contents of the SGID table (for the specified port).  Note:  The
1716 	 * GetGUIDInfo command only gets eight GUIDs per operation.  These
1717 	 * GUIDs are then appended to the GID prefix for the port (from the
1718 	 * GetPortInfo above) to form the entire SGID table.
1719 	 */
1720 	for (i = 0; i < pi->p_sgid_tbl_sz; i += 8) {
1721 		status = tavor_getguidinfo_cmd_post(state, port, i >> 3,
1722 		    TAVOR_SLEEPFLAG_FOR_CONTEXT(), &guidinfo);
1723 		if (status != TAVOR_CMD_SUCCESS) {
1724 			cmn_err(CE_CONT, "Tavor: GetGUIDInfo (port %02d) "
1725 			    "command failed: %08x\n", port, status);
1726 			return (ibc_get_ci_failure(0));
1727 		}
1728 
1729 		/* Figure out how many of the entries are valid */
1730 		sgid_max = min((pi->p_sgid_tbl_sz - i), 8);
1731 		for (j = 0; j < sgid_max; j++) {
1732 			indx = (i + j);
1733 			sgid = &pi->p_sgid_tbl[indx];
1734 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sgid))
1735 			sgid->gid_prefix = portinfo.GidPrefix;
1736 			sgid->gid_guid	 = guidinfo.GUIDBlocks[j];
1737 		}
1738 	}
1739 
1740 	/*
1741 	 * Fill in the PKey table.  Just as for the GID tables above, the
1742 	 * only access to the Tavor PKey tables is through the firmware's
1743 	 * MAD_IFC interface.  We post as many GetPKeyTable MADs as necessary
1744 	 * to read in the entire contents of the PKey table (for the specified
1745 	 * port).  Note:  The GetPKeyTable command only gets 32 PKeys per
1746 	 * operation.
1747 	 */
1748 	for (i = 0; i < pi->p_pkey_tbl_sz; i += 32) {
1749 		status = tavor_getpkeytable_cmd_post(state, port, i,
1750 		    TAVOR_SLEEPFLAG_FOR_CONTEXT(), &pkeytable);
1751 		if (status != TAVOR_CMD_SUCCESS) {
1752 			cmn_err(CE_CONT, "Tavor: GetPKeyTable (port %02d) "
1753 			    "command failed: %08x\n", port, status);
1754 			return (ibc_get_ci_failure(0));
1755 		}
1756 
1757 		/* Figure out how many of the entries are valid */
1758 		pkey_max = min((pi->p_pkey_tbl_sz - i), 32);
1759 		for (j = 0; j < pkey_max; j++) {
1760 			indx = (i + j);
1761 			pi->p_pkey_tbl[indx] = pkeytable.P_KeyTableBlocks[j];
1762 		}
1763 	}
1764 
1765 	return (DDI_SUCCESS);
1766 }
1767 
1768 
1769 /*
1770  * tavor_port_modify()
1771  *    Context: Can be called only from user or kernel context.
1772  */
1773 /* ARGSUSED */
1774 int
tavor_port_modify(tavor_state_t * state,uint8_t port,ibt_port_modify_flags_t flags,uint8_t init_type)1775 tavor_port_modify(tavor_state_t *state, uint8_t port,
1776     ibt_port_modify_flags_t flags, uint8_t init_type)
1777 {
1778 	sm_portinfo_t	portinfo;
1779 	uint32_t	capmask, reset_qkey;
1780 	int		status;
1781 
1782 	/*
1783 	 * Return an error if either of the unsupported flags are set
1784 	 */
1785 	if ((flags & IBT_PORT_SHUTDOWN) ||
1786 	    (flags & IBT_PORT_SET_INIT_TYPE)) {
1787 		return (IBT_NOT_SUPPORTED);
1788 	}
1789 
1790 	/*
1791 	 * Determine whether we are trying to reset the QKey counter
1792 	 */
1793 	reset_qkey = (flags & IBT_PORT_RESET_QKEY) ? 1 : 0;
1794 
1795 	/* Validate that specified port number is legal */
1796 	if (!tavor_portnum_is_valid(state, port)) {
1797 		return (IBT_HCA_PORT_INVALID);
1798 	}
1799 
1800 	/*
1801 	 * Use the Tavor MAD_IFC command to post a GetPortInfo MAD to the
1802 	 * firmware (for the specified port number).  This returns a full
1803 	 * PortInfo MAD (in "portinfo") from which we pull the current
1804 	 * capability mask.  We then modify the capability mask as directed
1805 	 * by the "pmod_flags" field, and write the updated capability mask
1806 	 * using the Tavor SET_IB command (below).
1807 	 */
1808 	status = tavor_getportinfo_cmd_post(state, port,
1809 	    TAVOR_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
1810 	if (status != TAVOR_CMD_SUCCESS) {
1811 		return (ibc_get_ci_failure(0));
1812 	}
1813 
1814 	/*
1815 	 * Convert InfiniBand-defined port capability flags to the format
1816 	 * specified by the IBTF.  Specifically, we modify the capability
1817 	 * mask based on the specified values.
1818 	 */
1819 	capmask = portinfo.CapabilityMask;
1820 
1821 	if (flags & IBT_PORT_RESET_SM)
1822 		capmask &= ~SM_CAP_MASK_IS_SM;
1823 	else if (flags & IBT_PORT_SET_SM)
1824 		capmask |= SM_CAP_MASK_IS_SM;
1825 
1826 	if (flags & IBT_PORT_RESET_SNMP)
1827 		capmask &= ~SM_CAP_MASK_IS_SNMP_SUPPD;
1828 	else if (flags & IBT_PORT_SET_SNMP)
1829 		capmask |= SM_CAP_MASK_IS_SNMP_SUPPD;
1830 
1831 	if (flags & IBT_PORT_RESET_DEVMGT)
1832 		capmask &= ~SM_CAP_MASK_IS_DM_SUPPD;
1833 	else if (flags & IBT_PORT_SET_DEVMGT)
1834 		capmask |= SM_CAP_MASK_IS_DM_SUPPD;
1835 
1836 	if (flags & IBT_PORT_RESET_VENDOR)
1837 		capmask &= ~SM_CAP_MASK_IS_VM_SUPPD;
1838 	else if (flags & IBT_PORT_SET_VENDOR)
1839 		capmask |= SM_CAP_MASK_IS_VM_SUPPD;
1840 
1841 	/*
1842 	 * Use the Tavor SET_IB command to update the capability mask and
1843 	 * (possibly) reset the QKey violation counter for the specified port.
1844 	 * Note: In general, this operation shouldn't fail.  If it does, then
1845 	 * it is an indication that something (probably in HW, but maybe in
1846 	 * SW) has gone seriously wrong.
1847 	 */
1848 	status = tavor_set_ib_cmd_post(state, capmask, port, reset_qkey,
1849 	    TAVOR_SLEEPFLAG_FOR_CONTEXT());
1850 	if (status != TAVOR_CMD_SUCCESS) {
1851 		TAVOR_WARNING(state, "failed to modify port capabilities");
1852 		cmn_err(CE_CONT, "Tavor: SET_IB (port %02d) command failed: "
1853 		    "%08x\n", port, status);
1854 		return (ibc_get_ci_failure(0));
1855 	}
1856 
1857 	return (DDI_SUCCESS);
1858 }
1859 
1860 
1861 /*
1862  * tavor_set_addr_path()
1863  *    Context: Can be called from interrupt or base context.
1864  *
1865  * Note: This routine is used for two purposes.  It is used to fill in the
1866  * Tavor UDAV fields, and it is used to fill in the address path information
1867  * for QPs.  Because the two Tavor structures are similar, common fields can
1868  * be filled in here.  Because they are slightly different, however, we pass
1869  * an additional flag to indicate which type is being filled.
1870  */
1871 int
tavor_set_addr_path(tavor_state_t * state,ibt_adds_vect_t * av,tavor_hw_addr_path_t * path,uint_t type,tavor_qphdl_t qp)1872 tavor_set_addr_path(tavor_state_t *state, ibt_adds_vect_t *av,
1873     tavor_hw_addr_path_t *path, uint_t type, tavor_qphdl_t qp)
1874 {
1875 	uint_t		gidtbl_sz;
1876 
1877 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av))
1878 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path))
1879 
1880 	path->ml_path	= av->av_src_path;
1881 	path->rlid	= av->av_dlid;
1882 	path->sl	= av->av_srvl;
1883 
1884 	/* Port number only valid (in "av_port_num") if this is a UDAV */
1885 	if (type == TAVOR_ADDRPATH_UDAV) {
1886 		path->portnum = av->av_port_num;
1887 	}
1888 
1889 	/*
1890 	 * Validate (and fill in) static rate.
1891 	 *
1892 	 * The stat_rate_sup is used to decide how to set the rate and
1893 	 * if it is zero, the driver uses the old interface.
1894 	 */
1895 	if (state->ts_devlim.stat_rate_sup) {
1896 		if (av->av_srate == IBT_SRATE_20) {
1897 			path->max_stat_rate = 0; /* 4x@DDR injection rate */
1898 		} else if (av->av_srate == IBT_SRATE_5) {
1899 			path->max_stat_rate = 3; /* 1x@DDR injection rate */
1900 		} else if (av->av_srate == IBT_SRATE_10) {
1901 			path->max_stat_rate = 2; /* 4x@SDR injection rate */
1902 		} else if (av->av_srate == IBT_SRATE_2) {
1903 			path->max_stat_rate = 1; /* 1x@SDR injection rate */
1904 		} else if (av->av_srate == IBT_SRATE_NOT_SPECIFIED) {
1905 			path->max_stat_rate = 0; /* Max */
1906 		} else {
1907 			return (IBT_STATIC_RATE_INVALID);
1908 		}
1909 	} else {
1910 		if (av->av_srate == IBT_SRATE_10) {
1911 			path->max_stat_rate = 0; /* 4x@SDR injection rate */
1912 		} else if (av->av_srate == IBT_SRATE_2) {
1913 			path->max_stat_rate = 1; /* 1x@SDR injection rate */
1914 		} else if (av->av_srate == IBT_SRATE_NOT_SPECIFIED) {
1915 			path->max_stat_rate = 0; /* Max */
1916 		} else {
1917 			return (IBT_STATIC_RATE_INVALID);
1918 		}
1919 	}
1920 
1921 	/*
1922 	 * If this is a QP operation save asoft copy.
1923 	 */
1924 	if (qp) {
1925 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(qp->qp_save_srate))
1926 		qp->qp_save_srate = av->av_srate;
1927 	}
1928 
1929 	/* If "grh" flag is set, then check for valid SGID index too */
1930 	gidtbl_sz = (1 << state->ts_devlim.log_max_gid);
1931 	if ((av->av_send_grh) && (av->av_sgid_ix > gidtbl_sz)) {
1932 		return (IBT_SGID_INVALID);
1933 	}
1934 
1935 	/*
1936 	 * Fill in all "global" values regardless of the value in the GRH
1937 	 * flag.  Because "grh" is not set unless "av_send_grh" is set, the
1938 	 * hardware will ignore the other "global" values as necessary.  Note:
1939 	 * SW does this here to enable later query operations to return
1940 	 * exactly the same params that were passed when the addr path was
1941 	 * last written.
1942 	 */
1943 	path->grh = av->av_send_grh;
1944 	if (type == TAVOR_ADDRPATH_QP) {
1945 		path->mgid_index = av->av_sgid_ix;
1946 	} else {
1947 		/*
1948 		 * For Tavor UDAV, the "mgid_index" field is the index into
1949 		 * a combined table (not a per-port table). So some extra
1950 		 * calculations are necessary.
1951 		 */
1952 		path->mgid_index = ((av->av_port_num - 1) * gidtbl_sz) +
1953 		    av->av_sgid_ix;
1954 	}
1955 	path->flow_label = av->av_flow;
1956 	path->tclass	 = av->av_tclass;
1957 	path->hop_limit	 = av->av_hop;
1958 	path->rgid_h	 = av->av_dgid.gid_prefix;
1959 
1960 	/*
1961 	 * According to Tavor PRM, the (31:0) part of rgid_l must be set to
1962 	 * "0x2" if the 'grh' or 'g' bit is cleared.  It also says that we
1963 	 * only need to do it for UDAV's.  So we enforce that here.
1964 	 *
1965 	 * NOTE: The entire 64 bits worth of GUID info is actually being
1966 	 * preserved (for UDAVs) by the callers of this function
1967 	 * (tavor_ah_alloc() and tavor_ah_modify()) and as long as the
1968 	 * 'grh' bit is not set, the upper 32 bits (63:32) of rgid_l are
1969 	 * "don't care".
1970 	 */
1971 	if ((path->grh) || (type == TAVOR_ADDRPATH_QP)) {
1972 		path->rgid_l = av->av_dgid.gid_guid;
1973 	} else {
1974 		path->rgid_l = 0x2;
1975 	}
1976 
1977 	return (DDI_SUCCESS);
1978 }
1979 
1980 
1981 /*
1982  * tavor_get_addr_path()
1983  *    Context: Can be called from interrupt or base context.
1984  *
1985  * Note: Just like tavor_set_addr_path() above, this routine is used for two
1986  * purposes.  It is used to read in the Tavor UDAV fields, and it is used to
1987  * read in the address path information for QPs.  Because the two Tavor
1988  * structures are similar, common fields can be read in here.  But because
1989  * they are slightly different, we pass an additional flag to indicate which
1990  * type is being read.
1991  */
1992 void
tavor_get_addr_path(tavor_state_t * state,tavor_hw_addr_path_t * path,ibt_adds_vect_t * av,uint_t type,tavor_qphdl_t qp)1993 tavor_get_addr_path(tavor_state_t *state, tavor_hw_addr_path_t *path,
1994     ibt_adds_vect_t *av, uint_t type, tavor_qphdl_t qp)
1995 {
1996 	uint_t		gidtbl_sz;
1997 
1998 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path))
1999 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av))
2000 
2001 	av->av_src_path	= path->ml_path;
2002 	av->av_port_num	= path->portnum;
2003 	av->av_dlid	= path->rlid;
2004 	av->av_srvl	= path->sl;
2005 
2006 	/*
2007 	 * Set "av_ipd" value from max_stat_rate.
2008 	 */
2009 	if (qp) {
2010 		/*
2011 		 * If a QP operation use the soft copy
2012 		 */
2013 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(qp->qp_save_srate))
2014 		av->av_srate = qp->qp_save_srate;
2015 	} else {
2016 		/*
2017 		 * The stat_rate_sup is used to decide how the srate value is
2018 		 * set and
2019 		 * if it is zero, the driver uses the old interface.
2020 		 */
2021 		if (state->ts_devlim.stat_rate_sup) {
2022 			if (path->max_stat_rate	== 0) {
2023 				av->av_srate = IBT_SRATE_20; /* 4x@DDR rate */
2024 			} else if (path->max_stat_rate	== 1) {
2025 				av->av_srate = IBT_SRATE_2;  /* 1x@SDR rate */
2026 			} else if (path->max_stat_rate	== 2) {
2027 				av->av_srate = IBT_SRATE_10; /* 4x@SDR rate */
2028 			} else if (path->max_stat_rate	== 3) {
2029 				av->av_srate = IBT_SRATE_5;  /* 1xDDR rate */
2030 			}
2031 		} else {
2032 			if (path->max_stat_rate	== 0) {
2033 				av->av_srate = IBT_SRATE_10; /* 4x@SDR rate */
2034 			} else if (path->max_stat_rate	== 1) {
2035 				av->av_srate = IBT_SRATE_2;  /* 1x@SDR rate */
2036 			}
2037 		}
2038 	}
2039 
2040 	/*
2041 	 * Extract all "global" values regardless of the value in the GRH
2042 	 * flag.  Because "av_send_grh" is set only if "grh" is set, software
2043 	 * knows to ignore the other "global" values as necessary.  Note: SW
2044 	 * does it this way to enable these query operations to return exactly
2045 	 * the same params that were passed when the addr path was last written.
2046 	 */
2047 	av->av_send_grh		= path->grh;
2048 	if (type == TAVOR_ADDRPATH_QP) {
2049 		av->av_sgid_ix  = path->mgid_index;
2050 	} else {
2051 		/*
2052 		 * For Tavor UDAV, the "mgid_index" field is the index into
2053 		 * a combined table (not a per-port table). So some extra
2054 		 * calculations are necessary.
2055 		 */
2056 		gidtbl_sz = (1 << state->ts_devlim.log_max_gid);
2057 		av->av_sgid_ix = path->mgid_index - ((av->av_port_num - 1) *
2058 		    gidtbl_sz);
2059 	}
2060 	av->av_flow		= path->flow_label;
2061 	av->av_tclass		= path->tclass;
2062 	av->av_hop		= path->hop_limit;
2063 	av->av_dgid.gid_prefix	= path->rgid_h;
2064 	av->av_dgid.gid_guid	= path->rgid_l;
2065 }
2066 
2067 
2068 /*
2069  * tavor_portnum_is_valid()
2070  *    Context: Can be called from interrupt or base context.
2071  */
2072 int
tavor_portnum_is_valid(tavor_state_t * state,uint_t portnum)2073 tavor_portnum_is_valid(tavor_state_t *state, uint_t portnum)
2074 {
2075 	uint_t	max_port;
2076 
2077 	max_port = state->ts_cfg_profile->cp_num_ports;
2078 	if ((portnum <= max_port) && (portnum != 0)) {
2079 		return (1);
2080 	} else {
2081 		return (0);
2082 	}
2083 }
2084 
2085 
2086 /*
2087  * tavor_pkeyindex_is_valid()
2088  *    Context: Can be called from interrupt or base context.
2089  */
2090 int
tavor_pkeyindex_is_valid(tavor_state_t * state,uint_t pkeyindx)2091 tavor_pkeyindex_is_valid(tavor_state_t *state, uint_t pkeyindx)
2092 {
2093 	uint_t	max_pkeyindx;
2094 
2095 	max_pkeyindx = 1 << state->ts_cfg_profile->cp_log_max_pkeytbl;
2096 	if (pkeyindx < max_pkeyindx) {
2097 		return (1);
2098 	} else {
2099 		return (0);
2100 	}
2101 }
2102 
2103 
2104 /*
2105  * tavor_queue_alloc()
2106  *    Context: Can be called from interrupt or base context.
2107  */
2108 int
tavor_queue_alloc(tavor_state_t * state,tavor_qalloc_info_t * qa_info,uint_t sleepflag)2109 tavor_queue_alloc(tavor_state_t *state, tavor_qalloc_info_t *qa_info,
2110     uint_t sleepflag)
2111 {
2112 	ddi_dma_attr_t		dma_attr;
2113 	int			(*callback)(caddr_t);
2114 	uint64_t		realsize, alloc_mask;
2115 	uint_t			dma_xfer_mode, type;
2116 	int			flag, status;
2117 
2118 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info))
2119 
2120 	/* Set the callback flag appropriately */
2121 	callback = (sleepflag == TAVOR_SLEEP) ? DDI_DMA_SLEEP :
2122 	    DDI_DMA_DONTWAIT;
2123 
2124 	/*
2125 	 * Initialize many of the default DMA attributes.  Then set additional
2126 	 * alignment restrictions as necessary for the queue memory.  Also
2127 	 * respect the configured value for IOMMU bypass
2128 	 */
2129 	tavor_dma_attr_init(&dma_attr);
2130 	dma_attr.dma_attr_align = qa_info->qa_bind_align;
2131 	type = state->ts_cfg_profile->cp_iommu_bypass;
2132 	if (type == TAVOR_BINDMEM_BYPASS) {
2133 		dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2134 	}
2135 
2136 	/* Allocate a DMA handle */
2137 	status = ddi_dma_alloc_handle(state->ts_dip, &dma_attr, callback, NULL,
2138 	    &qa_info->qa_dmahdl);
2139 	if (status != DDI_SUCCESS) {
2140 		return (DDI_FAILURE);
2141 	}
2142 
2143 	/*
2144 	 * Determine the amount of memory to allocate, depending on the values
2145 	 * in "qa_bind_align" and "qa_alloc_align".  The problem we are trying
2146 	 * to solve here is that allocating a DMA handle with IOMMU bypass
2147 	 * (DDI_DMA_FORCE_PHYSICAL) constrains us to only requesting alignments
2148 	 * that are less than the page size.  Since we may need stricter
2149 	 * alignments on the memory allocated by ddi_dma_mem_alloc() (e.g. in
2150 	 * Tavor QP work queue memory allocation), we use the following method
2151 	 * to calculate how much additional memory to request, and we enforce
2152 	 * our own alignment on the allocated result.
2153 	 */
2154 	alloc_mask = qa_info->qa_alloc_align - 1;
2155 	if (qa_info->qa_bind_align == qa_info->qa_alloc_align) {
2156 		realsize = qa_info->qa_size;
2157 	} else {
2158 		realsize = qa_info->qa_size + alloc_mask;
2159 	}
2160 
2161 	/*
2162 	 * If we are to allocate the queue from system memory, then use
2163 	 * ddi_dma_mem_alloc() to find the space.  Otherwise, if we are to
2164 	 * allocate the queue from locally-attached DDR memory, then use the
2165 	 * vmem allocator to find the space.  In either case, return a pointer
2166 	 * to the memory range allocated (including any necessary alignment
2167 	 * adjustments), the "real" memory pointer, the "real" size, and a
2168 	 * ddi_acc_handle_t to use when reading from/writing to the memory.
2169 	 */
2170 	if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_NORMAL) {
2171 
2172 		/*
2173 		 * Determine whether to map STREAMING or CONSISTENT.  This is
2174 		 * based on the value set in the configuration profile at
2175 		 * attach time.
2176 		 */
2177 		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
2178 
2179 		/* Allocate system memory for the queue */
2180 		status = ddi_dma_mem_alloc(qa_info->qa_dmahdl, realsize,
2181 		    &state->ts_reg_accattr, dma_xfer_mode, callback, NULL,
2182 		    (caddr_t *)&qa_info->qa_buf_real,
2183 		    (size_t *)&qa_info->qa_buf_realsz, &qa_info->qa_acchdl);
2184 		if (status != DDI_SUCCESS) {
2185 			ddi_dma_free_handle(&qa_info->qa_dmahdl);
2186 			return (DDI_FAILURE);
2187 		}
2188 
2189 		/*
2190 		 * Save temporary copy of the real pointer.  (This may be
2191 		 * modified in the last step below).
2192 		 */
2193 		qa_info->qa_buf_aligned = qa_info->qa_buf_real;
2194 
2195 	} else if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_USERLAND) {
2196 
2197 		/* Allocate userland mappable memory for the queue */
2198 		flag = (sleepflag == TAVOR_SLEEP) ? DDI_UMEM_SLEEP :
2199 		    DDI_UMEM_NOSLEEP;
2200 		qa_info->qa_buf_real = ddi_umem_alloc(realsize, flag,
2201 		    &qa_info->qa_umemcookie);
2202 		if (qa_info->qa_buf_real == NULL) {
2203 			ddi_dma_free_handle(&qa_info->qa_dmahdl);
2204 			return (DDI_FAILURE);
2205 		}
2206 
2207 		/*
2208 		 * Save temporary copy of the real pointer.  (This may be
2209 		 * modified in the last step below).
2210 		 */
2211 		qa_info->qa_buf_aligned = qa_info->qa_buf_real;
2212 
2213 	} else {  /* TAVOR_QUEUE_LOCATION_INDDR */
2214 
2215 		/* Allocate DDR memory for the queue */
2216 		flag = (sleepflag == TAVOR_SLEEP) ? VM_SLEEP : VM_NOSLEEP;
2217 		qa_info->qa_buf_real = (uint32_t *)vmem_xalloc(
2218 		    state->ts_ddrvmem, realsize, qa_info->qa_bind_align, 0, 0,
2219 		    NULL, NULL, flag);
2220 		if (qa_info->qa_buf_real == NULL) {
2221 			ddi_dma_free_handle(&qa_info->qa_dmahdl);
2222 			return (DDI_FAILURE);
2223 		}
2224 
2225 		/*
2226 		 * Since "qa_buf_real" will be a PCI address (the offset into
2227 		 * the DDR memory), we first need to do some calculations to
2228 		 * convert it to its kernel mapped address.  (Note: This may
2229 		 * be modified again below, when any additional "alloc"
2230 		 * alignment constraint is applied).
2231 		 */
2232 		qa_info->qa_buf_aligned = (uint32_t *)(uintptr_t)(((uintptr_t)
2233 		    state->ts_reg_ddr_baseaddr) + ((uintptr_t)
2234 		    qa_info->qa_buf_real - state->ts_ddr.ddr_baseaddr));
2235 		qa_info->qa_buf_realsz	= realsize;
2236 		qa_info->qa_acchdl	= state->ts_reg_ddrhdl;
2237 	}
2238 
2239 	/*
2240 	 * The last step is to ensure that the final address ("qa_buf_aligned")
2241 	 * has the appropriate "alloc" alignment restriction applied to it
2242 	 * (if necessary).
2243 	 */
2244 	if (qa_info->qa_bind_align != qa_info->qa_alloc_align) {
2245 		qa_info->qa_buf_aligned = (uint32_t *)(uintptr_t)(((uintptr_t)
2246 		    qa_info->qa_buf_aligned + alloc_mask) & ~alloc_mask);
2247 	}
2248 
2249 	return (DDI_SUCCESS);
2250 }
2251 
2252 
2253 /*
2254  * tavor_queue_free()
2255  *    Context: Can be called from interrupt or base context.
2256  */
2257 void
tavor_queue_free(tavor_state_t * state,tavor_qalloc_info_t * qa_info)2258 tavor_queue_free(tavor_state_t *state, tavor_qalloc_info_t *qa_info)
2259 {
2260 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info))
2261 
2262 	/*
2263 	 * Depending on how (i.e. from where) we allocated the memory for
2264 	 * this queue, we choose the appropriate method for releasing the
2265 	 * resources.
2266 	 */
2267 	if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_NORMAL) {
2268 
2269 		ddi_dma_mem_free(&qa_info->qa_acchdl);
2270 
2271 	} else if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_USERLAND) {
2272 
2273 		ddi_umem_free(qa_info->qa_umemcookie);
2274 
2275 	} else {  /* TAVOR_QUEUE_LOCATION_INDDR */
2276 
2277 		vmem_xfree(state->ts_ddrvmem, qa_info->qa_buf_real,
2278 		    qa_info->qa_buf_realsz);
2279 	}
2280 
2281 	/* Always free the dma handle */
2282 	ddi_dma_free_handle(&qa_info->qa_dmahdl);
2283 }
2284 
2285 
2286 /*
2287  * tavor_dmaattr_get()
2288  *    Context: Can be called from interrupt or base context.
2289  */
2290 void
tavor_dma_attr_init(ddi_dma_attr_t * dma_attr)2291 tavor_dma_attr_init(ddi_dma_attr_t *dma_attr)
2292 {
2293 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*dma_attr))
2294 
2295 	dma_attr->dma_attr_version	= DMA_ATTR_V0;
2296 	dma_attr->dma_attr_addr_lo	= 0;
2297 	dma_attr->dma_attr_addr_hi	= 0xFFFFFFFFFFFFFFFFull;
2298 	dma_attr->dma_attr_count_max	= 0xFFFFFFFFFFFFFFFFull;
2299 	dma_attr->dma_attr_align	= 1;
2300 	dma_attr->dma_attr_burstsizes	= 0x3FF;
2301 	dma_attr->dma_attr_minxfer	= 1;
2302 	dma_attr->dma_attr_maxxfer	= 0xFFFFFFFFFFFFFFFFull;
2303 	dma_attr->dma_attr_seg		= 0xFFFFFFFFFFFFFFFFull;
2304 	dma_attr->dma_attr_sgllen	= 0x7FFFFFFF;
2305 	dma_attr->dma_attr_granular	= 1;
2306 	dma_attr->dma_attr_flags	= 0;
2307 }
2308