xref: /illumos-gate/usr/src/uts/common/io/mac/mac_provider.c (revision c61a1653a4d73dbc950dac7d96350fd6cb517486)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2019 Joyent, Inc.
25  * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/conf.h>
30 #include <sys/id_space.h>
31 #include <sys/esunddi.h>
32 #include <sys/stat.h>
33 #include <sys/mkdev.h>
34 #include <sys/stream.h>
35 #include <sys/strsubr.h>
36 #include <sys/dlpi.h>
37 #include <sys/modhash.h>
38 #include <sys/mac.h>
39 #include <sys/mac_provider.h>
40 #include <sys/mac_impl.h>
41 #include <sys/mac_client_impl.h>
42 #include <sys/mac_client_priv.h>
43 #include <sys/mac_soft_ring.h>
44 #include <sys/mac_stat.h>
45 #include <sys/dld.h>
46 #include <sys/modctl.h>
47 #include <sys/fs/dv_node.h>
48 #include <sys/thread.h>
49 #include <sys/proc.h>
50 #include <sys/callb.h>
51 #include <sys/cpuvar.h>
52 #include <sys/atomic.h>
53 #include <sys/sdt.h>
54 #include <sys/mac_flow.h>
55 #include <sys/ddi_intr_impl.h>
56 #include <sys/disp.h>
57 #include <sys/sdt.h>
58 #include <sys/pattr.h>
59 #include <sys/strsun.h>
60 #include <sys/vlan.h>
61 #include <inet/ip.h>
62 #include <inet/tcp.h>
63 #include <netinet/udp.h>
64 #include <netinet/sctp.h>
65 
66 /*
67  * MAC Provider Interface.
68  *
69  * Interface for GLDv3 compatible NIC drivers.
70  */
71 
72 static void i_mac_notify_thread(void *);
73 
74 typedef void (*mac_notify_default_cb_fn_t)(mac_impl_t *);
75 
76 static const mac_notify_default_cb_fn_t mac_notify_cb_list[MAC_NNOTE] = {
77 	mac_fanout_recompute,	/* MAC_NOTE_LINK */
78 	NULL,		/* MAC_NOTE_UNICST */
79 	NULL,		/* MAC_NOTE_TX */
80 	NULL,		/* MAC_NOTE_DEVPROMISC */
81 	NULL,		/* MAC_NOTE_FASTPATH_FLUSH */
82 	NULL,		/* MAC_NOTE_SDU_SIZE */
83 	NULL,		/* MAC_NOTE_MARGIN */
84 	NULL,		/* MAC_NOTE_CAPAB_CHG */
85 	NULL		/* MAC_NOTE_LOWLINK */
86 };
87 
88 /*
89  * Driver support functions.
90  */
91 
92 /* REGISTRATION */
93 
94 mac_register_t *
95 mac_alloc(uint_t mac_version)
96 {
97 	mac_register_t *mregp;
98 
99 	/*
100 	 * Make sure there isn't a version mismatch between the driver and
101 	 * the framework.  In the future, if multiple versions are
102 	 * supported, this check could become more sophisticated.
103 	 */
104 	if (mac_version != MAC_VERSION)
105 		return (NULL);
106 
107 	mregp = kmem_zalloc(sizeof (mac_register_t), KM_SLEEP);
108 	mregp->m_version = mac_version;
109 	return (mregp);
110 }
111 
112 void
113 mac_free(mac_register_t *mregp)
114 {
115 	kmem_free(mregp, sizeof (mac_register_t));
116 }
117 
118 /*
119  * Convert a MAC's offload features into the equivalent DB_CKSUMFLAGS
120  * value.
121  */
122 static uint16_t
123 mac_features_to_flags(mac_handle_t mh)
124 {
125 	uint16_t flags = 0;
126 	uint32_t cap_sum = 0;
127 	mac_capab_lso_t cap_lso;
128 
129 	if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap_sum)) {
130 		if (cap_sum & HCKSUM_IPHDRCKSUM)
131 			flags |= HCK_IPV4_HDRCKSUM;
132 
133 		if (cap_sum & HCKSUM_INET_PARTIAL)
134 			flags |= HCK_PARTIALCKSUM;
135 		else if (cap_sum & (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6))
136 			flags |= HCK_FULLCKSUM;
137 	}
138 
139 	/*
140 	 * We don't need the information stored in 'cap_lso', but we
141 	 * need to pass a non-NULL pointer to appease the driver.
142 	 */
143 	if (mac_capab_get(mh, MAC_CAPAB_LSO, &cap_lso))
144 		flags |= HW_LSO;
145 
146 	return (flags);
147 }
148 
149 /*
150  * mac_register() is how drivers register new MACs with the GLDv3
151  * framework.  The mregp argument is allocated by drivers using the
152  * mac_alloc() function, and can be freed using mac_free() immediately upon
153  * return from mac_register().  Upon success (0 return value), the mhp
154  * opaque pointer becomes the driver's handle to its MAC interface, and is
155  * the argument to all other mac module entry points.
156  */
157 /* ARGSUSED */
158 int
159 mac_register(mac_register_t *mregp, mac_handle_t *mhp)
160 {
161 	mac_impl_t		*mip;
162 	mactype_t		*mtype;
163 	int			err = EINVAL;
164 	struct devnames		*dnp = NULL;
165 	uint_t			instance;
166 	boolean_t		style1_created = B_FALSE;
167 	boolean_t		style2_created = B_FALSE;
168 	char			*driver;
169 	minor_t			minor = 0;
170 
171 	/* A successful call to mac_init_ops() sets the DN_GLDV3_DRIVER flag. */
172 	if (!GLDV3_DRV(ddi_driver_major(mregp->m_dip)))
173 		return (EINVAL);
174 
175 	/* Find the required MAC-Type plugin. */
176 	if ((mtype = mactype_getplugin(mregp->m_type_ident)) == NULL)
177 		return (EINVAL);
178 
179 	/* Create a mac_impl_t to represent this MAC. */
180 	mip = kmem_cache_alloc(i_mac_impl_cachep, KM_SLEEP);
181 
182 	/*
183 	 * The mac is not ready for open yet.
184 	 */
185 	mip->mi_state_flags |= MIS_DISABLED;
186 
187 	/*
188 	 * When a mac is registered, the m_instance field can be set to:
189 	 *
190 	 *  0:	Get the mac's instance number from m_dip.
191 	 *	This is usually used for physical device dips.
192 	 *
193 	 *  [1 .. MAC_MAX_MINOR-1]: Use the value as the mac's instance number.
194 	 *	For example, when an aggregation is created with the key option,
195 	 *	"key" will be used as the instance number.
196 	 *
197 	 *  -1: Assign an instance number from [MAC_MAX_MINOR .. MAXMIN-1].
198 	 *	This is often used when a MAC of a virtual link is registered
199 	 *	(e.g., aggregation when "key" is not specified, or vnic).
200 	 *
201 	 * Note that the instance number is used to derive the mi_minor field
202 	 * of mac_impl_t, which will then be used to derive the name of kstats
203 	 * and the devfs nodes.  The first 2 cases are needed to preserve
204 	 * backward compatibility.
205 	 */
206 	switch (mregp->m_instance) {
207 	case 0:
208 		instance = ddi_get_instance(mregp->m_dip);
209 		break;
210 	case ((uint_t)-1):
211 		minor = mac_minor_hold(B_TRUE);
212 		if (minor == 0) {
213 			err = ENOSPC;
214 			goto fail;
215 		}
216 		instance = minor - 1;
217 		break;
218 	default:
219 		instance = mregp->m_instance;
220 		if (instance >= MAC_MAX_MINOR) {
221 			err = EINVAL;
222 			goto fail;
223 		}
224 		break;
225 	}
226 
227 	mip->mi_minor = (minor_t)(instance + 1);
228 	mip->mi_dip = mregp->m_dip;
229 	mip->mi_clients_list = NULL;
230 	mip->mi_nclients = 0;
231 
232 	/* Set the default IEEE Port VLAN Identifier */
233 	mip->mi_pvid = 1;
234 
235 	/* Default bridge link learning protection values */
236 	mip->mi_llimit = 1000;
237 	mip->mi_ldecay = 200;
238 
239 	driver = (char *)ddi_driver_name(mip->mi_dip);
240 
241 	/* Construct the MAC name as <drvname><instance> */
242 	(void) snprintf(mip->mi_name, sizeof (mip->mi_name), "%s%d",
243 	    driver, instance);
244 
245 	mip->mi_driver = mregp->m_driver;
246 
247 	mip->mi_type = mtype;
248 	mip->mi_margin = mregp->m_margin;
249 	mip->mi_info.mi_media = mtype->mt_type;
250 	mip->mi_info.mi_nativemedia = mtype->mt_nativetype;
251 	if (mregp->m_max_sdu <= mregp->m_min_sdu)
252 		goto fail;
253 	if (mregp->m_multicast_sdu == 0)
254 		mregp->m_multicast_sdu = mregp->m_max_sdu;
255 	if (mregp->m_multicast_sdu < mregp->m_min_sdu ||
256 	    mregp->m_multicast_sdu > mregp->m_max_sdu)
257 		goto fail;
258 	mip->mi_sdu_min = mregp->m_min_sdu;
259 	mip->mi_sdu_max = mregp->m_max_sdu;
260 	mip->mi_sdu_multicast = mregp->m_multicast_sdu;
261 	mip->mi_info.mi_addr_length = mip->mi_type->mt_addr_length;
262 	/*
263 	 * If the media supports a broadcast address, cache a pointer to it
264 	 * in the mac_info_t so that upper layers can use it.
265 	 */
266 	mip->mi_info.mi_brdcst_addr = mip->mi_type->mt_brdcst_addr;
267 
268 	mip->mi_v12n_level = mregp->m_v12n;
269 
270 	/*
271 	 * Copy the unicast source address into the mac_info_t, but only if
272 	 * the MAC-Type defines a non-zero address length.  We need to
273 	 * handle MAC-Types that have an address length of 0
274 	 * (point-to-point protocol MACs for example).
275 	 */
276 	if (mip->mi_type->mt_addr_length > 0) {
277 		if (mregp->m_src_addr == NULL)
278 			goto fail;
279 		mip->mi_info.mi_unicst_addr =
280 		    kmem_alloc(mip->mi_type->mt_addr_length, KM_SLEEP);
281 		bcopy(mregp->m_src_addr, mip->mi_info.mi_unicst_addr,
282 		    mip->mi_type->mt_addr_length);
283 
284 		/*
285 		 * Copy the fixed 'factory' MAC address from the immutable
286 		 * info.  This is taken to be the MAC address currently in
287 		 * use.
288 		 */
289 		bcopy(mip->mi_info.mi_unicst_addr, mip->mi_addr,
290 		    mip->mi_type->mt_addr_length);
291 
292 		/*
293 		 * At this point, we should set up the classification
294 		 * rules etc but we delay it till mac_open() so that
295 		 * the resource discovery has taken place and we
296 		 * know someone wants to use the device. Otherwise
297 		 * memory gets allocated for Rx ring structures even
298 		 * during probe.
299 		 */
300 
301 		/* Copy the destination address if one is provided. */
302 		if (mregp->m_dst_addr != NULL) {
303 			bcopy(mregp->m_dst_addr, mip->mi_dstaddr,
304 			    mip->mi_type->mt_addr_length);
305 			mip->mi_dstaddr_set = B_TRUE;
306 		}
307 	} else if (mregp->m_src_addr != NULL) {
308 		goto fail;
309 	}
310 
311 	/*
312 	 * The format of the m_pdata is specific to the plugin.  It is
313 	 * passed in as an argument to all of the plugin callbacks.  The
314 	 * driver can update this information by calling
315 	 * mac_pdata_update().
316 	 */
317 	if (mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY) {
318 		/*
319 		 * Verify if the supplied plugin data is valid.  Note that
320 		 * even if the caller passed in a NULL pointer as plugin data,
321 		 * we still need to verify if that's valid as the plugin may
322 		 * require plugin data to function.
323 		 */
324 		if (!mip->mi_type->mt_ops.mtops_pdata_verify(mregp->m_pdata,
325 		    mregp->m_pdata_size)) {
326 			goto fail;
327 		}
328 		if (mregp->m_pdata != NULL) {
329 			mip->mi_pdata =
330 			    kmem_alloc(mregp->m_pdata_size, KM_SLEEP);
331 			bcopy(mregp->m_pdata, mip->mi_pdata,
332 			    mregp->m_pdata_size);
333 			mip->mi_pdata_size = mregp->m_pdata_size;
334 		}
335 	} else if (mregp->m_pdata != NULL) {
336 		/*
337 		 * The caller supplied non-NULL plugin data, but the plugin
338 		 * does not recognize plugin data.
339 		 */
340 		err = EINVAL;
341 		goto fail;
342 	}
343 
344 	/*
345 	 * Register the private properties.
346 	 */
347 	mac_register_priv_prop(mip, mregp->m_priv_props);
348 
349 	/*
350 	 * Stash the driver callbacks into the mac_impl_t, but first sanity
351 	 * check to make sure all mandatory callbacks are set.
352 	 */
353 	if (mregp->m_callbacks->mc_getstat == NULL ||
354 	    mregp->m_callbacks->mc_start == NULL ||
355 	    mregp->m_callbacks->mc_stop == NULL ||
356 	    mregp->m_callbacks->mc_setpromisc == NULL ||
357 	    mregp->m_callbacks->mc_multicst == NULL) {
358 		goto fail;
359 	}
360 	mip->mi_callbacks = mregp->m_callbacks;
361 
362 	if (mac_capab_get((mac_handle_t)mip, MAC_CAPAB_LEGACY,
363 	    &mip->mi_capab_legacy)) {
364 		mip->mi_state_flags |= MIS_LEGACY;
365 		mip->mi_phy_dev = mip->mi_capab_legacy.ml_dev;
366 	} else {
367 		mip->mi_phy_dev = makedevice(ddi_driver_major(mip->mi_dip),
368 		    mip->mi_minor);
369 	}
370 
371 	/*
372 	 * Allocate a notification thread. thread_create blocks for memory
373 	 * if needed, it never fails.
374 	 */
375 	mip->mi_notify_thread = thread_create(NULL, 0, i_mac_notify_thread,
376 	    mip, 0, &p0, TS_RUN, minclsyspri);
377 
378 	/*
379 	 * Cache the DB_CKSUMFLAGS that this MAC supports.
380 	 */
381 	mip->mi_tx_cksum_flags = mac_features_to_flags((mac_handle_t)mip);
382 
383 	/*
384 	 * Initialize the capabilities
385 	 */
386 	bzero(&mip->mi_rx_rings_cap, sizeof (mac_capab_rings_t));
387 	bzero(&mip->mi_tx_rings_cap, sizeof (mac_capab_rings_t));
388 
389 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_VNIC, NULL))
390 		mip->mi_state_flags |= MIS_IS_VNIC;
391 
392 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL))
393 		mip->mi_state_flags |= MIS_IS_AGGR;
394 
395 	mac_addr_factory_init(mip);
396 
397 	mac_transceiver_init(mip);
398 
399 	mac_led_init(mip);
400 
401 	/*
402 	 * Enforce the virtrualization level registered.
403 	 */
404 	if (mip->mi_v12n_level & MAC_VIRT_LEVEL1) {
405 		if (mac_init_rings(mip, MAC_RING_TYPE_RX) != 0 ||
406 		    mac_init_rings(mip, MAC_RING_TYPE_TX) != 0)
407 			goto fail;
408 
409 		/*
410 		 * The driver needs to register at least rx rings for this
411 		 * virtualization level.
412 		 */
413 		if (mip->mi_rx_groups == NULL)
414 			goto fail;
415 	}
416 
417 	/*
418 	 * The driver must set mc_unicst entry point to NULL when it advertises
419 	 * CAP_RINGS for rx groups.
420 	 */
421 	if (mip->mi_rx_groups != NULL) {
422 		if (mregp->m_callbacks->mc_unicst != NULL)
423 			goto fail;
424 	} else {
425 		if (mregp->m_callbacks->mc_unicst == NULL)
426 			goto fail;
427 	}
428 
429 	/*
430 	 * Initialize MAC addresses. Must be called after mac_init_rings().
431 	 */
432 	mac_init_macaddr(mip);
433 
434 	mip->mi_share_capab.ms_snum = 0;
435 	if (mip->mi_v12n_level & MAC_VIRT_HIO) {
436 		(void) mac_capab_get((mac_handle_t)mip, MAC_CAPAB_SHARES,
437 		    &mip->mi_share_capab);
438 	}
439 
440 	/*
441 	 * Initialize the kstats for this device.
442 	 */
443 	mac_driver_stat_create(mip);
444 
445 	/* Zero out any properties. */
446 	bzero(&mip->mi_resource_props, sizeof (mac_resource_props_t));
447 
448 	if (mip->mi_minor <= MAC_MAX_MINOR) {
449 		/* Create a style-2 DLPI device */
450 		if (ddi_create_minor_node(mip->mi_dip, driver, S_IFCHR, 0,
451 		    DDI_NT_NET, CLONE_DEV) != DDI_SUCCESS)
452 			goto fail;
453 		style2_created = B_TRUE;
454 
455 		/* Create a style-1 DLPI device */
456 		if (ddi_create_minor_node(mip->mi_dip, mip->mi_name, S_IFCHR,
457 		    mip->mi_minor, DDI_NT_NET, 0) != DDI_SUCCESS)
458 			goto fail;
459 		style1_created = B_TRUE;
460 	}
461 
462 	mac_flow_l2tab_create(mip, &mip->mi_flow_tab);
463 
464 	rw_enter(&i_mac_impl_lock, RW_WRITER);
465 	if (mod_hash_insert(i_mac_impl_hash,
466 	    (mod_hash_key_t)mip->mi_name, (mod_hash_val_t)mip) != 0) {
467 		rw_exit(&i_mac_impl_lock);
468 		err = EEXIST;
469 		goto fail;
470 	}
471 
472 	DTRACE_PROBE2(mac__register, struct devnames *, dnp,
473 	    (mac_impl_t *), mip);
474 
475 	/*
476 	 * Mark the MAC to be ready for open.
477 	 */
478 	mip->mi_state_flags &= ~MIS_DISABLED;
479 	rw_exit(&i_mac_impl_lock);
480 
481 	atomic_inc_32(&i_mac_impl_count);
482 
483 	cmn_err(CE_NOTE, "!%s registered", mip->mi_name);
484 	*mhp = (mac_handle_t)mip;
485 	return (0);
486 
487 fail:
488 	if (style1_created)
489 		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
490 
491 	if (style2_created)
492 		ddi_remove_minor_node(mip->mi_dip, driver);
493 
494 	mac_addr_factory_fini(mip);
495 
496 	/* Clean up registered MAC addresses */
497 	mac_fini_macaddr(mip);
498 
499 	/* Clean up registered rings */
500 	mac_free_rings(mip, MAC_RING_TYPE_RX);
501 	mac_free_rings(mip, MAC_RING_TYPE_TX);
502 
503 	/* Clean up notification thread */
504 	if (mip->mi_notify_thread != NULL)
505 		i_mac_notify_exit(mip);
506 
507 	if (mip->mi_info.mi_unicst_addr != NULL) {
508 		kmem_free(mip->mi_info.mi_unicst_addr,
509 		    mip->mi_type->mt_addr_length);
510 		mip->mi_info.mi_unicst_addr = NULL;
511 	}
512 
513 	mac_driver_stat_delete(mip);
514 
515 	if (mip->mi_type != NULL) {
516 		atomic_dec_32(&mip->mi_type->mt_ref);
517 		mip->mi_type = NULL;
518 	}
519 
520 	if (mip->mi_pdata != NULL) {
521 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
522 		mip->mi_pdata = NULL;
523 		mip->mi_pdata_size = 0;
524 	}
525 
526 	if (minor != 0) {
527 		ASSERT(minor > MAC_MAX_MINOR);
528 		mac_minor_rele(minor);
529 	}
530 
531 	mip->mi_state_flags = 0;
532 	mac_unregister_priv_prop(mip);
533 
534 	/*
535 	 * Clear the state before destroying the mac_impl_t
536 	 */
537 	mip->mi_state_flags = 0;
538 
539 	kmem_cache_free(i_mac_impl_cachep, mip);
540 	return (err);
541 }
542 
543 /*
544  * Unregister from the GLDv3 framework
545  */
546 int
547 mac_unregister(mac_handle_t mh)
548 {
549 	int			err;
550 	mac_impl_t		*mip = (mac_impl_t *)mh;
551 	mod_hash_val_t		val;
552 	mac_margin_req_t	*mmr, *nextmmr;
553 
554 	/* Fail the unregister if there are any open references to this mac. */
555 	if ((err = mac_disable_nowait(mh)) != 0)
556 		return (err);
557 
558 	/*
559 	 * Clean up notification thread and wait for it to exit.
560 	 */
561 	i_mac_notify_exit(mip);
562 
563 	/*
564 	 * Prior to acquiring the MAC perimeter, remove the MAC instance from
565 	 * the internal hash table. Such removal means table-walkers that
566 	 * acquire the perimeter will not do so on behalf of what we are
567 	 * unregistering, which prevents a deadlock.
568 	 */
569 	rw_enter(&i_mac_impl_lock, RW_WRITER);
570 	(void) mod_hash_remove(i_mac_impl_hash,
571 	    (mod_hash_key_t)mip->mi_name, &val);
572 	rw_exit(&i_mac_impl_lock);
573 	ASSERT(mip == (mac_impl_t *)val);
574 
575 	i_mac_perim_enter(mip);
576 
577 	/*
578 	 * There is still resource properties configured over this mac.
579 	 */
580 	if (mip->mi_resource_props.mrp_mask != 0)
581 		mac_fastpath_enable((mac_handle_t)mip);
582 
583 	if (mip->mi_minor < MAC_MAX_MINOR + 1) {
584 		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
585 		ddi_remove_minor_node(mip->mi_dip,
586 		    (char *)ddi_driver_name(mip->mi_dip));
587 	}
588 
589 	ASSERT(mip->mi_nactiveclients == 0 && !(mip->mi_state_flags &
590 	    MIS_EXCLUSIVE));
591 
592 	mac_driver_stat_delete(mip);
593 
594 	ASSERT(i_mac_impl_count > 0);
595 	atomic_dec_32(&i_mac_impl_count);
596 
597 	if (mip->mi_pdata != NULL)
598 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
599 	mip->mi_pdata = NULL;
600 	mip->mi_pdata_size = 0;
601 
602 	/*
603 	 * Free the list of margin request.
604 	 */
605 	for (mmr = mip->mi_mmrp; mmr != NULL; mmr = nextmmr) {
606 		nextmmr = mmr->mmr_nextp;
607 		kmem_free(mmr, sizeof (mac_margin_req_t));
608 	}
609 	mip->mi_mmrp = NULL;
610 
611 	mip->mi_linkstate = mip->mi_lowlinkstate = LINK_STATE_UNKNOWN;
612 	kmem_free(mip->mi_info.mi_unicst_addr, mip->mi_type->mt_addr_length);
613 	mip->mi_info.mi_unicst_addr = NULL;
614 
615 	atomic_dec_32(&mip->mi_type->mt_ref);
616 	mip->mi_type = NULL;
617 
618 	/*
619 	 * Free the primary MAC address.
620 	 */
621 	mac_fini_macaddr(mip);
622 
623 	/*
624 	 * free all rings
625 	 */
626 	mac_free_rings(mip, MAC_RING_TYPE_RX);
627 	mac_free_rings(mip, MAC_RING_TYPE_TX);
628 
629 	mac_addr_factory_fini(mip);
630 
631 	bzero(mip->mi_addr, MAXMACADDRLEN);
632 	bzero(mip->mi_dstaddr, MAXMACADDRLEN);
633 	mip->mi_dstaddr_set = B_FALSE;
634 
635 	/* and the flows */
636 	mac_flow_tab_destroy(mip->mi_flow_tab);
637 	mip->mi_flow_tab = NULL;
638 
639 	if (mip->mi_minor > MAC_MAX_MINOR)
640 		mac_minor_rele(mip->mi_minor);
641 
642 	cmn_err(CE_NOTE, "!%s unregistered", mip->mi_name);
643 
644 	/*
645 	 * Reset the perim related fields to default values before
646 	 * kmem_cache_free
647 	 */
648 	i_mac_perim_exit(mip);
649 	mip->mi_state_flags = 0;
650 
651 	mac_unregister_priv_prop(mip);
652 
653 	ASSERT(mip->mi_bridge_link == NULL);
654 	kmem_cache_free(i_mac_impl_cachep, mip);
655 
656 	return (0);
657 }
658 
659 /* DATA RECEPTION */
660 
661 /*
662  * This function is invoked for packets received by the MAC driver in
663  * interrupt context. The ring generation number provided by the driver
664  * is matched with the ring generation number held in MAC. If they do not
665  * match, received packets are considered stale packets coming from an older
666  * assignment of the ring. Drop them.
667  */
668 void
669 mac_rx_ring(mac_handle_t mh, mac_ring_handle_t mrh, mblk_t *mp_chain,
670     uint64_t mr_gen_num)
671 {
672 	mac_ring_t		*mr = (mac_ring_t *)mrh;
673 
674 	if ((mr != NULL) && (mr->mr_gen_num != mr_gen_num)) {
675 		DTRACE_PROBE2(mac__rx__rings__stale__packet, uint64_t,
676 		    mr->mr_gen_num, uint64_t, mr_gen_num);
677 		freemsgchain(mp_chain);
678 		return;
679 	}
680 	mac_rx(mh, (mac_resource_handle_t)mrh, mp_chain);
681 }
682 
683 /*
684  * This function is invoked for each packet received by the underlying driver.
685  */
686 void
687 mac_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
688 {
689 	mac_impl_t *mip = (mac_impl_t *)mh;
690 
691 	/*
692 	 * Check if the link is part of a bridge.  If not, then we don't need
693 	 * to take the lock to remain consistent.  Make this common case
694 	 * lock-free and tail-call optimized.
695 	 */
696 	if (mip->mi_bridge_link == NULL) {
697 		mac_rx_common(mh, mrh, mp_chain);
698 	} else {
699 		/*
700 		 * Once we take a reference on the bridge link, the bridge
701 		 * module itself can't unload, so the callback pointers are
702 		 * stable.
703 		 */
704 		mutex_enter(&mip->mi_bridge_lock);
705 		if ((mh = mip->mi_bridge_link) != NULL)
706 			mac_bridge_ref_cb(mh, B_TRUE);
707 		mutex_exit(&mip->mi_bridge_lock);
708 		if (mh == NULL) {
709 			mac_rx_common((mac_handle_t)mip, mrh, mp_chain);
710 		} else {
711 			mac_bridge_rx_cb(mh, mrh, mp_chain);
712 			mac_bridge_ref_cb(mh, B_FALSE);
713 		}
714 	}
715 }
716 
717 /*
718  * Special case function: this allows snooping of packets transmitted and
719  * received by TRILL. By design, they go directly into the TRILL module.
720  */
721 void
722 mac_trill_snoop(mac_handle_t mh, mblk_t *mp)
723 {
724 	mac_impl_t *mip = (mac_impl_t *)mh;
725 
726 	if (mip->mi_promisc_list != NULL)
727 		mac_promisc_dispatch(mip, mp, NULL, B_FALSE);
728 }
729 
730 /*
731  * This is the upward reentry point for packets arriving from the bridging
732  * module and from mac_rx for links not part of a bridge.
733  */
734 void
735 mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
736 {
737 	mac_impl_t		*mip = (mac_impl_t *)mh;
738 	mac_ring_t		*mr = (mac_ring_t *)mrh;
739 	mac_soft_ring_set_t	*mac_srs;
740 	mblk_t			*bp = mp_chain;
741 
742 	/*
743 	 * If there are any promiscuous mode callbacks defined for
744 	 * this MAC, pass them a copy if appropriate.
745 	 */
746 	if (mip->mi_promisc_list != NULL)
747 		mac_promisc_dispatch(mip, mp_chain, NULL, B_FALSE);
748 
749 	if (mr != NULL) {
750 		/*
751 		 * If the SRS teardown has started, just return. The 'mr'
752 		 * continues to be valid until the driver unregisters the MAC.
753 		 * Hardware classified packets will not make their way up
754 		 * beyond this point once the teardown has started. The driver
755 		 * is never passed a pointer to a flow entry or SRS or any
756 		 * structure that can be freed much before mac_unregister.
757 		 */
758 		mutex_enter(&mr->mr_lock);
759 		if ((mr->mr_state != MR_INUSE) || (mr->mr_flag &
760 		    (MR_INCIPIENT | MR_CONDEMNED | MR_QUIESCE))) {
761 			mutex_exit(&mr->mr_lock);
762 			freemsgchain(mp_chain);
763 			return;
764 		}
765 
766 		/*
767 		 * The ring is in passthru mode; pass the chain up to
768 		 * the pseudo ring.
769 		 */
770 		if (mr->mr_classify_type == MAC_PASSTHRU_CLASSIFIER) {
771 			MR_REFHOLD_LOCKED(mr);
772 			mutex_exit(&mr->mr_lock);
773 			mr->mr_pt_fn(mr->mr_pt_arg1, mr->mr_pt_arg2, mp_chain,
774 			    B_FALSE);
775 			MR_REFRELE(mr);
776 			return;
777 		}
778 
779 		/*
780 		 * The passthru callback should only be set when in
781 		 * MAC_PASSTHRU_CLASSIFIER mode.
782 		 */
783 		ASSERT3P(mr->mr_pt_fn, ==, NULL);
784 
785 		/*
786 		 * We check if an SRS is controlling this ring.
787 		 * If so, we can directly call the srs_lower_proc
788 		 * routine otherwise we need to go through mac_rx_classify
789 		 * to reach the right place.
790 		 */
791 		if (mr->mr_classify_type == MAC_HW_CLASSIFIER) {
792 			MR_REFHOLD_LOCKED(mr);
793 			mutex_exit(&mr->mr_lock);
794 			ASSERT3P(mr->mr_srs, !=, NULL);
795 			mac_srs = mr->mr_srs;
796 
797 			/*
798 			 * This is the fast path. All packets received
799 			 * on this ring are hardware classified and
800 			 * share the same MAC header info.
801 			 */
802 			mac_srs->srs_rx.sr_lower_proc(mh,
803 			    (mac_resource_handle_t)mac_srs, mp_chain, B_FALSE);
804 			MR_REFRELE(mr);
805 			return;
806 		}
807 
808 		mutex_exit(&mr->mr_lock);
809 		/* We'll fall through to software classification */
810 	} else {
811 		flow_entry_t *flent;
812 		int err;
813 
814 		rw_enter(&mip->mi_rw_lock, RW_READER);
815 		if (mip->mi_single_active_client != NULL) {
816 			flent = mip->mi_single_active_client->mci_flent_list;
817 			FLOW_TRY_REFHOLD(flent, err);
818 			rw_exit(&mip->mi_rw_lock);
819 			if (err == 0) {
820 				(flent->fe_cb_fn)(flent->fe_cb_arg1,
821 				    flent->fe_cb_arg2, mp_chain, B_FALSE);
822 				FLOW_REFRELE(flent);
823 				return;
824 			}
825 		} else {
826 			rw_exit(&mip->mi_rw_lock);
827 		}
828 	}
829 
830 	if (!FLOW_TAB_EMPTY(mip->mi_flow_tab)) {
831 		if ((bp = mac_rx_flow(mh, mrh, bp)) == NULL)
832 			return;
833 	}
834 
835 	freemsgchain(bp);
836 }
837 
838 /* DATA TRANSMISSION */
839 
840 /*
841  * A driver's notification to resume transmission, in case of a provider
842  * without TX rings.
843  */
844 void
845 mac_tx_update(mac_handle_t mh)
846 {
847 	mac_tx_ring_update(mh, NULL);
848 }
849 
850 /*
851  * A driver's notification to resume transmission on the specified TX ring.
852  */
853 void
854 mac_tx_ring_update(mac_handle_t mh, mac_ring_handle_t rh)
855 {
856 	i_mac_tx_srs_notify((mac_impl_t *)mh, rh);
857 }
858 
859 /* LINK STATE */
860 /*
861  * Notify the MAC layer about a link state change
862  */
863 void
864 mac_link_update(mac_handle_t mh, link_state_t link)
865 {
866 	mac_impl_t	*mip = (mac_impl_t *)mh;
867 
868 	/*
869 	 * Save the link state.
870 	 */
871 	mip->mi_lowlinkstate = link;
872 
873 	/*
874 	 * Send a MAC_NOTE_LOWLINK notification.  This tells the notification
875 	 * thread to deliver both lower and upper notifications.
876 	 */
877 	i_mac_notify(mip, MAC_NOTE_LOWLINK);
878 }
879 
880 /*
881  * Notify the MAC layer about a link state change due to bridging.
882  */
883 void
884 mac_link_redo(mac_handle_t mh, link_state_t link)
885 {
886 	mac_impl_t	*mip = (mac_impl_t *)mh;
887 
888 	/*
889 	 * Save the link state.
890 	 */
891 	mip->mi_linkstate = link;
892 
893 	/*
894 	 * Send a MAC_NOTE_LINK notification.  Only upper notifications are
895 	 * made.
896 	 */
897 	i_mac_notify(mip, MAC_NOTE_LINK);
898 }
899 
900 /* MINOR NODE HANDLING */
901 
902 /*
903  * Given a dev_t, return the instance number (PPA) associated with it.
904  * Drivers can use this in their getinfo(9e) implementation to lookup
905  * the instance number (i.e. PPA) of the device, to use as an index to
906  * their own array of soft state structures.
907  *
908  * Returns -1 on error.
909  */
910 int
911 mac_devt_to_instance(dev_t devt)
912 {
913 	return (dld_devt_to_instance(devt));
914 }
915 
916 /*
917  * This function returns the first minor number that is available for
918  * driver private use.  All minor numbers smaller than this are
919  * reserved for GLDv3 use.
920  */
921 minor_t
922 mac_private_minor(void)
923 {
924 	return (MAC_PRIVATE_MINOR);
925 }
926 
927 /* OTHER CONTROL INFORMATION */
928 
929 /*
930  * A driver notified us that its primary MAC address has changed.
931  */
932 void
933 mac_unicst_update(mac_handle_t mh, const uint8_t *addr)
934 {
935 	mac_impl_t	*mip = (mac_impl_t *)mh;
936 
937 	if (mip->mi_type->mt_addr_length == 0)
938 		return;
939 
940 	i_mac_perim_enter(mip);
941 
942 	/*
943 	 * If address changes, freshen the MAC address value and update
944 	 * all MAC clients that share this MAC address.
945 	 */
946 	if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) != 0) {
947 		mac_freshen_macaddr(mac_find_macaddr(mip, mip->mi_addr),
948 		    (uint8_t *)addr);
949 	}
950 
951 	i_mac_perim_exit(mip);
952 
953 	/*
954 	 * Send a MAC_NOTE_UNICST notification.
955 	 */
956 	i_mac_notify(mip, MAC_NOTE_UNICST);
957 }
958 
959 void
960 mac_dst_update(mac_handle_t mh, const uint8_t *addr)
961 {
962 	mac_impl_t	*mip = (mac_impl_t *)mh;
963 
964 	if (mip->mi_type->mt_addr_length == 0)
965 		return;
966 
967 	i_mac_perim_enter(mip);
968 	bcopy(addr, mip->mi_dstaddr, mip->mi_type->mt_addr_length);
969 	i_mac_perim_exit(mip);
970 	i_mac_notify(mip, MAC_NOTE_DEST);
971 }
972 
973 /*
974  * MAC plugin information changed.
975  */
976 int
977 mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize)
978 {
979 	mac_impl_t	*mip = (mac_impl_t *)mh;
980 
981 	/*
982 	 * Verify that the plugin supports MAC plugin data and that the
983 	 * supplied data is valid.
984 	 */
985 	if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY))
986 		return (EINVAL);
987 	if (!mip->mi_type->mt_ops.mtops_pdata_verify(mac_pdata, dsize))
988 		return (EINVAL);
989 
990 	if (mip->mi_pdata != NULL)
991 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
992 
993 	mip->mi_pdata = kmem_alloc(dsize, KM_SLEEP);
994 	bcopy(mac_pdata, mip->mi_pdata, dsize);
995 	mip->mi_pdata_size = dsize;
996 
997 	/*
998 	 * Since the MAC plugin data is used to construct MAC headers that
999 	 * were cached in fast-path headers, we need to flush fast-path
1000 	 * information for links associated with this mac.
1001 	 */
1002 	i_mac_notify(mip, MAC_NOTE_FASTPATH_FLUSH);
1003 	return (0);
1004 }
1005 
1006 /*
1007  * The mac provider or mac frameowrk calls this function when it wants
1008  * to notify upstream consumers that the capabilities have changed and
1009  * that they should modify their own internal state accordingly.
1010  *
1011  * We currently have no regard for the fact that a provider could
1012  * decide to drop capabilities which would invalidate pending traffic.
1013  * For example, if one was to disable the Tx checksum offload while
1014  * TCP/IP traffic was being sent by mac clients relying on that
1015  * feature, then those packets would hit the write with missing or
1016  * partial checksums. A proper solution involves not only providing
1017  * notfication, but also performing client quiescing. That is, a capab
1018  * change should be treated as an atomic transaction that forms a
1019  * barrier between traffic relying on the current capabs and traffic
1020  * relying on the new capabs. In practice, simnet is currently the
1021  * only provider that could hit this, and it's an easily avoidable
1022  * situation (and at worst it should only lead to some dropped
1023  * packets). But if we ever want better on-the-fly capab change to
1024  * actual hardware providers, then we should give this update
1025  * mechanism a proper implementation.
1026  */
1027 void
1028 mac_capab_update(mac_handle_t mh)
1029 {
1030 	/*
1031 	 * Send a MAC_NOTE_CAPAB_CHG notification to alert upstream
1032 	 * clients to renegotiate capabilities.
1033 	 */
1034 	i_mac_notify((mac_impl_t *)mh, MAC_NOTE_CAPAB_CHG);
1035 }
1036 
1037 /*
1038  * Used by normal drivers to update the max sdu size.
1039  * We need to handle the case of a smaller mi_sdu_multicast
1040  * since this is called by mac_set_mtu() even for drivers that
1041  * have differing unicast and multicast mtu and we don't want to
1042  * increase the multicast mtu by accident in that case.
1043  */
1044 int
1045 mac_maxsdu_update(mac_handle_t mh, uint_t sdu_max)
1046 {
1047 	mac_impl_t	*mip = (mac_impl_t *)mh;
1048 
1049 	if (sdu_max == 0 || sdu_max < mip->mi_sdu_min)
1050 		return (EINVAL);
1051 	mip->mi_sdu_max = sdu_max;
1052 	if (mip->mi_sdu_multicast > mip->mi_sdu_max)
1053 		mip->mi_sdu_multicast = mip->mi_sdu_max;
1054 
1055 	/* Send a MAC_NOTE_SDU_SIZE notification. */
1056 	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
1057 	return (0);
1058 }
1059 
1060 /*
1061  * Version of the above function that is used by drivers that have a different
1062  * max sdu size for multicast/broadcast vs. unicast.
1063  */
1064 int
1065 mac_maxsdu_update2(mac_handle_t mh, uint_t sdu_max, uint_t sdu_multicast)
1066 {
1067 	mac_impl_t	*mip = (mac_impl_t *)mh;
1068 
1069 	if (sdu_max == 0 || sdu_max < mip->mi_sdu_min)
1070 		return (EINVAL);
1071 	if (sdu_multicast == 0)
1072 		sdu_multicast = sdu_max;
1073 	if (sdu_multicast > sdu_max || sdu_multicast < mip->mi_sdu_min)
1074 		return (EINVAL);
1075 	mip->mi_sdu_max = sdu_max;
1076 	mip->mi_sdu_multicast = sdu_multicast;
1077 
1078 	/* Send a MAC_NOTE_SDU_SIZE notification. */
1079 	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
1080 	return (0);
1081 }
1082 
1083 static void
1084 mac_ring_intr_retarget(mac_group_t *group, mac_ring_t *ring)
1085 {
1086 	mac_client_impl_t *mcip;
1087 	flow_entry_t *flent;
1088 	mac_soft_ring_set_t *mac_rx_srs;
1089 	mac_cpus_t *srs_cpu;
1090 	int i;
1091 
1092 	if (((mcip = MAC_GROUP_ONLY_CLIENT(group)) != NULL) &&
1093 	    (!ring->mr_info.mri_intr.mi_ddi_shared)) {
1094 		/* interrupt can be re-targeted */
1095 		ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
1096 		flent = mcip->mci_flent;
1097 		if (ring->mr_type == MAC_RING_TYPE_RX) {
1098 			for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1099 				mac_rx_srs = flent->fe_rx_srs[i];
1100 				if (mac_rx_srs->srs_ring != ring)
1101 					continue;
1102 				srs_cpu = &mac_rx_srs->srs_cpu;
1103 				mutex_enter(&cpu_lock);
1104 				mac_rx_srs_retarget_intr(mac_rx_srs,
1105 				    srs_cpu->mc_rx_intr_cpu);
1106 				mutex_exit(&cpu_lock);
1107 				break;
1108 			}
1109 		} else {
1110 			if (flent->fe_tx_srs != NULL) {
1111 				mutex_enter(&cpu_lock);
1112 				mac_tx_srs_retarget_intr(
1113 				    flent->fe_tx_srs);
1114 				mutex_exit(&cpu_lock);
1115 			}
1116 		}
1117 	}
1118 }
1119 
1120 /*
1121  * Clients like aggr create pseudo rings (mac_ring_t) and expose them to
1122  * their clients. There is a 1-1 mapping pseudo ring and the hardware
1123  * ring. ddi interrupt handles are exported from the hardware ring to
1124  * the pseudo ring. Thus when the interrupt handle changes, clients of
1125  * aggr that are using the handle need to use the new handle and
1126  * re-target their interrupts.
1127  */
1128 static void
1129 mac_pseudo_ring_intr_retarget(mac_impl_t *mip, mac_ring_t *ring,
1130     ddi_intr_handle_t ddh)
1131 {
1132 	mac_ring_t *pring;
1133 	mac_group_t *pgroup;
1134 	mac_impl_t *pmip;
1135 	char macname[MAXNAMELEN];
1136 	mac_perim_handle_t p_mph;
1137 	uint64_t saved_gen_num;
1138 
1139 again:
1140 	pring = (mac_ring_t *)ring->mr_prh;
1141 	pgroup = (mac_group_t *)pring->mr_gh;
1142 	pmip = (mac_impl_t *)pgroup->mrg_mh;
1143 	saved_gen_num = ring->mr_gen_num;
1144 	(void) strlcpy(macname, pmip->mi_name, MAXNAMELEN);
1145 	/*
1146 	 * We need to enter aggr's perimeter. The locking hierarchy
1147 	 * dictates that aggr's perimeter should be entered first
1148 	 * and then the port's perimeter. So drop the port's
1149 	 * perimeter, enter aggr's and then re-enter port's
1150 	 * perimeter.
1151 	 */
1152 	i_mac_perim_exit(mip);
1153 	/*
1154 	 * While we know pmip is the aggr's mip, there is a
1155 	 * possibility that aggr could have unregistered by
1156 	 * the time we exit port's perimeter (mip) and
1157 	 * enter aggr's perimeter (pmip). To avoid that
1158 	 * scenario, enter aggr's perimeter using its name.
1159 	 */
1160 	if (mac_perim_enter_by_macname(macname, &p_mph) != 0)
1161 		return;
1162 	i_mac_perim_enter(mip);
1163 	/*
1164 	 * Check if the ring got assigned to another aggregation before
1165 	 * be could enter aggr's and the port's perimeter. When a ring
1166 	 * gets deleted from an aggregation, it calls mac_stop_ring()
1167 	 * which increments the generation number. So checking
1168 	 * generation number will be enough.
1169 	 */
1170 	if (ring->mr_gen_num != saved_gen_num && ring->mr_prh != NULL) {
1171 		i_mac_perim_exit(mip);
1172 		mac_perim_exit(p_mph);
1173 		i_mac_perim_enter(mip);
1174 		goto again;
1175 	}
1176 
1177 	/* Check if pseudo ring is still present */
1178 	if (ring->mr_prh != NULL) {
1179 		pring->mr_info.mri_intr.mi_ddi_handle = ddh;
1180 		pring->mr_info.mri_intr.mi_ddi_shared =
1181 		    ring->mr_info.mri_intr.mi_ddi_shared;
1182 		if (ddh != NULL)
1183 			mac_ring_intr_retarget(pgroup, pring);
1184 	}
1185 	i_mac_perim_exit(mip);
1186 	mac_perim_exit(p_mph);
1187 }
1188 /*
1189  * API called by driver to provide new interrupt handle for TX/RX rings.
1190  * This usually happens when IRM (Interrupt Resource Manangement)
1191  * framework either gives the driver more MSI-x interrupts or takes
1192  * away MSI-x interrupts from the driver.
1193  */
1194 void
1195 mac_ring_intr_set(mac_ring_handle_t mrh, ddi_intr_handle_t ddh)
1196 {
1197 	mac_ring_t	*ring = (mac_ring_t *)mrh;
1198 	mac_group_t	*group = (mac_group_t *)ring->mr_gh;
1199 	mac_impl_t	*mip = (mac_impl_t *)group->mrg_mh;
1200 
1201 	i_mac_perim_enter(mip);
1202 	ring->mr_info.mri_intr.mi_ddi_handle = ddh;
1203 	if (ddh == NULL) {
1204 		/* Interrupts being reset */
1205 		ring->mr_info.mri_intr.mi_ddi_shared = B_FALSE;
1206 		if (ring->mr_prh != NULL) {
1207 			mac_pseudo_ring_intr_retarget(mip, ring, ddh);
1208 			return;
1209 		}
1210 	} else {
1211 		/* New interrupt handle */
1212 		mac_compare_ddi_handle(mip->mi_rx_groups,
1213 		    mip->mi_rx_group_count, ring);
1214 		if (!ring->mr_info.mri_intr.mi_ddi_shared) {
1215 			mac_compare_ddi_handle(mip->mi_tx_groups,
1216 			    mip->mi_tx_group_count, ring);
1217 		}
1218 		if (ring->mr_prh != NULL) {
1219 			mac_pseudo_ring_intr_retarget(mip, ring, ddh);
1220 			return;
1221 		} else {
1222 			mac_ring_intr_retarget(group, ring);
1223 		}
1224 	}
1225 	i_mac_perim_exit(mip);
1226 }
1227 
1228 /* PRIVATE FUNCTIONS, FOR INTERNAL USE ONLY */
1229 
1230 /*
1231  * Updates the mac_impl structure with the current state of the link
1232  */
1233 static void
1234 i_mac_log_link_state(mac_impl_t *mip)
1235 {
1236 	/*
1237 	 * If no change, then it is not interesting.
1238 	 */
1239 	if (mip->mi_lastlowlinkstate == mip->mi_lowlinkstate)
1240 		return;
1241 
1242 	switch (mip->mi_lowlinkstate) {
1243 	case LINK_STATE_UP:
1244 		if (mip->mi_type->mt_ops.mtops_ops & MTOPS_LINK_DETAILS) {
1245 			char det[200];
1246 
1247 			mip->mi_type->mt_ops.mtops_link_details(det,
1248 			    sizeof (det), (mac_handle_t)mip, mip->mi_pdata);
1249 
1250 			cmn_err(CE_NOTE, "!%s link up, %s", mip->mi_name, det);
1251 		} else {
1252 			cmn_err(CE_NOTE, "!%s link up", mip->mi_name);
1253 		}
1254 		break;
1255 
1256 	case LINK_STATE_DOWN:
1257 		/*
1258 		 * Only transitions from UP to DOWN are interesting
1259 		 */
1260 		if (mip->mi_lastlowlinkstate != LINK_STATE_UNKNOWN)
1261 			cmn_err(CE_NOTE, "!%s link down", mip->mi_name);
1262 		break;
1263 
1264 	case LINK_STATE_UNKNOWN:
1265 		/*
1266 		 * This case is normally not interesting.
1267 		 */
1268 		break;
1269 	}
1270 	mip->mi_lastlowlinkstate = mip->mi_lowlinkstate;
1271 }
1272 
1273 /*
1274  * Main routine for the callbacks notifications thread
1275  */
1276 static void
1277 i_mac_notify_thread(void *arg)
1278 {
1279 	mac_impl_t	*mip = arg;
1280 	callb_cpr_t	cprinfo;
1281 	mac_cb_t	*mcb;
1282 	mac_cb_info_t	*mcbi;
1283 	mac_notify_cb_t	*mncb;
1284 
1285 	mcbi = &mip->mi_notify_cb_info;
1286 	CALLB_CPR_INIT(&cprinfo, mcbi->mcbi_lockp, callb_generic_cpr,
1287 	    "i_mac_notify_thread");
1288 
1289 	mutex_enter(mcbi->mcbi_lockp);
1290 
1291 	for (;;) {
1292 		uint32_t	bits;
1293 		uint32_t	type;
1294 
1295 		bits = mip->mi_notify_bits;
1296 		if (bits == 0) {
1297 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1298 			cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
1299 			CALLB_CPR_SAFE_END(&cprinfo, mcbi->mcbi_lockp);
1300 			continue;
1301 		}
1302 		mip->mi_notify_bits = 0;
1303 		if ((bits & (1 << MAC_NNOTE)) != 0) {
1304 			/* request to quit */
1305 			ASSERT(mip->mi_state_flags & MIS_DISABLED);
1306 			break;
1307 		}
1308 
1309 		mutex_exit(mcbi->mcbi_lockp);
1310 
1311 		/*
1312 		 * Log link changes on the actual link, but then do reports on
1313 		 * synthetic state (if part of a bridge).
1314 		 */
1315 		if ((bits & (1 << MAC_NOTE_LOWLINK)) != 0) {
1316 			link_state_t newstate;
1317 			mac_handle_t mh;
1318 
1319 			i_mac_log_link_state(mip);
1320 			newstate = mip->mi_lowlinkstate;
1321 			if (mip->mi_bridge_link != NULL) {
1322 				mutex_enter(&mip->mi_bridge_lock);
1323 				if ((mh = mip->mi_bridge_link) != NULL) {
1324 					newstate = mac_bridge_ls_cb(mh,
1325 					    newstate);
1326 				}
1327 				mutex_exit(&mip->mi_bridge_lock);
1328 			}
1329 			if (newstate != mip->mi_linkstate) {
1330 				mip->mi_linkstate = newstate;
1331 				bits |= 1 << MAC_NOTE_LINK;
1332 			}
1333 		}
1334 
1335 		/*
1336 		 * Depending on which capabs have changed, the Tx
1337 		 * checksum flags may also need to be updated.
1338 		 */
1339 		if ((bits & (1 << MAC_NOTE_CAPAB_CHG)) != 0) {
1340 			mac_perim_handle_t mph;
1341 			mac_handle_t mh = (mac_handle_t)mip;
1342 
1343 			mac_perim_enter_by_mh(mh, &mph);
1344 			mip->mi_tx_cksum_flags = mac_features_to_flags(mh);
1345 			mac_perim_exit(mph);
1346 		}
1347 
1348 		/*
1349 		 * Do notification callbacks for each notification type.
1350 		 */
1351 		for (type = 0; type < MAC_NNOTE; type++) {
1352 			if ((bits & (1 << type)) == 0) {
1353 				continue;
1354 			}
1355 
1356 			if (mac_notify_cb_list[type] != NULL)
1357 				(*mac_notify_cb_list[type])(mip);
1358 
1359 			/*
1360 			 * Walk the list of notifications.
1361 			 */
1362 			MAC_CALLBACK_WALKER_INC(&mip->mi_notify_cb_info);
1363 			for (mcb = mip->mi_notify_cb_list; mcb != NULL;
1364 			    mcb = mcb->mcb_nextp) {
1365 				mncb = (mac_notify_cb_t *)mcb->mcb_objp;
1366 				mncb->mncb_fn(mncb->mncb_arg, type);
1367 			}
1368 			MAC_CALLBACK_WALKER_DCR(&mip->mi_notify_cb_info,
1369 			    &mip->mi_notify_cb_list);
1370 		}
1371 
1372 		mutex_enter(mcbi->mcbi_lockp);
1373 	}
1374 
1375 	mip->mi_state_flags |= MIS_NOTIFY_DONE;
1376 	cv_broadcast(&mcbi->mcbi_cv);
1377 
1378 	/* CALLB_CPR_EXIT drops the lock */
1379 	CALLB_CPR_EXIT(&cprinfo);
1380 	thread_exit();
1381 }
1382 
1383 /*
1384  * Signal the i_mac_notify_thread asking it to quit.
1385  * Then wait till it is done.
1386  */
1387 void
1388 i_mac_notify_exit(mac_impl_t *mip)
1389 {
1390 	mac_cb_info_t	*mcbi;
1391 
1392 	mcbi = &mip->mi_notify_cb_info;
1393 
1394 	mutex_enter(mcbi->mcbi_lockp);
1395 	mip->mi_notify_bits = (1 << MAC_NNOTE);
1396 	cv_broadcast(&mcbi->mcbi_cv);
1397 
1398 
1399 	while ((mip->mi_notify_thread != NULL) &&
1400 	    !(mip->mi_state_flags & MIS_NOTIFY_DONE)) {
1401 		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
1402 	}
1403 
1404 	/* Necessary clean up before doing kmem_cache_free */
1405 	mip->mi_state_flags &= ~MIS_NOTIFY_DONE;
1406 	mip->mi_notify_bits = 0;
1407 	mip->mi_notify_thread = NULL;
1408 	mutex_exit(mcbi->mcbi_lockp);
1409 }
1410 
1411 /*
1412  * Entry point invoked by drivers to dynamically add a ring to an
1413  * existing group.
1414  */
1415 int
1416 mac_group_add_ring(mac_group_handle_t gh, int index)
1417 {
1418 	mac_group_t *group = (mac_group_t *)gh;
1419 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
1420 	int ret;
1421 
1422 	i_mac_perim_enter(mip);
1423 	ret = i_mac_group_add_ring(group, NULL, index);
1424 	i_mac_perim_exit(mip);
1425 	return (ret);
1426 }
1427 
1428 /*
1429  * Entry point invoked by drivers to dynamically remove a ring
1430  * from an existing group. The specified ring handle must no longer
1431  * be used by the driver after a call to this function.
1432  */
1433 void
1434 mac_group_rem_ring(mac_group_handle_t gh, mac_ring_handle_t rh)
1435 {
1436 	mac_group_t *group = (mac_group_t *)gh;
1437 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
1438 
1439 	i_mac_perim_enter(mip);
1440 	i_mac_group_rem_ring(group, (mac_ring_t *)rh, B_TRUE);
1441 	i_mac_perim_exit(mip);
1442 }
1443 
1444 /*
1445  * mac_prop_info_*() callbacks called from the driver's prefix_propinfo()
1446  * entry points.
1447  */
1448 
1449 void
1450 mac_prop_info_set_default_uint8(mac_prop_info_handle_t ph, uint8_t val)
1451 {
1452 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1453 
1454 	/* nothing to do if the caller doesn't want the default value */
1455 	if (pr->pr_default == NULL)
1456 		return;
1457 
1458 	ASSERT(pr->pr_default_size >= sizeof (uint8_t));
1459 
1460 	*(uint8_t *)(pr->pr_default) = val;
1461 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1462 }
1463 
1464 void
1465 mac_prop_info_set_default_uint64(mac_prop_info_handle_t ph, uint64_t val)
1466 {
1467 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1468 
1469 	/* nothing to do if the caller doesn't want the default value */
1470 	if (pr->pr_default == NULL)
1471 		return;
1472 
1473 	ASSERT(pr->pr_default_size >= sizeof (uint64_t));
1474 
1475 	bcopy(&val, pr->pr_default, sizeof (val));
1476 
1477 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1478 }
1479 
1480 void
1481 mac_prop_info_set_default_uint32(mac_prop_info_handle_t ph, uint32_t val)
1482 {
1483 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1484 
1485 	/* nothing to do if the caller doesn't want the default value */
1486 	if (pr->pr_default == NULL)
1487 		return;
1488 
1489 	ASSERT(pr->pr_default_size >= sizeof (uint32_t));
1490 
1491 	bcopy(&val, pr->pr_default, sizeof (val));
1492 
1493 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1494 }
1495 
1496 void
1497 mac_prop_info_set_default_str(mac_prop_info_handle_t ph, const char *str)
1498 {
1499 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1500 
1501 	/* nothing to do if the caller doesn't want the default value */
1502 	if (pr->pr_default == NULL)
1503 		return;
1504 
1505 	if (strlen(str) >= pr->pr_default_size)
1506 		pr->pr_errno = ENOBUFS;
1507 	else
1508 		(void) strlcpy(pr->pr_default, str, pr->pr_default_size);
1509 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1510 }
1511 
1512 void
1513 mac_prop_info_set_default_link_flowctrl(mac_prop_info_handle_t ph,
1514     link_flowctrl_t val)
1515 {
1516 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1517 
1518 	/* nothing to do if the caller doesn't want the default value */
1519 	if (pr->pr_default == NULL)
1520 		return;
1521 
1522 	ASSERT(pr->pr_default_size >= sizeof (link_flowctrl_t));
1523 
1524 	bcopy(&val, pr->pr_default, sizeof (val));
1525 
1526 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1527 }
1528 
1529 void
1530 mac_prop_info_set_range_uint32(mac_prop_info_handle_t ph, uint32_t min,
1531     uint32_t max)
1532 {
1533 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1534 	mac_propval_range_t *range = pr->pr_range;
1535 	mac_propval_uint32_range_t *range32;
1536 
1537 	/* nothing to do if the caller doesn't want the range info */
1538 	if (range == NULL)
1539 		return;
1540 
1541 	if (pr->pr_range_cur_count++ == 0) {
1542 		/* first range */
1543 		pr->pr_flags |= MAC_PROP_INFO_RANGE;
1544 		range->mpr_type = MAC_PROPVAL_UINT32;
1545 	} else {
1546 		/* all ranges of a property should be of the same type */
1547 		ASSERT(range->mpr_type == MAC_PROPVAL_UINT32);
1548 		if (pr->pr_range_cur_count > range->mpr_count) {
1549 			pr->pr_errno = ENOSPC;
1550 			return;
1551 		}
1552 	}
1553 
1554 	range32 = range->mpr_range_uint32;
1555 	range32[pr->pr_range_cur_count - 1].mpur_min = min;
1556 	range32[pr->pr_range_cur_count - 1].mpur_max = max;
1557 }
1558 
1559 void
1560 mac_prop_info_set_perm(mac_prop_info_handle_t ph, uint8_t perm)
1561 {
1562 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1563 
1564 	pr->pr_perm = perm;
1565 	pr->pr_flags |= MAC_PROP_INFO_PERM;
1566 }
1567 
1568 void
1569 mac_hcksum_get(const mblk_t *mp, uint32_t *start, uint32_t *stuff,
1570     uint32_t *end, uint32_t *value, uint32_t *flags_ptr)
1571 {
1572 	uint32_t flags;
1573 
1574 	ASSERT(DB_TYPE(mp) == M_DATA);
1575 
1576 	flags = DB_CKSUMFLAGS(mp) & HCK_FLAGS;
1577 	if ((flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) != 0) {
1578 		if (value != NULL)
1579 			*value = (uint32_t)DB_CKSUM16(mp);
1580 		if ((flags & HCK_PARTIALCKSUM) != 0) {
1581 			if (start != NULL)
1582 				*start = (uint32_t)DB_CKSUMSTART(mp);
1583 			if (stuff != NULL)
1584 				*stuff = (uint32_t)DB_CKSUMSTUFF(mp);
1585 			if (end != NULL)
1586 				*end = (uint32_t)DB_CKSUMEND(mp);
1587 		}
1588 	}
1589 
1590 	if (flags_ptr != NULL)
1591 		*flags_ptr = flags;
1592 }
1593 
1594 void
1595 mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff, uint32_t end,
1596     uint32_t value, uint32_t flags)
1597 {
1598 	ASSERT(DB_TYPE(mp) == M_DATA);
1599 
1600 	DB_CKSUMSTART(mp) = (intptr_t)start;
1601 	DB_CKSUMSTUFF(mp) = (intptr_t)stuff;
1602 	DB_CKSUMEND(mp) = (intptr_t)end;
1603 	DB_CKSUMFLAGS(mp) = (uint16_t)flags;
1604 	DB_CKSUM16(mp) = (uint16_t)value;
1605 }
1606 
1607 void
1608 mac_hcksum_clone(const mblk_t *src, mblk_t *dst)
1609 {
1610 	ASSERT3U(DB_TYPE(src), ==, M_DATA);
1611 	ASSERT3U(DB_TYPE(dst), ==, M_DATA);
1612 
1613 	/*
1614 	 * Do these assignments unconditionally, rather than only when
1615 	 * flags is non-zero. This protects a situation where zeroed
1616 	 * hcksum data does not make the jump onto an mblk_t with
1617 	 * stale data in those fields. It's important to copy all
1618 	 * possible flags (HCK_* as well as HW_*) and not just the
1619 	 * checksum specific flags. Dropping flags during a clone
1620 	 * could result in dropped packets. If the caller has good
1621 	 * reason to drop those flags then it should do it manually,
1622 	 * after the clone.
1623 	 */
1624 	DB_CKSUMFLAGS(dst) = DB_CKSUMFLAGS(src);
1625 	DB_CKSUMSTART(dst) = DB_CKSUMSTART(src);
1626 	DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src);
1627 	DB_CKSUMEND(dst) = DB_CKSUMEND(src);
1628 	DB_CKSUM16(dst) = DB_CKSUM16(src);
1629 	DB_LSOMSS(dst) = DB_LSOMSS(src);
1630 }
1631 
1632 void
1633 mac_lso_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
1634 {
1635 	ASSERT(DB_TYPE(mp) == M_DATA);
1636 
1637 	if (flags != NULL) {
1638 		*flags = DB_CKSUMFLAGS(mp) & HW_LSO;
1639 		if ((*flags != 0) && (mss != NULL))
1640 			*mss = (uint32_t)DB_LSOMSS(mp);
1641 	}
1642 }
1643 
1644 void
1645 mac_transceiver_info_set_present(mac_transceiver_info_t *infop,
1646     boolean_t present)
1647 {
1648 	infop->mti_present = present;
1649 }
1650 
1651 void
1652 mac_transceiver_info_set_usable(mac_transceiver_info_t *infop,
1653     boolean_t usable)
1654 {
1655 	infop->mti_usable = usable;
1656 }
1657 
1658 /*
1659  * We should really keep track of our offset and not walk everything every
1660  * time. I can't imagine that this will be kind to us at high packet rates;
1661  * however, for the moment, let's leave that.
1662  *
1663  * This walks a message block chain without pulling up to fill in the context
1664  * information. Note that the data we care about could be hidden across more
1665  * than one mblk_t.
1666  */
1667 static int
1668 mac_meoi_get_uint8(mblk_t *mp, off_t off, uint8_t *out)
1669 {
1670 	size_t mpsize;
1671 	uint8_t *bp;
1672 
1673 	mpsize = msgsize(mp);
1674 	/* Check for overflow */
1675 	if (off + sizeof (uint16_t) > mpsize)
1676 		return (-1);
1677 
1678 	mpsize = MBLKL(mp);
1679 	while (off >= mpsize) {
1680 		mp = mp->b_cont;
1681 		off -= mpsize;
1682 		mpsize = MBLKL(mp);
1683 	}
1684 
1685 	bp = mp->b_rptr + off;
1686 	*out = *bp;
1687 	return (0);
1688 
1689 }
1690 
1691 static int
1692 mac_meoi_get_uint16(mblk_t *mp, off_t off, uint16_t *out)
1693 {
1694 	size_t mpsize;
1695 	uint8_t *bp;
1696 
1697 	mpsize = msgsize(mp);
1698 	/* Check for overflow */
1699 	if (off + sizeof (uint16_t) > mpsize)
1700 		return (-1);
1701 
1702 	mpsize = MBLKL(mp);
1703 	while (off >= mpsize) {
1704 		mp = mp->b_cont;
1705 		off -= mpsize;
1706 		mpsize = MBLKL(mp);
1707 	}
1708 
1709 	/*
1710 	 * Data is in network order. Note the second byte of data might be in
1711 	 * the next mp.
1712 	 */
1713 	bp = mp->b_rptr + off;
1714 	*out = *bp << 8;
1715 	if (off + 1 == mpsize) {
1716 		mp = mp->b_cont;
1717 		bp = mp->b_rptr;
1718 	} else {
1719 		bp++;
1720 	}
1721 
1722 	*out |= *bp;
1723 	return (0);
1724 
1725 }
1726 
1727 
1728 int
1729 mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi)
1730 {
1731 	size_t off;
1732 	uint16_t ether;
1733 	uint8_t ipproto, iplen, l4len, maclen;
1734 
1735 	bzero(meoi, sizeof (mac_ether_offload_info_t));
1736 
1737 	meoi->meoi_len = msgsize(mp);
1738 	off = offsetof(struct ether_header, ether_type);
1739 	if (mac_meoi_get_uint16(mp, off, &ether) != 0)
1740 		return (-1);
1741 
1742 	if (ether == ETHERTYPE_VLAN) {
1743 		off = offsetof(struct ether_vlan_header, ether_type);
1744 		if (mac_meoi_get_uint16(mp, off, &ether) != 0)
1745 			return (-1);
1746 		meoi->meoi_flags |= MEOI_VLAN_TAGGED;
1747 		maclen = sizeof (struct ether_vlan_header);
1748 	} else {
1749 		maclen = sizeof (struct ether_header);
1750 	}
1751 	meoi->meoi_flags |= MEOI_L2INFO_SET;
1752 	meoi->meoi_l2hlen = maclen;
1753 	meoi->meoi_l3proto = ether;
1754 
1755 	switch (ether) {
1756 	case ETHERTYPE_IP:
1757 		/*
1758 		 * For IPv4 we need to get the length of the header, as it can
1759 		 * be variable.
1760 		 */
1761 		off = offsetof(ipha_t, ipha_version_and_hdr_length) + maclen;
1762 		if (mac_meoi_get_uint8(mp, off, &iplen) != 0)
1763 			return (-1);
1764 		iplen &= 0x0f;
1765 		if (iplen < 5 || iplen > 0x0f)
1766 			return (-1);
1767 		iplen *= 4;
1768 		off = offsetof(ipha_t, ipha_protocol) + maclen;
1769 		if (mac_meoi_get_uint8(mp, off, &ipproto) == -1)
1770 			return (-1);
1771 		break;
1772 	case ETHERTYPE_IPV6:
1773 		iplen = 40;
1774 		off = offsetof(ip6_t, ip6_nxt) + maclen;
1775 		if (mac_meoi_get_uint8(mp, off, &ipproto) == -1)
1776 			return (-1);
1777 		break;
1778 	default:
1779 		return (0);
1780 	}
1781 	meoi->meoi_l3hlen = iplen;
1782 	meoi->meoi_l4proto = ipproto;
1783 	meoi->meoi_flags |= MEOI_L3INFO_SET;
1784 
1785 	switch (ipproto) {
1786 	case IPPROTO_TCP:
1787 		off = offsetof(tcph_t, th_offset_and_rsrvd) + maclen + iplen;
1788 		if (mac_meoi_get_uint8(mp, off, &l4len) == -1)
1789 			return (-1);
1790 		l4len = (l4len & 0xf0) >> 4;
1791 		if (l4len < 5 || l4len > 0xf)
1792 			return (-1);
1793 		l4len *= 4;
1794 		break;
1795 	case IPPROTO_UDP:
1796 		l4len = sizeof (struct udphdr);
1797 		break;
1798 	case IPPROTO_SCTP:
1799 		l4len = sizeof (sctp_hdr_t);
1800 		break;
1801 	default:
1802 		return (0);
1803 	}
1804 
1805 	meoi->meoi_l4hlen = l4len;
1806 	meoi->meoi_flags |= MEOI_L4INFO_SET;
1807 	return (0);
1808 }
1809