xref: /illumos-gate/usr/src/uts/common/io/mac/mac_provider.c (revision d77e6e0f12d19668c0e9068c0fcd7a2123da5373)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2019 Joyent, Inc.
25  * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
26  * Copyright 2020 RackTop Systems, Inc.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/conf.h>
31 #include <sys/id_space.h>
32 #include <sys/esunddi.h>
33 #include <sys/stat.h>
34 #include <sys/mkdev.h>
35 #include <sys/stream.h>
36 #include <sys/strsubr.h>
37 #include <sys/dlpi.h>
38 #include <sys/modhash.h>
39 #include <sys/mac.h>
40 #include <sys/mac_provider.h>
41 #include <sys/mac_impl.h>
42 #include <sys/mac_client_impl.h>
43 #include <sys/mac_client_priv.h>
44 #include <sys/mac_soft_ring.h>
45 #include <sys/mac_stat.h>
46 #include <sys/dld.h>
47 #include <sys/modctl.h>
48 #include <sys/fs/dv_node.h>
49 #include <sys/thread.h>
50 #include <sys/proc.h>
51 #include <sys/callb.h>
52 #include <sys/cpuvar.h>
53 #include <sys/atomic.h>
54 #include <sys/sdt.h>
55 #include <sys/mac_flow.h>
56 #include <sys/ddi_intr_impl.h>
57 #include <sys/disp.h>
58 #include <sys/sdt.h>
59 #include <sys/pattr.h>
60 #include <sys/strsun.h>
61 #include <sys/vlan.h>
62 #include <inet/ip.h>
63 #include <inet/tcp.h>
64 #include <netinet/udp.h>
65 #include <netinet/sctp.h>
66 
67 /*
68  * MAC Provider Interface.
69  *
70  * Interface for GLDv3 compatible NIC drivers.
71  */
72 
73 static void i_mac_notify_thread(void *);
74 
75 typedef void (*mac_notify_default_cb_fn_t)(mac_impl_t *);
76 
77 static const mac_notify_default_cb_fn_t mac_notify_cb_list[MAC_NNOTE] = {
78 	mac_fanout_recompute,	/* MAC_NOTE_LINK */
79 	NULL,		/* MAC_NOTE_UNICST */
80 	NULL,		/* MAC_NOTE_TX */
81 	NULL,		/* MAC_NOTE_DEVPROMISC */
82 	NULL,		/* MAC_NOTE_FASTPATH_FLUSH */
83 	NULL,		/* MAC_NOTE_SDU_SIZE */
84 	NULL,		/* MAC_NOTE_MARGIN */
85 	NULL,		/* MAC_NOTE_CAPAB_CHG */
86 	NULL		/* MAC_NOTE_LOWLINK */
87 };
88 
89 /*
90  * Driver support functions.
91  */
92 
93 /* REGISTRATION */
94 
95 mac_register_t *
96 mac_alloc(uint_t mac_version)
97 {
98 	mac_register_t *mregp;
99 
100 	/*
101 	 * Make sure there isn't a version mismatch between the driver and
102 	 * the framework.  In the future, if multiple versions are
103 	 * supported, this check could become more sophisticated.
104 	 */
105 	if (mac_version != MAC_VERSION)
106 		return (NULL);
107 
108 	mregp = kmem_zalloc(sizeof (mac_register_t), KM_SLEEP);
109 	mregp->m_version = mac_version;
110 	return (mregp);
111 }
112 
113 void
114 mac_free(mac_register_t *mregp)
115 {
116 	kmem_free(mregp, sizeof (mac_register_t));
117 }
118 
119 /*
120  * Convert a MAC's offload features into the equivalent DB_CKSUMFLAGS
121  * value.
122  */
123 static uint16_t
124 mac_features_to_flags(mac_handle_t mh)
125 {
126 	uint16_t flags = 0;
127 	uint32_t cap_sum = 0;
128 	mac_capab_lso_t cap_lso;
129 
130 	if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap_sum)) {
131 		if (cap_sum & HCKSUM_IPHDRCKSUM)
132 			flags |= HCK_IPV4_HDRCKSUM;
133 
134 		if (cap_sum & HCKSUM_INET_PARTIAL)
135 			flags |= HCK_PARTIALCKSUM;
136 		else if (cap_sum & (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6))
137 			flags |= HCK_FULLCKSUM;
138 	}
139 
140 	/*
141 	 * We don't need the information stored in 'cap_lso', but we
142 	 * need to pass a non-NULL pointer to appease the driver.
143 	 */
144 	if (mac_capab_get(mh, MAC_CAPAB_LSO, &cap_lso))
145 		flags |= HW_LSO;
146 
147 	return (flags);
148 }
149 
150 /*
151  * mac_register() is how drivers register new MACs with the GLDv3
152  * framework.  The mregp argument is allocated by drivers using the
153  * mac_alloc() function, and can be freed using mac_free() immediately upon
154  * return from mac_register().  Upon success (0 return value), the mhp
155  * opaque pointer becomes the driver's handle to its MAC interface, and is
156  * the argument to all other mac module entry points.
157  */
158 /* ARGSUSED */
159 int
160 mac_register(mac_register_t *mregp, mac_handle_t *mhp)
161 {
162 	mac_impl_t		*mip;
163 	mactype_t		*mtype;
164 	int			err = EINVAL;
165 	struct devnames		*dnp = NULL;
166 	uint_t			instance;
167 	boolean_t		style1_created = B_FALSE;
168 	boolean_t		style2_created = B_FALSE;
169 	char			*driver;
170 	minor_t			minor = 0;
171 
172 	/* A successful call to mac_init_ops() sets the DN_GLDV3_DRIVER flag. */
173 	if (!GLDV3_DRV(ddi_driver_major(mregp->m_dip)))
174 		return (EINVAL);
175 
176 	/* Find the required MAC-Type plugin. */
177 	if ((mtype = mactype_getplugin(mregp->m_type_ident)) == NULL)
178 		return (EINVAL);
179 
180 	/* Create a mac_impl_t to represent this MAC. */
181 	mip = kmem_cache_alloc(i_mac_impl_cachep, KM_SLEEP);
182 
183 	/*
184 	 * The mac is not ready for open yet.
185 	 */
186 	mip->mi_state_flags |= MIS_DISABLED;
187 
188 	/*
189 	 * When a mac is registered, the m_instance field can be set to:
190 	 *
191 	 *  0:	Get the mac's instance number from m_dip.
192 	 *	This is usually used for physical device dips.
193 	 *
194 	 *  [1 .. MAC_MAX_MINOR-1]: Use the value as the mac's instance number.
195 	 *	For example, when an aggregation is created with the key option,
196 	 *	"key" will be used as the instance number.
197 	 *
198 	 *  -1: Assign an instance number from [MAC_MAX_MINOR .. MAXMIN-1].
199 	 *	This is often used when a MAC of a virtual link is registered
200 	 *	(e.g., aggregation when "key" is not specified, or vnic).
201 	 *
202 	 * Note that the instance number is used to derive the mi_minor field
203 	 * of mac_impl_t, which will then be used to derive the name of kstats
204 	 * and the devfs nodes.  The first 2 cases are needed to preserve
205 	 * backward compatibility.
206 	 */
207 	switch (mregp->m_instance) {
208 	case 0:
209 		instance = ddi_get_instance(mregp->m_dip);
210 		break;
211 	case ((uint_t)-1):
212 		minor = mac_minor_hold(B_TRUE);
213 		if (minor == 0) {
214 			err = ENOSPC;
215 			goto fail;
216 		}
217 		instance = minor - 1;
218 		break;
219 	default:
220 		instance = mregp->m_instance;
221 		if (instance >= MAC_MAX_MINOR) {
222 			err = EINVAL;
223 			goto fail;
224 		}
225 		break;
226 	}
227 
228 	mip->mi_minor = (minor_t)(instance + 1);
229 	mip->mi_dip = mregp->m_dip;
230 	mip->mi_clients_list = NULL;
231 	mip->mi_nclients = 0;
232 
233 	/* Set the default IEEE Port VLAN Identifier */
234 	mip->mi_pvid = 1;
235 
236 	/* Default bridge link learning protection values */
237 	mip->mi_llimit = 1000;
238 	mip->mi_ldecay = 200;
239 
240 	driver = (char *)ddi_driver_name(mip->mi_dip);
241 
242 	/* Construct the MAC name as <drvname><instance> */
243 	(void) snprintf(mip->mi_name, sizeof (mip->mi_name), "%s%d",
244 	    driver, instance);
245 
246 	mip->mi_driver = mregp->m_driver;
247 
248 	mip->mi_type = mtype;
249 	mip->mi_margin = mregp->m_margin;
250 	mip->mi_info.mi_media = mtype->mt_type;
251 	mip->mi_info.mi_nativemedia = mtype->mt_nativetype;
252 	if (mregp->m_max_sdu <= mregp->m_min_sdu)
253 		goto fail;
254 	if (mregp->m_multicast_sdu == 0)
255 		mregp->m_multicast_sdu = mregp->m_max_sdu;
256 	if (mregp->m_multicast_sdu < mregp->m_min_sdu ||
257 	    mregp->m_multicast_sdu > mregp->m_max_sdu)
258 		goto fail;
259 	mip->mi_sdu_min = mregp->m_min_sdu;
260 	mip->mi_sdu_max = mregp->m_max_sdu;
261 	mip->mi_sdu_multicast = mregp->m_multicast_sdu;
262 	mip->mi_info.mi_addr_length = mip->mi_type->mt_addr_length;
263 	/*
264 	 * If the media supports a broadcast address, cache a pointer to it
265 	 * in the mac_info_t so that upper layers can use it.
266 	 */
267 	mip->mi_info.mi_brdcst_addr = mip->mi_type->mt_brdcst_addr;
268 
269 	mip->mi_v12n_level = mregp->m_v12n;
270 
271 	/*
272 	 * Copy the unicast source address into the mac_info_t, but only if
273 	 * the MAC-Type defines a non-zero address length.  We need to
274 	 * handle MAC-Types that have an address length of 0
275 	 * (point-to-point protocol MACs for example).
276 	 */
277 	if (mip->mi_type->mt_addr_length > 0) {
278 		if (mregp->m_src_addr == NULL)
279 			goto fail;
280 		mip->mi_info.mi_unicst_addr =
281 		    kmem_alloc(mip->mi_type->mt_addr_length, KM_SLEEP);
282 		bcopy(mregp->m_src_addr, mip->mi_info.mi_unicst_addr,
283 		    mip->mi_type->mt_addr_length);
284 
285 		/*
286 		 * Copy the fixed 'factory' MAC address from the immutable
287 		 * info.  This is taken to be the MAC address currently in
288 		 * use.
289 		 */
290 		bcopy(mip->mi_info.mi_unicst_addr, mip->mi_addr,
291 		    mip->mi_type->mt_addr_length);
292 
293 		/*
294 		 * At this point, we should set up the classification
295 		 * rules etc but we delay it till mac_open() so that
296 		 * the resource discovery has taken place and we
297 		 * know someone wants to use the device. Otherwise
298 		 * memory gets allocated for Rx ring structures even
299 		 * during probe.
300 		 */
301 
302 		/* Copy the destination address if one is provided. */
303 		if (mregp->m_dst_addr != NULL) {
304 			bcopy(mregp->m_dst_addr, mip->mi_dstaddr,
305 			    mip->mi_type->mt_addr_length);
306 			mip->mi_dstaddr_set = B_TRUE;
307 		}
308 	} else if (mregp->m_src_addr != NULL) {
309 		goto fail;
310 	}
311 
312 	/*
313 	 * The format of the m_pdata is specific to the plugin.  It is
314 	 * passed in as an argument to all of the plugin callbacks.  The
315 	 * driver can update this information by calling
316 	 * mac_pdata_update().
317 	 */
318 	if (mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY) {
319 		/*
320 		 * Verify if the supplied plugin data is valid.  Note that
321 		 * even if the caller passed in a NULL pointer as plugin data,
322 		 * we still need to verify if that's valid as the plugin may
323 		 * require plugin data to function.
324 		 */
325 		if (!mip->mi_type->mt_ops.mtops_pdata_verify(mregp->m_pdata,
326 		    mregp->m_pdata_size)) {
327 			goto fail;
328 		}
329 		if (mregp->m_pdata != NULL) {
330 			mip->mi_pdata =
331 			    kmem_alloc(mregp->m_pdata_size, KM_SLEEP);
332 			bcopy(mregp->m_pdata, mip->mi_pdata,
333 			    mregp->m_pdata_size);
334 			mip->mi_pdata_size = mregp->m_pdata_size;
335 		}
336 	} else if (mregp->m_pdata != NULL) {
337 		/*
338 		 * The caller supplied non-NULL plugin data, but the plugin
339 		 * does not recognize plugin data.
340 		 */
341 		err = EINVAL;
342 		goto fail;
343 	}
344 
345 	/*
346 	 * Register the private properties.
347 	 */
348 	mac_register_priv_prop(mip, mregp->m_priv_props);
349 
350 	/*
351 	 * Stash the driver callbacks into the mac_impl_t, but first sanity
352 	 * check to make sure all mandatory callbacks are set.
353 	 */
354 	if (mregp->m_callbacks->mc_getstat == NULL ||
355 	    mregp->m_callbacks->mc_start == NULL ||
356 	    mregp->m_callbacks->mc_stop == NULL ||
357 	    mregp->m_callbacks->mc_setpromisc == NULL ||
358 	    mregp->m_callbacks->mc_multicst == NULL) {
359 		goto fail;
360 	}
361 	mip->mi_callbacks = mregp->m_callbacks;
362 
363 	if (mac_capab_get((mac_handle_t)mip, MAC_CAPAB_LEGACY,
364 	    &mip->mi_capab_legacy)) {
365 		mip->mi_state_flags |= MIS_LEGACY;
366 		mip->mi_phy_dev = mip->mi_capab_legacy.ml_dev;
367 	} else {
368 		mip->mi_phy_dev = makedevice(ddi_driver_major(mip->mi_dip),
369 		    mip->mi_minor);
370 	}
371 
372 	/*
373 	 * Allocate a notification thread. thread_create blocks for memory
374 	 * if needed, it never fails.
375 	 */
376 	mip->mi_notify_thread = thread_create(NULL, 0, i_mac_notify_thread,
377 	    mip, 0, &p0, TS_RUN, minclsyspri);
378 
379 	/*
380 	 * Cache the DB_CKSUMFLAGS that this MAC supports.
381 	 */
382 	mip->mi_tx_cksum_flags = mac_features_to_flags((mac_handle_t)mip);
383 
384 	/*
385 	 * Initialize the capabilities
386 	 */
387 	bzero(&mip->mi_rx_rings_cap, sizeof (mac_capab_rings_t));
388 	bzero(&mip->mi_tx_rings_cap, sizeof (mac_capab_rings_t));
389 
390 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_VNIC, NULL))
391 		mip->mi_state_flags |= MIS_IS_VNIC;
392 
393 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL))
394 		mip->mi_state_flags |= MIS_IS_AGGR;
395 
396 	mac_addr_factory_init(mip);
397 
398 	mac_transceiver_init(mip);
399 
400 	mac_led_init(mip);
401 
402 	/*
403 	 * Enforce the virtrualization level registered.
404 	 */
405 	if (mip->mi_v12n_level & MAC_VIRT_LEVEL1) {
406 		if (mac_init_rings(mip, MAC_RING_TYPE_RX) != 0 ||
407 		    mac_init_rings(mip, MAC_RING_TYPE_TX) != 0)
408 			goto fail;
409 
410 		/*
411 		 * The driver needs to register at least rx rings for this
412 		 * virtualization level.
413 		 */
414 		if (mip->mi_rx_groups == NULL)
415 			goto fail;
416 	}
417 
418 	/*
419 	 * The driver must set mc_unicst entry point to NULL when it advertises
420 	 * CAP_RINGS for rx groups.
421 	 */
422 	if (mip->mi_rx_groups != NULL) {
423 		if (mregp->m_callbacks->mc_unicst != NULL)
424 			goto fail;
425 	} else {
426 		if (mregp->m_callbacks->mc_unicst == NULL)
427 			goto fail;
428 	}
429 
430 	/*
431 	 * Initialize MAC addresses. Must be called after mac_init_rings().
432 	 */
433 	mac_init_macaddr(mip);
434 
435 	mip->mi_share_capab.ms_snum = 0;
436 	if (mip->mi_v12n_level & MAC_VIRT_HIO) {
437 		(void) mac_capab_get((mac_handle_t)mip, MAC_CAPAB_SHARES,
438 		    &mip->mi_share_capab);
439 	}
440 
441 	/*
442 	 * Initialize the kstats for this device.
443 	 */
444 	mac_driver_stat_create(mip);
445 
446 	/* Zero out any properties. */
447 	bzero(&mip->mi_resource_props, sizeof (mac_resource_props_t));
448 
449 	if (mip->mi_minor <= MAC_MAX_MINOR) {
450 		/* Create a style-2 DLPI device */
451 		if (ddi_create_minor_node(mip->mi_dip, driver, S_IFCHR, 0,
452 		    DDI_NT_NET, CLONE_DEV) != DDI_SUCCESS)
453 			goto fail;
454 		style2_created = B_TRUE;
455 
456 		/* Create a style-1 DLPI device */
457 		if (ddi_create_minor_node(mip->mi_dip, mip->mi_name, S_IFCHR,
458 		    mip->mi_minor, DDI_NT_NET, 0) != DDI_SUCCESS)
459 			goto fail;
460 		style1_created = B_TRUE;
461 	}
462 
463 	mac_flow_l2tab_create(mip, &mip->mi_flow_tab);
464 
465 	rw_enter(&i_mac_impl_lock, RW_WRITER);
466 	if (mod_hash_insert(i_mac_impl_hash,
467 	    (mod_hash_key_t)mip->mi_name, (mod_hash_val_t)mip) != 0) {
468 		rw_exit(&i_mac_impl_lock);
469 		err = EEXIST;
470 		goto fail;
471 	}
472 
473 	DTRACE_PROBE2(mac__register, struct devnames *, dnp,
474 	    (mac_impl_t *), mip);
475 
476 	/*
477 	 * Mark the MAC to be ready for open.
478 	 */
479 	mip->mi_state_flags &= ~MIS_DISABLED;
480 	rw_exit(&i_mac_impl_lock);
481 
482 	atomic_inc_32(&i_mac_impl_count);
483 
484 	cmn_err(CE_NOTE, "!%s registered", mip->mi_name);
485 	*mhp = (mac_handle_t)mip;
486 	return (0);
487 
488 fail:
489 	if (style1_created)
490 		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
491 
492 	if (style2_created)
493 		ddi_remove_minor_node(mip->mi_dip, driver);
494 
495 	mac_addr_factory_fini(mip);
496 
497 	/* Clean up registered MAC addresses */
498 	mac_fini_macaddr(mip);
499 
500 	/* Clean up registered rings */
501 	mac_free_rings(mip, MAC_RING_TYPE_RX);
502 	mac_free_rings(mip, MAC_RING_TYPE_TX);
503 
504 	/* Clean up notification thread */
505 	if (mip->mi_notify_thread != NULL)
506 		i_mac_notify_exit(mip);
507 
508 	if (mip->mi_info.mi_unicst_addr != NULL) {
509 		kmem_free(mip->mi_info.mi_unicst_addr,
510 		    mip->mi_type->mt_addr_length);
511 		mip->mi_info.mi_unicst_addr = NULL;
512 	}
513 
514 	mac_driver_stat_delete(mip);
515 
516 	if (mip->mi_type != NULL) {
517 		atomic_dec_32(&mip->mi_type->mt_ref);
518 		mip->mi_type = NULL;
519 	}
520 
521 	if (mip->mi_pdata != NULL) {
522 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
523 		mip->mi_pdata = NULL;
524 		mip->mi_pdata_size = 0;
525 	}
526 
527 	if (minor != 0) {
528 		ASSERT(minor > MAC_MAX_MINOR);
529 		mac_minor_rele(minor);
530 	}
531 
532 	mip->mi_state_flags = 0;
533 	mac_unregister_priv_prop(mip);
534 
535 	/*
536 	 * Clear the state before destroying the mac_impl_t
537 	 */
538 	mip->mi_state_flags = 0;
539 
540 	kmem_cache_free(i_mac_impl_cachep, mip);
541 	return (err);
542 }
543 
544 /*
545  * Unregister from the GLDv3 framework
546  */
547 int
548 mac_unregister(mac_handle_t mh)
549 {
550 	int			err;
551 	mac_impl_t		*mip = (mac_impl_t *)mh;
552 	mod_hash_val_t		val;
553 	mac_margin_req_t	*mmr, *nextmmr;
554 
555 	/* Fail the unregister if there are any open references to this mac. */
556 	if ((err = mac_disable_nowait(mh)) != 0)
557 		return (err);
558 
559 	/*
560 	 * Clean up notification thread and wait for it to exit.
561 	 */
562 	i_mac_notify_exit(mip);
563 
564 	/*
565 	 * Prior to acquiring the MAC perimeter, remove the MAC instance from
566 	 * the internal hash table. Such removal means table-walkers that
567 	 * acquire the perimeter will not do so on behalf of what we are
568 	 * unregistering, which prevents a deadlock.
569 	 */
570 	rw_enter(&i_mac_impl_lock, RW_WRITER);
571 	(void) mod_hash_remove(i_mac_impl_hash,
572 	    (mod_hash_key_t)mip->mi_name, &val);
573 	rw_exit(&i_mac_impl_lock);
574 	ASSERT(mip == (mac_impl_t *)val);
575 
576 	i_mac_perim_enter(mip);
577 
578 	/*
579 	 * There is still resource properties configured over this mac.
580 	 */
581 	if (mip->mi_resource_props.mrp_mask != 0)
582 		mac_fastpath_enable((mac_handle_t)mip);
583 
584 	if (mip->mi_minor < MAC_MAX_MINOR + 1) {
585 		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
586 		ddi_remove_minor_node(mip->mi_dip,
587 		    (char *)ddi_driver_name(mip->mi_dip));
588 	}
589 
590 	ASSERT(mip->mi_nactiveclients == 0 && !(mip->mi_state_flags &
591 	    MIS_EXCLUSIVE));
592 
593 	mac_driver_stat_delete(mip);
594 
595 	ASSERT(i_mac_impl_count > 0);
596 	atomic_dec_32(&i_mac_impl_count);
597 
598 	if (mip->mi_pdata != NULL)
599 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
600 	mip->mi_pdata = NULL;
601 	mip->mi_pdata_size = 0;
602 
603 	/*
604 	 * Free the list of margin request.
605 	 */
606 	for (mmr = mip->mi_mmrp; mmr != NULL; mmr = nextmmr) {
607 		nextmmr = mmr->mmr_nextp;
608 		kmem_free(mmr, sizeof (mac_margin_req_t));
609 	}
610 	mip->mi_mmrp = NULL;
611 
612 	mip->mi_linkstate = mip->mi_lowlinkstate = LINK_STATE_UNKNOWN;
613 	kmem_free(mip->mi_info.mi_unicst_addr, mip->mi_type->mt_addr_length);
614 	mip->mi_info.mi_unicst_addr = NULL;
615 
616 	atomic_dec_32(&mip->mi_type->mt_ref);
617 	mip->mi_type = NULL;
618 
619 	/*
620 	 * Free the primary MAC address.
621 	 */
622 	mac_fini_macaddr(mip);
623 
624 	/*
625 	 * free all rings
626 	 */
627 	mac_free_rings(mip, MAC_RING_TYPE_RX);
628 	mac_free_rings(mip, MAC_RING_TYPE_TX);
629 
630 	mac_addr_factory_fini(mip);
631 
632 	bzero(mip->mi_addr, MAXMACADDRLEN);
633 	bzero(mip->mi_dstaddr, MAXMACADDRLEN);
634 	mip->mi_dstaddr_set = B_FALSE;
635 
636 	/* and the flows */
637 	mac_flow_tab_destroy(mip->mi_flow_tab);
638 	mip->mi_flow_tab = NULL;
639 
640 	if (mip->mi_minor > MAC_MAX_MINOR)
641 		mac_minor_rele(mip->mi_minor);
642 
643 	cmn_err(CE_NOTE, "!%s unregistered", mip->mi_name);
644 
645 	/*
646 	 * Reset the perim related fields to default values before
647 	 * kmem_cache_free
648 	 */
649 	i_mac_perim_exit(mip);
650 	mip->mi_state_flags = 0;
651 
652 	mac_unregister_priv_prop(mip);
653 
654 	ASSERT(mip->mi_bridge_link == NULL);
655 	kmem_cache_free(i_mac_impl_cachep, mip);
656 
657 	return (0);
658 }
659 
660 /* DATA RECEPTION */
661 
662 /*
663  * This function is invoked for packets received by the MAC driver in
664  * interrupt context. The ring generation number provided by the driver
665  * is matched with the ring generation number held in MAC. If they do not
666  * match, received packets are considered stale packets coming from an older
667  * assignment of the ring. Drop them.
668  */
669 void
670 mac_rx_ring(mac_handle_t mh, mac_ring_handle_t mrh, mblk_t *mp_chain,
671     uint64_t mr_gen_num)
672 {
673 	mac_ring_t		*mr = (mac_ring_t *)mrh;
674 
675 	if ((mr != NULL) && (mr->mr_gen_num != mr_gen_num)) {
676 		DTRACE_PROBE2(mac__rx__rings__stale__packet, uint64_t,
677 		    mr->mr_gen_num, uint64_t, mr_gen_num);
678 		freemsgchain(mp_chain);
679 		return;
680 	}
681 	mac_rx(mh, (mac_resource_handle_t)mrh, mp_chain);
682 }
683 
684 /*
685  * This function is invoked for each packet received by the underlying driver.
686  */
687 void
688 mac_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
689 {
690 	mac_impl_t *mip = (mac_impl_t *)mh;
691 
692 	/*
693 	 * Check if the link is part of a bridge.  If not, then we don't need
694 	 * to take the lock to remain consistent.  Make this common case
695 	 * lock-free and tail-call optimized.
696 	 */
697 	if (mip->mi_bridge_link == NULL) {
698 		mac_rx_common(mh, mrh, mp_chain);
699 	} else {
700 		/*
701 		 * Once we take a reference on the bridge link, the bridge
702 		 * module itself can't unload, so the callback pointers are
703 		 * stable.
704 		 */
705 		mutex_enter(&mip->mi_bridge_lock);
706 		if ((mh = mip->mi_bridge_link) != NULL)
707 			mac_bridge_ref_cb(mh, B_TRUE);
708 		mutex_exit(&mip->mi_bridge_lock);
709 		if (mh == NULL) {
710 			mac_rx_common((mac_handle_t)mip, mrh, mp_chain);
711 		} else {
712 			mac_bridge_rx_cb(mh, mrh, mp_chain);
713 			mac_bridge_ref_cb(mh, B_FALSE);
714 		}
715 	}
716 }
717 
718 /*
719  * Special case function: this allows snooping of packets transmitted and
720  * received by TRILL. By design, they go directly into the TRILL module.
721  */
722 void
723 mac_trill_snoop(mac_handle_t mh, mblk_t *mp)
724 {
725 	mac_impl_t *mip = (mac_impl_t *)mh;
726 
727 	if (mip->mi_promisc_list != NULL)
728 		mac_promisc_dispatch(mip, mp, NULL, B_FALSE);
729 }
730 
731 /*
732  * This is the upward reentry point for packets arriving from the bridging
733  * module and from mac_rx for links not part of a bridge.
734  */
735 void
736 mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
737 {
738 	mac_impl_t		*mip = (mac_impl_t *)mh;
739 	mac_ring_t		*mr = (mac_ring_t *)mrh;
740 	mac_soft_ring_set_t	*mac_srs;
741 	mblk_t			*bp = mp_chain;
742 
743 	/*
744 	 * If there are any promiscuous mode callbacks defined for
745 	 * this MAC, pass them a copy if appropriate.
746 	 */
747 	if (mip->mi_promisc_list != NULL)
748 		mac_promisc_dispatch(mip, mp_chain, NULL, B_FALSE);
749 
750 	if (mr != NULL) {
751 		/*
752 		 * If the SRS teardown has started, just return. The 'mr'
753 		 * continues to be valid until the driver unregisters the MAC.
754 		 * Hardware classified packets will not make their way up
755 		 * beyond this point once the teardown has started. The driver
756 		 * is never passed a pointer to a flow entry or SRS or any
757 		 * structure that can be freed much before mac_unregister.
758 		 */
759 		mutex_enter(&mr->mr_lock);
760 		if ((mr->mr_state != MR_INUSE) || (mr->mr_flag &
761 		    (MR_INCIPIENT | MR_CONDEMNED | MR_QUIESCE))) {
762 			mutex_exit(&mr->mr_lock);
763 			freemsgchain(mp_chain);
764 			return;
765 		}
766 
767 		/*
768 		 * The ring is in passthru mode; pass the chain up to
769 		 * the pseudo ring.
770 		 */
771 		if (mr->mr_classify_type == MAC_PASSTHRU_CLASSIFIER) {
772 			MR_REFHOLD_LOCKED(mr);
773 			mutex_exit(&mr->mr_lock);
774 			mr->mr_pt_fn(mr->mr_pt_arg1, mr->mr_pt_arg2, mp_chain,
775 			    B_FALSE);
776 			MR_REFRELE(mr);
777 			return;
778 		}
779 
780 		/*
781 		 * The passthru callback should only be set when in
782 		 * MAC_PASSTHRU_CLASSIFIER mode.
783 		 */
784 		ASSERT3P(mr->mr_pt_fn, ==, NULL);
785 
786 		/*
787 		 * We check if an SRS is controlling this ring.
788 		 * If so, we can directly call the srs_lower_proc
789 		 * routine otherwise we need to go through mac_rx_classify
790 		 * to reach the right place.
791 		 */
792 		if (mr->mr_classify_type == MAC_HW_CLASSIFIER) {
793 			MR_REFHOLD_LOCKED(mr);
794 			mutex_exit(&mr->mr_lock);
795 			ASSERT3P(mr->mr_srs, !=, NULL);
796 			mac_srs = mr->mr_srs;
797 
798 			/*
799 			 * This is the fast path. All packets received
800 			 * on this ring are hardware classified and
801 			 * share the same MAC header info.
802 			 */
803 			mac_srs->srs_rx.sr_lower_proc(mh,
804 			    (mac_resource_handle_t)mac_srs, mp_chain, B_FALSE);
805 			MR_REFRELE(mr);
806 			return;
807 		}
808 
809 		mutex_exit(&mr->mr_lock);
810 		/* We'll fall through to software classification */
811 	} else {
812 		flow_entry_t *flent;
813 		int err;
814 
815 		rw_enter(&mip->mi_rw_lock, RW_READER);
816 		if (mip->mi_single_active_client != NULL) {
817 			flent = mip->mi_single_active_client->mci_flent_list;
818 			FLOW_TRY_REFHOLD(flent, err);
819 			rw_exit(&mip->mi_rw_lock);
820 			if (err == 0) {
821 				(flent->fe_cb_fn)(flent->fe_cb_arg1,
822 				    flent->fe_cb_arg2, mp_chain, B_FALSE);
823 				FLOW_REFRELE(flent);
824 				return;
825 			}
826 		} else {
827 			rw_exit(&mip->mi_rw_lock);
828 		}
829 	}
830 
831 	if (!FLOW_TAB_EMPTY(mip->mi_flow_tab)) {
832 		if ((bp = mac_rx_flow(mh, mrh, bp)) == NULL)
833 			return;
834 	}
835 
836 	freemsgchain(bp);
837 }
838 
839 /* DATA TRANSMISSION */
840 
841 /*
842  * A driver's notification to resume transmission, in case of a provider
843  * without TX rings.
844  */
845 void
846 mac_tx_update(mac_handle_t mh)
847 {
848 	mac_tx_ring_update(mh, NULL);
849 }
850 
851 /*
852  * A driver's notification to resume transmission on the specified TX ring.
853  */
854 void
855 mac_tx_ring_update(mac_handle_t mh, mac_ring_handle_t rh)
856 {
857 	i_mac_tx_srs_notify((mac_impl_t *)mh, rh);
858 }
859 
860 /* LINK STATE */
861 /*
862  * Notify the MAC layer about a link state change
863  */
864 void
865 mac_link_update(mac_handle_t mh, link_state_t link)
866 {
867 	mac_impl_t	*mip = (mac_impl_t *)mh;
868 
869 	/*
870 	 * Save the link state.
871 	 */
872 	mip->mi_lowlinkstate = link;
873 
874 	/*
875 	 * Send a MAC_NOTE_LOWLINK notification.  This tells the notification
876 	 * thread to deliver both lower and upper notifications.
877 	 */
878 	i_mac_notify(mip, MAC_NOTE_LOWLINK);
879 }
880 
881 /*
882  * Notify the MAC layer about a link state change due to bridging.
883  */
884 void
885 mac_link_redo(mac_handle_t mh, link_state_t link)
886 {
887 	mac_impl_t	*mip = (mac_impl_t *)mh;
888 
889 	/*
890 	 * Save the link state.
891 	 */
892 	mip->mi_linkstate = link;
893 
894 	/*
895 	 * Send a MAC_NOTE_LINK notification.  Only upper notifications are
896 	 * made.
897 	 */
898 	i_mac_notify(mip, MAC_NOTE_LINK);
899 }
900 
901 /* MINOR NODE HANDLING */
902 
903 /*
904  * Given a dev_t, return the instance number (PPA) associated with it.
905  * Drivers can use this in their getinfo(9e) implementation to lookup
906  * the instance number (i.e. PPA) of the device, to use as an index to
907  * their own array of soft state structures.
908  *
909  * Returns -1 on error.
910  */
911 int
912 mac_devt_to_instance(dev_t devt)
913 {
914 	return (dld_devt_to_instance(devt));
915 }
916 
917 /*
918  * This function returns the first minor number that is available for
919  * driver private use.  All minor numbers smaller than this are
920  * reserved for GLDv3 use.
921  */
922 minor_t
923 mac_private_minor(void)
924 {
925 	return (MAC_PRIVATE_MINOR);
926 }
927 
928 /* OTHER CONTROL INFORMATION */
929 
930 /*
931  * A driver notified us that its primary MAC address has changed.
932  */
933 void
934 mac_unicst_update(mac_handle_t mh, const uint8_t *addr)
935 {
936 	mac_impl_t	*mip = (mac_impl_t *)mh;
937 
938 	if (mip->mi_type->mt_addr_length == 0)
939 		return;
940 
941 	i_mac_perim_enter(mip);
942 
943 	/*
944 	 * If address changes, freshen the MAC address value and update
945 	 * all MAC clients that share this MAC address.
946 	 */
947 	if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) != 0) {
948 		mac_freshen_macaddr(mac_find_macaddr(mip, mip->mi_addr),
949 		    (uint8_t *)addr);
950 	}
951 
952 	i_mac_perim_exit(mip);
953 
954 	/*
955 	 * Send a MAC_NOTE_UNICST notification.
956 	 */
957 	i_mac_notify(mip, MAC_NOTE_UNICST);
958 }
959 
960 void
961 mac_dst_update(mac_handle_t mh, const uint8_t *addr)
962 {
963 	mac_impl_t	*mip = (mac_impl_t *)mh;
964 
965 	if (mip->mi_type->mt_addr_length == 0)
966 		return;
967 
968 	i_mac_perim_enter(mip);
969 	bcopy(addr, mip->mi_dstaddr, mip->mi_type->mt_addr_length);
970 	i_mac_perim_exit(mip);
971 	i_mac_notify(mip, MAC_NOTE_DEST);
972 }
973 
974 /*
975  * MAC plugin information changed.
976  */
977 int
978 mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize)
979 {
980 	mac_impl_t	*mip = (mac_impl_t *)mh;
981 
982 	/*
983 	 * Verify that the plugin supports MAC plugin data and that the
984 	 * supplied data is valid.
985 	 */
986 	if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY))
987 		return (EINVAL);
988 	if (!mip->mi_type->mt_ops.mtops_pdata_verify(mac_pdata, dsize))
989 		return (EINVAL);
990 
991 	if (mip->mi_pdata != NULL)
992 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
993 
994 	mip->mi_pdata = kmem_alloc(dsize, KM_SLEEP);
995 	bcopy(mac_pdata, mip->mi_pdata, dsize);
996 	mip->mi_pdata_size = dsize;
997 
998 	/*
999 	 * Since the MAC plugin data is used to construct MAC headers that
1000 	 * were cached in fast-path headers, we need to flush fast-path
1001 	 * information for links associated with this mac.
1002 	 */
1003 	i_mac_notify(mip, MAC_NOTE_FASTPATH_FLUSH);
1004 	return (0);
1005 }
1006 
1007 /*
1008  * The mac provider or mac frameowrk calls this function when it wants
1009  * to notify upstream consumers that the capabilities have changed and
1010  * that they should modify their own internal state accordingly.
1011  *
1012  * We currently have no regard for the fact that a provider could
1013  * decide to drop capabilities which would invalidate pending traffic.
1014  * For example, if one was to disable the Tx checksum offload while
1015  * TCP/IP traffic was being sent by mac clients relying on that
1016  * feature, then those packets would hit the write with missing or
1017  * partial checksums. A proper solution involves not only providing
1018  * notfication, but also performing client quiescing. That is, a capab
1019  * change should be treated as an atomic transaction that forms a
1020  * barrier between traffic relying on the current capabs and traffic
1021  * relying on the new capabs. In practice, simnet is currently the
1022  * only provider that could hit this, and it's an easily avoidable
1023  * situation (and at worst it should only lead to some dropped
1024  * packets). But if we ever want better on-the-fly capab change to
1025  * actual hardware providers, then we should give this update
1026  * mechanism a proper implementation.
1027  */
1028 void
1029 mac_capab_update(mac_handle_t mh)
1030 {
1031 	/*
1032 	 * Send a MAC_NOTE_CAPAB_CHG notification to alert upstream
1033 	 * clients to renegotiate capabilities.
1034 	 */
1035 	i_mac_notify((mac_impl_t *)mh, MAC_NOTE_CAPAB_CHG);
1036 }
1037 
1038 /*
1039  * Used by normal drivers to update the max sdu size.
1040  * We need to handle the case of a smaller mi_sdu_multicast
1041  * since this is called by mac_set_mtu() even for drivers that
1042  * have differing unicast and multicast mtu and we don't want to
1043  * increase the multicast mtu by accident in that case.
1044  */
1045 int
1046 mac_maxsdu_update(mac_handle_t mh, uint_t sdu_max)
1047 {
1048 	mac_impl_t	*mip = (mac_impl_t *)mh;
1049 
1050 	if (sdu_max == 0 || sdu_max < mip->mi_sdu_min)
1051 		return (EINVAL);
1052 	mip->mi_sdu_max = sdu_max;
1053 	if (mip->mi_sdu_multicast > mip->mi_sdu_max)
1054 		mip->mi_sdu_multicast = mip->mi_sdu_max;
1055 
1056 	/* Send a MAC_NOTE_SDU_SIZE notification. */
1057 	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
1058 	return (0);
1059 }
1060 
1061 /*
1062  * Version of the above function that is used by drivers that have a different
1063  * max sdu size for multicast/broadcast vs. unicast.
1064  */
1065 int
1066 mac_maxsdu_update2(mac_handle_t mh, uint_t sdu_max, uint_t sdu_multicast)
1067 {
1068 	mac_impl_t	*mip = (mac_impl_t *)mh;
1069 
1070 	if (sdu_max == 0 || sdu_max < mip->mi_sdu_min)
1071 		return (EINVAL);
1072 	if (sdu_multicast == 0)
1073 		sdu_multicast = sdu_max;
1074 	if (sdu_multicast > sdu_max || sdu_multicast < mip->mi_sdu_min)
1075 		return (EINVAL);
1076 	mip->mi_sdu_max = sdu_max;
1077 	mip->mi_sdu_multicast = sdu_multicast;
1078 
1079 	/* Send a MAC_NOTE_SDU_SIZE notification. */
1080 	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
1081 	return (0);
1082 }
1083 
1084 static void
1085 mac_ring_intr_retarget(mac_group_t *group, mac_ring_t *ring)
1086 {
1087 	mac_client_impl_t *mcip;
1088 	flow_entry_t *flent;
1089 	mac_soft_ring_set_t *mac_rx_srs;
1090 	mac_cpus_t *srs_cpu;
1091 	int i;
1092 
1093 	if (((mcip = MAC_GROUP_ONLY_CLIENT(group)) != NULL) &&
1094 	    (!ring->mr_info.mri_intr.mi_ddi_shared)) {
1095 		/* interrupt can be re-targeted */
1096 		ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
1097 		flent = mcip->mci_flent;
1098 		if (ring->mr_type == MAC_RING_TYPE_RX) {
1099 			for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1100 				mac_rx_srs = flent->fe_rx_srs[i];
1101 				if (mac_rx_srs->srs_ring != ring)
1102 					continue;
1103 				srs_cpu = &mac_rx_srs->srs_cpu;
1104 				mutex_enter(&cpu_lock);
1105 				mac_rx_srs_retarget_intr(mac_rx_srs,
1106 				    srs_cpu->mc_rx_intr_cpu);
1107 				mutex_exit(&cpu_lock);
1108 				break;
1109 			}
1110 		} else {
1111 			if (flent->fe_tx_srs != NULL) {
1112 				mutex_enter(&cpu_lock);
1113 				mac_tx_srs_retarget_intr(
1114 				    flent->fe_tx_srs);
1115 				mutex_exit(&cpu_lock);
1116 			}
1117 		}
1118 	}
1119 }
1120 
1121 /*
1122  * Clients like aggr create pseudo rings (mac_ring_t) and expose them to
1123  * their clients. There is a 1-1 mapping pseudo ring and the hardware
1124  * ring. ddi interrupt handles are exported from the hardware ring to
1125  * the pseudo ring. Thus when the interrupt handle changes, clients of
1126  * aggr that are using the handle need to use the new handle and
1127  * re-target their interrupts.
1128  */
1129 static void
1130 mac_pseudo_ring_intr_retarget(mac_impl_t *mip, mac_ring_t *ring,
1131     ddi_intr_handle_t ddh)
1132 {
1133 	mac_ring_t *pring;
1134 	mac_group_t *pgroup;
1135 	mac_impl_t *pmip;
1136 	char macname[MAXNAMELEN];
1137 	mac_perim_handle_t p_mph;
1138 	uint64_t saved_gen_num;
1139 
1140 again:
1141 	pring = (mac_ring_t *)ring->mr_prh;
1142 	pgroup = (mac_group_t *)pring->mr_gh;
1143 	pmip = (mac_impl_t *)pgroup->mrg_mh;
1144 	saved_gen_num = ring->mr_gen_num;
1145 	(void) strlcpy(macname, pmip->mi_name, MAXNAMELEN);
1146 	/*
1147 	 * We need to enter aggr's perimeter. The locking hierarchy
1148 	 * dictates that aggr's perimeter should be entered first
1149 	 * and then the port's perimeter. So drop the port's
1150 	 * perimeter, enter aggr's and then re-enter port's
1151 	 * perimeter.
1152 	 */
1153 	i_mac_perim_exit(mip);
1154 	/*
1155 	 * While we know pmip is the aggr's mip, there is a
1156 	 * possibility that aggr could have unregistered by
1157 	 * the time we exit port's perimeter (mip) and
1158 	 * enter aggr's perimeter (pmip). To avoid that
1159 	 * scenario, enter aggr's perimeter using its name.
1160 	 */
1161 	if (mac_perim_enter_by_macname(macname, &p_mph) != 0)
1162 		return;
1163 	i_mac_perim_enter(mip);
1164 	/*
1165 	 * Check if the ring got assigned to another aggregation before
1166 	 * be could enter aggr's and the port's perimeter. When a ring
1167 	 * gets deleted from an aggregation, it calls mac_stop_ring()
1168 	 * which increments the generation number. So checking
1169 	 * generation number will be enough.
1170 	 */
1171 	if (ring->mr_gen_num != saved_gen_num && ring->mr_prh != NULL) {
1172 		i_mac_perim_exit(mip);
1173 		mac_perim_exit(p_mph);
1174 		i_mac_perim_enter(mip);
1175 		goto again;
1176 	}
1177 
1178 	/* Check if pseudo ring is still present */
1179 	if (ring->mr_prh != NULL) {
1180 		pring->mr_info.mri_intr.mi_ddi_handle = ddh;
1181 		pring->mr_info.mri_intr.mi_ddi_shared =
1182 		    ring->mr_info.mri_intr.mi_ddi_shared;
1183 		if (ddh != NULL)
1184 			mac_ring_intr_retarget(pgroup, pring);
1185 	}
1186 	i_mac_perim_exit(mip);
1187 	mac_perim_exit(p_mph);
1188 }
1189 /*
1190  * API called by driver to provide new interrupt handle for TX/RX rings.
1191  * This usually happens when IRM (Interrupt Resource Manangement)
1192  * framework either gives the driver more MSI-x interrupts or takes
1193  * away MSI-x interrupts from the driver.
1194  */
1195 void
1196 mac_ring_intr_set(mac_ring_handle_t mrh, ddi_intr_handle_t ddh)
1197 {
1198 	mac_ring_t	*ring = (mac_ring_t *)mrh;
1199 	mac_group_t	*group = (mac_group_t *)ring->mr_gh;
1200 	mac_impl_t	*mip = (mac_impl_t *)group->mrg_mh;
1201 
1202 	i_mac_perim_enter(mip);
1203 	ring->mr_info.mri_intr.mi_ddi_handle = ddh;
1204 	if (ddh == NULL) {
1205 		/* Interrupts being reset */
1206 		ring->mr_info.mri_intr.mi_ddi_shared = B_FALSE;
1207 		if (ring->mr_prh != NULL) {
1208 			mac_pseudo_ring_intr_retarget(mip, ring, ddh);
1209 			return;
1210 		}
1211 	} else {
1212 		/* New interrupt handle */
1213 		mac_compare_ddi_handle(mip->mi_rx_groups,
1214 		    mip->mi_rx_group_count, ring);
1215 		if (!ring->mr_info.mri_intr.mi_ddi_shared) {
1216 			mac_compare_ddi_handle(mip->mi_tx_groups,
1217 			    mip->mi_tx_group_count, ring);
1218 		}
1219 		if (ring->mr_prh != NULL) {
1220 			mac_pseudo_ring_intr_retarget(mip, ring, ddh);
1221 			return;
1222 		} else {
1223 			mac_ring_intr_retarget(group, ring);
1224 		}
1225 	}
1226 	i_mac_perim_exit(mip);
1227 }
1228 
1229 /* PRIVATE FUNCTIONS, FOR INTERNAL USE ONLY */
1230 
1231 /*
1232  * Updates the mac_impl structure with the current state of the link
1233  */
1234 static void
1235 i_mac_log_link_state(mac_impl_t *mip)
1236 {
1237 	/*
1238 	 * If no change, then it is not interesting.
1239 	 */
1240 	if (mip->mi_lastlowlinkstate == mip->mi_lowlinkstate)
1241 		return;
1242 
1243 	switch (mip->mi_lowlinkstate) {
1244 	case LINK_STATE_UP:
1245 		if (mip->mi_type->mt_ops.mtops_ops & MTOPS_LINK_DETAILS) {
1246 			char det[200];
1247 
1248 			mip->mi_type->mt_ops.mtops_link_details(det,
1249 			    sizeof (det), (mac_handle_t)mip, mip->mi_pdata);
1250 
1251 			cmn_err(CE_NOTE, "!%s link up, %s", mip->mi_name, det);
1252 		} else {
1253 			cmn_err(CE_NOTE, "!%s link up", mip->mi_name);
1254 		}
1255 		break;
1256 
1257 	case LINK_STATE_DOWN:
1258 		/*
1259 		 * Only transitions from UP to DOWN are interesting
1260 		 */
1261 		if (mip->mi_lastlowlinkstate != LINK_STATE_UNKNOWN)
1262 			cmn_err(CE_NOTE, "!%s link down", mip->mi_name);
1263 		break;
1264 
1265 	case LINK_STATE_UNKNOWN:
1266 		/*
1267 		 * This case is normally not interesting.
1268 		 */
1269 		break;
1270 	}
1271 	mip->mi_lastlowlinkstate = mip->mi_lowlinkstate;
1272 }
1273 
1274 /*
1275  * Main routine for the callbacks notifications thread
1276  */
1277 static void
1278 i_mac_notify_thread(void *arg)
1279 {
1280 	mac_impl_t	*mip = arg;
1281 	callb_cpr_t	cprinfo;
1282 	mac_cb_t	*mcb;
1283 	mac_cb_info_t	*mcbi;
1284 	mac_notify_cb_t	*mncb;
1285 
1286 	mcbi = &mip->mi_notify_cb_info;
1287 	CALLB_CPR_INIT(&cprinfo, mcbi->mcbi_lockp, callb_generic_cpr,
1288 	    "i_mac_notify_thread");
1289 
1290 	mutex_enter(mcbi->mcbi_lockp);
1291 
1292 	for (;;) {
1293 		uint32_t	bits;
1294 		uint32_t	type;
1295 
1296 		bits = mip->mi_notify_bits;
1297 		if (bits == 0) {
1298 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1299 			cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
1300 			CALLB_CPR_SAFE_END(&cprinfo, mcbi->mcbi_lockp);
1301 			continue;
1302 		}
1303 		mip->mi_notify_bits = 0;
1304 		if ((bits & (1 << MAC_NNOTE)) != 0) {
1305 			/* request to quit */
1306 			ASSERT(mip->mi_state_flags & MIS_DISABLED);
1307 			break;
1308 		}
1309 
1310 		mutex_exit(mcbi->mcbi_lockp);
1311 
1312 		/*
1313 		 * Log link changes on the actual link, but then do reports on
1314 		 * synthetic state (if part of a bridge).
1315 		 */
1316 		if ((bits & (1 << MAC_NOTE_LOWLINK)) != 0) {
1317 			link_state_t newstate;
1318 			mac_handle_t mh;
1319 
1320 			i_mac_log_link_state(mip);
1321 			newstate = mip->mi_lowlinkstate;
1322 			if (mip->mi_bridge_link != NULL) {
1323 				mutex_enter(&mip->mi_bridge_lock);
1324 				if ((mh = mip->mi_bridge_link) != NULL) {
1325 					newstate = mac_bridge_ls_cb(mh,
1326 					    newstate);
1327 				}
1328 				mutex_exit(&mip->mi_bridge_lock);
1329 			}
1330 			if (newstate != mip->mi_linkstate) {
1331 				mip->mi_linkstate = newstate;
1332 				bits |= 1 << MAC_NOTE_LINK;
1333 			}
1334 		}
1335 
1336 		/*
1337 		 * Depending on which capabs have changed, the Tx
1338 		 * checksum flags may also need to be updated.
1339 		 */
1340 		if ((bits & (1 << MAC_NOTE_CAPAB_CHG)) != 0) {
1341 			mac_perim_handle_t mph;
1342 			mac_handle_t mh = (mac_handle_t)mip;
1343 
1344 			mac_perim_enter_by_mh(mh, &mph);
1345 			mip->mi_tx_cksum_flags = mac_features_to_flags(mh);
1346 			mac_perim_exit(mph);
1347 		}
1348 
1349 		/*
1350 		 * Do notification callbacks for each notification type.
1351 		 */
1352 		for (type = 0; type < MAC_NNOTE; type++) {
1353 			if ((bits & (1 << type)) == 0) {
1354 				continue;
1355 			}
1356 
1357 			if (mac_notify_cb_list[type] != NULL)
1358 				(*mac_notify_cb_list[type])(mip);
1359 
1360 			/*
1361 			 * Walk the list of notifications.
1362 			 */
1363 			MAC_CALLBACK_WALKER_INC(&mip->mi_notify_cb_info);
1364 			for (mcb = mip->mi_notify_cb_list; mcb != NULL;
1365 			    mcb = mcb->mcb_nextp) {
1366 				mncb = (mac_notify_cb_t *)mcb->mcb_objp;
1367 				mncb->mncb_fn(mncb->mncb_arg, type);
1368 			}
1369 			MAC_CALLBACK_WALKER_DCR(&mip->mi_notify_cb_info,
1370 			    &mip->mi_notify_cb_list);
1371 		}
1372 
1373 		mutex_enter(mcbi->mcbi_lockp);
1374 	}
1375 
1376 	mip->mi_state_flags |= MIS_NOTIFY_DONE;
1377 	cv_broadcast(&mcbi->mcbi_cv);
1378 
1379 	/* CALLB_CPR_EXIT drops the lock */
1380 	CALLB_CPR_EXIT(&cprinfo);
1381 	thread_exit();
1382 }
1383 
1384 /*
1385  * Signal the i_mac_notify_thread asking it to quit.
1386  * Then wait till it is done.
1387  */
1388 void
1389 i_mac_notify_exit(mac_impl_t *mip)
1390 {
1391 	mac_cb_info_t	*mcbi;
1392 
1393 	mcbi = &mip->mi_notify_cb_info;
1394 
1395 	mutex_enter(mcbi->mcbi_lockp);
1396 	mip->mi_notify_bits = (1 << MAC_NNOTE);
1397 	cv_broadcast(&mcbi->mcbi_cv);
1398 
1399 
1400 	while ((mip->mi_notify_thread != NULL) &&
1401 	    !(mip->mi_state_flags & MIS_NOTIFY_DONE)) {
1402 		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
1403 	}
1404 
1405 	/* Necessary clean up before doing kmem_cache_free */
1406 	mip->mi_state_flags &= ~MIS_NOTIFY_DONE;
1407 	mip->mi_notify_bits = 0;
1408 	mip->mi_notify_thread = NULL;
1409 	mutex_exit(mcbi->mcbi_lockp);
1410 }
1411 
1412 /*
1413  * Entry point invoked by drivers to dynamically add a ring to an
1414  * existing group.
1415  */
1416 int
1417 mac_group_add_ring(mac_group_handle_t gh, int index)
1418 {
1419 	mac_group_t *group = (mac_group_t *)gh;
1420 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
1421 	int ret;
1422 
1423 	i_mac_perim_enter(mip);
1424 	ret = i_mac_group_add_ring(group, NULL, index);
1425 	i_mac_perim_exit(mip);
1426 	return (ret);
1427 }
1428 
1429 /*
1430  * Entry point invoked by drivers to dynamically remove a ring
1431  * from an existing group. The specified ring handle must no longer
1432  * be used by the driver after a call to this function.
1433  */
1434 void
1435 mac_group_rem_ring(mac_group_handle_t gh, mac_ring_handle_t rh)
1436 {
1437 	mac_group_t *group = (mac_group_t *)gh;
1438 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
1439 
1440 	i_mac_perim_enter(mip);
1441 	i_mac_group_rem_ring(group, (mac_ring_t *)rh, B_TRUE);
1442 	i_mac_perim_exit(mip);
1443 }
1444 
1445 /*
1446  * mac_prop_info_*() callbacks called from the driver's prefix_propinfo()
1447  * entry points.
1448  */
1449 
1450 void
1451 mac_prop_info_set_default_uint8(mac_prop_info_handle_t ph, uint8_t val)
1452 {
1453 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1454 
1455 	/* nothing to do if the caller doesn't want the default value */
1456 	if (pr->pr_default == NULL)
1457 		return;
1458 
1459 	ASSERT(pr->pr_default_size >= sizeof (uint8_t));
1460 
1461 	*(uint8_t *)(pr->pr_default) = val;
1462 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1463 }
1464 
1465 void
1466 mac_prop_info_set_default_uint64(mac_prop_info_handle_t ph, uint64_t val)
1467 {
1468 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1469 
1470 	/* nothing to do if the caller doesn't want the default value */
1471 	if (pr->pr_default == NULL)
1472 		return;
1473 
1474 	ASSERT(pr->pr_default_size >= sizeof (uint64_t));
1475 
1476 	bcopy(&val, pr->pr_default, sizeof (val));
1477 
1478 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1479 }
1480 
1481 void
1482 mac_prop_info_set_default_uint32(mac_prop_info_handle_t ph, uint32_t val)
1483 {
1484 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1485 
1486 	/* nothing to do if the caller doesn't want the default value */
1487 	if (pr->pr_default == NULL)
1488 		return;
1489 
1490 	ASSERT(pr->pr_default_size >= sizeof (uint32_t));
1491 
1492 	bcopy(&val, pr->pr_default, sizeof (val));
1493 
1494 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1495 }
1496 
1497 void
1498 mac_prop_info_set_default_str(mac_prop_info_handle_t ph, const char *str)
1499 {
1500 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1501 
1502 	/* nothing to do if the caller doesn't want the default value */
1503 	if (pr->pr_default == NULL)
1504 		return;
1505 
1506 	if (strlen(str) >= pr->pr_default_size)
1507 		pr->pr_errno = ENOBUFS;
1508 	else
1509 		(void) strlcpy(pr->pr_default, str, pr->pr_default_size);
1510 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1511 }
1512 
1513 void
1514 mac_prop_info_set_default_link_flowctrl(mac_prop_info_handle_t ph,
1515     link_flowctrl_t val)
1516 {
1517 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1518 
1519 	/* nothing to do if the caller doesn't want the default value */
1520 	if (pr->pr_default == NULL)
1521 		return;
1522 
1523 	ASSERT(pr->pr_default_size >= sizeof (link_flowctrl_t));
1524 
1525 	bcopy(&val, pr->pr_default, sizeof (val));
1526 
1527 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1528 }
1529 
1530 void
1531 mac_prop_info_set_default_fec(mac_prop_info_handle_t ph, link_fec_t val)
1532 {
1533 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1534 
1535 	/* nothing to do if the caller doesn't want the default value */
1536 	if (pr->pr_default == NULL)
1537 		return;
1538 
1539 	ASSERT(pr->pr_default_size >= sizeof (link_fec_t));
1540 
1541 	bcopy(&val, pr->pr_default, sizeof (val));
1542 
1543 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1544 }
1545 
1546 void
1547 mac_prop_info_set_range_uint32(mac_prop_info_handle_t ph, uint32_t min,
1548     uint32_t max)
1549 {
1550 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1551 	mac_propval_range_t *range = pr->pr_range;
1552 	mac_propval_uint32_range_t *range32;
1553 
1554 	/* nothing to do if the caller doesn't want the range info */
1555 	if (range == NULL)
1556 		return;
1557 
1558 	if (pr->pr_range_cur_count++ == 0) {
1559 		/* first range */
1560 		pr->pr_flags |= MAC_PROP_INFO_RANGE;
1561 		range->mpr_type = MAC_PROPVAL_UINT32;
1562 	} else {
1563 		/* all ranges of a property should be of the same type */
1564 		ASSERT(range->mpr_type == MAC_PROPVAL_UINT32);
1565 		if (pr->pr_range_cur_count > range->mpr_count) {
1566 			pr->pr_errno = ENOSPC;
1567 			return;
1568 		}
1569 	}
1570 
1571 	range32 = range->mpr_range_uint32;
1572 	range32[pr->pr_range_cur_count - 1].mpur_min = min;
1573 	range32[pr->pr_range_cur_count - 1].mpur_max = max;
1574 }
1575 
1576 void
1577 mac_prop_info_set_perm(mac_prop_info_handle_t ph, uint8_t perm)
1578 {
1579 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1580 
1581 	pr->pr_perm = perm;
1582 	pr->pr_flags |= MAC_PROP_INFO_PERM;
1583 }
1584 
1585 void
1586 mac_hcksum_get(const mblk_t *mp, uint32_t *start, uint32_t *stuff,
1587     uint32_t *end, uint32_t *value, uint32_t *flags_ptr)
1588 {
1589 	uint32_t flags;
1590 
1591 	ASSERT(DB_TYPE(mp) == M_DATA);
1592 
1593 	flags = DB_CKSUMFLAGS(mp) & HCK_FLAGS;
1594 	if ((flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) != 0) {
1595 		if (value != NULL)
1596 			*value = (uint32_t)DB_CKSUM16(mp);
1597 		if ((flags & HCK_PARTIALCKSUM) != 0) {
1598 			if (start != NULL)
1599 				*start = (uint32_t)DB_CKSUMSTART(mp);
1600 			if (stuff != NULL)
1601 				*stuff = (uint32_t)DB_CKSUMSTUFF(mp);
1602 			if (end != NULL)
1603 				*end = (uint32_t)DB_CKSUMEND(mp);
1604 		}
1605 	}
1606 
1607 	if (flags_ptr != NULL)
1608 		*flags_ptr = flags;
1609 }
1610 
1611 void
1612 mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff, uint32_t end,
1613     uint32_t value, uint32_t flags)
1614 {
1615 	ASSERT(DB_TYPE(mp) == M_DATA);
1616 
1617 	DB_CKSUMSTART(mp) = (intptr_t)start;
1618 	DB_CKSUMSTUFF(mp) = (intptr_t)stuff;
1619 	DB_CKSUMEND(mp) = (intptr_t)end;
1620 	DB_CKSUMFLAGS(mp) = (uint16_t)flags;
1621 	DB_CKSUM16(mp) = (uint16_t)value;
1622 }
1623 
1624 void
1625 mac_hcksum_clone(const mblk_t *src, mblk_t *dst)
1626 {
1627 	ASSERT3U(DB_TYPE(src), ==, M_DATA);
1628 	ASSERT3U(DB_TYPE(dst), ==, M_DATA);
1629 
1630 	/*
1631 	 * Do these assignments unconditionally, rather than only when
1632 	 * flags is non-zero. This protects a situation where zeroed
1633 	 * hcksum data does not make the jump onto an mblk_t with
1634 	 * stale data in those fields. It's important to copy all
1635 	 * possible flags (HCK_* as well as HW_*) and not just the
1636 	 * checksum specific flags. Dropping flags during a clone
1637 	 * could result in dropped packets. If the caller has good
1638 	 * reason to drop those flags then it should do it manually,
1639 	 * after the clone.
1640 	 */
1641 	DB_CKSUMFLAGS(dst) = DB_CKSUMFLAGS(src);
1642 	DB_CKSUMSTART(dst) = DB_CKSUMSTART(src);
1643 	DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src);
1644 	DB_CKSUMEND(dst) = DB_CKSUMEND(src);
1645 	DB_CKSUM16(dst) = DB_CKSUM16(src);
1646 	DB_LSOMSS(dst) = DB_LSOMSS(src);
1647 }
1648 
1649 void
1650 mac_lso_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
1651 {
1652 	ASSERT(DB_TYPE(mp) == M_DATA);
1653 
1654 	if (flags != NULL) {
1655 		*flags = DB_CKSUMFLAGS(mp) & HW_LSO;
1656 		if ((*flags != 0) && (mss != NULL))
1657 			*mss = (uint32_t)DB_LSOMSS(mp);
1658 	}
1659 }
1660 
1661 void
1662 mac_transceiver_info_set_present(mac_transceiver_info_t *infop,
1663     boolean_t present)
1664 {
1665 	infop->mti_present = present;
1666 }
1667 
1668 void
1669 mac_transceiver_info_set_usable(mac_transceiver_info_t *infop,
1670     boolean_t usable)
1671 {
1672 	infop->mti_usable = usable;
1673 }
1674 
1675 /*
1676  * We should really keep track of our offset and not walk everything every
1677  * time. I can't imagine that this will be kind to us at high packet rates;
1678  * however, for the moment, let's leave that.
1679  *
1680  * This walks a message block chain without pulling up to fill in the context
1681  * information. Note that the data we care about could be hidden across more
1682  * than one mblk_t.
1683  */
1684 static int
1685 mac_meoi_get_uint8(mblk_t *mp, off_t off, uint8_t *out)
1686 {
1687 	size_t mpsize;
1688 	uint8_t *bp;
1689 
1690 	mpsize = msgsize(mp);
1691 	/* Check for overflow */
1692 	if (off + sizeof (uint16_t) > mpsize)
1693 		return (-1);
1694 
1695 	mpsize = MBLKL(mp);
1696 	while (off >= mpsize) {
1697 		mp = mp->b_cont;
1698 		off -= mpsize;
1699 		mpsize = MBLKL(mp);
1700 	}
1701 
1702 	bp = mp->b_rptr + off;
1703 	*out = *bp;
1704 	return (0);
1705 
1706 }
1707 
1708 static int
1709 mac_meoi_get_uint16(mblk_t *mp, off_t off, uint16_t *out)
1710 {
1711 	size_t mpsize;
1712 	uint8_t *bp;
1713 
1714 	mpsize = msgsize(mp);
1715 	/* Check for overflow */
1716 	if (off + sizeof (uint16_t) > mpsize)
1717 		return (-1);
1718 
1719 	mpsize = MBLKL(mp);
1720 	while (off >= mpsize) {
1721 		mp = mp->b_cont;
1722 		off -= mpsize;
1723 		mpsize = MBLKL(mp);
1724 	}
1725 
1726 	/*
1727 	 * Data is in network order. Note the second byte of data might be in
1728 	 * the next mp.
1729 	 */
1730 	bp = mp->b_rptr + off;
1731 	*out = *bp << 8;
1732 	if (off + 1 == mpsize) {
1733 		mp = mp->b_cont;
1734 		bp = mp->b_rptr;
1735 	} else {
1736 		bp++;
1737 	}
1738 
1739 	*out |= *bp;
1740 	return (0);
1741 
1742 }
1743 
1744 
1745 int
1746 mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi)
1747 {
1748 	size_t off;
1749 	uint16_t ether;
1750 	uint8_t ipproto, iplen, l4len, maclen;
1751 
1752 	bzero(meoi, sizeof (mac_ether_offload_info_t));
1753 
1754 	meoi->meoi_len = msgsize(mp);
1755 	off = offsetof(struct ether_header, ether_type);
1756 	if (mac_meoi_get_uint16(mp, off, &ether) != 0)
1757 		return (-1);
1758 
1759 	if (ether == ETHERTYPE_VLAN) {
1760 		off = offsetof(struct ether_vlan_header, ether_type);
1761 		if (mac_meoi_get_uint16(mp, off, &ether) != 0)
1762 			return (-1);
1763 		meoi->meoi_flags |= MEOI_VLAN_TAGGED;
1764 		maclen = sizeof (struct ether_vlan_header);
1765 	} else {
1766 		maclen = sizeof (struct ether_header);
1767 	}
1768 	meoi->meoi_flags |= MEOI_L2INFO_SET;
1769 	meoi->meoi_l2hlen = maclen;
1770 	meoi->meoi_l3proto = ether;
1771 
1772 	switch (ether) {
1773 	case ETHERTYPE_IP:
1774 		/*
1775 		 * For IPv4 we need to get the length of the header, as it can
1776 		 * be variable.
1777 		 */
1778 		off = offsetof(ipha_t, ipha_version_and_hdr_length) + maclen;
1779 		if (mac_meoi_get_uint8(mp, off, &iplen) != 0)
1780 			return (-1);
1781 		iplen &= 0x0f;
1782 		if (iplen < 5 || iplen > 0x0f)
1783 			return (-1);
1784 		iplen *= 4;
1785 		off = offsetof(ipha_t, ipha_protocol) + maclen;
1786 		if (mac_meoi_get_uint8(mp, off, &ipproto) == -1)
1787 			return (-1);
1788 		break;
1789 	case ETHERTYPE_IPV6:
1790 		iplen = 40;
1791 		off = offsetof(ip6_t, ip6_nxt) + maclen;
1792 		if (mac_meoi_get_uint8(mp, off, &ipproto) == -1)
1793 			return (-1);
1794 		break;
1795 	default:
1796 		return (0);
1797 	}
1798 	meoi->meoi_l3hlen = iplen;
1799 	meoi->meoi_l4proto = ipproto;
1800 	meoi->meoi_flags |= MEOI_L3INFO_SET;
1801 
1802 	switch (ipproto) {
1803 	case IPPROTO_TCP:
1804 		off = offsetof(tcph_t, th_offset_and_rsrvd) + maclen + iplen;
1805 		if (mac_meoi_get_uint8(mp, off, &l4len) == -1)
1806 			return (-1);
1807 		l4len = (l4len & 0xf0) >> 4;
1808 		if (l4len < 5 || l4len > 0xf)
1809 			return (-1);
1810 		l4len *= 4;
1811 		break;
1812 	case IPPROTO_UDP:
1813 		l4len = sizeof (struct udphdr);
1814 		break;
1815 	case IPPROTO_SCTP:
1816 		l4len = sizeof (sctp_hdr_t);
1817 		break;
1818 	default:
1819 		return (0);
1820 	}
1821 
1822 	meoi->meoi_l4hlen = l4len;
1823 	meoi->meoi_flags |= MEOI_L4INFO_SET;
1824 	return (0);
1825 }
1826