xref: /illumos-gate/usr/src/uts/common/io/mac/mac_provider.c (revision ec71f88e58593e3077f03588d3c38e6cbd4e8c1a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2018 Joyent, Inc.
25  * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/conf.h>
30 #include <sys/id_space.h>
31 #include <sys/esunddi.h>
32 #include <sys/stat.h>
33 #include <sys/mkdev.h>
34 #include <sys/stream.h>
35 #include <sys/strsubr.h>
36 #include <sys/dlpi.h>
37 #include <sys/modhash.h>
38 #include <sys/mac.h>
39 #include <sys/mac_provider.h>
40 #include <sys/mac_impl.h>
41 #include <sys/mac_client_impl.h>
42 #include <sys/mac_client_priv.h>
43 #include <sys/mac_soft_ring.h>
44 #include <sys/mac_stat.h>
45 #include <sys/dld.h>
46 #include <sys/modctl.h>
47 #include <sys/fs/dv_node.h>
48 #include <sys/thread.h>
49 #include <sys/proc.h>
50 #include <sys/callb.h>
51 #include <sys/cpuvar.h>
52 #include <sys/atomic.h>
53 #include <sys/sdt.h>
54 #include <sys/mac_flow.h>
55 #include <sys/ddi_intr_impl.h>
56 #include <sys/disp.h>
57 #include <sys/sdt.h>
58 #include <sys/pattr.h>
59 #include <sys/strsun.h>
60 #include <sys/vlan.h>
61 #include <inet/ip.h>
62 #include <inet/tcp.h>
63 #include <netinet/udp.h>
64 #include <netinet/sctp.h>
65 
66 /*
67  * MAC Provider Interface.
68  *
69  * Interface for GLDv3 compatible NIC drivers.
70  */
71 
72 static void i_mac_notify_thread(void *);
73 
74 typedef void (*mac_notify_default_cb_fn_t)(mac_impl_t *);
75 
76 static const mac_notify_default_cb_fn_t mac_notify_cb_list[MAC_NNOTE] = {
77 	mac_fanout_recompute,	/* MAC_NOTE_LINK */
78 	NULL,		/* MAC_NOTE_UNICST */
79 	NULL,		/* MAC_NOTE_TX */
80 	NULL,		/* MAC_NOTE_DEVPROMISC */
81 	NULL,		/* MAC_NOTE_FASTPATH_FLUSH */
82 	NULL,		/* MAC_NOTE_SDU_SIZE */
83 	NULL,		/* MAC_NOTE_MARGIN */
84 	NULL,		/* MAC_NOTE_CAPAB_CHG */
85 	NULL		/* MAC_NOTE_LOWLINK */
86 };
87 
88 /*
89  * Driver support functions.
90  */
91 
92 /* REGISTRATION */
93 
94 mac_register_t *
95 mac_alloc(uint_t mac_version)
96 {
97 	mac_register_t *mregp;
98 
99 	/*
100 	 * Make sure there isn't a version mismatch between the driver and
101 	 * the framework.  In the future, if multiple versions are
102 	 * supported, this check could become more sophisticated.
103 	 */
104 	if (mac_version != MAC_VERSION)
105 		return (NULL);
106 
107 	mregp = kmem_zalloc(sizeof (mac_register_t), KM_SLEEP);
108 	mregp->m_version = mac_version;
109 	return (mregp);
110 }
111 
112 void
113 mac_free(mac_register_t *mregp)
114 {
115 	kmem_free(mregp, sizeof (mac_register_t));
116 }
117 
118 /*
119  * mac_register() is how drivers register new MACs with the GLDv3
120  * framework.  The mregp argument is allocated by drivers using the
121  * mac_alloc() function, and can be freed using mac_free() immediately upon
122  * return from mac_register().  Upon success (0 return value), the mhp
123  * opaque pointer becomes the driver's handle to its MAC interface, and is
124  * the argument to all other mac module entry points.
125  */
126 /* ARGSUSED */
127 int
128 mac_register(mac_register_t *mregp, mac_handle_t *mhp)
129 {
130 	mac_impl_t		*mip;
131 	mactype_t		*mtype;
132 	int			err = EINVAL;
133 	struct devnames		*dnp = NULL;
134 	uint_t			instance;
135 	boolean_t		style1_created = B_FALSE;
136 	boolean_t		style2_created = B_FALSE;
137 	char			*driver;
138 	minor_t			minor = 0;
139 
140 	/* A successful call to mac_init_ops() sets the DN_GLDV3_DRIVER flag. */
141 	if (!GLDV3_DRV(ddi_driver_major(mregp->m_dip)))
142 		return (EINVAL);
143 
144 	/* Find the required MAC-Type plugin. */
145 	if ((mtype = mactype_getplugin(mregp->m_type_ident)) == NULL)
146 		return (EINVAL);
147 
148 	/* Create a mac_impl_t to represent this MAC. */
149 	mip = kmem_cache_alloc(i_mac_impl_cachep, KM_SLEEP);
150 
151 	/*
152 	 * The mac is not ready for open yet.
153 	 */
154 	mip->mi_state_flags |= MIS_DISABLED;
155 
156 	/*
157 	 * When a mac is registered, the m_instance field can be set to:
158 	 *
159 	 *  0:	Get the mac's instance number from m_dip.
160 	 *	This is usually used for physical device dips.
161 	 *
162 	 *  [1 .. MAC_MAX_MINOR-1]: Use the value as the mac's instance number.
163 	 *	For example, when an aggregation is created with the key option,
164 	 *	"key" will be used as the instance number.
165 	 *
166 	 *  -1: Assign an instance number from [MAC_MAX_MINOR .. MAXMIN-1].
167 	 *	This is often used when a MAC of a virtual link is registered
168 	 *	(e.g., aggregation when "key" is not specified, or vnic).
169 	 *
170 	 * Note that the instance number is used to derive the mi_minor field
171 	 * of mac_impl_t, which will then be used to derive the name of kstats
172 	 * and the devfs nodes.  The first 2 cases are needed to preserve
173 	 * backward compatibility.
174 	 */
175 	switch (mregp->m_instance) {
176 	case 0:
177 		instance = ddi_get_instance(mregp->m_dip);
178 		break;
179 	case ((uint_t)-1):
180 		minor = mac_minor_hold(B_TRUE);
181 		if (minor == 0) {
182 			err = ENOSPC;
183 			goto fail;
184 		}
185 		instance = minor - 1;
186 		break;
187 	default:
188 		instance = mregp->m_instance;
189 		if (instance >= MAC_MAX_MINOR) {
190 			err = EINVAL;
191 			goto fail;
192 		}
193 		break;
194 	}
195 
196 	mip->mi_minor = (minor_t)(instance + 1);
197 	mip->mi_dip = mregp->m_dip;
198 	mip->mi_clients_list = NULL;
199 	mip->mi_nclients = 0;
200 
201 	/* Set the default IEEE Port VLAN Identifier */
202 	mip->mi_pvid = 1;
203 
204 	/* Default bridge link learning protection values */
205 	mip->mi_llimit = 1000;
206 	mip->mi_ldecay = 200;
207 
208 	driver = (char *)ddi_driver_name(mip->mi_dip);
209 
210 	/* Construct the MAC name as <drvname><instance> */
211 	(void) snprintf(mip->mi_name, sizeof (mip->mi_name), "%s%d",
212 	    driver, instance);
213 
214 	mip->mi_driver = mregp->m_driver;
215 
216 	mip->mi_type = mtype;
217 	mip->mi_margin = mregp->m_margin;
218 	mip->mi_info.mi_media = mtype->mt_type;
219 	mip->mi_info.mi_nativemedia = mtype->mt_nativetype;
220 	if (mregp->m_max_sdu <= mregp->m_min_sdu)
221 		goto fail;
222 	if (mregp->m_multicast_sdu == 0)
223 		mregp->m_multicast_sdu = mregp->m_max_sdu;
224 	if (mregp->m_multicast_sdu < mregp->m_min_sdu ||
225 	    mregp->m_multicast_sdu > mregp->m_max_sdu)
226 		goto fail;
227 	mip->mi_sdu_min = mregp->m_min_sdu;
228 	mip->mi_sdu_max = mregp->m_max_sdu;
229 	mip->mi_sdu_multicast = mregp->m_multicast_sdu;
230 	mip->mi_info.mi_addr_length = mip->mi_type->mt_addr_length;
231 	/*
232 	 * If the media supports a broadcast address, cache a pointer to it
233 	 * in the mac_info_t so that upper layers can use it.
234 	 */
235 	mip->mi_info.mi_brdcst_addr = mip->mi_type->mt_brdcst_addr;
236 
237 	mip->mi_v12n_level = mregp->m_v12n;
238 
239 	/*
240 	 * Copy the unicast source address into the mac_info_t, but only if
241 	 * the MAC-Type defines a non-zero address length.  We need to
242 	 * handle MAC-Types that have an address length of 0
243 	 * (point-to-point protocol MACs for example).
244 	 */
245 	if (mip->mi_type->mt_addr_length > 0) {
246 		if (mregp->m_src_addr == NULL)
247 			goto fail;
248 		mip->mi_info.mi_unicst_addr =
249 		    kmem_alloc(mip->mi_type->mt_addr_length, KM_SLEEP);
250 		bcopy(mregp->m_src_addr, mip->mi_info.mi_unicst_addr,
251 		    mip->mi_type->mt_addr_length);
252 
253 		/*
254 		 * Copy the fixed 'factory' MAC address from the immutable
255 		 * info.  This is taken to be the MAC address currently in
256 		 * use.
257 		 */
258 		bcopy(mip->mi_info.mi_unicst_addr, mip->mi_addr,
259 		    mip->mi_type->mt_addr_length);
260 
261 		/*
262 		 * At this point, we should set up the classification
263 		 * rules etc but we delay it till mac_open() so that
264 		 * the resource discovery has taken place and we
265 		 * know someone wants to use the device. Otherwise
266 		 * memory gets allocated for Rx ring structures even
267 		 * during probe.
268 		 */
269 
270 		/* Copy the destination address if one is provided. */
271 		if (mregp->m_dst_addr != NULL) {
272 			bcopy(mregp->m_dst_addr, mip->mi_dstaddr,
273 			    mip->mi_type->mt_addr_length);
274 			mip->mi_dstaddr_set = B_TRUE;
275 		}
276 	} else if (mregp->m_src_addr != NULL) {
277 		goto fail;
278 	}
279 
280 	/*
281 	 * The format of the m_pdata is specific to the plugin.  It is
282 	 * passed in as an argument to all of the plugin callbacks.  The
283 	 * driver can update this information by calling
284 	 * mac_pdata_update().
285 	 */
286 	if (mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY) {
287 		/*
288 		 * Verify if the supplied plugin data is valid.  Note that
289 		 * even if the caller passed in a NULL pointer as plugin data,
290 		 * we still need to verify if that's valid as the plugin may
291 		 * require plugin data to function.
292 		 */
293 		if (!mip->mi_type->mt_ops.mtops_pdata_verify(mregp->m_pdata,
294 		    mregp->m_pdata_size)) {
295 			goto fail;
296 		}
297 		if (mregp->m_pdata != NULL) {
298 			mip->mi_pdata =
299 			    kmem_alloc(mregp->m_pdata_size, KM_SLEEP);
300 			bcopy(mregp->m_pdata, mip->mi_pdata,
301 			    mregp->m_pdata_size);
302 			mip->mi_pdata_size = mregp->m_pdata_size;
303 		}
304 	} else if (mregp->m_pdata != NULL) {
305 		/*
306 		 * The caller supplied non-NULL plugin data, but the plugin
307 		 * does not recognize plugin data.
308 		 */
309 		err = EINVAL;
310 		goto fail;
311 	}
312 
313 	/*
314 	 * Register the private properties.
315 	 */
316 	mac_register_priv_prop(mip, mregp->m_priv_props);
317 
318 	/*
319 	 * Stash the driver callbacks into the mac_impl_t, but first sanity
320 	 * check to make sure all mandatory callbacks are set.
321 	 */
322 	if (mregp->m_callbacks->mc_getstat == NULL ||
323 	    mregp->m_callbacks->mc_start == NULL ||
324 	    mregp->m_callbacks->mc_stop == NULL ||
325 	    mregp->m_callbacks->mc_setpromisc == NULL ||
326 	    mregp->m_callbacks->mc_multicst == NULL) {
327 		goto fail;
328 	}
329 	mip->mi_callbacks = mregp->m_callbacks;
330 
331 	if (mac_capab_get((mac_handle_t)mip, MAC_CAPAB_LEGACY,
332 	    &mip->mi_capab_legacy)) {
333 		mip->mi_state_flags |= MIS_LEGACY;
334 		mip->mi_phy_dev = mip->mi_capab_legacy.ml_dev;
335 	} else {
336 		mip->mi_phy_dev = makedevice(ddi_driver_major(mip->mi_dip),
337 		    mip->mi_minor);
338 	}
339 
340 	/*
341 	 * Allocate a notification thread. thread_create blocks for memory
342 	 * if needed, it never fails.
343 	 */
344 	mip->mi_notify_thread = thread_create(NULL, 0, i_mac_notify_thread,
345 	    mip, 0, &p0, TS_RUN, minclsyspri);
346 
347 	/*
348 	 * Initialize the capabilities
349 	 */
350 
351 	bzero(&mip->mi_rx_rings_cap, sizeof (mac_capab_rings_t));
352 	bzero(&mip->mi_tx_rings_cap, sizeof (mac_capab_rings_t));
353 
354 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_VNIC, NULL))
355 		mip->mi_state_flags |= MIS_IS_VNIC;
356 
357 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL))
358 		mip->mi_state_flags |= MIS_IS_AGGR;
359 
360 	mac_addr_factory_init(mip);
361 
362 	mac_transceiver_init(mip);
363 
364 	mac_led_init(mip);
365 
366 	/*
367 	 * Enforce the virtrualization level registered.
368 	 */
369 	if (mip->mi_v12n_level & MAC_VIRT_LEVEL1) {
370 		if (mac_init_rings(mip, MAC_RING_TYPE_RX) != 0 ||
371 		    mac_init_rings(mip, MAC_RING_TYPE_TX) != 0)
372 			goto fail;
373 
374 		/*
375 		 * The driver needs to register at least rx rings for this
376 		 * virtualization level.
377 		 */
378 		if (mip->mi_rx_groups == NULL)
379 			goto fail;
380 	}
381 
382 	/*
383 	 * The driver must set mc_unicst entry point to NULL when it advertises
384 	 * CAP_RINGS for rx groups.
385 	 */
386 	if (mip->mi_rx_groups != NULL) {
387 		if (mregp->m_callbacks->mc_unicst != NULL)
388 			goto fail;
389 	} else {
390 		if (mregp->m_callbacks->mc_unicst == NULL)
391 			goto fail;
392 	}
393 
394 	/*
395 	 * Initialize MAC addresses. Must be called after mac_init_rings().
396 	 */
397 	mac_init_macaddr(mip);
398 
399 	mip->mi_share_capab.ms_snum = 0;
400 	if (mip->mi_v12n_level & MAC_VIRT_HIO) {
401 		(void) mac_capab_get((mac_handle_t)mip, MAC_CAPAB_SHARES,
402 		    &mip->mi_share_capab);
403 	}
404 
405 	/*
406 	 * Initialize the kstats for this device.
407 	 */
408 	mac_driver_stat_create(mip);
409 
410 	/* Zero out any properties. */
411 	bzero(&mip->mi_resource_props, sizeof (mac_resource_props_t));
412 
413 	if (mip->mi_minor <= MAC_MAX_MINOR) {
414 		/* Create a style-2 DLPI device */
415 		if (ddi_create_minor_node(mip->mi_dip, driver, S_IFCHR, 0,
416 		    DDI_NT_NET, CLONE_DEV) != DDI_SUCCESS)
417 			goto fail;
418 		style2_created = B_TRUE;
419 
420 		/* Create a style-1 DLPI device */
421 		if (ddi_create_minor_node(mip->mi_dip, mip->mi_name, S_IFCHR,
422 		    mip->mi_minor, DDI_NT_NET, 0) != DDI_SUCCESS)
423 			goto fail;
424 		style1_created = B_TRUE;
425 	}
426 
427 	mac_flow_l2tab_create(mip, &mip->mi_flow_tab);
428 
429 	rw_enter(&i_mac_impl_lock, RW_WRITER);
430 	if (mod_hash_insert(i_mac_impl_hash,
431 	    (mod_hash_key_t)mip->mi_name, (mod_hash_val_t)mip) != 0) {
432 		rw_exit(&i_mac_impl_lock);
433 		err = EEXIST;
434 		goto fail;
435 	}
436 
437 	DTRACE_PROBE2(mac__register, struct devnames *, dnp,
438 	    (mac_impl_t *), mip);
439 
440 	/*
441 	 * Mark the MAC to be ready for open.
442 	 */
443 	mip->mi_state_flags &= ~MIS_DISABLED;
444 	rw_exit(&i_mac_impl_lock);
445 
446 	atomic_inc_32(&i_mac_impl_count);
447 
448 	cmn_err(CE_NOTE, "!%s registered", mip->mi_name);
449 	*mhp = (mac_handle_t)mip;
450 	return (0);
451 
452 fail:
453 	if (style1_created)
454 		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
455 
456 	if (style2_created)
457 		ddi_remove_minor_node(mip->mi_dip, driver);
458 
459 	mac_addr_factory_fini(mip);
460 
461 	/* Clean up registered MAC addresses */
462 	mac_fini_macaddr(mip);
463 
464 	/* Clean up registered rings */
465 	mac_free_rings(mip, MAC_RING_TYPE_RX);
466 	mac_free_rings(mip, MAC_RING_TYPE_TX);
467 
468 	/* Clean up notification thread */
469 	if (mip->mi_notify_thread != NULL)
470 		i_mac_notify_exit(mip);
471 
472 	if (mip->mi_info.mi_unicst_addr != NULL) {
473 		kmem_free(mip->mi_info.mi_unicst_addr,
474 		    mip->mi_type->mt_addr_length);
475 		mip->mi_info.mi_unicst_addr = NULL;
476 	}
477 
478 	mac_driver_stat_delete(mip);
479 
480 	if (mip->mi_type != NULL) {
481 		atomic_dec_32(&mip->mi_type->mt_ref);
482 		mip->mi_type = NULL;
483 	}
484 
485 	if (mip->mi_pdata != NULL) {
486 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
487 		mip->mi_pdata = NULL;
488 		mip->mi_pdata_size = 0;
489 	}
490 
491 	if (minor != 0) {
492 		ASSERT(minor > MAC_MAX_MINOR);
493 		mac_minor_rele(minor);
494 	}
495 
496 	mip->mi_state_flags = 0;
497 	mac_unregister_priv_prop(mip);
498 
499 	/*
500 	 * Clear the state before destroying the mac_impl_t
501 	 */
502 	mip->mi_state_flags = 0;
503 
504 	kmem_cache_free(i_mac_impl_cachep, mip);
505 	return (err);
506 }
507 
508 /*
509  * Unregister from the GLDv3 framework
510  */
511 int
512 mac_unregister(mac_handle_t mh)
513 {
514 	int			err;
515 	mac_impl_t		*mip = (mac_impl_t *)mh;
516 	mod_hash_val_t		val;
517 	mac_margin_req_t	*mmr, *nextmmr;
518 
519 	/* Fail the unregister if there are any open references to this mac. */
520 	if ((err = mac_disable_nowait(mh)) != 0)
521 		return (err);
522 
523 	/*
524 	 * Clean up notification thread and wait for it to exit.
525 	 */
526 	i_mac_notify_exit(mip);
527 
528 	/*
529 	 * Prior to acquiring the MAC perimeter, remove the MAC instance from
530 	 * the internal hash table. Such removal means table-walkers that
531 	 * acquire the perimeter will not do so on behalf of what we are
532 	 * unregistering, which prevents a deadlock.
533 	 */
534 	rw_enter(&i_mac_impl_lock, RW_WRITER);
535 	(void) mod_hash_remove(i_mac_impl_hash,
536 	    (mod_hash_key_t)mip->mi_name, &val);
537 	rw_exit(&i_mac_impl_lock);
538 	ASSERT(mip == (mac_impl_t *)val);
539 
540 	i_mac_perim_enter(mip);
541 
542 	/*
543 	 * There is still resource properties configured over this mac.
544 	 */
545 	if (mip->mi_resource_props.mrp_mask != 0)
546 		mac_fastpath_enable((mac_handle_t)mip);
547 
548 	if (mip->mi_minor < MAC_MAX_MINOR + 1) {
549 		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
550 		ddi_remove_minor_node(mip->mi_dip,
551 		    (char *)ddi_driver_name(mip->mi_dip));
552 	}
553 
554 	ASSERT(mip->mi_nactiveclients == 0 && !(mip->mi_state_flags &
555 	    MIS_EXCLUSIVE));
556 
557 	mac_driver_stat_delete(mip);
558 
559 	ASSERT(i_mac_impl_count > 0);
560 	atomic_dec_32(&i_mac_impl_count);
561 
562 	if (mip->mi_pdata != NULL)
563 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
564 	mip->mi_pdata = NULL;
565 	mip->mi_pdata_size = 0;
566 
567 	/*
568 	 * Free the list of margin request.
569 	 */
570 	for (mmr = mip->mi_mmrp; mmr != NULL; mmr = nextmmr) {
571 		nextmmr = mmr->mmr_nextp;
572 		kmem_free(mmr, sizeof (mac_margin_req_t));
573 	}
574 	mip->mi_mmrp = NULL;
575 
576 	mip->mi_linkstate = mip->mi_lowlinkstate = LINK_STATE_UNKNOWN;
577 	kmem_free(mip->mi_info.mi_unicst_addr, mip->mi_type->mt_addr_length);
578 	mip->mi_info.mi_unicst_addr = NULL;
579 
580 	atomic_dec_32(&mip->mi_type->mt_ref);
581 	mip->mi_type = NULL;
582 
583 	/*
584 	 * Free the primary MAC address.
585 	 */
586 	mac_fini_macaddr(mip);
587 
588 	/*
589 	 * free all rings
590 	 */
591 	mac_free_rings(mip, MAC_RING_TYPE_RX);
592 	mac_free_rings(mip, MAC_RING_TYPE_TX);
593 
594 	mac_addr_factory_fini(mip);
595 
596 	bzero(mip->mi_addr, MAXMACADDRLEN);
597 	bzero(mip->mi_dstaddr, MAXMACADDRLEN);
598 	mip->mi_dstaddr_set = B_FALSE;
599 
600 	/* and the flows */
601 	mac_flow_tab_destroy(mip->mi_flow_tab);
602 	mip->mi_flow_tab = NULL;
603 
604 	if (mip->mi_minor > MAC_MAX_MINOR)
605 		mac_minor_rele(mip->mi_minor);
606 
607 	cmn_err(CE_NOTE, "!%s unregistered", mip->mi_name);
608 
609 	/*
610 	 * Reset the perim related fields to default values before
611 	 * kmem_cache_free
612 	 */
613 	i_mac_perim_exit(mip);
614 	mip->mi_state_flags = 0;
615 
616 	mac_unregister_priv_prop(mip);
617 
618 	ASSERT(mip->mi_bridge_link == NULL);
619 	kmem_cache_free(i_mac_impl_cachep, mip);
620 
621 	return (0);
622 }
623 
624 /* DATA RECEPTION */
625 
626 /*
627  * This function is invoked for packets received by the MAC driver in
628  * interrupt context. The ring generation number provided by the driver
629  * is matched with the ring generation number held in MAC. If they do not
630  * match, received packets are considered stale packets coming from an older
631  * assignment of the ring. Drop them.
632  */
633 void
634 mac_rx_ring(mac_handle_t mh, mac_ring_handle_t mrh, mblk_t *mp_chain,
635     uint64_t mr_gen_num)
636 {
637 	mac_ring_t		*mr = (mac_ring_t *)mrh;
638 
639 	if ((mr != NULL) && (mr->mr_gen_num != mr_gen_num)) {
640 		DTRACE_PROBE2(mac__rx__rings__stale__packet, uint64_t,
641 		    mr->mr_gen_num, uint64_t, mr_gen_num);
642 		freemsgchain(mp_chain);
643 		return;
644 	}
645 	mac_rx(mh, (mac_resource_handle_t)mrh, mp_chain);
646 }
647 
648 /*
649  * This function is invoked for each packet received by the underlying driver.
650  */
651 void
652 mac_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
653 {
654 	mac_impl_t *mip = (mac_impl_t *)mh;
655 
656 	/*
657 	 * Check if the link is part of a bridge.  If not, then we don't need
658 	 * to take the lock to remain consistent.  Make this common case
659 	 * lock-free and tail-call optimized.
660 	 */
661 	if (mip->mi_bridge_link == NULL) {
662 		mac_rx_common(mh, mrh, mp_chain);
663 	} else {
664 		/*
665 		 * Once we take a reference on the bridge link, the bridge
666 		 * module itself can't unload, so the callback pointers are
667 		 * stable.
668 		 */
669 		mutex_enter(&mip->mi_bridge_lock);
670 		if ((mh = mip->mi_bridge_link) != NULL)
671 			mac_bridge_ref_cb(mh, B_TRUE);
672 		mutex_exit(&mip->mi_bridge_lock);
673 		if (mh == NULL) {
674 			mac_rx_common((mac_handle_t)mip, mrh, mp_chain);
675 		} else {
676 			mac_bridge_rx_cb(mh, mrh, mp_chain);
677 			mac_bridge_ref_cb(mh, B_FALSE);
678 		}
679 	}
680 }
681 
682 /*
683  * Special case function: this allows snooping of packets transmitted and
684  * received by TRILL. By design, they go directly into the TRILL module.
685  */
686 void
687 mac_trill_snoop(mac_handle_t mh, mblk_t *mp)
688 {
689 	mac_impl_t *mip = (mac_impl_t *)mh;
690 
691 	if (mip->mi_promisc_list != NULL)
692 		mac_promisc_dispatch(mip, mp, NULL);
693 }
694 
695 /*
696  * This is the upward reentry point for packets arriving from the bridging
697  * module and from mac_rx for links not part of a bridge.
698  */
699 void
700 mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
701 {
702 	mac_impl_t		*mip = (mac_impl_t *)mh;
703 	mac_ring_t		*mr = (mac_ring_t *)mrh;
704 	mac_soft_ring_set_t	*mac_srs;
705 	mblk_t			*bp = mp_chain;
706 
707 	/*
708 	 * If there are any promiscuous mode callbacks defined for
709 	 * this MAC, pass them a copy if appropriate.
710 	 */
711 	if (mip->mi_promisc_list != NULL)
712 		mac_promisc_dispatch(mip, mp_chain, NULL);
713 
714 	if (mr != NULL) {
715 		/*
716 		 * If the SRS teardown has started, just return. The 'mr'
717 		 * continues to be valid until the driver unregisters the MAC.
718 		 * Hardware classified packets will not make their way up
719 		 * beyond this point once the teardown has started. The driver
720 		 * is never passed a pointer to a flow entry or SRS or any
721 		 * structure that can be freed much before mac_unregister.
722 		 */
723 		mutex_enter(&mr->mr_lock);
724 		if ((mr->mr_state != MR_INUSE) || (mr->mr_flag &
725 		    (MR_INCIPIENT | MR_CONDEMNED | MR_QUIESCE))) {
726 			mutex_exit(&mr->mr_lock);
727 			freemsgchain(mp_chain);
728 			return;
729 		}
730 
731 		/*
732 		 * The ring is in passthru mode; pass the chain up to
733 		 * the pseudo ring.
734 		 */
735 		if (mr->mr_classify_type == MAC_PASSTHRU_CLASSIFIER) {
736 			MR_REFHOLD_LOCKED(mr);
737 			mutex_exit(&mr->mr_lock);
738 			mr->mr_pt_fn(mr->mr_pt_arg1, mr->mr_pt_arg2, mp_chain,
739 			    B_FALSE);
740 			MR_REFRELE(mr);
741 			return;
742 		}
743 
744 		/*
745 		 * The passthru callback should only be set when in
746 		 * MAC_PASSTHRU_CLASSIFIER mode.
747 		 */
748 		ASSERT3P(mr->mr_pt_fn, ==, NULL);
749 
750 		/*
751 		 * We check if an SRS is controlling this ring.
752 		 * If so, we can directly call the srs_lower_proc
753 		 * routine otherwise we need to go through mac_rx_classify
754 		 * to reach the right place.
755 		 */
756 		if (mr->mr_classify_type == MAC_HW_CLASSIFIER) {
757 			MR_REFHOLD_LOCKED(mr);
758 			mutex_exit(&mr->mr_lock);
759 			ASSERT3P(mr->mr_srs, !=, NULL);
760 			mac_srs = mr->mr_srs;
761 
762 			/*
763 			 * This is the fast path. All packets received
764 			 * on this ring are hardware classified and
765 			 * share the same MAC header info.
766 			 */
767 			mac_srs->srs_rx.sr_lower_proc(mh,
768 			    (mac_resource_handle_t)mac_srs, mp_chain, B_FALSE);
769 			MR_REFRELE(mr);
770 			return;
771 		}
772 
773 		mutex_exit(&mr->mr_lock);
774 		/* We'll fall through to software classification */
775 	} else {
776 		flow_entry_t *flent;
777 		int err;
778 
779 		rw_enter(&mip->mi_rw_lock, RW_READER);
780 		if (mip->mi_single_active_client != NULL) {
781 			flent = mip->mi_single_active_client->mci_flent_list;
782 			FLOW_TRY_REFHOLD(flent, err);
783 			rw_exit(&mip->mi_rw_lock);
784 			if (err == 0) {
785 				(flent->fe_cb_fn)(flent->fe_cb_arg1,
786 				    flent->fe_cb_arg2, mp_chain, B_FALSE);
787 				FLOW_REFRELE(flent);
788 				return;
789 			}
790 		} else {
791 			rw_exit(&mip->mi_rw_lock);
792 		}
793 	}
794 
795 	if (!FLOW_TAB_EMPTY(mip->mi_flow_tab)) {
796 		if ((bp = mac_rx_flow(mh, mrh, bp)) == NULL)
797 			return;
798 	}
799 
800 	freemsgchain(bp);
801 }
802 
803 /* DATA TRANSMISSION */
804 
805 /*
806  * A driver's notification to resume transmission, in case of a provider
807  * without TX rings.
808  */
809 void
810 mac_tx_update(mac_handle_t mh)
811 {
812 	mac_tx_ring_update(mh, NULL);
813 }
814 
815 /*
816  * A driver's notification to resume transmission on the specified TX ring.
817  */
818 void
819 mac_tx_ring_update(mac_handle_t mh, mac_ring_handle_t rh)
820 {
821 	i_mac_tx_srs_notify((mac_impl_t *)mh, rh);
822 }
823 
824 /* LINK STATE */
825 /*
826  * Notify the MAC layer about a link state change
827  */
828 void
829 mac_link_update(mac_handle_t mh, link_state_t link)
830 {
831 	mac_impl_t	*mip = (mac_impl_t *)mh;
832 
833 	/*
834 	 * Save the link state.
835 	 */
836 	mip->mi_lowlinkstate = link;
837 
838 	/*
839 	 * Send a MAC_NOTE_LOWLINK notification.  This tells the notification
840 	 * thread to deliver both lower and upper notifications.
841 	 */
842 	i_mac_notify(mip, MAC_NOTE_LOWLINK);
843 }
844 
845 /*
846  * Notify the MAC layer about a link state change due to bridging.
847  */
848 void
849 mac_link_redo(mac_handle_t mh, link_state_t link)
850 {
851 	mac_impl_t	*mip = (mac_impl_t *)mh;
852 
853 	/*
854 	 * Save the link state.
855 	 */
856 	mip->mi_linkstate = link;
857 
858 	/*
859 	 * Send a MAC_NOTE_LINK notification.  Only upper notifications are
860 	 * made.
861 	 */
862 	i_mac_notify(mip, MAC_NOTE_LINK);
863 }
864 
865 /* MINOR NODE HANDLING */
866 
867 /*
868  * Given a dev_t, return the instance number (PPA) associated with it.
869  * Drivers can use this in their getinfo(9e) implementation to lookup
870  * the instance number (i.e. PPA) of the device, to use as an index to
871  * their own array of soft state structures.
872  *
873  * Returns -1 on error.
874  */
875 int
876 mac_devt_to_instance(dev_t devt)
877 {
878 	return (dld_devt_to_instance(devt));
879 }
880 
881 /*
882  * This function returns the first minor number that is available for
883  * driver private use.  All minor numbers smaller than this are
884  * reserved for GLDv3 use.
885  */
886 minor_t
887 mac_private_minor(void)
888 {
889 	return (MAC_PRIVATE_MINOR);
890 }
891 
892 /* OTHER CONTROL INFORMATION */
893 
894 /*
895  * A driver notified us that its primary MAC address has changed.
896  */
897 void
898 mac_unicst_update(mac_handle_t mh, const uint8_t *addr)
899 {
900 	mac_impl_t	*mip = (mac_impl_t *)mh;
901 
902 	if (mip->mi_type->mt_addr_length == 0)
903 		return;
904 
905 	i_mac_perim_enter(mip);
906 
907 	/*
908 	 * If address changes, freshen the MAC address value and update
909 	 * all MAC clients that share this MAC address.
910 	 */
911 	if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) != 0) {
912 		mac_freshen_macaddr(mac_find_macaddr(mip, mip->mi_addr),
913 		    (uint8_t *)addr);
914 	}
915 
916 	i_mac_perim_exit(mip);
917 
918 	/*
919 	 * Send a MAC_NOTE_UNICST notification.
920 	 */
921 	i_mac_notify(mip, MAC_NOTE_UNICST);
922 }
923 
924 void
925 mac_dst_update(mac_handle_t mh, const uint8_t *addr)
926 {
927 	mac_impl_t	*mip = (mac_impl_t *)mh;
928 
929 	if (mip->mi_type->mt_addr_length == 0)
930 		return;
931 
932 	i_mac_perim_enter(mip);
933 	bcopy(addr, mip->mi_dstaddr, mip->mi_type->mt_addr_length);
934 	i_mac_perim_exit(mip);
935 	i_mac_notify(mip, MAC_NOTE_DEST);
936 }
937 
938 /*
939  * MAC plugin information changed.
940  */
941 int
942 mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize)
943 {
944 	mac_impl_t	*mip = (mac_impl_t *)mh;
945 
946 	/*
947 	 * Verify that the plugin supports MAC plugin data and that the
948 	 * supplied data is valid.
949 	 */
950 	if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY))
951 		return (EINVAL);
952 	if (!mip->mi_type->mt_ops.mtops_pdata_verify(mac_pdata, dsize))
953 		return (EINVAL);
954 
955 	if (mip->mi_pdata != NULL)
956 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
957 
958 	mip->mi_pdata = kmem_alloc(dsize, KM_SLEEP);
959 	bcopy(mac_pdata, mip->mi_pdata, dsize);
960 	mip->mi_pdata_size = dsize;
961 
962 	/*
963 	 * Since the MAC plugin data is used to construct MAC headers that
964 	 * were cached in fast-path headers, we need to flush fast-path
965 	 * information for links associated with this mac.
966 	 */
967 	i_mac_notify(mip, MAC_NOTE_FASTPATH_FLUSH);
968 	return (0);
969 }
970 
971 /*
972  * Invoked by driver as well as the framework to notify its capability change.
973  */
974 void
975 mac_capab_update(mac_handle_t mh)
976 {
977 	/* Send MAC_NOTE_CAPAB_CHG notification */
978 	i_mac_notify((mac_impl_t *)mh, MAC_NOTE_CAPAB_CHG);
979 }
980 
981 /*
982  * Used by normal drivers to update the max sdu size.
983  * We need to handle the case of a smaller mi_sdu_multicast
984  * since this is called by mac_set_mtu() even for drivers that
985  * have differing unicast and multicast mtu and we don't want to
986  * increase the multicast mtu by accident in that case.
987  */
988 int
989 mac_maxsdu_update(mac_handle_t mh, uint_t sdu_max)
990 {
991 	mac_impl_t	*mip = (mac_impl_t *)mh;
992 
993 	if (sdu_max == 0 || sdu_max < mip->mi_sdu_min)
994 		return (EINVAL);
995 	mip->mi_sdu_max = sdu_max;
996 	if (mip->mi_sdu_multicast > mip->mi_sdu_max)
997 		mip->mi_sdu_multicast = mip->mi_sdu_max;
998 
999 	/* Send a MAC_NOTE_SDU_SIZE notification. */
1000 	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
1001 	return (0);
1002 }
1003 
1004 /*
1005  * Version of the above function that is used by drivers that have a different
1006  * max sdu size for multicast/broadcast vs. unicast.
1007  */
1008 int
1009 mac_maxsdu_update2(mac_handle_t mh, uint_t sdu_max, uint_t sdu_multicast)
1010 {
1011 	mac_impl_t	*mip = (mac_impl_t *)mh;
1012 
1013 	if (sdu_max == 0 || sdu_max < mip->mi_sdu_min)
1014 		return (EINVAL);
1015 	if (sdu_multicast == 0)
1016 		sdu_multicast = sdu_max;
1017 	if (sdu_multicast > sdu_max || sdu_multicast < mip->mi_sdu_min)
1018 		return (EINVAL);
1019 	mip->mi_sdu_max = sdu_max;
1020 	mip->mi_sdu_multicast = sdu_multicast;
1021 
1022 	/* Send a MAC_NOTE_SDU_SIZE notification. */
1023 	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
1024 	return (0);
1025 }
1026 
1027 static void
1028 mac_ring_intr_retarget(mac_group_t *group, mac_ring_t *ring)
1029 {
1030 	mac_client_impl_t *mcip;
1031 	flow_entry_t *flent;
1032 	mac_soft_ring_set_t *mac_rx_srs;
1033 	mac_cpus_t *srs_cpu;
1034 	int i;
1035 
1036 	if (((mcip = MAC_GROUP_ONLY_CLIENT(group)) != NULL) &&
1037 	    (!ring->mr_info.mri_intr.mi_ddi_shared)) {
1038 		/* interrupt can be re-targeted */
1039 		ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
1040 		flent = mcip->mci_flent;
1041 		if (ring->mr_type == MAC_RING_TYPE_RX) {
1042 			for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1043 				mac_rx_srs = flent->fe_rx_srs[i];
1044 				if (mac_rx_srs->srs_ring != ring)
1045 					continue;
1046 				srs_cpu = &mac_rx_srs->srs_cpu;
1047 				mutex_enter(&cpu_lock);
1048 				mac_rx_srs_retarget_intr(mac_rx_srs,
1049 				    srs_cpu->mc_rx_intr_cpu);
1050 				mutex_exit(&cpu_lock);
1051 				break;
1052 			}
1053 		} else {
1054 			if (flent->fe_tx_srs != NULL) {
1055 				mutex_enter(&cpu_lock);
1056 				mac_tx_srs_retarget_intr(
1057 				    flent->fe_tx_srs);
1058 				mutex_exit(&cpu_lock);
1059 			}
1060 		}
1061 	}
1062 }
1063 
1064 /*
1065  * Clients like aggr create pseudo rings (mac_ring_t) and expose them to
1066  * their clients. There is a 1-1 mapping pseudo ring and the hardware
1067  * ring. ddi interrupt handles are exported from the hardware ring to
1068  * the pseudo ring. Thus when the interrupt handle changes, clients of
1069  * aggr that are using the handle need to use the new handle and
1070  * re-target their interrupts.
1071  */
1072 static void
1073 mac_pseudo_ring_intr_retarget(mac_impl_t *mip, mac_ring_t *ring,
1074     ddi_intr_handle_t ddh)
1075 {
1076 	mac_ring_t *pring;
1077 	mac_group_t *pgroup;
1078 	mac_impl_t *pmip;
1079 	char macname[MAXNAMELEN];
1080 	mac_perim_handle_t p_mph;
1081 	uint64_t saved_gen_num;
1082 
1083 again:
1084 	pring = (mac_ring_t *)ring->mr_prh;
1085 	pgroup = (mac_group_t *)pring->mr_gh;
1086 	pmip = (mac_impl_t *)pgroup->mrg_mh;
1087 	saved_gen_num = ring->mr_gen_num;
1088 	(void) strlcpy(macname, pmip->mi_name, MAXNAMELEN);
1089 	/*
1090 	 * We need to enter aggr's perimeter. The locking hierarchy
1091 	 * dictates that aggr's perimeter should be entered first
1092 	 * and then the port's perimeter. So drop the port's
1093 	 * perimeter, enter aggr's and then re-enter port's
1094 	 * perimeter.
1095 	 */
1096 	i_mac_perim_exit(mip);
1097 	/*
1098 	 * While we know pmip is the aggr's mip, there is a
1099 	 * possibility that aggr could have unregistered by
1100 	 * the time we exit port's perimeter (mip) and
1101 	 * enter aggr's perimeter (pmip). To avoid that
1102 	 * scenario, enter aggr's perimeter using its name.
1103 	 */
1104 	if (mac_perim_enter_by_macname(macname, &p_mph) != 0)
1105 		return;
1106 	i_mac_perim_enter(mip);
1107 	/*
1108 	 * Check if the ring got assigned to another aggregation before
1109 	 * be could enter aggr's and the port's perimeter. When a ring
1110 	 * gets deleted from an aggregation, it calls mac_stop_ring()
1111 	 * which increments the generation number. So checking
1112 	 * generation number will be enough.
1113 	 */
1114 	if (ring->mr_gen_num != saved_gen_num && ring->mr_prh != NULL) {
1115 		i_mac_perim_exit(mip);
1116 		mac_perim_exit(p_mph);
1117 		i_mac_perim_enter(mip);
1118 		goto again;
1119 	}
1120 
1121 	/* Check if pseudo ring is still present */
1122 	if (ring->mr_prh != NULL) {
1123 		pring->mr_info.mri_intr.mi_ddi_handle = ddh;
1124 		pring->mr_info.mri_intr.mi_ddi_shared =
1125 		    ring->mr_info.mri_intr.mi_ddi_shared;
1126 		if (ddh != NULL)
1127 			mac_ring_intr_retarget(pgroup, pring);
1128 	}
1129 	i_mac_perim_exit(mip);
1130 	mac_perim_exit(p_mph);
1131 }
1132 /*
1133  * API called by driver to provide new interrupt handle for TX/RX rings.
1134  * This usually happens when IRM (Interrupt Resource Manangement)
1135  * framework either gives the driver more MSI-x interrupts or takes
1136  * away MSI-x interrupts from the driver.
1137  */
1138 void
1139 mac_ring_intr_set(mac_ring_handle_t mrh, ddi_intr_handle_t ddh)
1140 {
1141 	mac_ring_t	*ring = (mac_ring_t *)mrh;
1142 	mac_group_t	*group = (mac_group_t *)ring->mr_gh;
1143 	mac_impl_t	*mip = (mac_impl_t *)group->mrg_mh;
1144 
1145 	i_mac_perim_enter(mip);
1146 	ring->mr_info.mri_intr.mi_ddi_handle = ddh;
1147 	if (ddh == NULL) {
1148 		/* Interrupts being reset */
1149 		ring->mr_info.mri_intr.mi_ddi_shared = B_FALSE;
1150 		if (ring->mr_prh != NULL) {
1151 			mac_pseudo_ring_intr_retarget(mip, ring, ddh);
1152 			return;
1153 		}
1154 	} else {
1155 		/* New interrupt handle */
1156 		mac_compare_ddi_handle(mip->mi_rx_groups,
1157 		    mip->mi_rx_group_count, ring);
1158 		if (!ring->mr_info.mri_intr.mi_ddi_shared) {
1159 			mac_compare_ddi_handle(mip->mi_tx_groups,
1160 			    mip->mi_tx_group_count, ring);
1161 		}
1162 		if (ring->mr_prh != NULL) {
1163 			mac_pseudo_ring_intr_retarget(mip, ring, ddh);
1164 			return;
1165 		} else {
1166 			mac_ring_intr_retarget(group, ring);
1167 		}
1168 	}
1169 	i_mac_perim_exit(mip);
1170 }
1171 
1172 /* PRIVATE FUNCTIONS, FOR INTERNAL USE ONLY */
1173 
1174 /*
1175  * Updates the mac_impl structure with the current state of the link
1176  */
1177 static void
1178 i_mac_log_link_state(mac_impl_t *mip)
1179 {
1180 	/*
1181 	 * If no change, then it is not interesting.
1182 	 */
1183 	if (mip->mi_lastlowlinkstate == mip->mi_lowlinkstate)
1184 		return;
1185 
1186 	switch (mip->mi_lowlinkstate) {
1187 	case LINK_STATE_UP:
1188 		if (mip->mi_type->mt_ops.mtops_ops & MTOPS_LINK_DETAILS) {
1189 			char det[200];
1190 
1191 			mip->mi_type->mt_ops.mtops_link_details(det,
1192 			    sizeof (det), (mac_handle_t)mip, mip->mi_pdata);
1193 
1194 			cmn_err(CE_NOTE, "!%s link up, %s", mip->mi_name, det);
1195 		} else {
1196 			cmn_err(CE_NOTE, "!%s link up", mip->mi_name);
1197 		}
1198 		break;
1199 
1200 	case LINK_STATE_DOWN:
1201 		/*
1202 		 * Only transitions from UP to DOWN are interesting
1203 		 */
1204 		if (mip->mi_lastlowlinkstate != LINK_STATE_UNKNOWN)
1205 			cmn_err(CE_NOTE, "!%s link down", mip->mi_name);
1206 		break;
1207 
1208 	case LINK_STATE_UNKNOWN:
1209 		/*
1210 		 * This case is normally not interesting.
1211 		 */
1212 		break;
1213 	}
1214 	mip->mi_lastlowlinkstate = mip->mi_lowlinkstate;
1215 }
1216 
1217 /*
1218  * Main routine for the callbacks notifications thread
1219  */
1220 static void
1221 i_mac_notify_thread(void *arg)
1222 {
1223 	mac_impl_t	*mip = arg;
1224 	callb_cpr_t	cprinfo;
1225 	mac_cb_t	*mcb;
1226 	mac_cb_info_t	*mcbi;
1227 	mac_notify_cb_t	*mncb;
1228 
1229 	mcbi = &mip->mi_notify_cb_info;
1230 	CALLB_CPR_INIT(&cprinfo, mcbi->mcbi_lockp, callb_generic_cpr,
1231 	    "i_mac_notify_thread");
1232 
1233 	mutex_enter(mcbi->mcbi_lockp);
1234 
1235 	for (;;) {
1236 		uint32_t	bits;
1237 		uint32_t	type;
1238 
1239 		bits = mip->mi_notify_bits;
1240 		if (bits == 0) {
1241 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1242 			cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
1243 			CALLB_CPR_SAFE_END(&cprinfo, mcbi->mcbi_lockp);
1244 			continue;
1245 		}
1246 		mip->mi_notify_bits = 0;
1247 		if ((bits & (1 << MAC_NNOTE)) != 0) {
1248 			/* request to quit */
1249 			ASSERT(mip->mi_state_flags & MIS_DISABLED);
1250 			break;
1251 		}
1252 
1253 		mutex_exit(mcbi->mcbi_lockp);
1254 
1255 		/*
1256 		 * Log link changes on the actual link, but then do reports on
1257 		 * synthetic state (if part of a bridge).
1258 		 */
1259 		if ((bits & (1 << MAC_NOTE_LOWLINK)) != 0) {
1260 			link_state_t newstate;
1261 			mac_handle_t mh;
1262 
1263 			i_mac_log_link_state(mip);
1264 			newstate = mip->mi_lowlinkstate;
1265 			if (mip->mi_bridge_link != NULL) {
1266 				mutex_enter(&mip->mi_bridge_lock);
1267 				if ((mh = mip->mi_bridge_link) != NULL) {
1268 					newstate = mac_bridge_ls_cb(mh,
1269 					    newstate);
1270 				}
1271 				mutex_exit(&mip->mi_bridge_lock);
1272 			}
1273 			if (newstate != mip->mi_linkstate) {
1274 				mip->mi_linkstate = newstate;
1275 				bits |= 1 << MAC_NOTE_LINK;
1276 			}
1277 		}
1278 
1279 		/*
1280 		 * Do notification callbacks for each notification type.
1281 		 */
1282 		for (type = 0; type < MAC_NNOTE; type++) {
1283 			if ((bits & (1 << type)) == 0) {
1284 				continue;
1285 			}
1286 
1287 			if (mac_notify_cb_list[type] != NULL)
1288 				(*mac_notify_cb_list[type])(mip);
1289 
1290 			/*
1291 			 * Walk the list of notifications.
1292 			 */
1293 			MAC_CALLBACK_WALKER_INC(&mip->mi_notify_cb_info);
1294 			for (mcb = mip->mi_notify_cb_list; mcb != NULL;
1295 			    mcb = mcb->mcb_nextp) {
1296 				mncb = (mac_notify_cb_t *)mcb->mcb_objp;
1297 				mncb->mncb_fn(mncb->mncb_arg, type);
1298 			}
1299 			MAC_CALLBACK_WALKER_DCR(&mip->mi_notify_cb_info,
1300 			    &mip->mi_notify_cb_list);
1301 		}
1302 
1303 		mutex_enter(mcbi->mcbi_lockp);
1304 	}
1305 
1306 	mip->mi_state_flags |= MIS_NOTIFY_DONE;
1307 	cv_broadcast(&mcbi->mcbi_cv);
1308 
1309 	/* CALLB_CPR_EXIT drops the lock */
1310 	CALLB_CPR_EXIT(&cprinfo);
1311 	thread_exit();
1312 }
1313 
1314 /*
1315  * Signal the i_mac_notify_thread asking it to quit.
1316  * Then wait till it is done.
1317  */
1318 void
1319 i_mac_notify_exit(mac_impl_t *mip)
1320 {
1321 	mac_cb_info_t	*mcbi;
1322 
1323 	mcbi = &mip->mi_notify_cb_info;
1324 
1325 	mutex_enter(mcbi->mcbi_lockp);
1326 	mip->mi_notify_bits = (1 << MAC_NNOTE);
1327 	cv_broadcast(&mcbi->mcbi_cv);
1328 
1329 
1330 	while ((mip->mi_notify_thread != NULL) &&
1331 	    !(mip->mi_state_flags & MIS_NOTIFY_DONE)) {
1332 		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
1333 	}
1334 
1335 	/* Necessary clean up before doing kmem_cache_free */
1336 	mip->mi_state_flags &= ~MIS_NOTIFY_DONE;
1337 	mip->mi_notify_bits = 0;
1338 	mip->mi_notify_thread = NULL;
1339 	mutex_exit(mcbi->mcbi_lockp);
1340 }
1341 
1342 /*
1343  * Entry point invoked by drivers to dynamically add a ring to an
1344  * existing group.
1345  */
1346 int
1347 mac_group_add_ring(mac_group_handle_t gh, int index)
1348 {
1349 	mac_group_t *group = (mac_group_t *)gh;
1350 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
1351 	int ret;
1352 
1353 	i_mac_perim_enter(mip);
1354 	ret = i_mac_group_add_ring(group, NULL, index);
1355 	i_mac_perim_exit(mip);
1356 	return (ret);
1357 }
1358 
1359 /*
1360  * Entry point invoked by drivers to dynamically remove a ring
1361  * from an existing group. The specified ring handle must no longer
1362  * be used by the driver after a call to this function.
1363  */
1364 void
1365 mac_group_rem_ring(mac_group_handle_t gh, mac_ring_handle_t rh)
1366 {
1367 	mac_group_t *group = (mac_group_t *)gh;
1368 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
1369 
1370 	i_mac_perim_enter(mip);
1371 	i_mac_group_rem_ring(group, (mac_ring_t *)rh, B_TRUE);
1372 	i_mac_perim_exit(mip);
1373 }
1374 
1375 /*
1376  * mac_prop_info_*() callbacks called from the driver's prefix_propinfo()
1377  * entry points.
1378  */
1379 
1380 void
1381 mac_prop_info_set_default_uint8(mac_prop_info_handle_t ph, uint8_t val)
1382 {
1383 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1384 
1385 	/* nothing to do if the caller doesn't want the default value */
1386 	if (pr->pr_default == NULL)
1387 		return;
1388 
1389 	ASSERT(pr->pr_default_size >= sizeof (uint8_t));
1390 
1391 	*(uint8_t *)(pr->pr_default) = val;
1392 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1393 }
1394 
1395 void
1396 mac_prop_info_set_default_uint64(mac_prop_info_handle_t ph, uint64_t val)
1397 {
1398 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1399 
1400 	/* nothing to do if the caller doesn't want the default value */
1401 	if (pr->pr_default == NULL)
1402 		return;
1403 
1404 	ASSERT(pr->pr_default_size >= sizeof (uint64_t));
1405 
1406 	bcopy(&val, pr->pr_default, sizeof (val));
1407 
1408 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1409 }
1410 
1411 void
1412 mac_prop_info_set_default_uint32(mac_prop_info_handle_t ph, uint32_t val)
1413 {
1414 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1415 
1416 	/* nothing to do if the caller doesn't want the default value */
1417 	if (pr->pr_default == NULL)
1418 		return;
1419 
1420 	ASSERT(pr->pr_default_size >= sizeof (uint32_t));
1421 
1422 	bcopy(&val, pr->pr_default, sizeof (val));
1423 
1424 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1425 }
1426 
1427 void
1428 mac_prop_info_set_default_str(mac_prop_info_handle_t ph, const char *str)
1429 {
1430 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1431 
1432 	/* nothing to do if the caller doesn't want the default value */
1433 	if (pr->pr_default == NULL)
1434 		return;
1435 
1436 	if (strlen(str) >= pr->pr_default_size)
1437 		pr->pr_errno = ENOBUFS;
1438 	else
1439 		(void) strlcpy(pr->pr_default, str, pr->pr_default_size);
1440 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1441 }
1442 
1443 void
1444 mac_prop_info_set_default_link_flowctrl(mac_prop_info_handle_t ph,
1445     link_flowctrl_t val)
1446 {
1447 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1448 
1449 	/* nothing to do if the caller doesn't want the default value */
1450 	if (pr->pr_default == NULL)
1451 		return;
1452 
1453 	ASSERT(pr->pr_default_size >= sizeof (link_flowctrl_t));
1454 
1455 	bcopy(&val, pr->pr_default, sizeof (val));
1456 
1457 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1458 }
1459 
1460 void
1461 mac_prop_info_set_range_uint32(mac_prop_info_handle_t ph, uint32_t min,
1462     uint32_t max)
1463 {
1464 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1465 	mac_propval_range_t *range = pr->pr_range;
1466 	mac_propval_uint32_range_t *range32;
1467 
1468 	/* nothing to do if the caller doesn't want the range info */
1469 	if (range == NULL)
1470 		return;
1471 
1472 	if (pr->pr_range_cur_count++ == 0) {
1473 		/* first range */
1474 		pr->pr_flags |= MAC_PROP_INFO_RANGE;
1475 		range->mpr_type = MAC_PROPVAL_UINT32;
1476 	} else {
1477 		/* all ranges of a property should be of the same type */
1478 		ASSERT(range->mpr_type == MAC_PROPVAL_UINT32);
1479 		if (pr->pr_range_cur_count > range->mpr_count) {
1480 			pr->pr_errno = ENOSPC;
1481 			return;
1482 		}
1483 	}
1484 
1485 	range32 = range->mpr_range_uint32;
1486 	range32[pr->pr_range_cur_count - 1].mpur_min = min;
1487 	range32[pr->pr_range_cur_count - 1].mpur_max = max;
1488 }
1489 
1490 void
1491 mac_prop_info_set_perm(mac_prop_info_handle_t ph, uint8_t perm)
1492 {
1493 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1494 
1495 	pr->pr_perm = perm;
1496 	pr->pr_flags |= MAC_PROP_INFO_PERM;
1497 }
1498 
1499 void
1500 mac_hcksum_get(const mblk_t *mp, uint32_t *start, uint32_t *stuff,
1501     uint32_t *end, uint32_t *value, uint32_t *flags_ptr)
1502 {
1503 	uint32_t flags;
1504 
1505 	ASSERT(DB_TYPE(mp) == M_DATA);
1506 
1507 	flags = DB_CKSUMFLAGS(mp) & HCK_FLAGS;
1508 	if ((flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) != 0) {
1509 		if (value != NULL)
1510 			*value = (uint32_t)DB_CKSUM16(mp);
1511 		if ((flags & HCK_PARTIALCKSUM) != 0) {
1512 			if (start != NULL)
1513 				*start = (uint32_t)DB_CKSUMSTART(mp);
1514 			if (stuff != NULL)
1515 				*stuff = (uint32_t)DB_CKSUMSTUFF(mp);
1516 			if (end != NULL)
1517 				*end = (uint32_t)DB_CKSUMEND(mp);
1518 		}
1519 	}
1520 
1521 	if (flags_ptr != NULL)
1522 		*flags_ptr = flags;
1523 }
1524 
1525 void
1526 mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff, uint32_t end,
1527     uint32_t value, uint32_t flags)
1528 {
1529 	ASSERT(DB_TYPE(mp) == M_DATA);
1530 
1531 	DB_CKSUMSTART(mp) = (intptr_t)start;
1532 	DB_CKSUMSTUFF(mp) = (intptr_t)stuff;
1533 	DB_CKSUMEND(mp) = (intptr_t)end;
1534 	DB_CKSUMFLAGS(mp) = (uint16_t)flags;
1535 	DB_CKSUM16(mp) = (uint16_t)value;
1536 }
1537 
1538 void
1539 mac_hcksum_clone(const mblk_t *src, mblk_t *dst)
1540 {
1541 	ASSERT3U(DB_TYPE(src), ==, M_DATA);
1542 	ASSERT3U(DB_TYPE(dst), ==, M_DATA);
1543 
1544 	/*
1545 	 * Do these assignments unconditionally, rather than only when flags is
1546 	 * non-zero.  This protects a situation where zeroed hcksum data does
1547 	 * not make the jump onto an mblk_t with stale data in those fields.
1548 	 */
1549 	DB_CKSUMFLAGS(dst) = (DB_CKSUMFLAGS(src) & HCK_FLAGS);
1550 	DB_CKSUMSTART(dst) = DB_CKSUMSTART(src);
1551 	DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src);
1552 	DB_CKSUMEND(dst) = DB_CKSUMEND(src);
1553 	DB_CKSUM16(dst) = DB_CKSUM16(src);
1554 }
1555 
1556 void
1557 mac_lso_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
1558 {
1559 	ASSERT(DB_TYPE(mp) == M_DATA);
1560 
1561 	if (flags != NULL) {
1562 		*flags = DB_CKSUMFLAGS(mp) & HW_LSO;
1563 		if ((*flags != 0) && (mss != NULL))
1564 			*mss = (uint32_t)DB_LSOMSS(mp);
1565 	}
1566 }
1567 
1568 void
1569 mac_transceiver_info_set_present(mac_transceiver_info_t *infop,
1570     boolean_t present)
1571 {
1572 	infop->mti_present = present;
1573 }
1574 
1575 void
1576 mac_transceiver_info_set_usable(mac_transceiver_info_t *infop,
1577     boolean_t usable)
1578 {
1579 	infop->mti_usable = usable;
1580 }
1581 
1582 /*
1583  * We should really keep track of our offset and not walk everything every
1584  * time. I can't imagine that this will be kind to us at high packet rates;
1585  * however, for the moment, let's leave that.
1586  *
1587  * This walks a message block chain without pulling up to fill in the context
1588  * information. Note that the data we care about could be hidden across more
1589  * than one mblk_t.
1590  */
1591 static int
1592 mac_meoi_get_uint8(mblk_t *mp, off_t off, uint8_t *out)
1593 {
1594 	size_t mpsize;
1595 	uint8_t *bp;
1596 
1597 	mpsize = msgsize(mp);
1598 	/* Check for overflow */
1599 	if (off + sizeof (uint16_t) > mpsize)
1600 		return (-1);
1601 
1602 	mpsize = MBLKL(mp);
1603 	while (off >= mpsize) {
1604 		mp = mp->b_cont;
1605 		off -= mpsize;
1606 		mpsize = MBLKL(mp);
1607 	}
1608 
1609 	bp = mp->b_rptr + off;
1610 	*out = *bp;
1611 	return (0);
1612 
1613 }
1614 
1615 static int
1616 mac_meoi_get_uint16(mblk_t *mp, off_t off, uint16_t *out)
1617 {
1618 	size_t mpsize;
1619 	uint8_t *bp;
1620 
1621 	mpsize = msgsize(mp);
1622 	/* Check for overflow */
1623 	if (off + sizeof (uint16_t) > mpsize)
1624 		return (-1);
1625 
1626 	mpsize = MBLKL(mp);
1627 	while (off >= mpsize) {
1628 		mp = mp->b_cont;
1629 		off -= mpsize;
1630 		mpsize = MBLKL(mp);
1631 	}
1632 
1633 	/*
1634 	 * Data is in network order. Note the second byte of data might be in
1635 	 * the next mp.
1636 	 */
1637 	bp = mp->b_rptr + off;
1638 	*out = *bp << 8;
1639 	if (off + 1 == mpsize) {
1640 		mp = mp->b_cont;
1641 		bp = mp->b_rptr;
1642 	} else {
1643 		bp++;
1644 	}
1645 
1646 	*out |= *bp;
1647 	return (0);
1648 
1649 }
1650 
1651 
1652 int
1653 mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi)
1654 {
1655 	size_t off;
1656 	uint16_t ether;
1657 	uint8_t ipproto, iplen, l4len, maclen;
1658 
1659 	bzero(meoi, sizeof (mac_ether_offload_info_t));
1660 
1661 	meoi->meoi_len = msgsize(mp);
1662 	off = offsetof(struct ether_header, ether_type);
1663 	if (mac_meoi_get_uint16(mp, off, &ether) != 0)
1664 		return (-1);
1665 
1666 	if (ether == ETHERTYPE_VLAN) {
1667 		off = offsetof(struct ether_vlan_header, ether_type);
1668 		if (mac_meoi_get_uint16(mp, off, &ether) != 0)
1669 			return (-1);
1670 		meoi->meoi_flags |= MEOI_VLAN_TAGGED;
1671 		maclen = sizeof (struct ether_vlan_header);
1672 	} else {
1673 		maclen = sizeof (struct ether_header);
1674 	}
1675 	meoi->meoi_flags |= MEOI_L2INFO_SET;
1676 	meoi->meoi_l2hlen = maclen;
1677 	meoi->meoi_l3proto = ether;
1678 
1679 	switch (ether) {
1680 	case ETHERTYPE_IP:
1681 		/*
1682 		 * For IPv4 we need to get the length of the header, as it can
1683 		 * be variable.
1684 		 */
1685 		off = offsetof(ipha_t, ipha_version_and_hdr_length) + maclen;
1686 		if (mac_meoi_get_uint8(mp, off, &iplen) != 0)
1687 			return (-1);
1688 		iplen &= 0x0f;
1689 		if (iplen < 5 || iplen > 0x0f)
1690 			return (-1);
1691 		iplen *= 4;
1692 		off = offsetof(ipha_t, ipha_protocol) + maclen;
1693 		if (mac_meoi_get_uint8(mp, off, &ipproto) == -1)
1694 			return (-1);
1695 		break;
1696 	case ETHERTYPE_IPV6:
1697 		iplen = 40;
1698 		off = offsetof(ip6_t, ip6_nxt) + maclen;
1699 		if (mac_meoi_get_uint8(mp, off, &ipproto) == -1)
1700 			return (-1);
1701 		break;
1702 	default:
1703 		return (0);
1704 	}
1705 	meoi->meoi_l3hlen = iplen;
1706 	meoi->meoi_l4proto = ipproto;
1707 	meoi->meoi_flags |= MEOI_L3INFO_SET;
1708 
1709 	switch (ipproto) {
1710 	case IPPROTO_TCP:
1711 		off = offsetof(tcph_t, th_offset_and_rsrvd) + maclen + iplen;
1712 		if (mac_meoi_get_uint8(mp, off, &l4len) == -1)
1713 			return (-1);
1714 		l4len = (l4len & 0xf0) >> 4;
1715 		if (l4len < 5 || l4len > 0xf)
1716 			return (-1);
1717 		l4len *= 4;
1718 		break;
1719 	case IPPROTO_UDP:
1720 		l4len = sizeof (struct udphdr);
1721 		break;
1722 	case IPPROTO_SCTP:
1723 		l4len = sizeof (sctp_hdr_t);
1724 		break;
1725 	default:
1726 		return (0);
1727 	}
1728 
1729 	meoi->meoi_l4hlen = l4len;
1730 	meoi->meoi_flags |= MEOI_L4INFO_SET;
1731 	return (0);
1732 }
1733