xref: /illumos-gate/usr/src/uts/common/io/aggr/aggr_grp.c (revision 84de666edc7f7d835057ae4807a387447c086bcf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2018 Joyent, Inc.
24  */
25 
26 /*
27  * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
28  *
29  * An instance of the structure aggr_grp_t is allocated for each
30  * link aggregation group. When created, aggr_grp_t objects are
31  * entered into the aggr_grp_hash hash table maintained by the modhash
32  * module. The hash key is the linkid associated with the link
33  * aggregation group.
34  *
35  * A set of MAC ports are associated with each association group.
36  *
37  * Aggr pseudo TX rings
38  * --------------------
39  * The underlying ports (NICs) in an aggregation can have TX rings. To
40  * enhance aggr's performance, these TX rings are made available to the
41  * aggr layer as pseudo TX rings. The concept of pseudo rings are not new.
42  * They are already present and implemented on the RX side. It is called
43  * as pseudo RX rings. The same concept is extended to the TX side where
44  * each TX ring of an underlying port is reflected in aggr as a pseudo
45  * TX ring. Thus each pseudo TX ring will map to a specific hardware TX
46  * ring. Even in the case of a NIC that does not have a TX ring, a pseudo
47  * TX ring is given to the aggregation layer.
48  *
49  * With this change, the outgoing stack depth looks much better:
50  *
51  * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
52  * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
53  *
54  * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings:
55  * SRS_TX_AGGR and SRS_TX_BW_AGGR.
56  *
57  * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
58  * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX
59  * ring belonging to a port on which the packet has to be sent.
60  * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
61  * policy and then uses the fanout_hint passed to it to pick a TX ring from
62  * the selected port.
63  *
64  * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
65  * bandwidth limit is applied first on the outgoing packet and the packets
66  * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
67  * particular TX ring.
68  */
69 
70 #include <sys/types.h>
71 #include <sys/sysmacros.h>
72 #include <sys/conf.h>
73 #include <sys/cmn_err.h>
74 #include <sys/disp.h>
75 #include <sys/list.h>
76 #include <sys/ksynch.h>
77 #include <sys/kmem.h>
78 #include <sys/stream.h>
79 #include <sys/modctl.h>
80 #include <sys/ddi.h>
81 #include <sys/sunddi.h>
82 #include <sys/atomic.h>
83 #include <sys/stat.h>
84 #include <sys/modhash.h>
85 #include <sys/id_space.h>
86 #include <sys/strsun.h>
87 #include <sys/cred.h>
88 #include <sys/dlpi.h>
89 #include <sys/zone.h>
90 #include <sys/mac_provider.h>
91 #include <sys/dls.h>
92 #include <sys/vlan.h>
93 #include <sys/aggr.h>
94 #include <sys/aggr_impl.h>
95 
96 static int aggr_m_start(void *);
97 static void aggr_m_stop(void *);
98 static int aggr_m_promisc(void *, boolean_t);
99 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
100 static int aggr_m_unicst(void *, const uint8_t *);
101 static int aggr_m_stat(void *, uint_t, uint64_t *);
102 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
103 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
104 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
105     const void *);
106 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
107     mac_prop_info_handle_t);
108 
109 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
110 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
111     boolean_t *);
112 
113 static void aggr_grp_capab_set(aggr_grp_t *);
114 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
115 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
116 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
117 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
118 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
119 
120 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
121 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
122 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
123 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
124 static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t);
125 static int aggr_addmac(void *, const uint8_t *);
126 static int aggr_remmac(void *, const uint8_t *);
127 static int aggr_addvlan(mac_group_driver_t, uint16_t);
128 static int aggr_remvlan(mac_group_driver_t, uint16_t);
129 static mblk_t *aggr_rx_poll(void *, int);
130 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
131     const int, mac_ring_info_t *, mac_ring_handle_t);
132 static void aggr_fill_group(void *, mac_ring_type_t, const int,
133     mac_group_info_t *, mac_group_handle_t);
134 
135 static kmem_cache_t	*aggr_grp_cache;
136 static mod_hash_t	*aggr_grp_hash;
137 static krwlock_t	aggr_grp_lock;
138 static uint_t		aggr_grp_cnt;
139 static id_space_t	*key_ids;
140 
141 #define	GRP_HASHSZ		64
142 #define	GRP_HASH_KEY(linkid)	((mod_hash_key_t)(uintptr_t)linkid)
143 #define	AGGR_PORT_NAME_DELIMIT '-'
144 
145 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
146 
147 #define	AGGR_M_CALLBACK_FLAGS	\
148 	(MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
149 
150 static mac_callbacks_t aggr_m_callbacks = {
151 	AGGR_M_CALLBACK_FLAGS,
152 	aggr_m_stat,
153 	aggr_m_start,
154 	aggr_m_stop,
155 	aggr_m_promisc,
156 	aggr_m_multicst,
157 	NULL,
158 	NULL,
159 	NULL,
160 	aggr_m_ioctl,
161 	aggr_m_capab_get,
162 	NULL,
163 	NULL,
164 	aggr_m_setprop,
165 	NULL,
166 	aggr_m_propinfo
167 };
168 
169 /*ARGSUSED*/
170 static int
171 aggr_grp_constructor(void *buf, void *arg, int kmflag)
172 {
173 	aggr_grp_t *grp = buf;
174 
175 	bzero(grp, sizeof (*grp));
176 	mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
177 	cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
178 	rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
179 	mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
180 	cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
181 	mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL);
182 	cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL);
183 	grp->lg_link_state = LINK_STATE_UNKNOWN;
184 	return (0);
185 }
186 
187 /*ARGSUSED*/
188 static void
189 aggr_grp_destructor(void *buf, void *arg)
190 {
191 	aggr_grp_t *grp = buf;
192 
193 	if (grp->lg_tx_ports != NULL) {
194 		kmem_free(grp->lg_tx_ports,
195 		    grp->lg_tx_ports_size * sizeof (aggr_port_t *));
196 	}
197 
198 	mutex_destroy(&grp->lg_lacp_lock);
199 	cv_destroy(&grp->lg_lacp_cv);
200 	mutex_destroy(&grp->lg_port_lock);
201 	cv_destroy(&grp->lg_port_cv);
202 	rw_destroy(&grp->lg_tx_lock);
203 	mutex_destroy(&grp->lg_tx_flowctl_lock);
204 	cv_destroy(&grp->lg_tx_flowctl_cv);
205 }
206 
207 void
208 aggr_grp_init(void)
209 {
210 	aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
211 	    sizeof (aggr_grp_t), 0, aggr_grp_constructor,
212 	    aggr_grp_destructor, NULL, NULL, NULL, 0);
213 
214 	aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
215 	    GRP_HASHSZ, mod_hash_null_valdtor);
216 	rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
217 	aggr_grp_cnt = 0;
218 
219 	/*
220 	 * Allocate an id space to manage key values (when key is not
221 	 * specified). The range of the id space will be from
222 	 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol
223 	 * uses a 16-bit key.
224 	 */
225 	key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX);
226 	ASSERT(key_ids != NULL);
227 }
228 
229 void
230 aggr_grp_fini(void)
231 {
232 	id_space_destroy(key_ids);
233 	rw_destroy(&aggr_grp_lock);
234 	mod_hash_destroy_idhash(aggr_grp_hash);
235 	kmem_cache_destroy(aggr_grp_cache);
236 }
237 
238 uint_t
239 aggr_grp_count(void)
240 {
241 	uint_t	count;
242 
243 	rw_enter(&aggr_grp_lock, RW_READER);
244 	count = aggr_grp_cnt;
245 	rw_exit(&aggr_grp_lock);
246 	return (count);
247 }
248 
249 /*
250  * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
251  * requires the mac perimeter, this function holds a reference of the aggr
252  * and aggr won't call mac_unregister() until this reference drops to 0.
253  */
254 void
255 aggr_grp_port_hold(aggr_port_t *port)
256 {
257 	aggr_grp_t	*grp = port->lp_grp;
258 
259 	AGGR_PORT_REFHOLD(port);
260 	mutex_enter(&grp->lg_port_lock);
261 	grp->lg_port_ref++;
262 	mutex_exit(&grp->lg_port_lock);
263 }
264 
265 /*
266  * Release the reference of the grp and inform aggr_grp_delete() calling
267  * mac_unregister() is now safe.
268  */
269 void
270 aggr_grp_port_rele(aggr_port_t *port)
271 {
272 	aggr_grp_t	*grp = port->lp_grp;
273 
274 	mutex_enter(&grp->lg_port_lock);
275 	if (--grp->lg_port_ref == 0)
276 		cv_signal(&grp->lg_port_cv);
277 	mutex_exit(&grp->lg_port_lock);
278 	AGGR_PORT_REFRELE(port);
279 }
280 
281 /*
282  * Wait for the port's lacp timer thread and the port's notification callback
283  * to exit.
284  */
285 void
286 aggr_grp_port_wait(aggr_grp_t *grp)
287 {
288 	mutex_enter(&grp->lg_port_lock);
289 	if (grp->lg_port_ref != 0)
290 		cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
291 	mutex_exit(&grp->lg_port_lock);
292 }
293 
294 /*
295  * Attach a port to a link aggregation group.
296  *
297  * A port is attached to a link aggregation group once its speed
298  * and link state have been verified.
299  *
300  * Returns B_TRUE if the group link state or speed has changed. If
301  * it's the case, the caller must notify the MAC layer via a call
302  * to mac_link().
303  */
304 boolean_t
305 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
306 {
307 	boolean_t link_state_changed = B_FALSE;
308 
309 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
310 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
311 
312 	if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
313 		return (B_FALSE);
314 
315 	/*
316 	 * Validate the MAC port link speed and update the group
317 	 * link speed if needed.
318 	 */
319 	if (port->lp_ifspeed == 0 ||
320 	    port->lp_link_state != LINK_STATE_UP ||
321 	    port->lp_link_duplex != LINK_DUPLEX_FULL) {
322 		/*
323 		 * Can't attach a MAC port with unknown link speed,
324 		 * down link, or not in full duplex mode.
325 		 */
326 		return (B_FALSE);
327 	}
328 
329 	mutex_enter(&grp->lg_stat_lock);
330 	if (grp->lg_ifspeed == 0) {
331 		/*
332 		 * The group inherits the speed of the first link being
333 		 * attached.
334 		 */
335 		grp->lg_ifspeed = port->lp_ifspeed;
336 		link_state_changed = B_TRUE;
337 	} else if (grp->lg_ifspeed != port->lp_ifspeed) {
338 		/*
339 		 * The link speed of the MAC port must be the same as
340 		 * the group link speed, as per 802.3ad. Since it is
341 		 * not, the attach is cancelled.
342 		 */
343 		mutex_exit(&grp->lg_stat_lock);
344 		return (B_FALSE);
345 	}
346 	mutex_exit(&grp->lg_stat_lock);
347 
348 	grp->lg_nattached_ports++;
349 
350 	/*
351 	 * Update the group link state.
352 	 */
353 	if (grp->lg_link_state != LINK_STATE_UP) {
354 		grp->lg_link_state = LINK_STATE_UP;
355 		mutex_enter(&grp->lg_stat_lock);
356 		grp->lg_link_duplex = LINK_DUPLEX_FULL;
357 		mutex_exit(&grp->lg_stat_lock);
358 		link_state_changed = B_TRUE;
359 	}
360 
361 	/*
362 	 * Update port's state.
363 	 */
364 	port->lp_state = AGGR_PORT_STATE_ATTACHED;
365 
366 	aggr_grp_multicst_port(port, B_TRUE);
367 
368 	/*
369 	 * Set port's receive callback
370 	 */
371 	mac_rx_set(port->lp_mch, aggr_recv_cb, port);
372 
373 	/*
374 	 * If LACP is OFF, the port can be used to send data as soon
375 	 * as its link is up and verified to be compatible with the
376 	 * aggregation.
377 	 *
378 	 * If LACP is active or passive, notify the LACP subsystem, which
379 	 * will enable sending on the port following the LACP protocol.
380 	 */
381 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
382 		aggr_send_port_enable(port);
383 	else
384 		aggr_lacp_port_attached(port);
385 
386 	return (link_state_changed);
387 }
388 
389 boolean_t
390 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
391 {
392 	boolean_t link_state_changed = B_FALSE;
393 
394 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
395 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
396 
397 	/* update state */
398 	if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
399 		return (B_FALSE);
400 
401 	mac_rx_clear(port->lp_mch);
402 
403 	aggr_grp_multicst_port(port, B_FALSE);
404 
405 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
406 		aggr_send_port_disable(port);
407 	else
408 		aggr_lacp_port_detached(port);
409 
410 	port->lp_state = AGGR_PORT_STATE_STANDBY;
411 
412 	grp->lg_nattached_ports--;
413 	if (grp->lg_nattached_ports == 0) {
414 		/* the last attached MAC port of the group is being detached */
415 		grp->lg_link_state = LINK_STATE_DOWN;
416 		mutex_enter(&grp->lg_stat_lock);
417 		grp->lg_ifspeed = 0;
418 		grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
419 		mutex_exit(&grp->lg_stat_lock);
420 		link_state_changed = B_TRUE;
421 	}
422 
423 	return (link_state_changed);
424 }
425 
426 /*
427  * Update the MAC addresses of the constituent ports of the specified
428  * group. This function is invoked:
429  * - after creating a new aggregation group.
430  * - after adding new ports to an aggregation group.
431  * - after removing a port from a group when the MAC address of
432  *   that port was used for the MAC address of the group.
433  * - after the MAC address of a port changed when the MAC address
434  *   of that port was used for the MAC address of the group.
435  *
436  * Return true if the link state of the aggregation changed, for example
437  * as a result of a failure changing the MAC address of one of the
438  * constituent ports.
439  */
440 boolean_t
441 aggr_grp_update_ports_mac(aggr_grp_t *grp)
442 {
443 	aggr_port_t *cport;
444 	boolean_t link_state_changed = B_FALSE;
445 	mac_perim_handle_t mph;
446 
447 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
448 
449 	for (cport = grp->lg_ports; cport != NULL;
450 	    cport = cport->lp_next) {
451 		mac_perim_enter_by_mh(cport->lp_mh, &mph);
452 		if (aggr_port_unicst(cport) != 0) {
453 			if (aggr_grp_detach_port(grp, cport))
454 				link_state_changed = B_TRUE;
455 		} else {
456 			/*
457 			 * If a port was detached because of a previous
458 			 * failure changing the MAC address, the port is
459 			 * reattached when it successfully changes the MAC
460 			 * address now, and this might cause the link state
461 			 * of the aggregation to change.
462 			 */
463 			if (aggr_grp_attach_port(grp, cport))
464 				link_state_changed = B_TRUE;
465 		}
466 		mac_perim_exit(mph);
467 	}
468 	return (link_state_changed);
469 }
470 
471 /*
472  * Invoked when the MAC address of a port has changed. If the port's
473  * MAC address was used for the group MAC address, set mac_addr_changedp
474  * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST
475  * notification. If the link state changes due to detach/attach of
476  * the constituent port, set link_state_changedp to B_TRUE to indicate
477  * to the caller that it should send a MAC_NOTE_LINK notification. In both
478  * cases, it is the responsibility of the caller to invoke notification
479  * functions after releasing the the port lock.
480  */
481 void
482 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
483     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
484 {
485 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
486 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
487 	ASSERT(mac_addr_changedp != NULL);
488 	ASSERT(link_state_changedp != NULL);
489 
490 	*mac_addr_changedp = B_FALSE;
491 	*link_state_changedp = B_FALSE;
492 
493 	if (grp->lg_addr_fixed) {
494 		/*
495 		 * The group is using a fixed MAC address or an automatic
496 		 * MAC address has not been set.
497 		 */
498 		return;
499 	}
500 
501 	if (grp->lg_mac_addr_port == port) {
502 		/*
503 		 * The MAC address of the port was assigned to the group
504 		 * MAC address. Update the group MAC address.
505 		 */
506 		bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
507 		*mac_addr_changedp = B_TRUE;
508 	} else {
509 		/*
510 		 * Update the actual port MAC address to the MAC address
511 		 * of the group.
512 		 */
513 		if (aggr_port_unicst(port) != 0) {
514 			*link_state_changedp = aggr_grp_detach_port(grp, port);
515 		} else {
516 			/*
517 			 * If a port was detached because of a previous
518 			 * failure changing the MAC address, the port is
519 			 * reattached when it successfully changes the MAC
520 			 * address now, and this might cause the link state
521 			 * of the aggregation to change.
522 			 */
523 			*link_state_changedp = aggr_grp_attach_port(grp, port);
524 		}
525 	}
526 }
527 
528 /*
529  * Add a port to a link aggregation group.
530  */
531 static int
532 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
533     aggr_port_t **pp)
534 {
535 	aggr_port_t *port, **cport;
536 	mac_perim_handle_t mph;
537 	zoneid_t port_zoneid = ALL_ZONES;
538 	int err;
539 
540 	/* The port must be int the same zone as the aggregation. */
541 	if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
542 		port_zoneid = GLOBAL_ZONEID;
543 	if (grp->lg_zoneid != port_zoneid)
544 		return (EBUSY);
545 
546 	/*
547 	 * lg_mh could be NULL when the function is called during the creation
548 	 * of the aggregation.
549 	 */
550 	ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
551 
552 	/* create new port */
553 	err = aggr_port_create(grp, port_linkid, force, &port);
554 	if (err != 0)
555 		return (err);
556 
557 	mac_perim_enter_by_mh(port->lp_mh, &mph);
558 
559 	/* add port to list of group constituent ports */
560 	cport = &grp->lg_ports;
561 	while (*cport != NULL)
562 		cport = &((*cport)->lp_next);
563 	*cport = port;
564 
565 	/*
566 	 * Back reference to the group it is member of. A port always
567 	 * holds a reference to its group to ensure that the back
568 	 * reference is always valid.
569 	 */
570 	port->lp_grp = grp;
571 	AGGR_GRP_REFHOLD(grp);
572 	grp->lg_nports++;
573 
574 	aggr_lacp_init_port(port);
575 	mac_perim_exit(mph);
576 
577 	if (pp != NULL)
578 		*pp = port;
579 
580 	return (0);
581 }
582 
583 /*
584  * This is called in response to either our LACP state machine or a MAC
585  * notification that the link has gone down via aggr_send_port_disable(). At
586  * this point, we may need to update our default ring. To that end, we go
587  * through the set of ports (underlying datalinks in an aggregation) that are
588  * currently enabled to transmit data. If all our links have been disabled for
589  * transmit, then we don't do anything.
590  *
591  * Note, because we only have a single TX group, we don't have to worry about
592  * the rings moving between groups and the chance that mac will reassign it
593  * unless someone removes a port, at which point, we play it safe and call this
594  * again.
595  */
596 void
597 aggr_grp_update_default(aggr_grp_t *grp)
598 {
599 	aggr_port_t *port;
600 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
601 
602 	rw_enter(&grp->lg_tx_lock, RW_WRITER);
603 
604 	if (grp->lg_ntx_ports == 0) {
605 		rw_exit(&grp->lg_tx_lock);
606 		return;
607 	}
608 
609 	port = grp->lg_tx_ports[0];
610 	ASSERT(port->lp_tx_ring_cnt > 0);
611 	mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]);
612 	rw_exit(&grp->lg_tx_lock);
613 }
614 
615 /*
616  * Add a pseudo RX ring for the given HW ring handle.
617  */
618 static int
619 aggr_add_pseudo_rx_ring(aggr_port_t *port,
620     aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
621 {
622 	aggr_pseudo_rx_ring_t	*ring;
623 	int			err;
624 	int			j;
625 
626 	for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
627 		ring = rx_grp->arg_rings + j;
628 		if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
629 			break;
630 	}
631 
632 	/*
633 	 * No slot for this new RX ring.
634 	 */
635 	if (j == MAX_RINGS_PER_GROUP)
636 		return (EIO);
637 
638 	ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
639 	ring->arr_hw_rh = hw_rh;
640 	ring->arr_port = port;
641 	rx_grp->arg_ring_cnt++;
642 
643 	/*
644 	 * The group is already registered, dynamically add a new ring to the
645 	 * mac group.
646 	 */
647 	if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
648 		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
649 		ring->arr_hw_rh = NULL;
650 		ring->arr_port = NULL;
651 		rx_grp->arg_ring_cnt--;
652 	} else {
653 		mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
654 		    mac_find_ring(rx_grp->arg_gh, j));
655 	}
656 	return (err);
657 }
658 
659 /*
660  * Remove the pseudo RX ring of the given HW ring handle.
661  */
662 static void
663 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
664 {
665 	aggr_pseudo_rx_ring_t	*ring;
666 	int			j;
667 
668 	for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
669 		ring = rx_grp->arg_rings + j;
670 		if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
671 		    ring->arr_hw_rh != hw_rh) {
672 			continue;
673 		}
674 
675 		mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
676 
677 		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
678 		ring->arr_hw_rh = NULL;
679 		ring->arr_port = NULL;
680 		rx_grp->arg_ring_cnt--;
681 		mac_hwring_teardown(hw_rh);
682 		break;
683 	}
684 }
685 
686 /*
687  * Create pseudo rings over the HW rings of the port.
688  *
689  * o Create a pseudo ring in rx_grp per HW ring in the port's HW group.
690  *
691  * o Program existing unicast filters on the pseudo group into the HW group.
692  *
693  * o Program existing VLAN filters on the pseudo group into the HW group.
694  */
695 static int
696 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
697 {
698 	aggr_grp_t		*grp = port->lp_grp;
699 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
700 	aggr_unicst_addr_t	*addr, *a;
701 	mac_perim_handle_t	pmph;
702 	aggr_vlan_t		*avp;
703 	int			hw_rh_cnt, i = 0, j;
704 	int			err = 0;
705 
706 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
707 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
708 
709 	/*
710 	 * This function must be called after the aggr registers its MAC
711 	 * and its Rx group has been initialized.
712 	 */
713 	ASSERT(rx_grp->arg_gh != NULL);
714 
715 	/*
716 	 * Get the list of the underlying HW rings.
717 	 */
718 	hw_rh_cnt = mac_hwrings_get(port->lp_mch,
719 	    &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX);
720 
721 	if (port->lp_hwgh != NULL) {
722 		/*
723 		 * Quiesce the HW ring and the MAC SRS on the ring. Note
724 		 * that the HW ring will be restarted when the pseudo ring
725 		 * is started. At that time all the packets will be
726 		 * directly passed up to the pseudo Rx ring and handled
727 		 * by MAC SRS created over the pseudo Rx ring.
728 		 */
729 		mac_rx_client_quiesce(port->lp_mch);
730 		mac_srs_perm_quiesce(port->lp_mch, B_TRUE);
731 	}
732 
733 	/*
734 	 * Add existing VLAN and unicast address filters to the port.
735 	 */
736 	for (avp = list_head(&rx_grp->arg_vlans); avp != NULL;
737 	    avp = list_next(&rx_grp->arg_vlans, avp)) {
738 		if ((err = aggr_port_addvlan(port, avp->av_vid)) != 0)
739 			goto err;
740 	}
741 
742 	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
743 		if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0)
744 			goto err;
745 	}
746 
747 	for (i = 0; i < hw_rh_cnt; i++) {
748 		err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
749 		if (err != 0)
750 			goto err;
751 	}
752 
753 	port->lp_rx_grp_added = B_TRUE;
754 	mac_perim_exit(pmph);
755 	return (0);
756 
757 err:
758 	ASSERT(err != 0);
759 
760 	for (j = 0; j < i; j++)
761 		aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
762 
763 	for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
764 		aggr_port_remmac(port, a->aua_addr);
765 
766 	if (avp != NULL)
767 		avp = list_prev(&rx_grp->arg_vlans, avp);
768 
769 	for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) {
770 		int err2;
771 
772 		if ((err2 = aggr_port_remvlan(port, avp->av_vid)) != 0) {
773 			cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
774 			    ": errno %d.", avp->av_vid,
775 			    mac_client_name(port->lp_mch), err2);
776 		}
777 	}
778 
779 	if (port->lp_hwgh != NULL) {
780 		mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
781 		mac_rx_client_restart(port->lp_mch);
782 		port->lp_hwgh = NULL;
783 	}
784 
785 	mac_perim_exit(pmph);
786 	return (err);
787 }
788 
789 /*
790  * Destroy the pseudo rings mapping to this port and remove all VLAN
791  * and unicast filters from this port. Even if there are no underlying
792  * HW rings we must still remove the unicast filters to take the port
793  * out of promisc mode.
794  */
795 static void
796 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
797 {
798 	aggr_grp_t		*grp = port->lp_grp;
799 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
800 	aggr_unicst_addr_t	*addr;
801 	mac_group_handle_t	hwgh;
802 	mac_perim_handle_t	pmph;
803 	int			hw_rh_cnt, i;
804 
805 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
806 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
807 
808 	if (!port->lp_rx_grp_added)
809 		goto done;
810 
811 	ASSERT(rx_grp->arg_gh != NULL);
812 	hw_rh_cnt = mac_hwrings_get(port->lp_mch,
813 	    &hwgh, hw_rh, MAC_RING_TYPE_RX);
814 
815 	for (i = 0; i < hw_rh_cnt; i++)
816 		aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
817 
818 	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
819 		aggr_port_remmac(port, addr->aua_addr);
820 
821 	for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL;
822 	    avp = list_next(&rx_grp->arg_vlans, avp)) {
823 		int err;
824 
825 		if ((err = aggr_port_remvlan(port, avp->av_vid)) != 0) {
826 			cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
827 			    ": errno %d.", avp->av_vid,
828 			    mac_client_name(port->lp_mch), err);
829 		}
830 	}
831 
832 	if (port->lp_hwgh != NULL) {
833 		port->lp_hwgh = NULL;
834 
835 		/*
836 		 * First clear the permanent-quiesced flag of the RX srs then
837 		 * restart the HW ring and the mac srs on the ring. Note that
838 		 * the HW ring and associated SRS will soon been removed when
839 		 * the port is removed from the aggr.
840 		 */
841 		mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
842 		mac_rx_client_restart(port->lp_mch);
843 	}
844 
845 	port->lp_rx_grp_added = B_FALSE;
846 done:
847 	mac_perim_exit(pmph);
848 }
849 
850 /*
851  * Add a pseudo TX ring for the given HW ring handle.
852  */
853 static int
854 aggr_add_pseudo_tx_ring(aggr_port_t *port,
855     aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
856     mac_ring_handle_t *pseudo_rh)
857 {
858 	aggr_pseudo_tx_ring_t	*ring;
859 	int			err;
860 	int			i;
861 
862 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
863 	for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
864 		ring = tx_grp->atg_rings + i;
865 		if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
866 			break;
867 	}
868 	/*
869 	 * No slot for this new TX ring.
870 	 */
871 	if (i == MAX_RINGS_PER_GROUP)
872 		return (EIO);
873 	/*
874 	 * The following 4 statements needs to be done before
875 	 * calling mac_group_add_ring(). Otherwise it will
876 	 * result in an assertion failure in mac_init_ring().
877 	 */
878 	ring->atr_flags |= MAC_PSEUDO_RING_INUSE;
879 	ring->atr_hw_rh = hw_rh;
880 	ring->atr_port = port;
881 	tx_grp->atg_ring_cnt++;
882 
883 	/*
884 	 * The TX side has no concept of ring groups unlike RX groups.
885 	 * There is just a single group which stores all the TX rings.
886 	 * This group will be used to store aggr's pseudo TX rings.
887 	 */
888 	if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) {
889 		ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
890 		ring->atr_hw_rh = NULL;
891 		ring->atr_port = NULL;
892 		tx_grp->atg_ring_cnt--;
893 	} else {
894 		*pseudo_rh = mac_find_ring(tx_grp->atg_gh, i);
895 		if (hw_rh != NULL) {
896 			mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
897 			    mac_find_ring(tx_grp->atg_gh, i));
898 		}
899 	}
900 
901 	return (err);
902 }
903 
904 /*
905  * Remove the pseudo TX ring of the given HW ring handle.
906  */
907 static void
908 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp,
909     mac_ring_handle_t pseudo_hw_rh)
910 {
911 	aggr_pseudo_tx_ring_t	*ring;
912 	int			i;
913 
914 	for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
915 		ring = tx_grp->atg_rings + i;
916 		if (ring->atr_rh != pseudo_hw_rh)
917 			continue;
918 
919 		ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE);
920 		mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh);
921 		ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
922 		mac_hwring_teardown(ring->atr_hw_rh);
923 		ring->atr_hw_rh = NULL;
924 		ring->atr_port = NULL;
925 		tx_grp->atg_ring_cnt--;
926 		break;
927 	}
928 }
929 
930 /*
931  * This function is called to create pseudo rings over hardware rings of
932  * the underlying device. There is a 1:1 mapping between the pseudo TX
933  * rings of the aggr and the hardware rings of the underlying port.
934  */
935 static int
936 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
937 {
938 	aggr_grp_t		*grp = port->lp_grp;
939 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
940 	mac_perim_handle_t	pmph;
941 	int			hw_rh_cnt, i = 0, j;
942 	int			err = 0;
943 
944 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
945 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
946 
947 	/*
948 	 * Get the list the the underlying HW rings.
949 	 */
950 	hw_rh_cnt = mac_hwrings_get(port->lp_mch,
951 	    NULL, hw_rh, MAC_RING_TYPE_TX);
952 
953 	/*
954 	 * Even if the underlying NIC does not have TX rings, we
955 	 * still make a psuedo TX ring for that NIC with NULL as
956 	 * the ring handle.
957 	 */
958 	if (hw_rh_cnt == 0)
959 		port->lp_tx_ring_cnt = 1;
960 	else
961 		port->lp_tx_ring_cnt = hw_rh_cnt;
962 
963 	port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
964 	    port->lp_tx_ring_cnt), KM_SLEEP);
965 	port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
966 	    port->lp_tx_ring_cnt), KM_SLEEP);
967 
968 	if (hw_rh_cnt == 0) {
969 		if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
970 		    NULL, &pseudo_rh)) == 0) {
971 			port->lp_tx_rings[0] = NULL;
972 			port->lp_pseudo_tx_rings[0] = pseudo_rh;
973 		}
974 	} else {
975 		for (i = 0; err == 0 && i < hw_rh_cnt; i++) {
976 			err = aggr_add_pseudo_tx_ring(port,
977 			    tx_grp, hw_rh[i], &pseudo_rh);
978 			if (err != 0)
979 				break;
980 			port->lp_tx_rings[i] = hw_rh[i];
981 			port->lp_pseudo_tx_rings[i] = pseudo_rh;
982 		}
983 	}
984 
985 	if (err != 0) {
986 		if (hw_rh_cnt != 0) {
987 			for (j = 0; j < i; j++) {
988 				aggr_rem_pseudo_tx_ring(tx_grp,
989 				    port->lp_pseudo_tx_rings[j]);
990 			}
991 		}
992 		kmem_free(port->lp_tx_rings,
993 		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
994 		kmem_free(port->lp_pseudo_tx_rings,
995 		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
996 		port->lp_tx_ring_cnt = 0;
997 	} else {
998 		port->lp_tx_grp_added = B_TRUE;
999 		port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
1000 		    aggr_tx_ring_update, port);
1001 	}
1002 	mac_perim_exit(pmph);
1003 	aggr_grp_update_default(grp);
1004 	return (err);
1005 }
1006 
1007 /*
1008  * This function is called by aggr to remove pseudo TX rings over the
1009  * HW rings of the underlying port.
1010  */
1011 static void
1012 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
1013 {
1014 	aggr_grp_t		*grp = port->lp_grp;
1015 	mac_perim_handle_t	pmph;
1016 	int			i;
1017 
1018 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1019 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
1020 
1021 	if (!port->lp_tx_grp_added)
1022 		goto done;
1023 
1024 	ASSERT(tx_grp->atg_gh != NULL);
1025 
1026 	for (i = 0; i < port->lp_tx_ring_cnt; i++)
1027 		aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
1028 
1029 	kmem_free(port->lp_tx_rings,
1030 	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1031 	kmem_free(port->lp_pseudo_tx_rings,
1032 	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1033 
1034 	port->lp_tx_ring_cnt = 0;
1035 	(void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
1036 	port->lp_tx_grp_added = B_FALSE;
1037 	aggr_grp_update_default(grp);
1038 done:
1039 	mac_perim_exit(pmph);
1040 }
1041 
1042 static int
1043 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
1044 {
1045 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1046 	return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
1047 }
1048 
1049 static int
1050 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
1051 {
1052 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1053 	return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
1054 }
1055 
1056 /*
1057  * Here we need to start the pseudo-ring. As MAC already ensures that the
1058  * underlying device is set up, all we need to do is save the ring generation.
1059  *
1060  * Note, we don't end up wanting to use the underlying mac_hwring_start/stop
1061  * functions here as those don't actually stop and start the ring, they just
1062  * quiesce the ring. Regardless of whether the aggr is logically up or not, we
1063  * want to make sure that we can receive traffic for LACP.
1064  */
1065 static int
1066 aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen)
1067 {
1068 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1069 
1070 	rr_ring->arr_gen = mr_gen;
1071 	return (0);
1072 }
1073 
1074 /*
1075  * Add one or more ports to an existing link aggregation group.
1076  */
1077 int
1078 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
1079     laioc_port_t *ports)
1080 {
1081 	int rc, i, nadded = 0;
1082 	aggr_grp_t *grp = NULL;
1083 	aggr_port_t *port;
1084 	boolean_t link_state_changed = B_FALSE;
1085 	mac_perim_handle_t mph, pmph;
1086 
1087 	/* get group corresponding to linkid */
1088 	rw_enter(&aggr_grp_lock, RW_READER);
1089 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1090 	    (mod_hash_val_t *)&grp) != 0) {
1091 		rw_exit(&aggr_grp_lock);
1092 		return (ENOENT);
1093 	}
1094 	AGGR_GRP_REFHOLD(grp);
1095 
1096 	/*
1097 	 * Hold the perimeter so that the aggregation won't be destroyed.
1098 	 */
1099 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1100 	rw_exit(&aggr_grp_lock);
1101 
1102 	/* add the specified ports to group */
1103 	for (i = 0; i < nports; i++) {
1104 		/* add port to group */
1105 		if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1106 		    force, &port)) != 0) {
1107 			goto bail;
1108 		}
1109 		ASSERT(port != NULL);
1110 		nadded++;
1111 
1112 		/* check capabilities */
1113 		if (!aggr_grp_capab_check(grp, port) ||
1114 		    !aggr_grp_sdu_check(grp, port) ||
1115 		    !aggr_grp_margin_check(grp, port)) {
1116 			rc = ENOTSUP;
1117 			goto bail;
1118 		}
1119 
1120 		/*
1121 		 * Create the pseudo ring for each HW ring of the underlying
1122 		 * port.
1123 		 */
1124 		rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1125 		if (rc != 0)
1126 			goto bail;
1127 		rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group);
1128 		if (rc != 0)
1129 			goto bail;
1130 
1131 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1132 
1133 		/* set LACP mode */
1134 		aggr_port_lacp_set_mode(grp, port);
1135 
1136 		/* start port if group has already been started */
1137 		if (grp->lg_started) {
1138 			rc = aggr_port_start(port);
1139 			if (rc != 0) {
1140 				mac_perim_exit(pmph);
1141 				goto bail;
1142 			}
1143 
1144 			/*
1145 			 * Turn on the promiscuous mode over the port when it
1146 			 * is requested to be turned on to receive the
1147 			 * non-primary address over a port, or the promiscous
1148 			 * mode is enabled over the aggr.
1149 			 */
1150 			if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1151 				rc = aggr_port_promisc(port, B_TRUE);
1152 				if (rc != 0) {
1153 					mac_perim_exit(pmph);
1154 					goto bail;
1155 				}
1156 			}
1157 		}
1158 		mac_perim_exit(pmph);
1159 
1160 		/*
1161 		 * Attach each port if necessary.
1162 		 */
1163 		if (aggr_port_notify_link(grp, port))
1164 			link_state_changed = B_TRUE;
1165 
1166 		/*
1167 		 * Initialize the callback functions for this port.
1168 		 */
1169 		aggr_port_init_callbacks(port);
1170 	}
1171 
1172 	/* update the MAC address of the constituent ports */
1173 	if (aggr_grp_update_ports_mac(grp))
1174 		link_state_changed = B_TRUE;
1175 
1176 	if (link_state_changed)
1177 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1178 
1179 bail:
1180 	if (rc != 0) {
1181 		/* stop and remove ports that have been added */
1182 		for (i = 0; i < nadded; i++) {
1183 			port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1184 			ASSERT(port != NULL);
1185 			if (grp->lg_started) {
1186 				mac_perim_enter_by_mh(port->lp_mh, &pmph);
1187 				(void) aggr_port_promisc(port, B_FALSE);
1188 				aggr_port_stop(port);
1189 				mac_perim_exit(pmph);
1190 			}
1191 			aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1192 			aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1193 			(void) aggr_grp_rem_port(grp, port, NULL, NULL);
1194 		}
1195 	}
1196 
1197 	mac_perim_exit(mph);
1198 	AGGR_GRP_REFRELE(grp);
1199 	return (rc);
1200 }
1201 
1202 static int
1203 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1204     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1205     aggr_lacp_timer_t lacp_timer)
1206 {
1207 	boolean_t mac_addr_changed = B_FALSE;
1208 	boolean_t link_state_changed = B_FALSE;
1209 	mac_perim_handle_t pmph;
1210 
1211 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1212 
1213 	/* validate fixed address if specified */
1214 	if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
1215 	    ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
1216 	    (mac_addr[0] & 0x01))) {
1217 		return (EINVAL);
1218 	}
1219 
1220 	/* update policy if requested */
1221 	if (update_mask & AGGR_MODIFY_POLICY)
1222 		aggr_send_update_policy(grp, policy);
1223 
1224 	/* update unicast MAC address if requested */
1225 	if (update_mask & AGGR_MODIFY_MAC) {
1226 		if (mac_fixed) {
1227 			/* user-supplied MAC address */
1228 			grp->lg_mac_addr_port = NULL;
1229 			if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
1230 				bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1231 				mac_addr_changed = B_TRUE;
1232 			}
1233 		} else if (grp->lg_addr_fixed) {
1234 			/* switch from user-supplied to automatic */
1235 			aggr_port_t *port = grp->lg_ports;
1236 
1237 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1238 			bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
1239 			grp->lg_mac_addr_port = port;
1240 			mac_addr_changed = B_TRUE;
1241 			mac_perim_exit(pmph);
1242 		}
1243 		grp->lg_addr_fixed = mac_fixed;
1244 	}
1245 
1246 	if (mac_addr_changed)
1247 		link_state_changed = aggr_grp_update_ports_mac(grp);
1248 
1249 	if (update_mask & AGGR_MODIFY_LACP_MODE)
1250 		aggr_lacp_update_mode(grp, lacp_mode);
1251 
1252 	if (update_mask & AGGR_MODIFY_LACP_TIMER)
1253 		aggr_lacp_update_timer(grp, lacp_timer);
1254 
1255 	if (link_state_changed)
1256 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1257 
1258 	if (mac_addr_changed)
1259 		mac_unicst_update(grp->lg_mh, grp->lg_addr);
1260 
1261 	return (0);
1262 }
1263 
1264 /*
1265  * Update properties of an existing link aggregation group.
1266  */
1267 int
1268 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
1269     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1270     aggr_lacp_timer_t lacp_timer)
1271 {
1272 	aggr_grp_t *grp = NULL;
1273 	mac_perim_handle_t mph;
1274 	int err;
1275 
1276 	/* get group corresponding to linkid */
1277 	rw_enter(&aggr_grp_lock, RW_READER);
1278 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1279 	    (mod_hash_val_t *)&grp) != 0) {
1280 		rw_exit(&aggr_grp_lock);
1281 		return (ENOENT);
1282 	}
1283 	AGGR_GRP_REFHOLD(grp);
1284 
1285 	/*
1286 	 * Hold the perimeter so that the aggregation won't be destroyed.
1287 	 */
1288 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1289 	rw_exit(&aggr_grp_lock);
1290 
1291 	err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
1292 	    mac_addr, lacp_mode, lacp_timer);
1293 
1294 	mac_perim_exit(mph);
1295 	AGGR_GRP_REFRELE(grp);
1296 	return (err);
1297 }
1298 
1299 /*
1300  * Create a new link aggregation group upon request from administrator.
1301  * Returns 0 on success, an errno on failure.
1302  */
1303 int
1304 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
1305     laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force,
1306     uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer,
1307     cred_t *credp)
1308 {
1309 	aggr_grp_t *grp = NULL;
1310 	aggr_port_t *port;
1311 	mac_register_t *mac;
1312 	boolean_t link_state_changed;
1313 	mac_perim_handle_t mph;
1314 	int err;
1315 	int i;
1316 	kt_did_t tid = 0;
1317 
1318 	/* need at least one port */
1319 	if (nports == 0)
1320 		return (EINVAL);
1321 
1322 	rw_enter(&aggr_grp_lock, RW_WRITER);
1323 
1324 	/* does a group with the same linkid already exist? */
1325 	err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1326 	    (mod_hash_val_t *)&grp);
1327 	if (err == 0) {
1328 		rw_exit(&aggr_grp_lock);
1329 		return (EEXIST);
1330 	}
1331 
1332 	grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
1333 
1334 	grp->lg_refs = 1;
1335 	grp->lg_closing = B_FALSE;
1336 	grp->lg_force = force;
1337 	grp->lg_linkid = linkid;
1338 	grp->lg_zoneid = crgetzoneid(credp);
1339 	grp->lg_ifspeed = 0;
1340 	grp->lg_link_state = LINK_STATE_UNKNOWN;
1341 	grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1342 	grp->lg_started = B_FALSE;
1343 	grp->lg_promisc = B_FALSE;
1344 	grp->lg_lacp_done = B_FALSE;
1345 	grp->lg_tx_notify_done = B_FALSE;
1346 	grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1347 	grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1348 	    aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1349 	grp->lg_tx_notify_thread = thread_create(NULL, 0,
1350 	    aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1351 	grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1352 	    MAX_RINGS_PER_GROUP), KM_SLEEP);
1353 	grp->lg_tx_blocked_cnt = 0;
1354 	bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t));
1355 	bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1356 	aggr_lacp_init_grp(grp);
1357 
1358 	grp->lg_rx_group.arg_untagged = 0;
1359 	list_create(&(grp->lg_rx_group.arg_vlans), sizeof (aggr_vlan_t),
1360 	    offsetof(aggr_vlan_t, av_link));
1361 
1362 	/* add MAC ports to group */
1363 	grp->lg_ports = NULL;
1364 	grp->lg_nports = 0;
1365 	grp->lg_nattached_ports = 0;
1366 	grp->lg_ntx_ports = 0;
1367 
1368 	/*
1369 	 * If key is not specified by the user, allocate the key.
1370 	 */
1371 	if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1372 		err = ENOMEM;
1373 		goto bail;
1374 	}
1375 	grp->lg_key = key;
1376 
1377 	for (i = 0; i < nports; i++) {
1378 		err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port);
1379 		if (err != 0)
1380 			goto bail;
1381 	}
1382 
1383 	/*
1384 	 * If no explicit MAC address was specified by the administrator,
1385 	 * set it to the MAC address of the first port.
1386 	 */
1387 	grp->lg_addr_fixed = mac_fixed;
1388 	if (grp->lg_addr_fixed) {
1389 		/* validate specified address */
1390 		if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1391 			err = EINVAL;
1392 			goto bail;
1393 		}
1394 		bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1395 	} else {
1396 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1397 		grp->lg_mac_addr_port = grp->lg_ports;
1398 	}
1399 
1400 	/* set the initial group capabilities */
1401 	aggr_grp_capab_set(grp);
1402 
1403 	if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1404 		err = ENOMEM;
1405 		goto bail;
1406 	}
1407 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1408 	mac->m_driver = grp;
1409 	mac->m_dip = aggr_dip;
1410 	mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1411 	mac->m_src_addr = grp->lg_addr;
1412 	mac->m_callbacks = &aggr_m_callbacks;
1413 	mac->m_min_sdu = 0;
1414 	mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1415 	mac->m_margin = aggr_grp_max_margin(grp);
1416 	mac->m_v12n = MAC_VIRT_LEVEL1;
1417 	err = mac_register(mac, &grp->lg_mh);
1418 	mac_free(mac);
1419 	if (err != 0)
1420 		goto bail;
1421 
1422 	err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1423 	if (err != 0) {
1424 		(void) mac_unregister(grp->lg_mh);
1425 		grp->lg_mh = NULL;
1426 		goto bail;
1427 	}
1428 
1429 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1430 
1431 	/*
1432 	 * Update the MAC address of the constituent ports.
1433 	 * None of the port is attached at this time, the link state of the
1434 	 * aggregation will not change.
1435 	 */
1436 	link_state_changed = aggr_grp_update_ports_mac(grp);
1437 	ASSERT(!link_state_changed);
1438 
1439 	/* update outbound load balancing policy */
1440 	aggr_send_update_policy(grp, policy);
1441 
1442 	/* set LACP mode */
1443 	aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1444 
1445 	/*
1446 	 * Attach each port if necessary.
1447 	 */
1448 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1449 		/*
1450 		 * Create the pseudo ring for each HW ring of the underlying
1451 		 * port. Note that this is done after the aggr registers the
1452 		 * mac.
1453 		 */
1454 		VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0);
1455 		VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0);
1456 		if (aggr_port_notify_link(grp, port))
1457 			link_state_changed = B_TRUE;
1458 
1459 		/*
1460 		 * Initialize the callback functions for this port.
1461 		 */
1462 		aggr_port_init_callbacks(port);
1463 	}
1464 
1465 	if (link_state_changed)
1466 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1467 
1468 	/* add new group to hash table */
1469 	err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1470 	    (mod_hash_val_t)grp);
1471 	ASSERT(err == 0);
1472 	aggr_grp_cnt++;
1473 
1474 	mac_perim_exit(mph);
1475 	rw_exit(&aggr_grp_lock);
1476 	return (0);
1477 
1478 bail:
1479 
1480 	grp->lg_closing = B_TRUE;
1481 
1482 	port = grp->lg_ports;
1483 	while (port != NULL) {
1484 		aggr_port_t *cport;
1485 
1486 		cport = port->lp_next;
1487 		aggr_port_delete(port);
1488 		port = cport;
1489 	}
1490 
1491 	/*
1492 	 * Inform the lacp_rx thread to exit.
1493 	 */
1494 	mutex_enter(&grp->lg_lacp_lock);
1495 	grp->lg_lacp_done = B_TRUE;
1496 	cv_signal(&grp->lg_lacp_cv);
1497 	while (grp->lg_lacp_rx_thread != NULL)
1498 		cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1499 	mutex_exit(&grp->lg_lacp_lock);
1500 	/*
1501 	 * Inform the tx_notify thread to exit.
1502 	 */
1503 	mutex_enter(&grp->lg_tx_flowctl_lock);
1504 	if (grp->lg_tx_notify_thread != NULL) {
1505 		tid = grp->lg_tx_notify_thread->t_did;
1506 		grp->lg_tx_notify_done = B_TRUE;
1507 		cv_signal(&grp->lg_tx_flowctl_cv);
1508 	}
1509 	mutex_exit(&grp->lg_tx_flowctl_lock);
1510 	if (tid != 0)
1511 		thread_join(tid);
1512 
1513 	kmem_free(grp->lg_tx_blocked_rings,
1514 	    (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1515 	rw_exit(&aggr_grp_lock);
1516 	AGGR_GRP_REFRELE(grp);
1517 	return (err);
1518 }
1519 
1520 /*
1521  * Return a pointer to the member of a group with specified linkid.
1522  */
1523 static aggr_port_t *
1524 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
1525 {
1526 	aggr_port_t *port;
1527 
1528 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1529 
1530 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1531 		if (port->lp_linkid == linkid)
1532 			break;
1533 	}
1534 
1535 	return (port);
1536 }
1537 
1538 /*
1539  * Stop, detach and remove a port from a link aggregation group.
1540  */
1541 static int
1542 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
1543     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
1544 {
1545 	int rc = 0;
1546 	aggr_port_t **pport;
1547 	boolean_t mac_addr_changed = B_FALSE;
1548 	boolean_t link_state_changed = B_FALSE;
1549 	mac_perim_handle_t mph;
1550 	uint64_t val;
1551 	uint_t i;
1552 	uint_t stat;
1553 
1554 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1555 	ASSERT(grp->lg_nports > 1);
1556 	ASSERT(!grp->lg_closing);
1557 
1558 	/* unlink port */
1559 	for (pport = &grp->lg_ports; *pport != port;
1560 	    pport = &(*pport)->lp_next) {
1561 		if (*pport == NULL) {
1562 			rc = ENOENT;
1563 			goto done;
1564 		}
1565 	}
1566 	*pport = port->lp_next;
1567 
1568 	mac_perim_enter_by_mh(port->lp_mh, &mph);
1569 
1570 	/*
1571 	 * If the MAC address of the port being removed was assigned
1572 	 * to the group, update the group MAC address
1573 	 * using the MAC address of a different port.
1574 	 */
1575 	if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
1576 		/*
1577 		 * Set the MAC address of the group to the
1578 		 * MAC address of its first port.
1579 		 */
1580 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1581 		grp->lg_mac_addr_port = grp->lg_ports;
1582 		mac_addr_changed = B_TRUE;
1583 	}
1584 
1585 	link_state_changed = aggr_grp_detach_port(grp, port);
1586 
1587 	/*
1588 	 * Add the counter statistics of the ports while it was aggregated
1589 	 * to the group's residual statistics.  This is done by obtaining
1590 	 * the current counter from the underlying MAC then subtracting the
1591 	 * value of the counter at the moment it was added to the
1592 	 * aggregation.
1593 	 */
1594 	for (i = 0; i < MAC_NSTAT; i++) {
1595 		stat = i + MAC_STAT_MIN;
1596 		if (!MAC_STAT_ISACOUNTER(stat))
1597 			continue;
1598 		val = aggr_port_stat(port, stat);
1599 		val -= port->lp_stat[i];
1600 		mutex_enter(&grp->lg_stat_lock);
1601 		grp->lg_stat[i] += val;
1602 		mutex_exit(&grp->lg_stat_lock);
1603 	}
1604 	for (i = 0; i < ETHER_NSTAT; i++) {
1605 		stat = i + MACTYPE_STAT_MIN;
1606 		if (!ETHER_STAT_ISACOUNTER(stat))
1607 			continue;
1608 		val = aggr_port_stat(port, stat);
1609 		val -= port->lp_ether_stat[i];
1610 		mutex_enter(&grp->lg_stat_lock);
1611 		grp->lg_ether_stat[i] += val;
1612 		mutex_exit(&grp->lg_stat_lock);
1613 	}
1614 
1615 	grp->lg_nports--;
1616 	mac_perim_exit(mph);
1617 
1618 	aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1619 	aggr_port_delete(port);
1620 
1621 	/*
1622 	 * If the group MAC address has changed, update the MAC address of
1623 	 * the remaining constituent ports according to the new MAC
1624 	 * address of the group.
1625 	 */
1626 	if (mac_addr_changed && aggr_grp_update_ports_mac(grp))
1627 		link_state_changed = B_TRUE;
1628 
1629 done:
1630 	if (mac_addr_changedp != NULL)
1631 		*mac_addr_changedp = mac_addr_changed;
1632 	if (link_state_changedp != NULL)
1633 		*link_state_changedp = link_state_changed;
1634 
1635 	return (rc);
1636 }
1637 
1638 /*
1639  * Remove one or more ports from an existing link aggregation group.
1640  */
1641 int
1642 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
1643 {
1644 	int rc = 0, i;
1645 	aggr_grp_t *grp = NULL;
1646 	aggr_port_t *port;
1647 	boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
1648 	boolean_t link_state_update = B_FALSE, link_state_changed;
1649 	mac_perim_handle_t mph, pmph;
1650 
1651 	/* get group corresponding to linkid */
1652 	rw_enter(&aggr_grp_lock, RW_READER);
1653 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1654 	    (mod_hash_val_t *)&grp) != 0) {
1655 		rw_exit(&aggr_grp_lock);
1656 		return (ENOENT);
1657 	}
1658 	AGGR_GRP_REFHOLD(grp);
1659 
1660 	/*
1661 	 * Hold the perimeter so that the aggregation won't be destroyed.
1662 	 */
1663 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1664 	rw_exit(&aggr_grp_lock);
1665 
1666 	/* we need to keep at least one port per group */
1667 	if (nports >= grp->lg_nports) {
1668 		rc = EINVAL;
1669 		goto bail;
1670 	}
1671 
1672 	/* first verify that all the groups are valid */
1673 	for (i = 0; i < nports; i++) {
1674 		if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) {
1675 			/* port not found */
1676 			rc = ENOENT;
1677 			goto bail;
1678 		}
1679 	}
1680 
1681 	/* clear the promiscous mode for the specified ports */
1682 	for (i = 0; i < nports && rc == 0; i++) {
1683 		/* lookup port */
1684 		port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1685 		ASSERT(port != NULL);
1686 
1687 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1688 		rc = aggr_port_promisc(port, B_FALSE);
1689 		mac_perim_exit(pmph);
1690 	}
1691 	if (rc != 0) {
1692 		for (i = 0; i < nports; i++) {
1693 			port = aggr_grp_port_lookup(grp,
1694 			    ports[i].lp_linkid);
1695 			ASSERT(port != NULL);
1696 
1697 			/*
1698 			 * Turn the promiscuous mode back on if it is required
1699 			 * to receive the non-primary address over a port, or
1700 			 * the promiscous mode is enabled over the aggr.
1701 			 */
1702 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1703 			if (port->lp_started && (grp->lg_promisc ||
1704 			    port->lp_prom_addr != NULL)) {
1705 				(void) aggr_port_promisc(port, B_TRUE);
1706 			}
1707 			mac_perim_exit(pmph);
1708 		}
1709 		goto bail;
1710 	}
1711 
1712 	/* remove the specified ports from group */
1713 	for (i = 0; i < nports; i++) {
1714 		/* lookup port */
1715 		port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1716 		ASSERT(port != NULL);
1717 
1718 		/* stop port if group has already been started */
1719 		if (grp->lg_started) {
1720 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1721 			aggr_port_stop(port);
1722 			mac_perim_exit(pmph);
1723 		}
1724 
1725 		/*
1726 		 * aggr_rem_pseudo_tx_group() is not called here. Instead
1727 		 * it is called from inside aggr_grp_rem_port() after the
1728 		 * port has been detached. The reason is that
1729 		 * aggr_rem_pseudo_tx_group() removes one ring at a time
1730 		 * and if there is still traffic going on, then there
1731 		 * is the possibility of aggr_find_tx_ring() returning a
1732 		 * removed ring for transmission. Once the port has been
1733 		 * detached, that port will not be used and
1734 		 * aggr_find_tx_ring() will not return any rings
1735 		 * belonging to it.
1736 		 */
1737 		aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1738 
1739 		/* remove port from group */
1740 		rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1741 		    &link_state_changed);
1742 		ASSERT(rc == 0);
1743 		mac_addr_update = mac_addr_update || mac_addr_changed;
1744 		link_state_update = link_state_update || link_state_changed;
1745 	}
1746 
1747 bail:
1748 	if (mac_addr_update)
1749 		mac_unicst_update(grp->lg_mh, grp->lg_addr);
1750 	if (link_state_update)
1751 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1752 
1753 	mac_perim_exit(mph);
1754 	AGGR_GRP_REFRELE(grp);
1755 
1756 	return (rc);
1757 }
1758 
1759 int
1760 aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
1761 {
1762 	aggr_grp_t *grp = NULL;
1763 	aggr_port_t *port, *cport;
1764 	datalink_id_t tmpid;
1765 	mod_hash_val_t val;
1766 	mac_perim_handle_t mph, pmph;
1767 	int err;
1768 	kt_did_t tid = 0;
1769 
1770 	rw_enter(&aggr_grp_lock, RW_WRITER);
1771 
1772 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1773 	    (mod_hash_val_t *)&grp) != 0) {
1774 		rw_exit(&aggr_grp_lock);
1775 		return (ENOENT);
1776 	}
1777 
1778 	/*
1779 	 * Note that dls_devnet_destroy() must be called before lg_lock is
1780 	 * held. Otherwise, it will deadlock if another thread is in
1781 	 * aggr_m_stat() and thus has a kstat_hold() on the kstats that
1782 	 * dls_devnet_destroy() needs to delete.
1783 	 */
1784 	if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
1785 		rw_exit(&aggr_grp_lock);
1786 		return (err);
1787 	}
1788 	ASSERT(linkid == tmpid);
1789 
1790 	/*
1791 	 * Unregister from the MAC service module. Since this can
1792 	 * fail if a client hasn't closed the MAC port, we gracefully
1793 	 * fail the operation.
1794 	 */
1795 	if ((err = mac_disable(grp->lg_mh)) != 0) {
1796 		(void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred));
1797 		rw_exit(&aggr_grp_lock);
1798 		return (err);
1799 	}
1800 	(void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
1801 	ASSERT(grp == (aggr_grp_t *)val);
1802 
1803 	ASSERT(aggr_grp_cnt > 0);
1804 	aggr_grp_cnt--;
1805 	rw_exit(&aggr_grp_lock);
1806 
1807 	/*
1808 	 * Inform the lacp_rx thread to exit.
1809 	 */
1810 	mutex_enter(&grp->lg_lacp_lock);
1811 	grp->lg_lacp_done = B_TRUE;
1812 	cv_signal(&grp->lg_lacp_cv);
1813 	while (grp->lg_lacp_rx_thread != NULL)
1814 		cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1815 	mutex_exit(&grp->lg_lacp_lock);
1816 	/*
1817 	 * Inform the tx_notify_thread to exit.
1818 	 */
1819 	mutex_enter(&grp->lg_tx_flowctl_lock);
1820 	if (grp->lg_tx_notify_thread != NULL) {
1821 		tid = grp->lg_tx_notify_thread->t_did;
1822 		grp->lg_tx_notify_done = B_TRUE;
1823 		cv_signal(&grp->lg_tx_flowctl_cv);
1824 	}
1825 	mutex_exit(&grp->lg_tx_flowctl_lock);
1826 	if (tid != 0)
1827 		thread_join(tid);
1828 
1829 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1830 
1831 	grp->lg_closing = B_TRUE;
1832 	/* detach and free MAC ports associated with group */
1833 	port = grp->lg_ports;
1834 	while (port != NULL) {
1835 		cport = port->lp_next;
1836 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1837 		if (grp->lg_started)
1838 			aggr_port_stop(port);
1839 		(void) aggr_grp_detach_port(grp, port);
1840 		mac_perim_exit(pmph);
1841 		aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1842 		aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1843 		aggr_port_delete(port);
1844 		port = cport;
1845 	}
1846 
1847 	mac_perim_exit(mph);
1848 
1849 	kmem_free(grp->lg_tx_blocked_rings,
1850 	    (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1851 	/*
1852 	 * Wait for the port's lacp timer thread and its notification callback
1853 	 * to exit before calling mac_unregister() since both needs to access
1854 	 * the mac perimeter of the grp.
1855 	 */
1856 	aggr_grp_port_wait(grp);
1857 
1858 	VERIFY(mac_unregister(grp->lg_mh) == 0);
1859 	grp->lg_mh = NULL;
1860 
1861 	list_destroy(&(grp->lg_rx_group.arg_vlans));
1862 
1863 	AGGR_GRP_REFRELE(grp);
1864 	return (0);
1865 }
1866 
1867 void
1868 aggr_grp_free(aggr_grp_t *grp)
1869 {
1870 	ASSERT(grp->lg_refs == 0);
1871 	ASSERT(grp->lg_port_ref == 0);
1872 	if (grp->lg_key > AGGR_MAX_KEY) {
1873 		id_free(key_ids, grp->lg_key);
1874 		grp->lg_key = 0;
1875 	}
1876 	kmem_cache_free(aggr_grp_cache, grp);
1877 }
1878 
1879 int
1880 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1881     aggr_grp_info_new_grp_fn_t new_grp_fn,
1882     aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred)
1883 {
1884 	aggr_grp_t	*grp;
1885 	aggr_port_t	*port;
1886 	mac_perim_handle_t mph, pmph;
1887 	int		rc = 0;
1888 
1889 	/*
1890 	 * Make sure that the aggregation link is visible from the caller's
1891 	 * zone.
1892 	 */
1893 	if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred)))
1894 		return (ENOENT);
1895 
1896 	rw_enter(&aggr_grp_lock, RW_READER);
1897 
1898 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1899 	    (mod_hash_val_t *)&grp) != 0) {
1900 		rw_exit(&aggr_grp_lock);
1901 		return (ENOENT);
1902 	}
1903 	AGGR_GRP_REFHOLD(grp);
1904 
1905 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1906 	rw_exit(&aggr_grp_lock);
1907 
1908 	rc = new_grp_fn(fn_arg, grp->lg_linkid,
1909 	    (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
1910 	    grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy,
1911 	    grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
1912 
1913 	if (rc != 0)
1914 		goto bail;
1915 
1916 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1917 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1918 		rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
1919 		    port->lp_state, &port->lp_lacp.ActorOperPortState);
1920 		mac_perim_exit(pmph);
1921 
1922 		if (rc != 0)
1923 			goto bail;
1924 	}
1925 
1926 bail:
1927 	mac_perim_exit(mph);
1928 	AGGR_GRP_REFRELE(grp);
1929 	return (rc);
1930 }
1931 
1932 /*ARGSUSED*/
1933 static void
1934 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1935 {
1936 	miocnak(q, mp, 0, ENOTSUP);
1937 }
1938 
1939 static int
1940 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
1941 {
1942 	aggr_port_t	*port;
1943 	uint_t		stat_index;
1944 
1945 	ASSERT(MUTEX_HELD(&grp->lg_stat_lock));
1946 
1947 	/* We only aggregate counter statistics. */
1948 	if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) ||
1949 	    IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) {
1950 		return (ENOTSUP);
1951 	}
1952 
1953 	/*
1954 	 * Counter statistics for a group are computed by aggregating the
1955 	 * counters of the members MACs while they were aggregated, plus
1956 	 * the residual counter of the group itself, which is updated each
1957 	 * time a MAC is removed from the group.
1958 	 */
1959 	*val = 0;
1960 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1961 		/* actual port statistic */
1962 		*val += aggr_port_stat(port, stat);
1963 		/*
1964 		 * minus the port stat when it was added, plus any residual
1965 		 * amount for the group.
1966 		 */
1967 		if (IS_MAC_STAT(stat)) {
1968 			stat_index = stat - MAC_STAT_MIN;
1969 			*val -= port->lp_stat[stat_index];
1970 			*val += grp->lg_stat[stat_index];
1971 		} else if (IS_MACTYPE_STAT(stat)) {
1972 			stat_index = stat - MACTYPE_STAT_MIN;
1973 			*val -= port->lp_ether_stat[stat_index];
1974 			*val += grp->lg_ether_stat[stat_index];
1975 		}
1976 	}
1977 	return (0);
1978 }
1979 
1980 int
1981 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1982 {
1983 	aggr_pseudo_rx_ring_t   *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver;
1984 
1985 	if (rx_ring->arr_hw_rh != NULL) {
1986 		*val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat);
1987 	} else {
1988 		aggr_port_t	*port = rx_ring->arr_port;
1989 
1990 		*val = mac_stat_get(port->lp_mh, stat);
1991 
1992 	}
1993 	return (0);
1994 }
1995 
1996 int
1997 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1998 {
1999 	aggr_pseudo_tx_ring_t   *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver;
2000 
2001 	if (tx_ring->atr_hw_rh != NULL) {
2002 		*val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat);
2003 	} else {
2004 		aggr_port_t	*port = tx_ring->atr_port;
2005 
2006 		*val = mac_stat_get(port->lp_mh, stat);
2007 	}
2008 	return (0);
2009 }
2010 
2011 static int
2012 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
2013 {
2014 	aggr_grp_t		*grp = arg;
2015 	int			rval = 0;
2016 
2017 	mutex_enter(&grp->lg_stat_lock);
2018 
2019 	switch (stat) {
2020 	case MAC_STAT_IFSPEED:
2021 		*val = grp->lg_ifspeed;
2022 		break;
2023 
2024 	case ETHER_STAT_LINK_DUPLEX:
2025 		*val = grp->lg_link_duplex;
2026 		break;
2027 
2028 	default:
2029 		/*
2030 		 * For all other statistics, we return the aggregated stat
2031 		 * from the underlying ports.  aggr_grp_stat() will set
2032 		 * rval appropriately if the statistic isn't a counter.
2033 		 */
2034 		rval = aggr_grp_stat(grp, stat, val);
2035 	}
2036 
2037 	mutex_exit(&grp->lg_stat_lock);
2038 	return (rval);
2039 }
2040 
2041 static int
2042 aggr_m_start(void *arg)
2043 {
2044 	aggr_grp_t *grp = arg;
2045 	aggr_port_t *port;
2046 	mac_perim_handle_t mph, pmph;
2047 
2048 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2049 
2050 	/*
2051 	 * Attempts to start all configured members of the group.
2052 	 * Group members will be attached when their link-up notification
2053 	 * is received.
2054 	 */
2055 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2056 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2057 		if (aggr_port_start(port) != 0) {
2058 			mac_perim_exit(pmph);
2059 			continue;
2060 		}
2061 
2062 		/*
2063 		 * Turn on the promiscuous mode if it is required to receive
2064 		 * the non-primary address over a port, or the promiscous
2065 		 * mode is enabled over the aggr.
2066 		 */
2067 		if (grp->lg_promisc || port->lp_prom_addr != NULL) {
2068 			if (aggr_port_promisc(port, B_TRUE) != 0)
2069 				aggr_port_stop(port);
2070 		}
2071 		mac_perim_exit(pmph);
2072 	}
2073 
2074 	grp->lg_started = B_TRUE;
2075 
2076 	mac_perim_exit(mph);
2077 	return (0);
2078 }
2079 
2080 static void
2081 aggr_m_stop(void *arg)
2082 {
2083 	aggr_grp_t *grp = arg;
2084 	aggr_port_t *port;
2085 	mac_perim_handle_t mph, pmph;
2086 
2087 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2088 
2089 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2090 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2091 
2092 		/* reset port promiscuous mode */
2093 		(void) aggr_port_promisc(port, B_FALSE);
2094 
2095 		aggr_port_stop(port);
2096 		mac_perim_exit(pmph);
2097 	}
2098 
2099 	grp->lg_started = B_FALSE;
2100 	mac_perim_exit(mph);
2101 }
2102 
2103 static int
2104 aggr_m_promisc(void *arg, boolean_t on)
2105 {
2106 	aggr_grp_t *grp = arg;
2107 	aggr_port_t *port;
2108 	boolean_t link_state_changed = B_FALSE;
2109 	mac_perim_handle_t mph, pmph;
2110 
2111 	AGGR_GRP_REFHOLD(grp);
2112 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2113 
2114 	ASSERT(!grp->lg_closing);
2115 
2116 	if (on == grp->lg_promisc)
2117 		goto bail;
2118 
2119 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2120 		int	err = 0;
2121 
2122 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2123 		AGGR_PORT_REFHOLD(port);
2124 		if (!on && (port->lp_prom_addr == NULL))
2125 			err = aggr_port_promisc(port, B_FALSE);
2126 		else if (on && port->lp_started)
2127 			err = aggr_port_promisc(port, B_TRUE);
2128 
2129 		if (err != 0) {
2130 			if (aggr_grp_detach_port(grp, port))
2131 				link_state_changed = B_TRUE;
2132 		} else {
2133 			/*
2134 			 * If a port was detached because of a previous
2135 			 * failure changing the promiscuity, the port
2136 			 * is reattached when it successfully changes
2137 			 * the promiscuity now, and this might cause
2138 			 * the link state of the aggregation to change.
2139 			 */
2140 			if (aggr_grp_attach_port(grp, port))
2141 				link_state_changed = B_TRUE;
2142 		}
2143 		mac_perim_exit(pmph);
2144 		AGGR_PORT_REFRELE(port);
2145 	}
2146 
2147 	grp->lg_promisc = on;
2148 
2149 	if (link_state_changed)
2150 		mac_link_update(grp->lg_mh, grp->lg_link_state);
2151 
2152 bail:
2153 	mac_perim_exit(mph);
2154 	AGGR_GRP_REFRELE(grp);
2155 
2156 	return (0);
2157 }
2158 
2159 static void
2160 aggr_grp_port_rename(const char *new_name, void *arg)
2161 {
2162 	/*
2163 	 * aggr port's mac client name is the format of "aggr link name" plus
2164 	 * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
2165 	 */
2166 	int aggr_len, link_len, clnt_name_len, i;
2167 	char *str_end, *str_st, *str_del;
2168 	char aggr_name[MAXNAMELEN];
2169 	char link_name[MAXNAMELEN];
2170 	char *clnt_name;
2171 	aggr_grp_t *aggr_grp = arg;
2172 	aggr_port_t *aggr_port = aggr_grp->lg_ports;
2173 
2174 	for (i = 0; i < aggr_grp->lg_nports; i++) {
2175 		clnt_name = mac_client_name(aggr_port->lp_mch);
2176 		clnt_name_len = strlen(clnt_name);
2177 		str_st = clnt_name;
2178 		str_end = &(clnt_name[clnt_name_len]);
2179 		str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
2180 		ASSERT(str_del != NULL);
2181 		aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
2182 		link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
2183 		bzero(aggr_name, MAXNAMELEN);
2184 		bzero(link_name, MAXNAMELEN);
2185 		bcopy(clnt_name, aggr_name, aggr_len);
2186 		bcopy(str_del, link_name, link_len + 1);
2187 		bzero(clnt_name, MAXNAMELEN);
2188 		(void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
2189 		    link_name);
2190 
2191 		(void) mac_rename_primary(aggr_port->lp_mh, NULL);
2192 		aggr_port = aggr_port->lp_next;
2193 	}
2194 }
2195 
2196 /*
2197  * Initialize the capabilities that are advertised for the group
2198  * according to the capabilities of the constituent ports.
2199  */
2200 static boolean_t
2201 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
2202 {
2203 	aggr_grp_t *grp = arg;
2204 
2205 	switch (cap) {
2206 	case MAC_CAPAB_HCKSUM: {
2207 		uint32_t *hcksum_txflags = cap_data;
2208 		*hcksum_txflags = grp->lg_hcksum_txflags;
2209 		break;
2210 	}
2211 	case MAC_CAPAB_LSO: {
2212 		mac_capab_lso_t *cap_lso = cap_data;
2213 
2214 		if (grp->lg_lso) {
2215 			*cap_lso = grp->lg_cap_lso;
2216 			break;
2217 		} else {
2218 			return (B_FALSE);
2219 		}
2220 	}
2221 	case MAC_CAPAB_NO_NATIVEVLAN:
2222 		return (!grp->lg_vlan);
2223 	case MAC_CAPAB_NO_ZCOPY:
2224 		return (!grp->lg_zcopy);
2225 	case MAC_CAPAB_RINGS: {
2226 		mac_capab_rings_t *cap_rings = cap_data;
2227 
2228 		if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2229 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2230 			cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt;
2231 
2232 			/*
2233 			 * An aggregation advertises only one (pseudo) RX
2234 			 * group, which virtualizes the main/primary group of
2235 			 * the underlying devices.
2236 			 */
2237 			cap_rings->mr_gnum = 1;
2238 			cap_rings->mr_gaddring = NULL;
2239 			cap_rings->mr_gremring = NULL;
2240 		} else {
2241 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2242 			cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2243 			cap_rings->mr_gnum = 0;
2244 		}
2245 		cap_rings->mr_rget = aggr_fill_ring;
2246 		cap_rings->mr_gget = aggr_fill_group;
2247 		break;
2248 	}
2249 	case MAC_CAPAB_AGGR:
2250 	{
2251 		mac_capab_aggr_t *aggr_cap;
2252 
2253 		if (cap_data != NULL) {
2254 			aggr_cap = cap_data;
2255 			aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2256 			aggr_cap->mca_unicst = aggr_m_unicst;
2257 			aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2258 			aggr_cap->mca_arg = arg;
2259 		}
2260 		return (B_TRUE);
2261 	}
2262 	default:
2263 		return (B_FALSE);
2264 	}
2265 	return (B_TRUE);
2266 }
2267 
2268 /*
2269  * Callback function for MAC layer to register groups.
2270  */
2271 static void
2272 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2273     mac_group_info_t *infop, mac_group_handle_t gh)
2274 {
2275 	aggr_grp_t *grp = arg;
2276 	aggr_pseudo_rx_group_t *rx_group;
2277 	aggr_pseudo_tx_group_t *tx_group;
2278 
2279 	ASSERT(index == 0);
2280 	if (rtype == MAC_RING_TYPE_RX) {
2281 		rx_group = &grp->lg_rx_group;
2282 		rx_group->arg_gh = gh;
2283 		rx_group->arg_grp = grp;
2284 
2285 		infop->mgi_driver = (mac_group_driver_t)rx_group;
2286 		infop->mgi_start = NULL;
2287 		infop->mgi_stop = NULL;
2288 		infop->mgi_addmac = aggr_addmac;
2289 		infop->mgi_remmac = aggr_remmac;
2290 		infop->mgi_count = rx_group->arg_ring_cnt;
2291 
2292 		/*
2293 		 * Always set the HW VLAN callbacks. They are smart
2294 		 * enough to know when a port has HW VLAN filters to
2295 		 * program and when it doesn't.
2296 		 */
2297 		infop->mgi_addvlan = aggr_addvlan;
2298 		infop->mgi_remvlan = aggr_remvlan;
2299 	} else {
2300 		tx_group = &grp->lg_tx_group;
2301 		tx_group->atg_gh = gh;
2302 	}
2303 }
2304 
2305 /*
2306  * Callback funtion for MAC layer to register all rings.
2307  */
2308 static void
2309 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2310     const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2311 {
2312 	aggr_grp_t	*grp = arg;
2313 
2314 	switch (rtype) {
2315 	case MAC_RING_TYPE_RX: {
2316 		aggr_pseudo_rx_group_t	*rx_group = &grp->lg_rx_group;
2317 		aggr_pseudo_rx_ring_t	*rx_ring;
2318 		mac_intr_t		aggr_mac_intr;
2319 
2320 		ASSERT(rg_index == 0);
2321 
2322 		ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt));
2323 		rx_ring = rx_group->arg_rings + index;
2324 		rx_ring->arr_rh = rh;
2325 
2326 		/*
2327 		 * Entrypoint to enable interrupt (disable poll) and
2328 		 * disable interrupt (enable poll).
2329 		 */
2330 		aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2331 		aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2332 		aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2333 		aggr_mac_intr.mi_ddi_handle = NULL;
2334 
2335 		infop->mri_driver = (mac_ring_driver_t)rx_ring;
2336 		infop->mri_start = aggr_pseudo_start_ring;
2337 		infop->mri_stop = NULL;
2338 
2339 		infop->mri_intr = aggr_mac_intr;
2340 		infop->mri_poll = aggr_rx_poll;
2341 
2342 		infop->mri_stat = aggr_rx_ring_stat;
2343 		break;
2344 	}
2345 	case MAC_RING_TYPE_TX: {
2346 		aggr_pseudo_tx_group_t	*tx_group = &grp->lg_tx_group;
2347 		aggr_pseudo_tx_ring_t	*tx_ring;
2348 
2349 		ASSERT(rg_index == -1);
2350 		ASSERT(index < tx_group->atg_ring_cnt);
2351 
2352 		tx_ring = &tx_group->atg_rings[index];
2353 		tx_ring->atr_rh = rh;
2354 
2355 		infop->mri_driver = (mac_ring_driver_t)tx_ring;
2356 		infop->mri_start = NULL;
2357 		infop->mri_stop = NULL;
2358 		infop->mri_tx = aggr_ring_tx;
2359 		infop->mri_stat = aggr_tx_ring_stat;
2360 		/*
2361 		 * Use the hw TX ring handle to find if the ring needs
2362 		 * serialization or not. For NICs that do not expose
2363 		 * Tx rings, atr_hw_rh will be NULL.
2364 		 */
2365 		if (tx_ring->atr_hw_rh != NULL) {
2366 			infop->mri_flags =
2367 			    mac_hwring_getinfo(tx_ring->atr_hw_rh);
2368 		}
2369 		break;
2370 	}
2371 	default:
2372 		break;
2373 	}
2374 }
2375 
2376 static mblk_t *
2377 aggr_rx_poll(void *arg, int bytes_to_pickup)
2378 {
2379 	aggr_pseudo_rx_ring_t *rr_ring = arg;
2380 	aggr_port_t *port = rr_ring->arr_port;
2381 	aggr_grp_t *grp = port->lp_grp;
2382 	mblk_t *mp_chain, *mp, **mpp;
2383 
2384 	mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
2385 
2386 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
2387 		return (mp_chain);
2388 
2389 	mpp = &mp_chain;
2390 	while ((mp = *mpp) != NULL) {
2391 		if (MBLKL(mp) >= sizeof (struct ether_header)) {
2392 			struct ether_header *ehp;
2393 
2394 			ehp = (struct ether_header *)mp->b_rptr;
2395 			if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
2396 				*mpp = mp->b_next;
2397 				mp->b_next = NULL;
2398 				aggr_recv_lacp(port,
2399 				    (mac_resource_handle_t)rr_ring, mp);
2400 				continue;
2401 			}
2402 		}
2403 
2404 		if (!port->lp_collector_enabled) {
2405 			*mpp = mp->b_next;
2406 			mp->b_next = NULL;
2407 			freemsg(mp);
2408 			continue;
2409 		}
2410 		mpp = &mp->b_next;
2411 	}
2412 	return (mp_chain);
2413 }
2414 
2415 static int
2416 aggr_addmac(void *arg, const uint8_t *mac_addr)
2417 {
2418 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)arg;
2419 	aggr_unicst_addr_t	*addr, **pprev;
2420 	aggr_grp_t		*grp = rx_group->arg_grp;
2421 	aggr_port_t		*port, *p;
2422 	mac_perim_handle_t	mph;
2423 	int			err = 0;
2424 
2425 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2426 
2427 	if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2428 		mac_perim_exit(mph);
2429 		return (0);
2430 	}
2431 
2432 	/*
2433 	 * Insert this mac address into the list of mac addresses owned by
2434 	 * the aggregation pseudo group.
2435 	 */
2436 	pprev = &rx_group->arg_macaddr;
2437 	while ((addr = *pprev) != NULL) {
2438 		if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2439 			mac_perim_exit(mph);
2440 			return (EEXIST);
2441 		}
2442 		pprev = &addr->aua_next;
2443 	}
2444 	addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2445 	bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2446 	addr->aua_next = NULL;
2447 	*pprev = addr;
2448 
2449 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2450 		if ((err = aggr_port_addmac(port, mac_addr)) != 0)
2451 			break;
2452 
2453 	if (err != 0) {
2454 		for (p = grp->lg_ports; p != port; p = p->lp_next)
2455 			aggr_port_remmac(p, mac_addr);
2456 
2457 		*pprev = NULL;
2458 		kmem_free(addr, sizeof (aggr_unicst_addr_t));
2459 	}
2460 
2461 	mac_perim_exit(mph);
2462 	return (err);
2463 }
2464 
2465 static int
2466 aggr_remmac(void *arg, const uint8_t *mac_addr)
2467 {
2468 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)arg;
2469 	aggr_unicst_addr_t	*addr, **pprev;
2470 	aggr_grp_t		*grp = rx_group->arg_grp;
2471 	aggr_port_t		*port;
2472 	mac_perim_handle_t	mph;
2473 	int			err = 0;
2474 
2475 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2476 
2477 	if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2478 		mac_perim_exit(mph);
2479 		return (0);
2480 	}
2481 
2482 	/*
2483 	 * Insert this mac address into the list of mac addresses owned by
2484 	 * the aggregation pseudo group.
2485 	 */
2486 	pprev = &rx_group->arg_macaddr;
2487 	while ((addr = *pprev) != NULL) {
2488 		if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2489 			pprev = &addr->aua_next;
2490 			continue;
2491 		}
2492 		break;
2493 	}
2494 	if (addr == NULL) {
2495 		mac_perim_exit(mph);
2496 		return (EINVAL);
2497 	}
2498 
2499 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2500 		aggr_port_remmac(port, mac_addr);
2501 
2502 	*pprev = addr->aua_next;
2503 	kmem_free(addr, sizeof (aggr_unicst_addr_t));
2504 
2505 	mac_perim_exit(mph);
2506 	return (err);
2507 }
2508 
2509 /*
2510  * Search for VID in the Rx group's list and return a pointer if
2511  * found. Otherwise return NULL.
2512  */
2513 static aggr_vlan_t *
2514 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid)
2515 {
2516 	ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh));
2517 	for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL;
2518 	    avp = list_next(&rx_group->arg_vlans, avp)) {
2519 		if (avp->av_vid == vid)
2520 			return (avp);
2521 	}
2522 
2523 	return (NULL);
2524 }
2525 
2526 /*
2527  * Accept traffic on the specified VID.
2528  *
2529  * Persist VLAN state in the aggr so that ports added later will
2530  * receive the correct filters. In the future it would be nice to
2531  * allow aggr to iterate its clients instead of duplicating state.
2532  */
2533 static int
2534 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid)
2535 {
2536 	aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2537 	aggr_grp_t		*aggr = rx_group->arg_grp;
2538 	aggr_port_t		*port, *p;
2539 	mac_perim_handle_t	mph;
2540 	int			err = 0;
2541 	aggr_vlan_t		*avp = NULL;
2542 
2543 	mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2544 
2545 	if (vid == MAC_VLAN_UNTAGGED) {
2546 		/*
2547 		 * Aggr is both a MAC provider and MAC client. As a
2548 		 * MAC provider it is passed MAC_VLAN_UNTAGGED by its
2549 		 * client. As a client itself, it should pass
2550 		 * VLAN_ID_NONE to its ports.
2551 		 */
2552 		vid = VLAN_ID_NONE;
2553 		rx_group->arg_untagged++;
2554 		goto update_ports;
2555 	}
2556 
2557 	avp = aggr_find_vlan(rx_group, vid);
2558 
2559 	if (avp != NULL) {
2560 		avp->av_refs++;
2561 		mac_perim_exit(mph);
2562 		return (0);
2563 	}
2564 
2565 	avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP);
2566 	avp->av_vid = vid;
2567 	avp->av_refs = 1;
2568 
2569 update_ports:
2570 	for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2571 		if ((err = aggr_port_addvlan(port, vid)) != 0)
2572 			break;
2573 
2574 	if (err != 0) {
2575 		/*
2576 		 * If any of these calls fail then we are in a
2577 		 * situation where the ports have different HW state.
2578 		 * There's no reasonable action the MAC client can
2579 		 * take in this scenario to rectify the situation.
2580 		 */
2581 		for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2582 			int err2;
2583 
2584 			if ((err2 = aggr_port_remvlan(p, vid)) != 0) {
2585 				cmn_err(CE_WARN, "Failed to remove VLAN %u"
2586 				    " from port %s: errno %d.", vid,
2587 				    mac_client_name(p->lp_mch), err2);
2588 			}
2589 
2590 		}
2591 
2592 		if (vid == VLAN_ID_NONE)
2593 			rx_group->arg_untagged--;
2594 
2595 		if (avp != NULL) {
2596 			kmem_free(avp, sizeof (aggr_vlan_t));
2597 			avp = NULL;
2598 		}
2599 	}
2600 
2601 	if (avp != NULL)
2602 		list_insert_tail(&rx_group->arg_vlans, avp);
2603 
2604 done:
2605 	mac_perim_exit(mph);
2606 	return (err);
2607 }
2608 
2609 /*
2610  * Stop accepting traffic on this VLAN if it's the last use of this VLAN.
2611  */
2612 static int
2613 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid)
2614 {
2615 	aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2616 	aggr_grp_t		*aggr = rx_group->arg_grp;
2617 	aggr_port_t		*port, *p;
2618 	mac_perim_handle_t	mph;
2619 	int			err = 0;
2620 	aggr_vlan_t		*avp = NULL;
2621 
2622 	mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2623 
2624 	/*
2625 	 * See the comment in aggr_addvlan().
2626 	 */
2627 	if (vid == MAC_VLAN_UNTAGGED) {
2628 		vid = VLAN_ID_NONE;
2629 		rx_group->arg_untagged--;
2630 
2631 		if (rx_group->arg_untagged > 0)
2632 			goto done;
2633 
2634 		goto update_ports;
2635 	}
2636 
2637 	avp = aggr_find_vlan(rx_group, vid);
2638 
2639 	if (avp == NULL) {
2640 		err = ENOENT;
2641 		goto done;
2642 	}
2643 
2644 	avp->av_refs--;
2645 
2646 	if (avp->av_refs > 0)
2647 		goto done;
2648 
2649 update_ports:
2650 	for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2651 		if ((err = aggr_port_remvlan(port, vid)) != 0)
2652 			break;
2653 
2654 	/*
2655 	 * See the comment in aggr_addvlan() for justification of the
2656 	 * use of VERIFY here.
2657 	 */
2658 	if (err != 0) {
2659 		for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2660 			int err2;
2661 
2662 			if ((err2 = aggr_port_addvlan(p, vid)) != 0) {
2663 				cmn_err(CE_WARN, "Failed to add VLAN %u"
2664 				    " to port %s: errno %d.", vid,
2665 				    mac_client_name(p->lp_mch), err2);
2666 			}
2667 		}
2668 
2669 		if (avp != NULL)
2670 			avp->av_refs++;
2671 
2672 		if (vid == VLAN_ID_NONE)
2673 			rx_group->arg_untagged++;
2674 
2675 		goto done;
2676 	}
2677 
2678 	if (err == 0 && avp != NULL) {
2679 		VERIFY3U(avp->av_refs, ==, 0);
2680 		list_remove(&rx_group->arg_vlans, avp);
2681 		kmem_free(avp, sizeof (aggr_vlan_t));
2682 	}
2683 
2684 done:
2685 	mac_perim_exit(mph);
2686 	return (err);
2687 }
2688 
2689 /*
2690  * Add or remove the multicast addresses that are defined for the group
2691  * to or from the specified port.
2692  *
2693  * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
2694  * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
2695  * called when the port is either stopped or detached.
2696  */
2697 void
2698 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
2699 {
2700 	aggr_grp_t *grp = port->lp_grp;
2701 
2702 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
2703 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2704 
2705 	if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
2706 		return;
2707 
2708 	mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
2709 }
2710 
2711 static int
2712 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
2713 {
2714 	aggr_grp_t *grp = arg;
2715 	aggr_port_t *port = NULL, *errport = NULL;
2716 	mac_perim_handle_t mph;
2717 	int err = 0;
2718 
2719 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2720 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2721 		if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2722 		    !port->lp_started) {
2723 			continue;
2724 		}
2725 		err = aggr_port_multicst(port, add, addrp);
2726 		if (err != 0) {
2727 			errport = port;
2728 			break;
2729 		}
2730 	}
2731 
2732 	/*
2733 	 * At least one port caused error return and this error is returned to
2734 	 * mac, eventually a NAK would be sent upwards.
2735 	 * Some ports have this multicast address listed now, and some don't.
2736 	 * Treat this error as a whole aggr failure not individual port failure.
2737 	 * Therefore remove this multicast address from other ports.
2738 	 */
2739 	if ((err != 0) && add) {
2740 		for (port = grp->lg_ports; port != errport;
2741 		    port = port->lp_next) {
2742 			if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2743 			    !port->lp_started) {
2744 				continue;
2745 			}
2746 			(void) aggr_port_multicst(port, B_FALSE, addrp);
2747 		}
2748 	}
2749 	mac_perim_exit(mph);
2750 	return (err);
2751 }
2752 
2753 static int
2754 aggr_m_unicst(void *arg, const uint8_t *macaddr)
2755 {
2756 	aggr_grp_t *grp = arg;
2757 	mac_perim_handle_t mph;
2758 	int err;
2759 
2760 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2761 	err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
2762 	    0, 0);
2763 	mac_perim_exit(mph);
2764 	return (err);
2765 }
2766 
2767 /*
2768  * Initialize the capabilities that are advertised for the group
2769  * according to the capabilities of the constituent ports.
2770  */
2771 static void
2772 aggr_grp_capab_set(aggr_grp_t *grp)
2773 {
2774 	uint32_t cksum;
2775 	aggr_port_t *port;
2776 	mac_capab_lso_t cap_lso;
2777 
2778 	ASSERT(grp->lg_mh == NULL);
2779 	ASSERT(grp->lg_ports != NULL);
2780 
2781 	grp->lg_hcksum_txflags = (uint32_t)-1;
2782 	grp->lg_zcopy = B_TRUE;
2783 	grp->lg_vlan = B_TRUE;
2784 
2785 	grp->lg_lso = B_TRUE;
2786 	grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1;
2787 	grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1;
2788 
2789 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2790 		if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum))
2791 			cksum = 0;
2792 		grp->lg_hcksum_txflags &= cksum;
2793 
2794 		grp->lg_vlan &=
2795 		    !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL);
2796 
2797 		grp->lg_zcopy &=
2798 		    !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
2799 
2800 		grp->lg_lso &=
2801 		    mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso);
2802 		if (grp->lg_lso) {
2803 			grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags;
2804 			if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2805 			    cap_lso.lso_basic_tcp_ipv4.lso_max)
2806 				grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max =
2807 				    cap_lso.lso_basic_tcp_ipv4.lso_max;
2808 		}
2809 	}
2810 }
2811 
2812 /*
2813  * Checks whether the capabilities of the port being added are compatible
2814  * with the current capabilities of the aggregation.
2815  */
2816 static boolean_t
2817 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
2818 {
2819 	uint32_t hcksum_txflags;
2820 
2821 	ASSERT(grp->lg_ports != NULL);
2822 
2823 	if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) &
2824 	    grp->lg_vlan) != grp->lg_vlan) {
2825 		return (B_FALSE);
2826 	}
2827 
2828 	if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) &
2829 	    grp->lg_zcopy) != grp->lg_zcopy) {
2830 		return (B_FALSE);
2831 	}
2832 
2833 	if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) {
2834 		if (grp->lg_hcksum_txflags != 0)
2835 			return (B_FALSE);
2836 	} else if ((hcksum_txflags & grp->lg_hcksum_txflags) !=
2837 	    grp->lg_hcksum_txflags) {
2838 		return (B_FALSE);
2839 	}
2840 
2841 	if (grp->lg_lso) {
2842 		mac_capab_lso_t cap_lso;
2843 
2844 		if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) {
2845 			if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) !=
2846 			    grp->lg_cap_lso.lso_flags)
2847 				return (B_FALSE);
2848 			if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2849 			    cap_lso.lso_basic_tcp_ipv4.lso_max)
2850 				return (B_FALSE);
2851 		} else {
2852 			return (B_FALSE);
2853 		}
2854 	}
2855 
2856 	return (B_TRUE);
2857 }
2858 
2859 /*
2860  * Returns the maximum SDU according to the SDU of the constituent ports.
2861  */
2862 static uint_t
2863 aggr_grp_max_sdu(aggr_grp_t *grp)
2864 {
2865 	uint_t max_sdu = (uint_t)-1;
2866 	aggr_port_t *port;
2867 
2868 	ASSERT(grp->lg_ports != NULL);
2869 
2870 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2871 		uint_t port_sdu_max;
2872 
2873 		mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2874 		if (max_sdu > port_sdu_max)
2875 			max_sdu = port_sdu_max;
2876 	}
2877 
2878 	return (max_sdu);
2879 }
2880 
2881 /*
2882  * Checks if the maximum SDU of the specified port is compatible
2883  * with the maximum SDU of the specified aggregation group, returns
2884  * B_TRUE if it is, B_FALSE otherwise.
2885  */
2886 static boolean_t
2887 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port)
2888 {
2889 	uint_t port_sdu_max;
2890 
2891 	mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2892 	return (port_sdu_max >= grp->lg_max_sdu);
2893 }
2894 
2895 /*
2896  * Returns the maximum margin according to the margin of the constituent ports.
2897  */
2898 static uint32_t
2899 aggr_grp_max_margin(aggr_grp_t *grp)
2900 {
2901 	uint32_t margin = UINT32_MAX;
2902 	aggr_port_t *port;
2903 
2904 	ASSERT(grp->lg_mh == NULL);
2905 	ASSERT(grp->lg_ports != NULL);
2906 
2907 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2908 		if (margin > port->lp_margin)
2909 			margin = port->lp_margin;
2910 	}
2911 
2912 	grp->lg_margin = margin;
2913 	return (margin);
2914 }
2915 
2916 /*
2917  * Checks if the maximum margin of the specified port is compatible
2918  * with the maximum margin of the specified aggregation group, returns
2919  * B_TRUE if it is, B_FALSE otherwise.
2920  */
2921 static boolean_t
2922 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port)
2923 {
2924 	if (port->lp_margin >= grp->lg_margin)
2925 		return (B_TRUE);
2926 
2927 	/*
2928 	 * See whether the current margin value is allowed to be changed to
2929 	 * the new value.
2930 	 */
2931 	if (!mac_margin_update(grp->lg_mh, port->lp_margin))
2932 		return (B_FALSE);
2933 
2934 	grp->lg_margin = port->lp_margin;
2935 	return (B_TRUE);
2936 }
2937 
2938 /*
2939  * Set MTU on individual ports of an aggregation group
2940  */
2941 static int
2942 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu,
2943     uint32_t *old_mtu)
2944 {
2945 	boolean_t		removed = B_FALSE;
2946 	mac_perim_handle_t	mph;
2947 	mac_diag_t		diag;
2948 	int			err, rv, retry = 0;
2949 
2950 	if (port->lp_mah != NULL) {
2951 		(void) mac_unicast_remove(port->lp_mch, port->lp_mah);
2952 		port->lp_mah = NULL;
2953 		removed = B_TRUE;
2954 	}
2955 	err = mac_set_mtu(port->lp_mh, sdu, old_mtu);
2956 try_again:
2957 	if (removed && (rv = mac_unicast_add(port->lp_mch, NULL,
2958 	    MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK,
2959 	    &port->lp_mah, 0, &diag)) != 0) {
2960 		/*
2961 		 * following is a workaround for a bug in 'bge' driver.
2962 		 * See CR 6794654 for more information and this work around
2963 		 * will be removed once the CR is fixed.
2964 		 */
2965 		if (rv == EIO && retry++ < 3) {
2966 			delay(2 * hz);
2967 			goto try_again;
2968 		}
2969 		/*
2970 		 * if mac_unicast_add() failed while setting the MTU,
2971 		 * detach the port from the group.
2972 		 */
2973 		mac_perim_enter_by_mh(port->lp_mh, &mph);
2974 		(void) aggr_grp_detach_port(grp, port);
2975 		mac_perim_exit(mph);
2976 		cmn_err(CE_WARN, "Unable to restart the port %s while "
2977 		    "setting MTU. Detaching the port from the aggregation.",
2978 		    mac_client_name(port->lp_mch));
2979 	}
2980 	return (err);
2981 }
2982 
2983 static int
2984 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu)
2985 {
2986 	int			err = 0, i, rv;
2987 	aggr_port_t		*port;
2988 	uint32_t		*mtu;
2989 
2990 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2991 
2992 	/*
2993 	 * If the MTU being set is equal to aggr group's maximum
2994 	 * allowable value, then there is nothing to change
2995 	 */
2996 	if (sdu == grp->lg_max_sdu)
2997 		return (0);
2998 
2999 	/* 0 is aggr group's min sdu */
3000 	if (sdu == 0)
3001 		return (EINVAL);
3002 
3003 	mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP);
3004 	for (port = grp->lg_ports, i = 0; port != NULL && err == 0;
3005 	    port = port->lp_next, i++) {
3006 		err = aggr_set_port_sdu(grp, port, sdu, mtu + i);
3007 	}
3008 	if (err != 0) {
3009 		/* recover from error: reset the mtus of the ports */
3010 		aggr_port_t *tmp;
3011 
3012 		for (tmp = grp->lg_ports, i = 0; tmp != port;
3013 		    tmp = tmp->lp_next, i++) {
3014 			(void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL);
3015 		}
3016 		goto bail;
3017 	}
3018 	grp->lg_max_sdu = aggr_grp_max_sdu(grp);
3019 	rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu);
3020 	ASSERT(rv == 0);
3021 bail:
3022 	kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports);
3023 	return (err);
3024 }
3025 
3026 /*
3027  * Callback functions for set/get of properties
3028  */
3029 /*ARGSUSED*/
3030 static int
3031 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3032     uint_t pr_valsize, const void *pr_val)
3033 {
3034 	int		err = ENOTSUP;
3035 	aggr_grp_t	*grp = m_driver;
3036 
3037 	switch (pr_num) {
3038 	case MAC_PROP_MTU: {
3039 		uint32_t	mtu;
3040 
3041 		if (pr_valsize < sizeof (mtu)) {
3042 			err = EINVAL;
3043 			break;
3044 		}
3045 		bcopy(pr_val, &mtu, sizeof (mtu));
3046 		err = aggr_sdu_update(grp, mtu);
3047 		break;
3048 	}
3049 	default:
3050 		break;
3051 	}
3052 	return (err);
3053 }
3054 
3055 typedef struct rboundary {
3056 	uint32_t	bval;
3057 	int		btype;
3058 } rboundary_t;
3059 
3060 /*
3061  * This function finds the intersection of mtu ranges stored in arrays -
3062  * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval.
3063  * Individual arrays are assumed to contain non-overlapping ranges.
3064  * Algorithm:
3065  *   A range has two boundaries - min and max. We scan all arrays and store
3066  * each boundary as a separate element in a temporary array. We also store
3067  * the boundary types, min or max, as +1 or -1 respectively in the temporary
3068  * array. Then we sort the temporary array in ascending order. We scan the
3069  * sorted array from lower to higher values and keep a cumulative sum of
3070  * boundary types. Element in the temporary array for which the sum reaches
3071  * mcount is a min boundary of a range in the result and next element will be
3072  * max boundary.
3073  *
3074  * Example for mcount = 3,
3075  *
3076  *  ----|_________|-------|_______|----|__|------ mrange[0]
3077  *
3078  *  -------|________|--|____________|-----|___|-- mrange[1]
3079  *
3080  *  --------|________________|-------|____|------ mrange[2]
3081  *
3082  *                                      3 2 1
3083  *                                       \|/
3084  *      1  23     2 1  2  3  2    1 01 2  V   0  <- the sum
3085  *  ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array
3086  *
3087  *                                 same min and max
3088  *                                        V
3089  *  --------|_____|-------|__|------------|------ intersecting ranges
3090  */
3091 void
3092 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount,
3093     mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount)
3094 {
3095 	mac_propval_uint32_range_t	*rval, *ur;
3096 	int				rmaxcnt, rcount;
3097 	size_t				sz_range32;
3098 	rboundary_t			*ta; /* temporary array */
3099 	rboundary_t			temp;
3100 	boolean_t			range_started = B_FALSE;
3101 	int				i, j, m, sum;
3102 
3103 	sz_range32 = sizeof (mac_propval_uint32_range_t);
3104 
3105 	for (i = 0, rmaxcnt = 0; i < mcount; i++)
3106 		rmaxcnt += mrange[i]->mpr_count;
3107 
3108 	/* Allocate enough space to store the results */
3109 	rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP);
3110 
3111 	/* Number of boundaries are twice as many as ranges */
3112 	ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP);
3113 
3114 	for (i = 0, m = 0; i < mcount; i++) {
3115 		ur = &(mrange[i]->mpr_range_uint32[0]);
3116 		for (j = 0; j < mrange[i]->mpr_count; j++) {
3117 			ta[m].bval = ur[j].mpur_min;
3118 			ta[m++].btype = 1;
3119 			ta[m].bval = ur[j].mpur_max;
3120 			ta[m++].btype = -1;
3121 		}
3122 	}
3123 
3124 	/*
3125 	 * Sort the temporary array in ascending order of bval;
3126 	 * if boundary values are same then sort on btype.
3127 	 */
3128 	for (i = 0; i < m-1; i++) {
3129 		for (j = i+1; j < m; j++) {
3130 			if ((ta[i].bval > ta[j].bval) ||
3131 			    ((ta[i].bval == ta[j].bval) &&
3132 			    (ta[i].btype < ta[j].btype))) {
3133 				temp = ta[i];
3134 				ta[i] = ta[j];
3135 				ta[j] = temp;
3136 			}
3137 		}
3138 	}
3139 
3140 	/* Walk through temporary array to find all ranges in the results */
3141 	for (i = 0, sum = 0, rcount = 0; i < m; i++) {
3142 		sum += ta[i].btype;
3143 		if (sum == mcount) {
3144 			rval[rcount].mpur_min = ta[i].bval;
3145 			range_started = B_TRUE;
3146 		} else if (sum < mcount && range_started) {
3147 			rval[rcount++].mpur_max = ta[i].bval;
3148 			range_started = B_FALSE;
3149 		}
3150 	}
3151 
3152 	*prval = rval;
3153 	*prmaxcnt = rmaxcnt;
3154 	*prcount = rcount;
3155 
3156 	kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t));
3157 }
3158 
3159 /*
3160  * Returns the mtu ranges which could be supported by aggr group.
3161  * prmaxcnt returns the size of the buffer prval, prcount returns
3162  * the number of valid entries in prval. Caller is responsible
3163  * for freeing up prval.
3164  */
3165 int
3166 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval,
3167     int *prmaxcnt, int *prcount)
3168 {
3169 	mac_propval_range_t		**vals;
3170 	aggr_port_t			*port;
3171 	mac_perim_handle_t		mph;
3172 	uint_t				i, numr;
3173 	int				err = 0;
3174 	size_t				sz_propval, sz_range32;
3175 	size_t				size;
3176 
3177 	sz_propval = sizeof (mac_propval_range_t);
3178 	sz_range32 = sizeof (mac_propval_uint32_range_t);
3179 
3180 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
3181 
3182 	vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports,
3183 	    KM_SLEEP);
3184 
3185 	for (port = grp->lg_ports, i = 0; port != NULL;
3186 	    port = port->lp_next, i++) {
3187 
3188 		size = sz_propval;
3189 		vals[i] = kmem_alloc(size, KM_SLEEP);
3190 		vals[i]->mpr_count = 1;
3191 
3192 		mac_perim_enter_by_mh(port->lp_mh, &mph);
3193 
3194 		err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3195 		    NULL, 0, vals[i], NULL);
3196 		if (err == ENOSPC) {
3197 			/*
3198 			 * Not enough space to hold all ranges.
3199 			 * Allocate extra space as indicated and retry.
3200 			 */
3201 			numr = vals[i]->mpr_count;
3202 			kmem_free(vals[i], sz_propval);
3203 			size = sz_propval + (numr - 1) * sz_range32;
3204 			vals[i] = kmem_alloc(size, KM_SLEEP);
3205 			vals[i]->mpr_count = numr;
3206 			err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3207 			    NULL, 0, vals[i], NULL);
3208 			ASSERT(err != ENOSPC);
3209 		}
3210 		mac_perim_exit(mph);
3211 		if (err != 0) {
3212 			kmem_free(vals[i], size);
3213 			vals[i] = NULL;
3214 			break;
3215 		}
3216 	}
3217 
3218 	/*
3219 	 * if any of the underlying ports does not support changing MTU then
3220 	 * just return ENOTSUP
3221 	 */
3222 	if (port != NULL) {
3223 		ASSERT(err != 0);
3224 		goto done;
3225 	}
3226 
3227 	aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt,
3228 	    prcount);
3229 
3230 done:
3231 	for (i = 0; i < grp->lg_nports; i++) {
3232 		if (vals[i] != NULL) {
3233 			numr = vals[i]->mpr_count;
3234 			size = sz_propval + (numr - 1) * sz_range32;
3235 			kmem_free(vals[i], size);
3236 		}
3237 	}
3238 
3239 	kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports);
3240 	return (err);
3241 }
3242 
3243 static void
3244 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3245     mac_prop_info_handle_t prh)
3246 {
3247 	aggr_grp_t			*grp = m_driver;
3248 	mac_propval_uint32_range_t	*rval = NULL;
3249 	int				i, rcount, rmaxcnt;
3250 	int				err = 0;
3251 
3252 	_NOTE(ARGUNUSED(pr_name));
3253 
3254 	switch (pr_num) {
3255 	case MAC_PROP_MTU:
3256 
3257 		err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt,
3258 		    &rcount);
3259 		if (err != 0) {
3260 			ASSERT(rval == NULL);
3261 			return;
3262 		}
3263 		for (i = 0; i < rcount; i++) {
3264 			mac_prop_info_set_range_uint32(prh,
3265 			    rval[i].mpur_min, rval[i].mpur_max);
3266 		}
3267 		kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt);
3268 		break;
3269 	}
3270 }
3271