1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2020 Joyent, Inc.
24 * Copyright 2020 RackTop Systems, Inc.
25 */
26
27 /*
28 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
29 *
30 * An instance of the structure aggr_grp_t is allocated for each
31 * link aggregation group. When created, aggr_grp_t objects are
32 * entered into the aggr_grp_hash hash table maintained by the modhash
33 * module. The hash key is the linkid associated with the link
34 * aggregation group.
35 *
36 * Each aggregation contains a set of ports. The port is represented
37 * by the aggr_port_t structure. A port consists of a single MAC
38 * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying
39 * MAC. This client is used by the aggr to send and receive LACP
40 * traffic. Each port client takes on the same MAC unicast address --
41 * the address of the aggregation itself (taken from the first port by
42 * default).
43 *
44 * The MAC client that hangs off each aggr port is not your typical
45 * MAC client. Not only does it have exclusive control of the MAC, but
46 * it also has no Tx or Rx SRSes. An SRS is designed to queue and
47 * fanout traffic among L4 protocols; but the aggr is an intermediary,
48 * not a consumer. Instead of using SRSes, the aggr puts the
49 * underlying hardware rings into passthru mode and ships packets up
50 * via a direct call to aggr_recv_cb(). This allows aggr to enforce
51 * LACP while passing all other traffic up to clients of the aggr.
52 *
53 * Pseudo Rx Groups and Rings
54 * --------------------------
55 *
56 * It is imperative for client performance that the aggr provide as
57 * many MAC groups as possible. In order to use the underlying HW
58 * resources, aggr creates pseudo groups to aggregate the underlying
59 * HW groups. Every HW group gets mapped to a pseudo group; and every
60 * HW ring in that group gets mapped to a pseudo ring. The pseudo
61 * group at index 0 combines all the HW groups at index 0 from each
62 * port, etc. The aggr's MAC then creates normal MAC groups and rings
63 * out of these pseudo groups and rings to present to the aggr's
64 * clients. To the clients, the aggr's groups and rings are absolutely
65 * no different than a NIC's groups or rings.
66 *
67 * Pseudo Tx Rings
68 * ---------------
69 *
70 * The underlying ports (NICs) in an aggregation can have Tx rings. To
71 * enhance aggr's performance, these Tx rings are made available to
72 * the aggr layer as pseudo Tx rings. The concept of pseudo rings are
73 * not new. They are already present and implemented on the Rx side.
74 * The same concept is extended to the Tx side where each Tx ring of
75 * an underlying port is reflected in aggr as a pseudo Tx ring. Thus
76 * each pseudo Tx ring will map to a specific hardware Tx ring. Even
77 * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring
78 * is given to the aggregation layer.
79 *
80 * With this change, the outgoing stack depth looks much better:
81 *
82 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
83 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
84 *
85 * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings:
86 * SRS_TX_AGGR and SRS_TX_BW_AGGR.
87 *
88 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
89 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx
90 * ring belonging to a port on which the packet has to be sent.
91 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
92 * policy and then uses the fanout_hint passed to it to pick a Tx ring from
93 * the selected port.
94 *
95 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
96 * bandwidth limit is applied first on the outgoing packet and the packets
97 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
98 * particular Tx ring.
99 */
100
101 #include <sys/types.h>
102 #include <sys/sysmacros.h>
103 #include <sys/conf.h>
104 #include <sys/cmn_err.h>
105 #include <sys/disp.h>
106 #include <sys/list.h>
107 #include <sys/ksynch.h>
108 #include <sys/kmem.h>
109 #include <sys/stream.h>
110 #include <sys/modctl.h>
111 #include <sys/ddi.h>
112 #include <sys/sunddi.h>
113 #include <sys/atomic.h>
114 #include <sys/stat.h>
115 #include <sys/modhash.h>
116 #include <sys/id_space.h>
117 #include <sys/strsun.h>
118 #include <sys/cred.h>
119 #include <sys/dlpi.h>
120 #include <sys/zone.h>
121 #include <sys/mac_provider.h>
122 #include <sys/dls.h>
123 #include <sys/vlan.h>
124 #include <sys/aggr.h>
125 #include <sys/aggr_impl.h>
126
127 static int aggr_m_start(void *);
128 static void aggr_m_stop(void *);
129 static int aggr_m_promisc(void *, boolean_t);
130 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
131 static int aggr_m_unicst(void *, const uint8_t *);
132 static int aggr_m_stat(void *, uint_t, uint64_t *);
133 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
134 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
135 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
136 const void *);
137 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
138 mac_prop_info_handle_t);
139
140 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
141 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
142 boolean_t *);
143
144 static void aggr_grp_capab_set(aggr_grp_t *);
145 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
146 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
147 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
148 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
149 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
150
151 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
152 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
153 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
154 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
155 static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t);
156 static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t);
157 static int aggr_addmac(void *, const uint8_t *);
158 static int aggr_remmac(void *, const uint8_t *);
159 static int aggr_addvlan(mac_group_driver_t, uint16_t);
160 static int aggr_remvlan(mac_group_driver_t, uint16_t);
161 static mblk_t *aggr_rx_poll(void *, int);
162 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
163 const int, mac_ring_info_t *, mac_ring_handle_t);
164 static void aggr_fill_group(void *, mac_ring_type_t, const int,
165 mac_group_info_t *, mac_group_handle_t);
166
167 static kmem_cache_t *aggr_grp_cache;
168 static mod_hash_t *aggr_grp_hash;
169 static krwlock_t aggr_grp_lock;
170 static uint_t aggr_grp_cnt;
171 static id_space_t *key_ids;
172
173 #define GRP_HASHSZ 64
174 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid)
175 #define AGGR_PORT_NAME_DELIMIT '-'
176
177 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
178
179 #define AGGR_M_CALLBACK_FLAGS \
180 (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
181
182 static mac_callbacks_t aggr_m_callbacks = {
183 AGGR_M_CALLBACK_FLAGS,
184 aggr_m_stat,
185 aggr_m_start,
186 aggr_m_stop,
187 aggr_m_promisc,
188 aggr_m_multicst,
189 NULL,
190 NULL,
191 NULL,
192 aggr_m_ioctl,
193 aggr_m_capab_get,
194 NULL,
195 NULL,
196 aggr_m_setprop,
197 NULL,
198 aggr_m_propinfo
199 };
200
201 /*ARGSUSED*/
202 static int
aggr_grp_constructor(void * buf,void * arg,int kmflag)203 aggr_grp_constructor(void *buf, void *arg, int kmflag)
204 {
205 aggr_grp_t *grp = buf;
206
207 bzero(grp, sizeof (*grp));
208 mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
209 cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
210 rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
211 mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
212 cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
213 mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL);
214 cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL);
215 grp->lg_link_state = LINK_STATE_UNKNOWN;
216 return (0);
217 }
218
219 /*ARGSUSED*/
220 static void
aggr_grp_destructor(void * buf,void * arg)221 aggr_grp_destructor(void *buf, void *arg)
222 {
223 aggr_grp_t *grp = buf;
224
225 if (grp->lg_tx_ports != NULL) {
226 kmem_free(grp->lg_tx_ports,
227 grp->lg_tx_ports_size * sizeof (aggr_port_t *));
228 }
229
230 mutex_destroy(&grp->lg_lacp_lock);
231 cv_destroy(&grp->lg_lacp_cv);
232 mutex_destroy(&grp->lg_port_lock);
233 cv_destroy(&grp->lg_port_cv);
234 rw_destroy(&grp->lg_tx_lock);
235 mutex_destroy(&grp->lg_tx_flowctl_lock);
236 cv_destroy(&grp->lg_tx_flowctl_cv);
237 }
238
239 void
aggr_grp_init(void)240 aggr_grp_init(void)
241 {
242 aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
243 sizeof (aggr_grp_t), 0, aggr_grp_constructor,
244 aggr_grp_destructor, NULL, NULL, NULL, 0);
245
246 aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
247 GRP_HASHSZ, mod_hash_null_valdtor);
248 rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
249 aggr_grp_cnt = 0;
250
251 /*
252 * Allocate an id space to manage key values (when key is not
253 * specified). The range of the id space will be from
254 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol
255 * uses a 16-bit key.
256 */
257 key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX);
258 ASSERT(key_ids != NULL);
259 }
260
261 void
aggr_grp_fini(void)262 aggr_grp_fini(void)
263 {
264 id_space_destroy(key_ids);
265 rw_destroy(&aggr_grp_lock);
266 mod_hash_destroy_idhash(aggr_grp_hash);
267 kmem_cache_destroy(aggr_grp_cache);
268 }
269
270 uint_t
aggr_grp_count(void)271 aggr_grp_count(void)
272 {
273 uint_t count;
274
275 rw_enter(&aggr_grp_lock, RW_READER);
276 count = aggr_grp_cnt;
277 rw_exit(&aggr_grp_lock);
278 return (count);
279 }
280
281 /*
282 * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
283 * requires the mac perimeter, this function holds a reference of the aggr
284 * and aggr won't call mac_unregister() until this reference drops to 0.
285 */
286 void
aggr_grp_port_hold(aggr_port_t * port)287 aggr_grp_port_hold(aggr_port_t *port)
288 {
289 aggr_grp_t *grp = port->lp_grp;
290
291 AGGR_PORT_REFHOLD(port);
292 mutex_enter(&grp->lg_port_lock);
293 grp->lg_port_ref++;
294 mutex_exit(&grp->lg_port_lock);
295 }
296
297 /*
298 * Release the reference of the grp and inform aggr_grp_delete() calling
299 * mac_unregister() is now safe.
300 */
301 void
aggr_grp_port_rele(aggr_port_t * port)302 aggr_grp_port_rele(aggr_port_t *port)
303 {
304 aggr_grp_t *grp = port->lp_grp;
305
306 mutex_enter(&grp->lg_port_lock);
307 if (--grp->lg_port_ref == 0)
308 cv_signal(&grp->lg_port_cv);
309 mutex_exit(&grp->lg_port_lock);
310 AGGR_PORT_REFRELE(port);
311 }
312
313 /*
314 * Wait for the port's lacp timer thread and the port's notification callback
315 * to exit.
316 */
317 void
aggr_grp_port_wait(aggr_grp_t * grp)318 aggr_grp_port_wait(aggr_grp_t *grp)
319 {
320 mutex_enter(&grp->lg_port_lock);
321 if (grp->lg_port_ref != 0)
322 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
323 mutex_exit(&grp->lg_port_lock);
324 }
325
326 /*
327 * Attach a port to a link aggregation group.
328 *
329 * A port is attached to a link aggregation group once its speed
330 * and link state have been verified.
331 *
332 * Returns B_TRUE if the group link state or speed has changed. If
333 * it's the case, the caller must notify the MAC layer via a call
334 * to mac_link().
335 */
336 boolean_t
aggr_grp_attach_port(aggr_grp_t * grp,aggr_port_t * port)337 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
338 {
339 boolean_t link_state_changed = B_FALSE;
340
341 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
342 ASSERT(MAC_PERIM_HELD(port->lp_mh));
343
344 if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
345 return (B_FALSE);
346
347 /*
348 * Validate the MAC port link speed and update the group
349 * link speed if needed.
350 */
351 if (port->lp_ifspeed == 0 ||
352 port->lp_link_state != LINK_STATE_UP ||
353 port->lp_link_duplex != LINK_DUPLEX_FULL) {
354 /*
355 * Can't attach a MAC port with unknown link speed,
356 * down link, or not in full duplex mode.
357 */
358 return (B_FALSE);
359 }
360
361 mutex_enter(&grp->lg_stat_lock);
362 if (grp->lg_ifspeed == 0) {
363 /*
364 * The group inherits the speed of the first link being
365 * attached.
366 */
367 grp->lg_ifspeed = port->lp_ifspeed;
368 link_state_changed = B_TRUE;
369 } else if (grp->lg_ifspeed != port->lp_ifspeed) {
370 /*
371 * The link speed of the MAC port must be the same as
372 * the group link speed, as per 802.3ad. Since it is
373 * not, the attach is cancelled.
374 */
375 mutex_exit(&grp->lg_stat_lock);
376 return (B_FALSE);
377 }
378 mutex_exit(&grp->lg_stat_lock);
379
380 grp->lg_nattached_ports++;
381
382 /*
383 * Update the group link state.
384 */
385 if (grp->lg_link_state != LINK_STATE_UP) {
386 grp->lg_link_state = LINK_STATE_UP;
387 mutex_enter(&grp->lg_stat_lock);
388 grp->lg_link_duplex = LINK_DUPLEX_FULL;
389 mutex_exit(&grp->lg_stat_lock);
390 link_state_changed = B_TRUE;
391 }
392
393 /*
394 * Update port's state.
395 */
396 port->lp_state = AGGR_PORT_STATE_ATTACHED;
397
398 aggr_grp_multicst_port(port, B_TRUE);
399
400 /*
401 * The port client doesn't have an Rx SRS; instead of calling
402 * mac_rx_set() we set the client's flow callback directly.
403 * This datapath is used only when the port's driver doesn't
404 * support MAC_CAPAB_RINGS. Drivers with ring support will
405 * deliver traffic to the aggr via ring passthru.
406 */
407 mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port);
408
409 /*
410 * If LACP is OFF, the port can be used to send data as soon
411 * as its link is up and verified to be compatible with the
412 * aggregation.
413 *
414 * If LACP is active or passive, notify the LACP subsystem, which
415 * will enable sending on the port following the LACP protocol.
416 */
417 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
418 aggr_send_port_enable(port);
419 else
420 aggr_lacp_port_attached(port);
421
422 return (link_state_changed);
423 }
424
425 boolean_t
aggr_grp_detach_port(aggr_grp_t * grp,aggr_port_t * port)426 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
427 {
428 boolean_t link_state_changed = B_FALSE;
429
430 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
431 ASSERT(MAC_PERIM_HELD(port->lp_mh));
432
433 /* update state */
434 if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
435 return (B_FALSE);
436
437 mac_client_clear_flow_cb(port->lp_mch);
438
439 aggr_grp_multicst_port(port, B_FALSE);
440
441 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
442 aggr_send_port_disable(port);
443 else
444 aggr_lacp_port_detached(port);
445
446 port->lp_state = AGGR_PORT_STATE_STANDBY;
447
448 grp->lg_nattached_ports--;
449 if (grp->lg_nattached_ports == 0) {
450 /* the last attached MAC port of the group is being detached */
451 grp->lg_link_state = LINK_STATE_DOWN;
452 mutex_enter(&grp->lg_stat_lock);
453 grp->lg_ifspeed = 0;
454 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
455 mutex_exit(&grp->lg_stat_lock);
456 link_state_changed = B_TRUE;
457 }
458
459 return (link_state_changed);
460 }
461
462 /*
463 * Update the MAC addresses of the constituent ports of the specified
464 * group. This function is invoked:
465 * - after creating a new aggregation group.
466 * - after adding new ports to an aggregation group.
467 * - after removing a port from a group when the MAC address of
468 * that port was used for the MAC address of the group.
469 * - after the MAC address of a port changed when the MAC address
470 * of that port was used for the MAC address of the group.
471 *
472 * Return true if the link state of the aggregation changed, for example
473 * as a result of a failure changing the MAC address of one of the
474 * constituent ports.
475 */
476 boolean_t
aggr_grp_update_ports_mac(aggr_grp_t * grp)477 aggr_grp_update_ports_mac(aggr_grp_t *grp)
478 {
479 aggr_port_t *cport;
480 boolean_t link_state_changed = B_FALSE;
481 mac_perim_handle_t mph;
482
483 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
484
485 for (cport = grp->lg_ports; cport != NULL;
486 cport = cport->lp_next) {
487 mac_perim_enter_by_mh(cport->lp_mh, &mph);
488 if (aggr_port_unicst(cport) != 0) {
489 if (aggr_grp_detach_port(grp, cport))
490 link_state_changed = B_TRUE;
491 } else {
492 /*
493 * If a port was detached because of a previous
494 * failure changing the MAC address, the port is
495 * reattached when it successfully changes the MAC
496 * address now, and this might cause the link state
497 * of the aggregation to change.
498 */
499 if (aggr_grp_attach_port(grp, cport))
500 link_state_changed = B_TRUE;
501 }
502 mac_perim_exit(mph);
503 }
504 return (link_state_changed);
505 }
506
507 /*
508 * Invoked when the MAC address of a port has changed. If the port's
509 * MAC address was used for the group MAC address, set mac_addr_changedp
510 * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST
511 * notification. If the link state changes due to detach/attach of
512 * the constituent port, set link_state_changedp to B_TRUE to indicate
513 * to the caller that it should send a MAC_NOTE_LINK notification. In both
514 * cases, it is the responsibility of the caller to invoke notification
515 * functions after releasing the the port lock.
516 */
517 void
aggr_grp_port_mac_changed(aggr_grp_t * grp,aggr_port_t * port,boolean_t * mac_addr_changedp,boolean_t * link_state_changedp)518 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
519 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
520 {
521 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
522 ASSERT(MAC_PERIM_HELD(port->lp_mh));
523 ASSERT(mac_addr_changedp != NULL);
524 ASSERT(link_state_changedp != NULL);
525
526 *mac_addr_changedp = B_FALSE;
527 *link_state_changedp = B_FALSE;
528
529 if (grp->lg_addr_fixed) {
530 /*
531 * The group is using a fixed MAC address or an automatic
532 * MAC address has not been set.
533 */
534 return;
535 }
536
537 if (grp->lg_mac_addr_port == port) {
538 /*
539 * The MAC address of the port was assigned to the group
540 * MAC address. Update the group MAC address.
541 */
542 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
543 *mac_addr_changedp = B_TRUE;
544 } else {
545 /*
546 * Update the actual port MAC address to the MAC address
547 * of the group.
548 */
549 if (aggr_port_unicst(port) != 0) {
550 *link_state_changedp = aggr_grp_detach_port(grp, port);
551 } else {
552 /*
553 * If a port was detached because of a previous
554 * failure changing the MAC address, the port is
555 * reattached when it successfully changes the MAC
556 * address now, and this might cause the link state
557 * of the aggregation to change.
558 */
559 *link_state_changedp = aggr_grp_attach_port(grp, port);
560 }
561 }
562 }
563
564 /*
565 * Add a port to a link aggregation group.
566 */
567 static int
aggr_grp_add_port(aggr_grp_t * grp,datalink_id_t port_linkid,boolean_t force,aggr_port_t ** pp)568 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
569 aggr_port_t **pp)
570 {
571 aggr_port_t *port, **cport;
572 mac_perim_handle_t mph;
573 zoneid_t port_zoneid = ALL_ZONES;
574 int err;
575
576 /* The port must be in the same zone as the aggregation. */
577 if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
578 port_zoneid = GLOBAL_ZONEID;
579 if (grp->lg_zoneid != port_zoneid)
580 return (EBUSY);
581
582 /*
583 * If we are creating the aggr, then there is no MAC handle
584 * and thus no perimeter to hold. If we are adding a port to
585 * an existing aggr, then the perimiter of the aggr's MAC must
586 * be held.
587 */
588 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
589
590 err = aggr_port_create(grp, port_linkid, force, &port);
591 if (err != 0)
592 return (err);
593
594 mac_perim_enter_by_mh(port->lp_mh, &mph);
595
596 /* Add the new port to the end of the list. */
597 cport = &grp->lg_ports;
598 while (*cport != NULL)
599 cport = &((*cport)->lp_next);
600 *cport = port;
601
602 /*
603 * Back reference to the group it is member of. A port always
604 * holds a reference to its group to ensure that the back
605 * reference is always valid.
606 */
607 port->lp_grp = grp;
608 AGGR_GRP_REFHOLD(grp);
609 grp->lg_nports++;
610 if (grp->lg_nports > grp->lg_nports_high)
611 grp->lg_nports_high = grp->lg_nports;
612
613 aggr_lacp_init_port(port);
614 mac_perim_exit(mph);
615
616 if (pp != NULL)
617 *pp = port;
618
619 return (0);
620 }
621
622 /*
623 * This is called when the 'lg_tx_ports' arrangement has changed and
624 * we need to update the corresponding 'mi_default_tx_ring'. This
625 * happens for several reasons.
626 *
627 * - A pseudo TX mac group was added or removed.
628 * - An LACP message has changed the port's state.
629 * - A link event has changed the port's state.
630 *
631 * In any case, we see if there is at least one port enabled (see
632 * 'aggr_send_port_enable()'), and if so we use its first ring as the
633 * mac's default TX ring.
634 *
635 * Note, because we only have a single TX group, we don't have to
636 * worry about the rings moving between groups and the chance that mac
637 * will reassign it unless someone removes a port, at which point, we
638 * play it safe and call this again.
639 */
640 void
aggr_grp_update_default(aggr_grp_t * grp)641 aggr_grp_update_default(aggr_grp_t *grp)
642 {
643 aggr_port_t *port;
644 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
645
646 rw_enter(&grp->lg_tx_lock, RW_WRITER);
647
648 if (grp->lg_ntx_ports == 0) {
649 rw_exit(&grp->lg_tx_lock);
650 return;
651 }
652
653 port = grp->lg_tx_ports[0];
654 ASSERT(port->lp_tx_ring_cnt > 0);
655 mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]);
656 rw_exit(&grp->lg_tx_lock);
657 }
658
659 /*
660 * Add a pseudo RX ring for the given HW ring handle.
661 */
662 static int
aggr_add_pseudo_rx_ring(aggr_port_t * port,aggr_pseudo_rx_group_t * rx_grp,mac_ring_handle_t hw_rh)663 aggr_add_pseudo_rx_ring(aggr_port_t *port,
664 aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
665 {
666 aggr_pseudo_rx_ring_t *ring;
667 int err;
668 int j;
669
670 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
671 ring = rx_grp->arg_rings + j;
672 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
673 break;
674 }
675
676 /*
677 * No slot for this new RX ring.
678 */
679 if (j == MAX_RINGS_PER_GROUP)
680 return (ENOSPC);
681
682 ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
683 ring->arr_hw_rh = hw_rh;
684 ring->arr_port = port;
685 ring->arr_grp = rx_grp;
686 rx_grp->arg_ring_cnt++;
687
688 /*
689 * The group is already registered, dynamically add a new ring to the
690 * mac group.
691 */
692 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
693 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
694 ring->arr_hw_rh = NULL;
695 ring->arr_port = NULL;
696 ring->arr_grp = NULL;
697 rx_grp->arg_ring_cnt--;
698 } else {
699 /*
700 * This must run after the MAC is registered.
701 */
702 ASSERT3P(ring->arr_rh, !=, NULL);
703 mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb,
704 (void *)port, (mac_resource_handle_t)ring);
705 }
706 return (err);
707 }
708
709 /*
710 * Remove the pseudo RX ring of the given HW ring handle.
711 */
712 static void
aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t * rx_grp,mac_ring_handle_t hw_rh)713 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
714 {
715 for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) {
716 aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j;
717
718 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
719 ring->arr_hw_rh != hw_rh) {
720 continue;
721 }
722
723 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
724
725 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
726 ring->arr_hw_rh = NULL;
727 ring->arr_port = NULL;
728 ring->arr_grp = NULL;
729 rx_grp->arg_ring_cnt--;
730 mac_hwring_clear_passthru(hw_rh);
731 break;
732 }
733 }
734
735 /*
736 * Create pseudo rings over the HW rings of the port.
737 *
738 * o Create a pseudo ring in rx_grp per HW ring in the port's HW group.
739 *
740 * o Program existing unicast filters on the pseudo group into the HW group.
741 *
742 * o Program existing VLAN filters on the pseudo group into the HW group.
743 */
744 static int
aggr_add_pseudo_rx_group(aggr_port_t * port,aggr_pseudo_rx_group_t * rx_grp)745 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
746 {
747 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
748 aggr_unicst_addr_t *addr, *a;
749 mac_perim_handle_t pmph;
750 aggr_vlan_t *avp;
751 uint_t hw_rh_cnt, i;
752 int err = 0;
753 uint_t g_idx = rx_grp->arg_index;
754
755 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
756 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
757 mac_perim_enter_by_mh(port->lp_mh, &pmph);
758
759 i = 0;
760 addr = NULL;
761 /*
762 * This function must be called after the aggr registers its
763 * MAC and its Rx groups have been initialized.
764 */
765 ASSERT(rx_grp->arg_gh != NULL);
766
767 /*
768 * Get the list of the underlying HW rings.
769 */
770 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx,
771 &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX);
772
773 /*
774 * Add existing VLAN and unicast address filters to the port.
775 */
776 for (avp = list_head(&rx_grp->arg_vlans); avp != NULL;
777 avp = list_next(&rx_grp->arg_vlans, avp)) {
778 if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0)
779 goto err;
780 }
781
782 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
783 if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0)
784 goto err;
785 }
786
787 for (i = 0; i < hw_rh_cnt; i++) {
788 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
789 if (err != 0)
790 goto err;
791 }
792
793 mac_perim_exit(pmph);
794 return (0);
795
796 err:
797 ASSERT(err != 0);
798
799 for (uint_t j = 0; j < i; j++)
800 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
801
802 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
803 aggr_port_remmac(port, g_idx, a->aua_addr);
804
805 if (avp != NULL)
806 avp = list_prev(&rx_grp->arg_vlans, avp);
807
808 for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) {
809 int err2;
810
811 if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
812 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
813 ": errno %d.", avp->av_vid,
814 mac_client_name(port->lp_mch), err2);
815 }
816 }
817
818 port->lp_hwghs[g_idx] = NULL;
819 mac_perim_exit(pmph);
820 return (err);
821 }
822
823 /*
824 * Destroy the pseudo rings mapping to this port and remove all VLAN
825 * and unicast filters from this port. Even if there are no underlying
826 * HW rings we must still remove the unicast filters to take the port
827 * out of promisc mode.
828 */
829 static void
aggr_rem_pseudo_rx_group(aggr_port_t * port,aggr_pseudo_rx_group_t * rx_grp)830 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
831 {
832 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
833 aggr_unicst_addr_t *addr;
834 mac_perim_handle_t pmph;
835 uint_t hw_rh_cnt;
836 uint_t g_idx = rx_grp->arg_index;
837
838 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
839 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
840 ASSERT3P(rx_grp->arg_gh, !=, NULL);
841 mac_perim_enter_by_mh(port->lp_mh, &pmph);
842
843 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh,
844 MAC_RING_TYPE_RX);
845
846 for (uint_t i = 0; i < hw_rh_cnt; i++)
847 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
848
849 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
850 aggr_port_remmac(port, g_idx, addr->aua_addr);
851
852 for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL;
853 avp = list_next(&rx_grp->arg_vlans, avp)) {
854 int err;
855
856 if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
857 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
858 ": errno %d.", avp->av_vid,
859 mac_client_name(port->lp_mch), err);
860 }
861 }
862
863 port->lp_hwghs[g_idx] = NULL;
864 mac_perim_exit(pmph);
865 }
866
867 /*
868 * Add a pseudo TX ring for the given HW ring handle.
869 */
870 static int
aggr_add_pseudo_tx_ring(aggr_port_t * port,aggr_pseudo_tx_group_t * tx_grp,mac_ring_handle_t hw_rh,mac_ring_handle_t * pseudo_rh)871 aggr_add_pseudo_tx_ring(aggr_port_t *port,
872 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
873 mac_ring_handle_t *pseudo_rh)
874 {
875 aggr_pseudo_tx_ring_t *ring;
876 int err;
877 int i;
878
879 ASSERT(MAC_PERIM_HELD(port->lp_mh));
880 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
881 ring = tx_grp->atg_rings + i;
882 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
883 break;
884 }
885 /*
886 * No slot for this new TX ring.
887 */
888 if (i == MAX_RINGS_PER_GROUP)
889 return (ENOSPC);
890 /*
891 * The following 4 statements needs to be done before
892 * calling mac_group_add_ring(). Otherwise it will
893 * result in an assertion failure in mac_init_ring().
894 */
895 ring->atr_flags |= MAC_PSEUDO_RING_INUSE;
896 ring->atr_hw_rh = hw_rh;
897 ring->atr_port = port;
898 tx_grp->atg_ring_cnt++;
899
900 /*
901 * The TX side has no concept of ring groups unlike RX groups.
902 * There is just a single group which stores all the TX rings.
903 * This group will be used to store aggr's pseudo TX rings.
904 */
905 if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) {
906 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
907 ring->atr_hw_rh = NULL;
908 ring->atr_port = NULL;
909 tx_grp->atg_ring_cnt--;
910 } else {
911 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i);
912 if (hw_rh != NULL) {
913 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
914 mac_find_ring(tx_grp->atg_gh, i));
915 }
916 }
917
918 return (err);
919 }
920
921 /*
922 * Remove the pseudo TX ring of the given HW ring handle.
923 */
924 static void
aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t * tx_grp,mac_ring_handle_t pseudo_hw_rh)925 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp,
926 mac_ring_handle_t pseudo_hw_rh)
927 {
928 aggr_pseudo_tx_ring_t *ring;
929 int i;
930
931 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
932 ring = tx_grp->atg_rings + i;
933 if (ring->atr_rh != pseudo_hw_rh)
934 continue;
935
936 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE);
937 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh);
938 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
939 mac_hwring_teardown(ring->atr_hw_rh);
940 ring->atr_hw_rh = NULL;
941 ring->atr_port = NULL;
942 tx_grp->atg_ring_cnt--;
943 break;
944 }
945 }
946
947 /*
948 * This function is called to create pseudo rings over hardware rings of
949 * the underlying device. There is a 1:1 mapping between the pseudo TX
950 * rings of the aggr and the hardware rings of the underlying port.
951 */
952 static int
aggr_add_pseudo_tx_group(aggr_port_t * port,aggr_pseudo_tx_group_t * tx_grp,uint_t limit)953 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp,
954 uint_t limit)
955 {
956 aggr_grp_t *grp = port->lp_grp;
957 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
958 mac_perim_handle_t pmph;
959 int hw_rh_cnt, i = 0, j;
960 int err = 0;
961
962 if (limit == 0)
963 return (ENOSPC);
964
965 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
966 mac_perim_enter_by_mh(port->lp_mh, &pmph);
967
968 /*
969 * Get the list the the underlying HW rings.
970 */
971 hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh,
972 MAC_RING_TYPE_TX);
973
974 /*
975 * Even if the underlying NIC does not have TX rings, we
976 * still make a psuedo TX ring for that NIC with NULL as
977 * the ring handle.
978 */
979 if (hw_rh_cnt == 0)
980 port->lp_tx_ring_cnt = 1;
981 else
982 port->lp_tx_ring_cnt = MIN(hw_rh_cnt, limit);
983
984 port->lp_tx_ring_alloc = port->lp_tx_ring_cnt;
985 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
986 port->lp_tx_ring_alloc), KM_SLEEP);
987 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
988 port->lp_tx_ring_alloc), KM_SLEEP);
989
990 if (hw_rh_cnt == 0) {
991 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
992 NULL, &pseudo_rh)) == 0) {
993 port->lp_tx_rings[0] = NULL;
994 port->lp_pseudo_tx_rings[0] = pseudo_rh;
995 }
996 } else {
997 for (i = 0; err == 0 && i < port->lp_tx_ring_cnt; i++) {
998 err = aggr_add_pseudo_tx_ring(port,
999 tx_grp, hw_rh[i], &pseudo_rh);
1000 if (err != 0)
1001 break;
1002 port->lp_tx_rings[i] = hw_rh[i];
1003 port->lp_pseudo_tx_rings[i] = pseudo_rh;
1004 }
1005 }
1006
1007 if (err != 0) {
1008 if (hw_rh_cnt != 0) {
1009 for (j = 0; j < i; j++) {
1010 aggr_rem_pseudo_tx_ring(tx_grp,
1011 port->lp_pseudo_tx_rings[j]);
1012 }
1013 }
1014 kmem_free(port->lp_tx_rings,
1015 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc));
1016 kmem_free(port->lp_pseudo_tx_rings,
1017 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc));
1018 port->lp_tx_ring_cnt = 0;
1019 port->lp_tx_ring_alloc = 0;
1020 } else {
1021 port->lp_tx_grp_added = B_TRUE;
1022 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
1023 aggr_tx_ring_update, port);
1024 }
1025 mac_perim_exit(pmph);
1026 aggr_grp_update_default(grp);
1027 return (err);
1028 }
1029
1030 /*
1031 * This function is called by aggr to remove pseudo TX rings over the
1032 * HW rings of the underlying port.
1033 */
1034 static void
aggr_rem_pseudo_tx_group(aggr_port_t * port,aggr_pseudo_tx_group_t * tx_grp)1035 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
1036 {
1037 aggr_grp_t *grp = port->lp_grp;
1038 mac_perim_handle_t pmph;
1039 int i;
1040
1041 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1042 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1043
1044 if (!port->lp_tx_grp_added)
1045 goto done;
1046
1047 ASSERT(tx_grp->atg_gh != NULL);
1048
1049 for (i = 0; i < port->lp_tx_ring_cnt; i++)
1050 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
1051
1052 kmem_free(port->lp_tx_rings,
1053 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc));
1054 kmem_free(port->lp_pseudo_tx_rings,
1055 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc));
1056
1057 port->lp_tx_ring_cnt = 0;
1058 (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
1059 port->lp_tx_grp_added = B_FALSE;
1060 aggr_grp_update_default(grp);
1061 done:
1062 mac_perim_exit(pmph);
1063 }
1064
1065 static int
aggr_pseudo_disable_intr(mac_intr_handle_t ih)1066 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
1067 {
1068 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1069 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
1070 }
1071
1072 static int
aggr_pseudo_enable_intr(mac_intr_handle_t ih)1073 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
1074 {
1075 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1076 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
1077 }
1078
1079 /*
1080 * Start the pseudo ring. Since the pseudo ring is just an abstraction
1081 * over an actual HW ring, the real task is to start the underlying HW
1082 * ring.
1083 */
1084 static int
aggr_pseudo_start_rx_ring(mac_ring_driver_t arg,uint64_t mr_gen)1085 aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen)
1086 {
1087 int err;
1088 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1089
1090 err = mac_hwring_start(rr_ring->arr_hw_rh);
1091
1092 if (err != 0)
1093 return (err);
1094
1095 rr_ring->arr_gen = mr_gen;
1096 return (err);
1097 }
1098
1099 /*
1100 * Stop the pseudo ring. Since the pseudo ring is just an abstraction
1101 * over an actual HW ring, the real task is to stop the underlying HW
1102 * ring.
1103 */
1104 static void
aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg)1105 aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg)
1106 {
1107 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1108
1109 /*
1110 * The rings underlying the default group must stay up to
1111 * continue receiving LACP traffic. We would normally never
1112 * stop the default Rx rings because of the primary MAC
1113 * client; but aggr's primary MAC client doesn't call
1114 * mac_unicast_add() and thus mi_active is 0 when the last
1115 * non-primary client is deleted.
1116 */
1117 if (rr_ring->arr_grp->arg_index != 0)
1118 mac_hwring_stop(rr_ring->arr_hw_rh);
1119 }
1120
1121 /*
1122 * Trim each port in a group to ensure it uses no more than tx_ring_limit
1123 * rings.
1124 */
1125 static void
aggr_grp_balance_tx(aggr_grp_t * grp,uint_t tx_ring_limit)1126 aggr_grp_balance_tx(aggr_grp_t *grp, uint_t tx_ring_limit)
1127 {
1128 aggr_port_t *port;
1129 mac_perim_handle_t mph;
1130 uint_t i, tx_ring_cnt;
1131
1132 ASSERT(tx_ring_limit > 0);
1133 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1134
1135 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1136 mac_perim_enter_by_mh(port->lp_mh, &mph);
1137
1138 /*
1139 * Reduce the Tx ring count first to prevent rings being
1140 * used as they are removed.
1141 */
1142 rw_enter(&grp->lg_tx_lock, RW_WRITER);
1143 if (port->lp_tx_ring_cnt <= tx_ring_limit) {
1144 rw_exit(&grp->lg_tx_lock);
1145 mac_perim_exit(mph);
1146 continue;
1147 }
1148
1149 tx_ring_cnt = port->lp_tx_ring_cnt;
1150 port->lp_tx_ring_cnt = tx_ring_limit;
1151 rw_exit(&grp->lg_tx_lock);
1152
1153 for (i = tx_ring_cnt - 1; i >= tx_ring_limit; i--) {
1154 aggr_rem_pseudo_tx_ring(&grp->lg_tx_group,
1155 port->lp_pseudo_tx_rings[i]);
1156
1157 }
1158
1159 mac_perim_exit(mph);
1160 }
1161 }
1162
1163 /*
1164 * Add one or more ports to an existing link aggregation group.
1165 */
1166 int
aggr_grp_add_ports(datalink_id_t linkid,uint_t nports,boolean_t force,laioc_port_t * ports)1167 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
1168 laioc_port_t *ports)
1169 {
1170 int rc;
1171 uint_t port_added = 0;
1172 uint_t grp_added;
1173 uint_t nports_high, tx_ring_limit;
1174 aggr_grp_t *grp = NULL;
1175 aggr_port_t *port;
1176 boolean_t link_state_changed = B_FALSE;
1177 mac_perim_handle_t mph, pmph;
1178
1179 /* Get the aggr corresponding to linkid. */
1180 rw_enter(&aggr_grp_lock, RW_READER);
1181 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1182 (mod_hash_val_t *)&grp) != 0) {
1183 rw_exit(&aggr_grp_lock);
1184 return (ENOENT);
1185 }
1186 AGGR_GRP_REFHOLD(grp);
1187
1188 /*
1189 * Hold the perimeter so that the aggregation can't be destroyed.
1190 */
1191 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1192 rw_exit(&aggr_grp_lock);
1193
1194 /*
1195 * Limit the number of Tx rings per port. When determining the
1196 * number of ports take into consideration the existing high
1197 * value, and what the new high value may be after this request.
1198 */
1199 nports_high = MAX(grp->lg_nports_high, grp->lg_nports + nports);
1200 tx_ring_limit = MAX_RINGS_PER_GROUP / nports_high;
1201
1202 if (tx_ring_limit == 0) {
1203 rc = ENOSPC;
1204 goto bail;
1205 }
1206
1207 /*
1208 * Balance the Tx rings so each port has a fair share of rings.
1209 */
1210 aggr_grp_balance_tx(grp, tx_ring_limit);
1211
1212 /* Add the specified ports to the aggr. */
1213 for (uint_t i = 0; i < nports; i++) {
1214 grp_added = 0;
1215
1216 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1217 force, &port)) != 0) {
1218 goto bail;
1219 }
1220
1221 ASSERT(port != NULL);
1222 port_added++;
1223
1224 /* check capabilities */
1225 if (!aggr_grp_capab_check(grp, port) ||
1226 !aggr_grp_sdu_check(grp, port) ||
1227 !aggr_grp_margin_check(grp, port)) {
1228 rc = ENOTSUP;
1229 goto bail;
1230 }
1231
1232 /*
1233 * Create the pseudo ring for each HW ring of the underlying
1234 * port.
1235 */
1236 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group,
1237 tx_ring_limit);
1238 if (rc != 0)
1239 goto bail;
1240
1241 for (uint_t j = 0; j < grp->lg_rx_group_count; j++) {
1242 rc = aggr_add_pseudo_rx_group(port,
1243 &grp->lg_rx_groups[j]);
1244
1245 if (rc != 0)
1246 goto bail;
1247
1248 grp_added++;
1249 }
1250
1251 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1252
1253 /* set LACP mode */
1254 aggr_port_lacp_set_mode(grp, port);
1255
1256 /* start port if group has already been started */
1257 if (grp->lg_started) {
1258 rc = aggr_port_start(port);
1259 if (rc != 0) {
1260 mac_perim_exit(pmph);
1261 goto bail;
1262 }
1263
1264 /*
1265 * Turn on the promiscuous mode over the port when it
1266 * is requested to be turned on to receive the
1267 * non-primary address over a port, or the promiscuous
1268 * mode is enabled over the aggr.
1269 */
1270 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1271 rc = aggr_port_promisc(port, B_TRUE);
1272 if (rc != 0) {
1273 mac_perim_exit(pmph);
1274 goto bail;
1275 }
1276 }
1277 }
1278 mac_perim_exit(pmph);
1279
1280 /*
1281 * Attach each port if necessary.
1282 */
1283 if (aggr_port_notify_link(grp, port))
1284 link_state_changed = B_TRUE;
1285
1286 /*
1287 * Initialize the callback functions for this port.
1288 */
1289 aggr_port_init_callbacks(port);
1290 }
1291
1292 /* update the MAC address of the constituent ports */
1293 if (aggr_grp_update_ports_mac(grp))
1294 link_state_changed = B_TRUE;
1295
1296 if (link_state_changed)
1297 mac_link_update(grp->lg_mh, grp->lg_link_state);
1298
1299 bail:
1300 if (rc != 0) {
1301 /* stop and remove ports that have been added */
1302 for (uint_t i = 0; i < port_added; i++) {
1303 uint_t grp_remove;
1304
1305 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1306 ASSERT(port != NULL);
1307
1308 if (grp->lg_started) {
1309 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1310 (void) aggr_port_promisc(port, B_FALSE);
1311 aggr_port_stop(port);
1312 mac_perim_exit(pmph);
1313 }
1314
1315 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1316
1317 /*
1318 * Only the last port could have a partial set
1319 * of groups added.
1320 */
1321 grp_remove = (i + 1 == port_added) ? grp_added :
1322 grp->lg_rx_group_count;
1323
1324 for (uint_t j = 0; j < grp_remove; j++) {
1325 aggr_rem_pseudo_rx_group(port,
1326 &grp->lg_rx_groups[j]);
1327 }
1328
1329 (void) aggr_grp_rem_port(grp, port, NULL, NULL);
1330 }
1331 }
1332
1333 mac_perim_exit(mph);
1334 AGGR_GRP_REFRELE(grp);
1335 return (rc);
1336 }
1337
1338 static int
aggr_grp_modify_common(aggr_grp_t * grp,uint8_t update_mask,uint32_t policy,boolean_t mac_fixed,const uchar_t * mac_addr,aggr_lacp_mode_t lacp_mode,aggr_lacp_timer_t lacp_timer)1339 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1340 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1341 aggr_lacp_timer_t lacp_timer)
1342 {
1343 boolean_t mac_addr_changed = B_FALSE;
1344 boolean_t link_state_changed = B_FALSE;
1345 mac_perim_handle_t pmph;
1346
1347 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1348
1349 /* validate fixed address if specified */
1350 if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
1351 ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
1352 (mac_addr[0] & 0x01))) {
1353 return (EINVAL);
1354 }
1355
1356 /* update policy if requested */
1357 if (update_mask & AGGR_MODIFY_POLICY)
1358 aggr_send_update_policy(grp, policy);
1359
1360 /* update unicast MAC address if requested */
1361 if (update_mask & AGGR_MODIFY_MAC) {
1362 if (mac_fixed) {
1363 /* user-supplied MAC address */
1364 grp->lg_mac_addr_port = NULL;
1365 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
1366 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1367 mac_addr_changed = B_TRUE;
1368 }
1369 } else if (grp->lg_addr_fixed) {
1370 /* switch from user-supplied to automatic */
1371 aggr_port_t *port = grp->lg_ports;
1372
1373 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1374 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
1375 grp->lg_mac_addr_port = port;
1376 mac_addr_changed = B_TRUE;
1377 mac_perim_exit(pmph);
1378 }
1379 grp->lg_addr_fixed = mac_fixed;
1380 }
1381
1382 if (mac_addr_changed)
1383 link_state_changed = aggr_grp_update_ports_mac(grp);
1384
1385 if (update_mask & AGGR_MODIFY_LACP_MODE)
1386 aggr_lacp_update_mode(grp, lacp_mode);
1387
1388 if (update_mask & AGGR_MODIFY_LACP_TIMER)
1389 aggr_lacp_update_timer(grp, lacp_timer);
1390
1391 if (link_state_changed)
1392 mac_link_update(grp->lg_mh, grp->lg_link_state);
1393
1394 if (mac_addr_changed)
1395 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1396
1397 return (0);
1398 }
1399
1400 /*
1401 * Update properties of an existing link aggregation group.
1402 */
1403 int
aggr_grp_modify(datalink_id_t linkid,uint8_t update_mask,uint32_t policy,boolean_t mac_fixed,const uchar_t * mac_addr,aggr_lacp_mode_t lacp_mode,aggr_lacp_timer_t lacp_timer)1404 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
1405 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1406 aggr_lacp_timer_t lacp_timer)
1407 {
1408 aggr_grp_t *grp = NULL;
1409 mac_perim_handle_t mph;
1410 int err;
1411
1412 /* get group corresponding to linkid */
1413 rw_enter(&aggr_grp_lock, RW_READER);
1414 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1415 (mod_hash_val_t *)&grp) != 0) {
1416 rw_exit(&aggr_grp_lock);
1417 return (ENOENT);
1418 }
1419 AGGR_GRP_REFHOLD(grp);
1420
1421 /*
1422 * Hold the perimeter so that the aggregation won't be destroyed.
1423 */
1424 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1425 rw_exit(&aggr_grp_lock);
1426
1427 err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
1428 mac_addr, lacp_mode, lacp_timer);
1429
1430 mac_perim_exit(mph);
1431 AGGR_GRP_REFRELE(grp);
1432 return (err);
1433 }
1434
1435 /*
1436 * Create a new link aggregation group upon request from administrator.
1437 * Returns 0 on success, an errno on failure.
1438 */
1439 int
aggr_grp_create(datalink_id_t linkid,uint32_t key,uint_t nports,laioc_port_t * ports,uint32_t policy,boolean_t mac_fixed,boolean_t force,uchar_t * mac_addr,aggr_lacp_mode_t lacp_mode,aggr_lacp_timer_t lacp_timer,cred_t * credp)1440 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
1441 laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force,
1442 uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer,
1443 cred_t *credp)
1444 {
1445 aggr_grp_t *grp = NULL;
1446 aggr_port_t *port;
1447 aggr_port_t *last_attached = NULL;
1448 mac_register_t *mac;
1449 boolean_t link_state_changed;
1450 mac_perim_handle_t mph, pmph;
1451 datalink_id_t tempid;
1452 boolean_t mac_registered = B_FALSE;
1453 uint_t tx_ring_limit;
1454 int err;
1455 int i, j;
1456 kt_did_t tid = 0;
1457
1458 /* need at least one port */
1459 if (nports == 0)
1460 return (EINVAL);
1461
1462 rw_enter(&aggr_grp_lock, RW_WRITER);
1463
1464 /* does a group with the same linkid already exist? */
1465 err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1466 (mod_hash_val_t *)&grp);
1467 if (err == 0) {
1468 rw_exit(&aggr_grp_lock);
1469 return (EEXIST);
1470 }
1471
1472 grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
1473
1474 grp->lg_refs = 1;
1475 grp->lg_closing = B_FALSE;
1476 grp->lg_force = force;
1477 grp->lg_linkid = linkid;
1478 grp->lg_zoneid = crgetzoneid(credp);
1479 grp->lg_ifspeed = 0;
1480 grp->lg_link_state = LINK_STATE_UNKNOWN;
1481 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1482 grp->lg_started = B_FALSE;
1483 grp->lg_promisc = B_FALSE;
1484 grp->lg_lacp_done = B_FALSE;
1485 grp->lg_tx_notify_done = B_FALSE;
1486 grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1487 grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1488 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1489 grp->lg_tx_notify_thread = thread_create(NULL, 0,
1490 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1491 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1492 MAX_RINGS_PER_GROUP), KM_SLEEP);
1493 grp->lg_tx_blocked_cnt = 0;
1494 bzero(&grp->lg_rx_groups,
1495 sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT);
1496 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1497 aggr_lacp_init_grp(grp);
1498
1499 /* add MAC ports to group */
1500 grp->lg_ports = NULL;
1501 grp->lg_nports = 0;
1502 grp->lg_nattached_ports = 0;
1503 grp->lg_ntx_ports = 0;
1504
1505 /*
1506 * If key is not specified by the user, allocate the key.
1507 */
1508 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1509 err = ENOMEM;
1510 goto bail;
1511 }
1512 grp->lg_key = key;
1513
1514 for (i = 0; i < nports; i++) {
1515 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port);
1516 if (err != 0)
1517 goto bail;
1518 }
1519
1520 grp->lg_rx_group_count = 1;
1521
1522 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1523 uint_t num_rgroups;
1524
1525 mac_perim_enter_by_mh(port->lp_mh, &mph);
1526 num_rgroups = mac_get_num_rx_groups(port->lp_mh);
1527 mac_perim_exit(mph);
1528
1529 /*
1530 * Utilize all the groups in a port. If some ports
1531 * have less groups than others, then traffic destined
1532 * for the same unicast address may be HW classified
1533 * on some ports but SW classified by aggr when
1534 * arriving on other ports.
1535 */
1536 grp->lg_rx_group_count = MAX(grp->lg_rx_group_count,
1537 num_rgroups);
1538 }
1539
1540 /*
1541 * There could be cases where the hardware provides more
1542 * groups than aggr can support. Make sure we never go above
1543 * the max aggr can support.
1544 */
1545 grp->lg_rx_group_count = MIN(grp->lg_rx_group_count,
1546 MAX_GROUPS_PER_PORT);
1547
1548 ASSERT3U(grp->lg_rx_group_count, >, 0);
1549 for (i = 0; i < MAX_GROUPS_PER_PORT; i++) {
1550 grp->lg_rx_groups[i].arg_index = i;
1551 grp->lg_rx_groups[i].arg_untagged = 0;
1552 list_create(&(grp->lg_rx_groups[i].arg_vlans),
1553 sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link));
1554 }
1555
1556 /*
1557 * If no explicit MAC address was specified by the administrator,
1558 * set it to the MAC address of the first port.
1559 */
1560 grp->lg_addr_fixed = mac_fixed;
1561 if (grp->lg_addr_fixed) {
1562 /* validate specified address */
1563 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1564 err = EINVAL;
1565 goto bail;
1566 }
1567 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1568 } else {
1569 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1570 grp->lg_mac_addr_port = grp->lg_ports;
1571 }
1572
1573 /* Set the initial group capabilities. */
1574 aggr_grp_capab_set(grp);
1575
1576 if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1577 err = ENOMEM;
1578 goto bail;
1579 }
1580 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1581 mac->m_driver = grp;
1582 mac->m_dip = aggr_dip;
1583 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1584 mac->m_src_addr = grp->lg_addr;
1585 mac->m_callbacks = &aggr_m_callbacks;
1586 mac->m_min_sdu = 0;
1587 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1588 mac->m_margin = aggr_grp_max_margin(grp);
1589 mac->m_v12n = MAC_VIRT_LEVEL1;
1590 err = mac_register(mac, &grp->lg_mh);
1591 mac_free(mac);
1592 if (err != 0)
1593 goto bail;
1594
1595 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1596 if (err != 0) {
1597 (void) mac_unregister(grp->lg_mh);
1598 grp->lg_mh = NULL;
1599 goto bail;
1600 }
1601
1602 mac_registered = B_TRUE;
1603
1604 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1605
1606 /*
1607 * Update the MAC address of the constituent ports.
1608 * None of the port is attached at this time, the link state of the
1609 * aggregation will not change.
1610 *
1611 * All ports take on the primary MAC address of the aggr
1612 * (lg_aggr). At this point, none of the ports are attached;
1613 * thus the link state of the aggregation will not change.
1614 */
1615 link_state_changed = aggr_grp_update_ports_mac(grp);
1616 ASSERT(!link_state_changed);
1617
1618 /* Update outbound load balancing policy. */
1619 aggr_send_update_policy(grp, policy);
1620
1621 /* Set LACP mode. */
1622 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1623
1624 /*
1625 * The pseudo Tx group holds a maximum of MAX_RINGS_PER_GROUP
1626 * rings, when all the Tx rings of all the ports are accumulated
1627 * it is conceivable this limit is exceeded. We try and prevent
1628 * this by limiting the number of rings an individual port will use.
1629 *
1630 * - When an aggr is first created, we will not let an
1631 * individual port use more than MAX_RINGS_PER_GROUP/nports
1632 * rings.
1633 * - As ports are added to an existing aggr, each of the
1634 * ports will not use more than MAX_RINGS_PER_GROUP/nports_high.
1635 * Where nports_high is the highest number of ports the aggr has
1636 * held (including any ports being added). This may involve
1637 * trimming rings from existing ports.
1638 */
1639
1640 /* Leave room for 4 ports */
1641 tx_ring_limit = MAX_RINGS_PER_GROUP / MAX(4, nports);
1642
1643 /*
1644 * Attach each port if necessary.
1645 */
1646 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1647 /*
1648 * Create the pseudo ring for each HW ring of the
1649 * underlying port. Note that this is done after the
1650 * aggr registers its MAC.
1651 */
1652 err = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group,
1653 tx_ring_limit);
1654
1655 if (err != 0) {
1656 mac_perim_exit(mph);
1657 goto bail;
1658 }
1659
1660 for (i = 0; i < grp->lg_rx_group_count; i++) {
1661 err = aggr_add_pseudo_rx_group(port,
1662 &grp->lg_rx_groups[i]);
1663
1664 if (err != 0) {
1665 /*
1666 * Undo what we have added for the current
1667 * port.
1668 */
1669 aggr_rem_pseudo_tx_group(port,
1670 &grp->lg_tx_group);
1671
1672 for (j = 0; j < i; j++) {
1673 aggr_rem_pseudo_rx_group(port,
1674 &grp->lg_rx_groups[j]);
1675 }
1676
1677 mac_perim_exit(mph);
1678 goto bail;
1679 }
1680 }
1681
1682 if (aggr_port_notify_link(grp, port))
1683 link_state_changed = B_TRUE;
1684
1685 /*
1686 * Initialize the callback functions for this port.
1687 */
1688 aggr_port_init_callbacks(port);
1689
1690 last_attached = port;
1691 }
1692
1693 if (link_state_changed)
1694 mac_link_update(grp->lg_mh, grp->lg_link_state);
1695
1696 /* add new group to hash table */
1697 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1698 (mod_hash_val_t)grp);
1699 ASSERT(err == 0);
1700 aggr_grp_cnt++;
1701
1702 mac_perim_exit(mph);
1703 rw_exit(&aggr_grp_lock);
1704 return (0);
1705
1706 bail:
1707 grp->lg_closing = B_TRUE;
1708
1709 /*
1710 * Inform the lacp_rx thread to exit.
1711 */
1712 mutex_enter(&grp->lg_lacp_lock);
1713 grp->lg_lacp_done = B_TRUE;
1714 cv_signal(&grp->lg_lacp_cv);
1715 while (grp->lg_lacp_rx_thread != NULL)
1716 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1717 mutex_exit(&grp->lg_lacp_lock);
1718 /*
1719 * Inform the tx_notify thread to exit.
1720 */
1721 mutex_enter(&grp->lg_tx_flowctl_lock);
1722 if (grp->lg_tx_notify_thread != NULL) {
1723 tid = grp->lg_tx_notify_thread->t_did;
1724 grp->lg_tx_notify_done = B_TRUE;
1725 cv_signal(&grp->lg_tx_flowctl_cv);
1726 }
1727 mutex_exit(&grp->lg_tx_flowctl_lock);
1728 if (tid != 0)
1729 thread_join(tid);
1730
1731 if (mac_registered) {
1732 (void) dls_devnet_destroy(grp->lg_mh, &tempid, B_TRUE);
1733 (void) mac_disable(grp->lg_mh);
1734
1735 if (last_attached != NULL) {
1736 /*
1737 * Detach and clean up ports added.
1738 */
1739 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1740
1741 for (port = grp->lg_ports; ; port = port->lp_next) {
1742 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1743 (void) aggr_grp_detach_port(grp, port);
1744 mac_perim_exit(pmph);
1745
1746 aggr_rem_pseudo_tx_group(port,
1747 &grp->lg_tx_group);
1748
1749 for (i = 0; i < grp->lg_rx_group_count; i++) {
1750 aggr_rem_pseudo_rx_group(port,
1751 &grp->lg_rx_groups[i]);
1752 }
1753 if (port == last_attached)
1754 break;
1755 }
1756
1757 mac_perim_exit(mph);
1758 }
1759
1760 (void) mac_unregister(grp->lg_mh);
1761 }
1762
1763 port = grp->lg_ports;
1764 while (port != NULL) {
1765 aggr_port_t *cport;
1766
1767 cport = port->lp_next;
1768 aggr_port_delete(port);
1769 port = cport;
1770 }
1771
1772 kmem_free(grp->lg_tx_blocked_rings,
1773 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1774 rw_exit(&aggr_grp_lock);
1775 AGGR_GRP_REFRELE(grp);
1776 return (err);
1777 }
1778
1779 /*
1780 * Return a pointer to the member of a group with specified linkid.
1781 */
1782 static aggr_port_t *
aggr_grp_port_lookup(aggr_grp_t * grp,datalink_id_t linkid)1783 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
1784 {
1785 aggr_port_t *port;
1786
1787 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1788
1789 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1790 if (port->lp_linkid == linkid)
1791 break;
1792 }
1793
1794 return (port);
1795 }
1796
1797 /*
1798 * Stop, detach and remove a port from a link aggregation group.
1799 */
1800 static int
aggr_grp_rem_port(aggr_grp_t * grp,aggr_port_t * port,boolean_t * mac_addr_changedp,boolean_t * link_state_changedp)1801 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
1802 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
1803 {
1804 int rc = 0;
1805 aggr_port_t **pport;
1806 boolean_t mac_addr_changed = B_FALSE;
1807 boolean_t link_state_changed = B_FALSE;
1808 mac_perim_handle_t mph;
1809 uint64_t val;
1810 uint_t i;
1811 uint_t stat;
1812
1813 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1814 ASSERT(grp->lg_nports > 1);
1815 ASSERT(!grp->lg_closing);
1816
1817 /* unlink port */
1818 for (pport = &grp->lg_ports; *pport != port;
1819 pport = &(*pport)->lp_next) {
1820 if (*pport == NULL) {
1821 rc = ENOENT;
1822 goto done;
1823 }
1824 }
1825 *pport = port->lp_next;
1826
1827 mac_perim_enter_by_mh(port->lp_mh, &mph);
1828
1829 /*
1830 * If the MAC address of the port being removed was assigned
1831 * to the group, update the group MAC address
1832 * using the MAC address of a different port.
1833 */
1834 if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
1835 /*
1836 * Set the MAC address of the group to the
1837 * MAC address of its first port.
1838 */
1839 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1840 grp->lg_mac_addr_port = grp->lg_ports;
1841 mac_addr_changed = B_TRUE;
1842 }
1843
1844 link_state_changed = aggr_grp_detach_port(grp, port);
1845
1846 /*
1847 * Add the counter statistics of the ports while it was aggregated
1848 * to the group's residual statistics. This is done by obtaining
1849 * the current counter from the underlying MAC then subtracting the
1850 * value of the counter at the moment it was added to the
1851 * aggregation.
1852 */
1853 for (i = 0; i < MAC_NSTAT; i++) {
1854 stat = i + MAC_STAT_MIN;
1855 if (!MAC_STAT_ISACOUNTER(stat))
1856 continue;
1857 val = aggr_port_stat(port, stat);
1858 val -= port->lp_stat[i];
1859 mutex_enter(&grp->lg_stat_lock);
1860 grp->lg_stat[i] += val;
1861 mutex_exit(&grp->lg_stat_lock);
1862 }
1863 for (i = 0; i < ETHER_NSTAT; i++) {
1864 stat = i + MACTYPE_STAT_MIN;
1865 if (!ETHER_STAT_ISACOUNTER(stat))
1866 continue;
1867 val = aggr_port_stat(port, stat);
1868 val -= port->lp_ether_stat[i];
1869 mutex_enter(&grp->lg_stat_lock);
1870 grp->lg_ether_stat[i] += val;
1871 mutex_exit(&grp->lg_stat_lock);
1872 }
1873
1874 grp->lg_nports--;
1875 mac_perim_exit(mph);
1876
1877 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1878 aggr_port_delete(port);
1879
1880 /*
1881 * If the group MAC address has changed, update the MAC address of
1882 * the remaining constituent ports according to the new MAC
1883 * address of the group.
1884 */
1885 if (mac_addr_changed && aggr_grp_update_ports_mac(grp))
1886 link_state_changed = B_TRUE;
1887
1888 done:
1889 if (mac_addr_changedp != NULL)
1890 *mac_addr_changedp = mac_addr_changed;
1891 if (link_state_changedp != NULL)
1892 *link_state_changedp = link_state_changed;
1893
1894 return (rc);
1895 }
1896
1897 /*
1898 * Remove one or more ports from an existing link aggregation group.
1899 */
1900 int
aggr_grp_rem_ports(datalink_id_t linkid,uint_t nports,laioc_port_t * ports)1901 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
1902 {
1903 int rc = 0;
1904 uint_t i;
1905 aggr_grp_t *grp = NULL;
1906 aggr_port_t *port;
1907 boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
1908 boolean_t link_state_update = B_FALSE, link_state_changed;
1909 mac_perim_handle_t mph, pmph;
1910
1911 /* get group corresponding to linkid */
1912 rw_enter(&aggr_grp_lock, RW_READER);
1913 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1914 (mod_hash_val_t *)&grp) != 0) {
1915 rw_exit(&aggr_grp_lock);
1916 return (ENOENT);
1917 }
1918 AGGR_GRP_REFHOLD(grp);
1919
1920 /*
1921 * Hold the perimeter so that the aggregation won't be destroyed.
1922 */
1923 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1924 rw_exit(&aggr_grp_lock);
1925
1926 /* we need to keep at least one port per group */
1927 if (nports >= grp->lg_nports) {
1928 rc = EINVAL;
1929 goto bail;
1930 }
1931
1932 /* first verify that all the groups are valid */
1933 for (i = 0; i < nports; i++) {
1934 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) {
1935 /* port not found */
1936 rc = ENOENT;
1937 goto bail;
1938 }
1939 }
1940
1941 /* clear the promiscous mode for the specified ports */
1942 for (i = 0; i < nports && rc == 0; i++) {
1943 /* lookup port */
1944 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1945 ASSERT(port != NULL);
1946
1947 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1948 rc = aggr_port_promisc(port, B_FALSE);
1949 mac_perim_exit(pmph);
1950 }
1951 if (rc != 0) {
1952 for (i = 0; i < nports; i++) {
1953 port = aggr_grp_port_lookup(grp,
1954 ports[i].lp_linkid);
1955 ASSERT(port != NULL);
1956
1957 /*
1958 * Turn the promiscuous mode back on if it is required
1959 * to receive the non-primary address over a port, or
1960 * the promiscous mode is enabled over the aggr.
1961 */
1962 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1963 if (port->lp_started && (grp->lg_promisc ||
1964 port->lp_prom_addr != NULL)) {
1965 (void) aggr_port_promisc(port, B_TRUE);
1966 }
1967 mac_perim_exit(pmph);
1968 }
1969 goto bail;
1970 }
1971
1972 /* remove the specified ports from group */
1973 for (i = 0; i < nports; i++) {
1974 /* lookup port */
1975 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1976 ASSERT(port != NULL);
1977
1978 /* stop port if group has already been started */
1979 if (grp->lg_started) {
1980 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1981 aggr_port_stop(port);
1982 mac_perim_exit(pmph);
1983 }
1984
1985 /*
1986 * aggr_rem_pseudo_tx_group() is not called here. Instead
1987 * it is called from inside aggr_grp_rem_port() after the
1988 * port has been detached. The reason is that
1989 * aggr_rem_pseudo_tx_group() removes one ring at a time
1990 * and if there is still traffic going on, then there
1991 * is the possibility of aggr_find_tx_ring() returning a
1992 * removed ring for transmission. Once the port has been
1993 * detached, that port will not be used and
1994 * aggr_find_tx_ring() will not return any rings
1995 * belonging to it.
1996 */
1997 for (uint_t j = 0; j < grp->lg_rx_group_count; j++)
1998 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[j]);
1999
2000 /* remove port from group */
2001 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
2002 &link_state_changed);
2003 ASSERT(rc == 0);
2004 mac_addr_update = mac_addr_update || mac_addr_changed;
2005 link_state_update = link_state_update || link_state_changed;
2006 }
2007
2008 bail:
2009 if (mac_addr_update)
2010 mac_unicst_update(grp->lg_mh, grp->lg_addr);
2011 if (link_state_update)
2012 mac_link_update(grp->lg_mh, grp->lg_link_state);
2013
2014 mac_perim_exit(mph);
2015 AGGR_GRP_REFRELE(grp);
2016
2017 return (rc);
2018 }
2019
2020 int
aggr_grp_delete(datalink_id_t linkid,cred_t * cred)2021 aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
2022 {
2023 aggr_grp_t *grp = NULL;
2024 aggr_port_t *port, *cport;
2025 datalink_id_t tmpid;
2026 mod_hash_val_t val;
2027 mac_perim_handle_t mph, pmph;
2028 int err;
2029 kt_did_t tid = 0;
2030
2031 rw_enter(&aggr_grp_lock, RW_WRITER);
2032
2033 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
2034 (mod_hash_val_t *)&grp) != 0) {
2035 rw_exit(&aggr_grp_lock);
2036 return (ENOENT);
2037 }
2038
2039 /*
2040 * Note that dls_devnet_destroy() must be called before lg_lock is
2041 * held. Otherwise, it will deadlock if another thread is in
2042 * aggr_m_stat() and thus has a kstat_hold() on the kstats that
2043 * dls_devnet_destroy() needs to delete.
2044 */
2045 if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
2046 rw_exit(&aggr_grp_lock);
2047 return (err);
2048 }
2049 ASSERT(linkid == tmpid);
2050
2051 /*
2052 * Unregister from the MAC service module. Since this can
2053 * fail if a client hasn't closed the MAC port, we gracefully
2054 * fail the operation.
2055 */
2056 if ((err = mac_disable(grp->lg_mh)) != 0) {
2057 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred));
2058 rw_exit(&aggr_grp_lock);
2059 return (err);
2060 }
2061 (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
2062 ASSERT(grp == (aggr_grp_t *)val);
2063
2064 ASSERT(aggr_grp_cnt > 0);
2065 aggr_grp_cnt--;
2066 rw_exit(&aggr_grp_lock);
2067
2068 /*
2069 * Inform the lacp_rx thread to exit.
2070 */
2071 mutex_enter(&grp->lg_lacp_lock);
2072 grp->lg_lacp_done = B_TRUE;
2073 cv_signal(&grp->lg_lacp_cv);
2074 while (grp->lg_lacp_rx_thread != NULL)
2075 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
2076 mutex_exit(&grp->lg_lacp_lock);
2077 /*
2078 * Inform the tx_notify_thread to exit.
2079 */
2080 mutex_enter(&grp->lg_tx_flowctl_lock);
2081 if (grp->lg_tx_notify_thread != NULL) {
2082 tid = grp->lg_tx_notify_thread->t_did;
2083 grp->lg_tx_notify_done = B_TRUE;
2084 cv_signal(&grp->lg_tx_flowctl_cv);
2085 }
2086 mutex_exit(&grp->lg_tx_flowctl_lock);
2087 if (tid != 0)
2088 thread_join(tid);
2089
2090 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2091
2092 grp->lg_closing = B_TRUE;
2093 /* detach and free MAC ports associated with group */
2094 port = grp->lg_ports;
2095 while (port != NULL) {
2096 cport = port->lp_next;
2097 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2098 if (grp->lg_started)
2099 aggr_port_stop(port);
2100 (void) aggr_grp_detach_port(grp, port);
2101 mac_perim_exit(pmph);
2102 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
2103 for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
2104 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
2105 aggr_port_delete(port);
2106 port = cport;
2107 }
2108
2109 mac_perim_exit(mph);
2110
2111 kmem_free(grp->lg_tx_blocked_rings,
2112 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
2113 /*
2114 * Wait for the port's lacp timer thread and its notification callback
2115 * to exit before calling mac_unregister() since both needs to access
2116 * the mac perimeter of the grp.
2117 */
2118 aggr_grp_port_wait(grp);
2119
2120 VERIFY(mac_unregister(grp->lg_mh) == 0);
2121 grp->lg_mh = NULL;
2122
2123 for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) {
2124 list_destroy(&(grp->lg_rx_groups[i].arg_vlans));
2125 }
2126
2127 AGGR_GRP_REFRELE(grp);
2128 return (0);
2129 }
2130
2131 void
aggr_grp_free(aggr_grp_t * grp)2132 aggr_grp_free(aggr_grp_t *grp)
2133 {
2134 ASSERT(grp->lg_refs == 0);
2135 ASSERT(grp->lg_port_ref == 0);
2136 if (grp->lg_key > AGGR_MAX_KEY) {
2137 id_free(key_ids, grp->lg_key);
2138 grp->lg_key = 0;
2139 }
2140 kmem_cache_free(aggr_grp_cache, grp);
2141 }
2142
2143 int
aggr_grp_info(datalink_id_t linkid,void * fn_arg,aggr_grp_info_new_grp_fn_t new_grp_fn,aggr_grp_info_new_port_fn_t new_port_fn,cred_t * cred)2144 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
2145 aggr_grp_info_new_grp_fn_t new_grp_fn,
2146 aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred)
2147 {
2148 aggr_grp_t *grp;
2149 aggr_port_t *port;
2150 mac_perim_handle_t mph, pmph;
2151 int rc = 0;
2152
2153 /*
2154 * Make sure that the aggregation link is visible from the caller's
2155 * zone.
2156 */
2157 if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred)))
2158 return (ENOENT);
2159
2160 rw_enter(&aggr_grp_lock, RW_READER);
2161
2162 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
2163 (mod_hash_val_t *)&grp) != 0) {
2164 rw_exit(&aggr_grp_lock);
2165 return (ENOENT);
2166 }
2167 AGGR_GRP_REFHOLD(grp);
2168
2169 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2170 rw_exit(&aggr_grp_lock);
2171
2172 rc = new_grp_fn(fn_arg, grp->lg_linkid,
2173 (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
2174 grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy,
2175 grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
2176
2177 if (rc != 0)
2178 goto bail;
2179
2180 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2181 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2182 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
2183 port->lp_state, &port->lp_lacp.ActorOperPortState);
2184 mac_perim_exit(pmph);
2185
2186 if (rc != 0)
2187 goto bail;
2188 }
2189
2190 bail:
2191 mac_perim_exit(mph);
2192 AGGR_GRP_REFRELE(grp);
2193 return (rc);
2194 }
2195
2196 /*ARGSUSED*/
2197 static void
aggr_m_ioctl(void * arg,queue_t * q,mblk_t * mp)2198 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
2199 {
2200 miocnak(q, mp, 0, ENOTSUP);
2201 }
2202
2203 static int
aggr_grp_stat(aggr_grp_t * grp,uint_t stat,uint64_t * val)2204 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
2205 {
2206 aggr_port_t *port;
2207 uint_t stat_index;
2208
2209 ASSERT(MUTEX_HELD(&grp->lg_stat_lock));
2210
2211 /* We only aggregate counter statistics. */
2212 if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) ||
2213 IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) {
2214 return (ENOTSUP);
2215 }
2216
2217 /*
2218 * Counter statistics for a group are computed by aggregating the
2219 * counters of the members MACs while they were aggregated, plus
2220 * the residual counter of the group itself, which is updated each
2221 * time a MAC is removed from the group.
2222 */
2223 *val = 0;
2224 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2225 /* actual port statistic */
2226 *val += aggr_port_stat(port, stat);
2227 /*
2228 * minus the port stat when it was added, plus any residual
2229 * amount for the group.
2230 */
2231 if (IS_MAC_STAT(stat)) {
2232 stat_index = stat - MAC_STAT_MIN;
2233 *val -= port->lp_stat[stat_index];
2234 *val += grp->lg_stat[stat_index];
2235 } else if (IS_MACTYPE_STAT(stat)) {
2236 stat_index = stat - MACTYPE_STAT_MIN;
2237 *val -= port->lp_ether_stat[stat_index];
2238 *val += grp->lg_ether_stat[stat_index];
2239 }
2240 }
2241 return (0);
2242 }
2243
2244 int
aggr_rx_ring_stat(mac_ring_driver_t rdriver,uint_t stat,uint64_t * val)2245 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2246 {
2247 aggr_pseudo_rx_ring_t *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver;
2248
2249 if (rx_ring->arr_hw_rh != NULL) {
2250 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat);
2251 } else {
2252 aggr_port_t *port = rx_ring->arr_port;
2253
2254 *val = mac_stat_get(port->lp_mh, stat);
2255
2256 }
2257 return (0);
2258 }
2259
2260 int
aggr_tx_ring_stat(mac_ring_driver_t rdriver,uint_t stat,uint64_t * val)2261 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2262 {
2263 aggr_pseudo_tx_ring_t *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver;
2264
2265 if (tx_ring->atr_hw_rh != NULL) {
2266 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat);
2267 } else {
2268 aggr_port_t *port = tx_ring->atr_port;
2269
2270 *val = mac_stat_get(port->lp_mh, stat);
2271 }
2272 return (0);
2273 }
2274
2275 static int
aggr_m_stat(void * arg,uint_t stat,uint64_t * val)2276 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
2277 {
2278 aggr_grp_t *grp = arg;
2279 int rval = 0;
2280
2281 mutex_enter(&grp->lg_stat_lock);
2282
2283 switch (stat) {
2284 case MAC_STAT_IFSPEED:
2285 *val = grp->lg_ifspeed;
2286 break;
2287
2288 case ETHER_STAT_LINK_DUPLEX:
2289 *val = grp->lg_link_duplex;
2290 break;
2291
2292 default:
2293 /*
2294 * For all other statistics, we return the aggregated stat
2295 * from the underlying ports. aggr_grp_stat() will set
2296 * rval appropriately if the statistic isn't a counter.
2297 */
2298 rval = aggr_grp_stat(grp, stat, val);
2299 }
2300
2301 mutex_exit(&grp->lg_stat_lock);
2302 return (rval);
2303 }
2304
2305 static int
aggr_m_start(void * arg)2306 aggr_m_start(void *arg)
2307 {
2308 aggr_grp_t *grp = arg;
2309 aggr_port_t *port;
2310 mac_perim_handle_t mph, pmph;
2311
2312 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2313
2314 /*
2315 * Attempts to start all configured members of the group.
2316 * Group members will be attached when their link-up notification
2317 * is received.
2318 */
2319 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2320 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2321 if (aggr_port_start(port) != 0) {
2322 mac_perim_exit(pmph);
2323 continue;
2324 }
2325
2326 /*
2327 * Turn on the promiscuous mode if it is required to receive
2328 * the non-primary address over a port, or the promiscous
2329 * mode is enabled over the aggr.
2330 */
2331 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
2332 if (aggr_port_promisc(port, B_TRUE) != 0)
2333 aggr_port_stop(port);
2334 }
2335 mac_perim_exit(pmph);
2336 }
2337
2338 grp->lg_started = B_TRUE;
2339
2340 mac_perim_exit(mph);
2341 return (0);
2342 }
2343
2344 static void
aggr_m_stop(void * arg)2345 aggr_m_stop(void *arg)
2346 {
2347 aggr_grp_t *grp = arg;
2348 aggr_port_t *port;
2349 mac_perim_handle_t mph, pmph;
2350
2351 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2352
2353 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2354 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2355
2356 /* reset port promiscuous mode */
2357 (void) aggr_port_promisc(port, B_FALSE);
2358
2359 aggr_port_stop(port);
2360 mac_perim_exit(pmph);
2361 }
2362
2363 grp->lg_started = B_FALSE;
2364 mac_perim_exit(mph);
2365 }
2366
2367 static int
aggr_m_promisc(void * arg,boolean_t on)2368 aggr_m_promisc(void *arg, boolean_t on)
2369 {
2370 aggr_grp_t *grp = arg;
2371 aggr_port_t *port;
2372 boolean_t link_state_changed = B_FALSE;
2373 mac_perim_handle_t mph, pmph;
2374
2375 AGGR_GRP_REFHOLD(grp);
2376 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2377
2378 ASSERT(!grp->lg_closing);
2379
2380 if (on == grp->lg_promisc)
2381 goto bail;
2382
2383 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2384 int err = 0;
2385
2386 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2387 AGGR_PORT_REFHOLD(port);
2388 if (!on && (port->lp_prom_addr == NULL))
2389 err = aggr_port_promisc(port, B_FALSE);
2390 else if (on && port->lp_started)
2391 err = aggr_port_promisc(port, B_TRUE);
2392
2393 if (err != 0) {
2394 if (aggr_grp_detach_port(grp, port))
2395 link_state_changed = B_TRUE;
2396 } else {
2397 /*
2398 * If a port was detached because of a previous
2399 * failure changing the promiscuity, the port
2400 * is reattached when it successfully changes
2401 * the promiscuity now, and this might cause
2402 * the link state of the aggregation to change.
2403 */
2404 if (aggr_grp_attach_port(grp, port))
2405 link_state_changed = B_TRUE;
2406 }
2407 mac_perim_exit(pmph);
2408 AGGR_PORT_REFRELE(port);
2409 }
2410
2411 grp->lg_promisc = on;
2412
2413 if (link_state_changed)
2414 mac_link_update(grp->lg_mh, grp->lg_link_state);
2415
2416 bail:
2417 mac_perim_exit(mph);
2418 AGGR_GRP_REFRELE(grp);
2419
2420 return (0);
2421 }
2422
2423 static void
aggr_grp_port_rename(const char * new_name,void * arg)2424 aggr_grp_port_rename(const char *new_name, void *arg)
2425 {
2426 /*
2427 * aggr port's mac client name is the format of "aggr link name" plus
2428 * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
2429 */
2430 int aggr_len, link_len, clnt_name_len, i;
2431 char *str_end, *str_st, *str_del;
2432 char aggr_name[MAXNAMELEN];
2433 char link_name[MAXNAMELEN];
2434 char *clnt_name;
2435 aggr_grp_t *aggr_grp = arg;
2436 aggr_port_t *aggr_port = aggr_grp->lg_ports;
2437
2438 for (i = 0; i < aggr_grp->lg_nports; i++) {
2439 clnt_name = mac_client_name(aggr_port->lp_mch);
2440 clnt_name_len = strlen(clnt_name);
2441 str_st = clnt_name;
2442 str_end = &(clnt_name[clnt_name_len]);
2443 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
2444 ASSERT(str_del != NULL);
2445 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
2446 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
2447 bzero(aggr_name, MAXNAMELEN);
2448 bzero(link_name, MAXNAMELEN);
2449 bcopy(clnt_name, aggr_name, aggr_len);
2450 bcopy(str_del, link_name, link_len + 1);
2451 bzero(clnt_name, MAXNAMELEN);
2452 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
2453 link_name);
2454
2455 (void) mac_rename_primary(aggr_port->lp_mh, NULL);
2456 aggr_port = aggr_port->lp_next;
2457 }
2458 }
2459
2460 /*
2461 * Initialize the capabilities that are advertised for the group
2462 * according to the capabilities of the constituent ports.
2463 */
2464 static boolean_t
aggr_m_capab_get(void * arg,mac_capab_t cap,void * cap_data)2465 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
2466 {
2467 aggr_grp_t *grp = arg;
2468
2469 switch (cap) {
2470 case MAC_CAPAB_HCKSUM: {
2471 uint32_t *hcksum_txflags = cap_data;
2472 *hcksum_txflags = grp->lg_hcksum_txflags;
2473 break;
2474 }
2475 case MAC_CAPAB_LSO: {
2476 mac_capab_lso_t *cap_lso = cap_data;
2477
2478 if (grp->lg_lso) {
2479 *cap_lso = grp->lg_cap_lso;
2480 break;
2481 } else {
2482 return (B_FALSE);
2483 }
2484 }
2485 case MAC_CAPAB_NO_NATIVEVLAN:
2486 return (!grp->lg_vlan);
2487 case MAC_CAPAB_NO_ZCOPY:
2488 return (!grp->lg_zcopy);
2489 case MAC_CAPAB_RINGS: {
2490 mac_capab_rings_t *cap_rings = cap_data;
2491 uint_t ring_cnt = 0;
2492
2493 for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
2494 ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt;
2495
2496 if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2497 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2498 cap_rings->mr_rnum = ring_cnt;
2499 cap_rings->mr_gnum = grp->lg_rx_group_count;
2500 cap_rings->mr_gaddring = NULL;
2501 cap_rings->mr_gremring = NULL;
2502 } else {
2503 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2504 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2505 cap_rings->mr_gnum = 0;
2506 }
2507 cap_rings->mr_rget = aggr_fill_ring;
2508 cap_rings->mr_gget = aggr_fill_group;
2509 break;
2510 }
2511 case MAC_CAPAB_AGGR:
2512 {
2513 mac_capab_aggr_t *aggr_cap;
2514
2515 if (cap_data != NULL) {
2516 aggr_cap = cap_data;
2517 aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2518 aggr_cap->mca_unicst = aggr_m_unicst;
2519 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2520 aggr_cap->mca_arg = arg;
2521 }
2522 return (B_TRUE);
2523 }
2524 default:
2525 return (B_FALSE);
2526 }
2527 return (B_TRUE);
2528 }
2529
2530 /*
2531 * Callback function for MAC layer to register groups.
2532 */
2533 static void
aggr_fill_group(void * arg,mac_ring_type_t rtype,const int index,mac_group_info_t * infop,mac_group_handle_t gh)2534 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2535 mac_group_info_t *infop, mac_group_handle_t gh)
2536 {
2537 aggr_grp_t *grp = arg;
2538
2539 if (rtype == MAC_RING_TYPE_RX) {
2540 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index];
2541
2542 rx_group->arg_gh = gh;
2543 rx_group->arg_grp = grp;
2544
2545 infop->mgi_driver = (mac_group_driver_t)rx_group;
2546 infop->mgi_start = NULL;
2547 infop->mgi_stop = NULL;
2548 infop->mgi_addmac = aggr_addmac;
2549 infop->mgi_remmac = aggr_remmac;
2550 infop->mgi_count = rx_group->arg_ring_cnt;
2551
2552 /*
2553 * Always set the HW VLAN callbacks. They are smart
2554 * enough to know when a port has HW VLAN filters to
2555 * program and when it doesn't.
2556 */
2557 infop->mgi_addvlan = aggr_addvlan;
2558 infop->mgi_remvlan = aggr_remvlan;
2559 } else {
2560 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2561
2562 ASSERT3S(index, ==, 0);
2563 tx_group->atg_gh = gh;
2564 }
2565 }
2566
2567 /*
2568 * Callback funtion for MAC layer to register all rings.
2569 */
2570 static void
aggr_fill_ring(void * arg,mac_ring_type_t rtype,const int rg_index,const int index,mac_ring_info_t * infop,mac_ring_handle_t rh)2571 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2572 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2573 {
2574 aggr_grp_t *grp = arg;
2575
2576 switch (rtype) {
2577 case MAC_RING_TYPE_RX: {
2578 aggr_pseudo_rx_group_t *rx_group;
2579 aggr_pseudo_rx_ring_t *rx_ring;
2580 mac_intr_t aggr_mac_intr;
2581
2582 rx_group = &grp->lg_rx_groups[rg_index];
2583 ASSERT3S(index, >=, 0);
2584 ASSERT3S(index, <, rx_group->arg_ring_cnt);
2585 rx_ring = rx_group->arg_rings + index;
2586 rx_ring->arr_rh = rh;
2587
2588 /*
2589 * Entrypoint to enable interrupt (disable poll) and
2590 * disable interrupt (enable poll).
2591 */
2592 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2593 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2594 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2595 aggr_mac_intr.mi_ddi_handle = NULL;
2596
2597 infop->mri_driver = (mac_ring_driver_t)rx_ring;
2598 infop->mri_start = aggr_pseudo_start_rx_ring;
2599 infop->mri_stop = aggr_pseudo_stop_rx_ring;
2600
2601 infop->mri_intr = aggr_mac_intr;
2602 infop->mri_poll = aggr_rx_poll;
2603
2604 infop->mri_stat = aggr_rx_ring_stat;
2605 break;
2606 }
2607 case MAC_RING_TYPE_TX: {
2608 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2609 aggr_pseudo_tx_ring_t *tx_ring;
2610
2611 ASSERT(rg_index == -1);
2612 ASSERT(index < tx_group->atg_ring_cnt);
2613
2614 tx_ring = &tx_group->atg_rings[index];
2615 tx_ring->atr_rh = rh;
2616
2617 infop->mri_driver = (mac_ring_driver_t)tx_ring;
2618 infop->mri_start = NULL;
2619 infop->mri_stop = NULL;
2620 infop->mri_tx = aggr_ring_tx;
2621 infop->mri_stat = aggr_tx_ring_stat;
2622 /*
2623 * Use the hw TX ring handle to find if the ring needs
2624 * serialization or not. For NICs that do not expose
2625 * Tx rings, atr_hw_rh will be NULL.
2626 */
2627 if (tx_ring->atr_hw_rh != NULL) {
2628 infop->mri_flags =
2629 mac_hwring_getinfo(tx_ring->atr_hw_rh);
2630 }
2631 break;
2632 }
2633 default:
2634 break;
2635 }
2636 }
2637
2638 static mblk_t *
aggr_rx_poll(void * arg,int bytes_to_pickup)2639 aggr_rx_poll(void *arg, int bytes_to_pickup)
2640 {
2641 aggr_pseudo_rx_ring_t *rr_ring = arg;
2642 aggr_port_t *port = rr_ring->arr_port;
2643 aggr_grp_t *grp = port->lp_grp;
2644 mblk_t *mp_chain, *mp, **mpp;
2645
2646 mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
2647
2648 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
2649 return (mp_chain);
2650
2651 mpp = &mp_chain;
2652 while ((mp = *mpp) != NULL) {
2653 if (MBLKL(mp) >= sizeof (struct ether_header)) {
2654 struct ether_header *ehp;
2655
2656 ehp = (struct ether_header *)mp->b_rptr;
2657 if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
2658 *mpp = mp->b_next;
2659 mp->b_next = NULL;
2660 aggr_recv_lacp(port,
2661 (mac_resource_handle_t)rr_ring, mp);
2662 continue;
2663 }
2664 }
2665
2666 if (!port->lp_collector_enabled) {
2667 *mpp = mp->b_next;
2668 mp->b_next = NULL;
2669 freemsg(mp);
2670 continue;
2671 }
2672 mpp = &mp->b_next;
2673 }
2674 return (mp_chain);
2675 }
2676
2677 static int
aggr_addmac(void * arg,const uint8_t * mac_addr)2678 aggr_addmac(void *arg, const uint8_t *mac_addr)
2679 {
2680 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2681 aggr_unicst_addr_t *addr, **pprev;
2682 aggr_grp_t *grp = rx_group->arg_grp;
2683 aggr_port_t *port, *p;
2684 mac_perim_handle_t mph;
2685 int err = 0;
2686 uint_t idx = rx_group->arg_index;
2687
2688 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2689
2690 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2691 mac_perim_exit(mph);
2692 return (0);
2693 }
2694
2695 /*
2696 * Insert this mac address into the list of mac addresses owned by
2697 * the aggregation pseudo group.
2698 */
2699 pprev = &rx_group->arg_macaddr;
2700 while ((addr = *pprev) != NULL) {
2701 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2702 mac_perim_exit(mph);
2703 return (EEXIST);
2704 }
2705 pprev = &addr->aua_next;
2706 }
2707 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2708 bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2709 addr->aua_next = NULL;
2710 *pprev = addr;
2711
2712 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2713 if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0)
2714 break;
2715
2716 if (err != 0) {
2717 for (p = grp->lg_ports; p != port; p = p->lp_next)
2718 aggr_port_remmac(p, idx, mac_addr);
2719
2720 *pprev = NULL;
2721 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2722 }
2723
2724 mac_perim_exit(mph);
2725 return (err);
2726 }
2727
2728 static int
aggr_remmac(void * arg,const uint8_t * mac_addr)2729 aggr_remmac(void *arg, const uint8_t *mac_addr)
2730 {
2731 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2732 aggr_unicst_addr_t *addr, **pprev;
2733 aggr_grp_t *grp = rx_group->arg_grp;
2734 aggr_port_t *port;
2735 mac_perim_handle_t mph;
2736 int err = 0;
2737
2738 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2739
2740 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2741 mac_perim_exit(mph);
2742 return (0);
2743 }
2744
2745 /*
2746 * Insert this mac address into the list of mac addresses owned by
2747 * the aggregation pseudo group.
2748 */
2749 pprev = &rx_group->arg_macaddr;
2750 while ((addr = *pprev) != NULL) {
2751 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2752 pprev = &addr->aua_next;
2753 continue;
2754 }
2755 break;
2756 }
2757 if (addr == NULL) {
2758 mac_perim_exit(mph);
2759 return (EINVAL);
2760 }
2761
2762 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2763 aggr_port_remmac(port, rx_group->arg_index, mac_addr);
2764
2765 *pprev = addr->aua_next;
2766 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2767
2768 mac_perim_exit(mph);
2769 return (err);
2770 }
2771
2772 /*
2773 * Search for VID in the Rx group's list and return a pointer if
2774 * found. Otherwise return NULL.
2775 */
2776 static aggr_vlan_t *
aggr_find_vlan(aggr_pseudo_rx_group_t * rx_group,uint16_t vid)2777 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid)
2778 {
2779 ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh));
2780 for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL;
2781 avp = list_next(&rx_group->arg_vlans, avp)) {
2782 if (avp->av_vid == vid)
2783 return (avp);
2784 }
2785
2786 return (NULL);
2787 }
2788
2789 /*
2790 * Accept traffic on the specified VID.
2791 *
2792 * Persist VLAN state in the aggr so that ports added later will
2793 * receive the correct filters. In the future it would be nice to
2794 * allow aggr to iterate its clients instead of duplicating state.
2795 */
2796 static int
aggr_addvlan(mac_group_driver_t gdriver,uint16_t vid)2797 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid)
2798 {
2799 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2800 aggr_grp_t *aggr = rx_group->arg_grp;
2801 aggr_port_t *port, *p;
2802 mac_perim_handle_t mph;
2803 int err = 0;
2804 aggr_vlan_t *avp = NULL;
2805 uint_t idx = rx_group->arg_index;
2806
2807 mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2808
2809 if (vid == MAC_VLAN_UNTAGGED) {
2810 /*
2811 * Aggr is both a MAC provider and MAC client. As a
2812 * MAC provider it is passed MAC_VLAN_UNTAGGED by its
2813 * client. As a client itself, it should pass
2814 * VLAN_ID_NONE to its ports.
2815 */
2816 vid = VLAN_ID_NONE;
2817 rx_group->arg_untagged++;
2818 goto update_ports;
2819 }
2820
2821 avp = aggr_find_vlan(rx_group, vid);
2822
2823 if (avp != NULL) {
2824 avp->av_refs++;
2825 mac_perim_exit(mph);
2826 return (0);
2827 }
2828
2829 avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP);
2830 avp->av_vid = vid;
2831 avp->av_refs = 1;
2832
2833 update_ports:
2834 for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2835 if ((err = aggr_port_addvlan(port, idx, vid)) != 0)
2836 break;
2837
2838 if (err != 0) {
2839 /*
2840 * If any of these calls fail then we are in a
2841 * situation where the ports have different HW state.
2842 * There's no reasonable action the MAC client can
2843 * take in this scenario to rectify the situation.
2844 */
2845 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2846 int err2;
2847
2848 if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) {
2849 cmn_err(CE_WARN, "Failed to remove VLAN %u"
2850 " from port %s: errno %d.", vid,
2851 mac_client_name(p->lp_mch), err2);
2852 }
2853
2854 }
2855
2856 if (vid == VLAN_ID_NONE)
2857 rx_group->arg_untagged--;
2858
2859 if (avp != NULL) {
2860 kmem_free(avp, sizeof (aggr_vlan_t));
2861 avp = NULL;
2862 }
2863 }
2864
2865 if (avp != NULL)
2866 list_insert_tail(&rx_group->arg_vlans, avp);
2867
2868 done:
2869 mac_perim_exit(mph);
2870 return (err);
2871 }
2872
2873 /*
2874 * Stop accepting traffic on this VLAN if it's the last use of this VLAN.
2875 */
2876 static int
aggr_remvlan(mac_group_driver_t gdriver,uint16_t vid)2877 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid)
2878 {
2879 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2880 aggr_grp_t *aggr = rx_group->arg_grp;
2881 aggr_port_t *port, *p;
2882 mac_perim_handle_t mph;
2883 int err = 0;
2884 aggr_vlan_t *avp = NULL;
2885 uint_t idx = rx_group->arg_index;
2886
2887 mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2888
2889 /*
2890 * See the comment in aggr_addvlan().
2891 */
2892 if (vid == MAC_VLAN_UNTAGGED) {
2893 vid = VLAN_ID_NONE;
2894 rx_group->arg_untagged--;
2895
2896 if (rx_group->arg_untagged > 0)
2897 goto done;
2898
2899 goto update_ports;
2900 }
2901
2902 avp = aggr_find_vlan(rx_group, vid);
2903
2904 if (avp == NULL) {
2905 err = ENOENT;
2906 goto done;
2907 }
2908
2909 avp->av_refs--;
2910
2911 if (avp->av_refs > 0)
2912 goto done;
2913
2914 update_ports:
2915 for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2916 if ((err = aggr_port_remvlan(port, idx, vid)) != 0)
2917 break;
2918
2919 /*
2920 * See the comment in aggr_addvlan() for justification of the
2921 * use of VERIFY here.
2922 */
2923 if (err != 0) {
2924 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2925 int err2;
2926
2927 if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) {
2928 cmn_err(CE_WARN, "Failed to add VLAN %u"
2929 " to port %s: errno %d.", vid,
2930 mac_client_name(p->lp_mch), err2);
2931 }
2932 }
2933
2934 if (avp != NULL)
2935 avp->av_refs++;
2936
2937 if (vid == VLAN_ID_NONE)
2938 rx_group->arg_untagged++;
2939
2940 goto done;
2941 }
2942
2943 if (err == 0 && avp != NULL) {
2944 VERIFY3U(avp->av_refs, ==, 0);
2945 list_remove(&rx_group->arg_vlans, avp);
2946 kmem_free(avp, sizeof (aggr_vlan_t));
2947 }
2948
2949 done:
2950 mac_perim_exit(mph);
2951 return (err);
2952 }
2953
2954 /*
2955 * Add or remove the multicast addresses that are defined for the group
2956 * to or from the specified port.
2957 *
2958 * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
2959 * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
2960 * called when the port is either stopped or detached.
2961 */
2962 void
aggr_grp_multicst_port(aggr_port_t * port,boolean_t add)2963 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
2964 {
2965 aggr_grp_t *grp = port->lp_grp;
2966
2967 ASSERT(MAC_PERIM_HELD(port->lp_mh));
2968 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2969
2970 if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
2971 return;
2972
2973 mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
2974 }
2975
2976 static int
aggr_m_multicst(void * arg,boolean_t add,const uint8_t * addrp)2977 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
2978 {
2979 aggr_grp_t *grp = arg;
2980 aggr_port_t *port = NULL, *errport = NULL;
2981 mac_perim_handle_t mph;
2982 int err = 0;
2983
2984 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2985 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2986 if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2987 !port->lp_started) {
2988 continue;
2989 }
2990 err = aggr_port_multicst(port, add, addrp);
2991 if (err != 0) {
2992 errport = port;
2993 break;
2994 }
2995 }
2996
2997 /*
2998 * At least one port caused error return and this error is returned to
2999 * mac, eventually a NAK would be sent upwards.
3000 * Some ports have this multicast address listed now, and some don't.
3001 * Treat this error as a whole aggr failure not individual port failure.
3002 * Therefore remove this multicast address from other ports.
3003 */
3004 if ((err != 0) && add) {
3005 for (port = grp->lg_ports; port != errport;
3006 port = port->lp_next) {
3007 if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
3008 !port->lp_started) {
3009 continue;
3010 }
3011 (void) aggr_port_multicst(port, B_FALSE, addrp);
3012 }
3013 }
3014 mac_perim_exit(mph);
3015 return (err);
3016 }
3017
3018 static int
aggr_m_unicst(void * arg,const uint8_t * macaddr)3019 aggr_m_unicst(void *arg, const uint8_t *macaddr)
3020 {
3021 aggr_grp_t *grp = arg;
3022 mac_perim_handle_t mph;
3023 int err;
3024
3025 mac_perim_enter_by_mh(grp->lg_mh, &mph);
3026 err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
3027 0, 0);
3028 mac_perim_exit(mph);
3029 return (err);
3030 }
3031
3032 /*
3033 * Initialize the capabilities that are advertised for the group
3034 * according to the capabilities of the constituent ports.
3035 */
3036 static void
aggr_grp_capab_set(aggr_grp_t * grp)3037 aggr_grp_capab_set(aggr_grp_t *grp)
3038 {
3039 uint32_t cksum;
3040 aggr_port_t *port;
3041 mac_capab_lso_t cap_lso;
3042
3043 ASSERT(grp->lg_mh == NULL);
3044 ASSERT(grp->lg_ports != NULL);
3045
3046 grp->lg_hcksum_txflags = (uint32_t)-1;
3047 grp->lg_zcopy = B_TRUE;
3048 grp->lg_vlan = B_TRUE;
3049
3050 grp->lg_lso = B_TRUE;
3051 grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1;
3052 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1;
3053
3054 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
3055 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum))
3056 cksum = 0;
3057 grp->lg_hcksum_txflags &= cksum;
3058
3059 grp->lg_vlan &=
3060 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL);
3061
3062 grp->lg_zcopy &=
3063 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
3064
3065 grp->lg_lso &=
3066 mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso);
3067 if (grp->lg_lso) {
3068 grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags;
3069 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
3070 cap_lso.lso_basic_tcp_ipv4.lso_max)
3071 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max =
3072 cap_lso.lso_basic_tcp_ipv4.lso_max;
3073 }
3074 }
3075 }
3076
3077 /*
3078 * Checks whether the capabilities of the port being added are compatible
3079 * with the current capabilities of the aggregation.
3080 */
3081 static boolean_t
aggr_grp_capab_check(aggr_grp_t * grp,aggr_port_t * port)3082 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
3083 {
3084 uint32_t hcksum_txflags;
3085
3086 ASSERT(grp->lg_ports != NULL);
3087
3088 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) &
3089 grp->lg_vlan) != grp->lg_vlan) {
3090 return (B_FALSE);
3091 }
3092
3093 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) &
3094 grp->lg_zcopy) != grp->lg_zcopy) {
3095 return (B_FALSE);
3096 }
3097
3098 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) {
3099 if (grp->lg_hcksum_txflags != 0)
3100 return (B_FALSE);
3101 } else if ((hcksum_txflags & grp->lg_hcksum_txflags) !=
3102 grp->lg_hcksum_txflags) {
3103 return (B_FALSE);
3104 }
3105
3106 if (grp->lg_lso) {
3107 mac_capab_lso_t cap_lso;
3108
3109 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) {
3110 if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) !=
3111 grp->lg_cap_lso.lso_flags)
3112 return (B_FALSE);
3113 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
3114 cap_lso.lso_basic_tcp_ipv4.lso_max)
3115 return (B_FALSE);
3116 } else {
3117 return (B_FALSE);
3118 }
3119 }
3120
3121 return (B_TRUE);
3122 }
3123
3124 /*
3125 * Returns the maximum SDU according to the SDU of the constituent ports.
3126 */
3127 static uint_t
aggr_grp_max_sdu(aggr_grp_t * grp)3128 aggr_grp_max_sdu(aggr_grp_t *grp)
3129 {
3130 uint_t max_sdu = (uint_t)-1;
3131 aggr_port_t *port;
3132
3133 ASSERT(grp->lg_ports != NULL);
3134
3135 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
3136 uint_t port_sdu_max;
3137
3138 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
3139 if (max_sdu > port_sdu_max)
3140 max_sdu = port_sdu_max;
3141 }
3142
3143 return (max_sdu);
3144 }
3145
3146 /*
3147 * Checks if the maximum SDU of the specified port is compatible
3148 * with the maximum SDU of the specified aggregation group, returns
3149 * B_TRUE if it is, B_FALSE otherwise.
3150 */
3151 static boolean_t
aggr_grp_sdu_check(aggr_grp_t * grp,aggr_port_t * port)3152 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port)
3153 {
3154 uint_t port_sdu_max;
3155
3156 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
3157 return (port_sdu_max >= grp->lg_max_sdu);
3158 }
3159
3160 /*
3161 * Returns the maximum margin according to the margin of the constituent ports.
3162 */
3163 static uint32_t
aggr_grp_max_margin(aggr_grp_t * grp)3164 aggr_grp_max_margin(aggr_grp_t *grp)
3165 {
3166 uint32_t margin = UINT32_MAX;
3167 aggr_port_t *port;
3168
3169 ASSERT(grp->lg_mh == NULL);
3170 ASSERT(grp->lg_ports != NULL);
3171
3172 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
3173 if (margin > port->lp_margin)
3174 margin = port->lp_margin;
3175 }
3176
3177 grp->lg_margin = margin;
3178 return (margin);
3179 }
3180
3181 /*
3182 * Checks if the maximum margin of the specified port is compatible
3183 * with the maximum margin of the specified aggregation group, returns
3184 * B_TRUE if it is, B_FALSE otherwise.
3185 */
3186 static boolean_t
aggr_grp_margin_check(aggr_grp_t * grp,aggr_port_t * port)3187 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port)
3188 {
3189 if (port->lp_margin >= grp->lg_margin)
3190 return (B_TRUE);
3191
3192 /*
3193 * See whether the current margin value is allowed to be changed to
3194 * the new value.
3195 */
3196 if (!mac_margin_update(grp->lg_mh, port->lp_margin))
3197 return (B_FALSE);
3198
3199 grp->lg_margin = port->lp_margin;
3200 return (B_TRUE);
3201 }
3202
3203 /*
3204 * Set MTU on individual ports of an aggregation group
3205 */
3206 static int
aggr_set_port_sdu(aggr_grp_t * grp,aggr_port_t * port,uint32_t sdu,uint32_t * old_mtu)3207 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu,
3208 uint32_t *old_mtu)
3209 {
3210 boolean_t removed = B_FALSE;
3211 mac_perim_handle_t mph;
3212 mac_diag_t diag;
3213 int err, rv, retry = 0;
3214
3215 if (port->lp_mah != NULL) {
3216 (void) mac_unicast_remove(port->lp_mch, port->lp_mah);
3217 port->lp_mah = NULL;
3218 removed = B_TRUE;
3219 }
3220 err = mac_set_mtu(port->lp_mh, sdu, old_mtu);
3221 try_again:
3222 if (removed && (rv = mac_unicast_add(port->lp_mch, NULL,
3223 MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK,
3224 &port->lp_mah, 0, &diag)) != 0) {
3225 /*
3226 * following is a workaround for a bug in 'bge' driver.
3227 * See CR 6794654 for more information and this work around
3228 * will be removed once the CR is fixed.
3229 */
3230 if (rv == EIO && retry++ < 3) {
3231 delay(2 * hz);
3232 goto try_again;
3233 }
3234 /*
3235 * if mac_unicast_add() failed while setting the MTU,
3236 * detach the port from the group.
3237 */
3238 mac_perim_enter_by_mh(port->lp_mh, &mph);
3239 (void) aggr_grp_detach_port(grp, port);
3240 mac_perim_exit(mph);
3241 cmn_err(CE_WARN, "Unable to restart the port %s while "
3242 "setting MTU. Detaching the port from the aggregation.",
3243 mac_client_name(port->lp_mch));
3244 }
3245 return (err);
3246 }
3247
3248 static int
aggr_sdu_update(aggr_grp_t * grp,uint32_t sdu)3249 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu)
3250 {
3251 int err = 0, i, rv;
3252 aggr_port_t *port;
3253 uint32_t *mtu;
3254
3255 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
3256
3257 /*
3258 * If the MTU being set is equal to aggr group's maximum
3259 * allowable value, then there is nothing to change
3260 */
3261 if (sdu == grp->lg_max_sdu)
3262 return (0);
3263
3264 /* 0 is aggr group's min sdu */
3265 if (sdu == 0)
3266 return (EINVAL);
3267
3268 mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP);
3269 for (port = grp->lg_ports, i = 0; port != NULL && err == 0;
3270 port = port->lp_next, i++) {
3271 err = aggr_set_port_sdu(grp, port, sdu, mtu + i);
3272 }
3273 if (err != 0) {
3274 /* recover from error: reset the mtus of the ports */
3275 aggr_port_t *tmp;
3276
3277 for (tmp = grp->lg_ports, i = 0; tmp != port;
3278 tmp = tmp->lp_next, i++) {
3279 (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL);
3280 }
3281 goto bail;
3282 }
3283 grp->lg_max_sdu = aggr_grp_max_sdu(grp);
3284 rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu);
3285 ASSERT(rv == 0);
3286 bail:
3287 kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports);
3288 return (err);
3289 }
3290
3291 /*
3292 * Callback functions for set/get of properties
3293 */
3294 /*ARGSUSED*/
3295 static int
aggr_m_setprop(void * m_driver,const char * pr_name,mac_prop_id_t pr_num,uint_t pr_valsize,const void * pr_val)3296 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3297 uint_t pr_valsize, const void *pr_val)
3298 {
3299 int err = ENOTSUP;
3300 aggr_grp_t *grp = m_driver;
3301
3302 switch (pr_num) {
3303 case MAC_PROP_MTU: {
3304 uint32_t mtu;
3305
3306 if (pr_valsize < sizeof (mtu)) {
3307 err = EINVAL;
3308 break;
3309 }
3310 bcopy(pr_val, &mtu, sizeof (mtu));
3311 err = aggr_sdu_update(grp, mtu);
3312 break;
3313 }
3314 default:
3315 break;
3316 }
3317 return (err);
3318 }
3319
3320 typedef struct rboundary {
3321 uint32_t bval;
3322 int btype;
3323 } rboundary_t;
3324
3325 /*
3326 * This function finds the intersection of mtu ranges stored in arrays -
3327 * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval.
3328 * Individual arrays are assumed to contain non-overlapping ranges.
3329 * Algorithm:
3330 * A range has two boundaries - min and max. We scan all arrays and store
3331 * each boundary as a separate element in a temporary array. We also store
3332 * the boundary types, min or max, as +1 or -1 respectively in the temporary
3333 * array. Then we sort the temporary array in ascending order. We scan the
3334 * sorted array from lower to higher values and keep a cumulative sum of
3335 * boundary types. Element in the temporary array for which the sum reaches
3336 * mcount is a min boundary of a range in the result and next element will be
3337 * max boundary.
3338 *
3339 * Example for mcount = 3,
3340 *
3341 * ----|_________|-------|_______|----|__|------ mrange[0]
3342 *
3343 * -------|________|--|____________|-----|___|-- mrange[1]
3344 *
3345 * --------|________________|-------|____|------ mrange[2]
3346 *
3347 * 3 2 1
3348 * \|/
3349 * 1 23 2 1 2 3 2 1 01 2 V 0 <- the sum
3350 * ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array
3351 *
3352 * same min and max
3353 * V
3354 * --------|_____|-------|__|------------|------ intersecting ranges
3355 */
3356 void
aggr_mtu_range_intersection(mac_propval_range_t ** mrange,int mcount,mac_propval_uint32_range_t ** prval,int * prmaxcnt,int * prcount)3357 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount,
3358 mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount)
3359 {
3360 mac_propval_uint32_range_t *rval, *ur;
3361 int rmaxcnt, rcount;
3362 size_t sz_range32;
3363 rboundary_t *ta; /* temporary array */
3364 rboundary_t temp;
3365 boolean_t range_started = B_FALSE;
3366 int i, j, m, sum;
3367
3368 sz_range32 = sizeof (mac_propval_uint32_range_t);
3369
3370 for (i = 0, rmaxcnt = 0; i < mcount; i++)
3371 rmaxcnt += mrange[i]->mpr_count;
3372
3373 /* Allocate enough space to store the results */
3374 rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP);
3375
3376 /* Number of boundaries are twice as many as ranges */
3377 ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP);
3378
3379 for (i = 0, m = 0; i < mcount; i++) {
3380 ur = &(mrange[i]->mpr_range_uint32[0]);
3381 for (j = 0; j < mrange[i]->mpr_count; j++) {
3382 ta[m].bval = ur[j].mpur_min;
3383 ta[m++].btype = 1;
3384 ta[m].bval = ur[j].mpur_max;
3385 ta[m++].btype = -1;
3386 }
3387 }
3388
3389 /*
3390 * Sort the temporary array in ascending order of bval;
3391 * if boundary values are same then sort on btype.
3392 */
3393 for (i = 0; i < m-1; i++) {
3394 for (j = i+1; j < m; j++) {
3395 if ((ta[i].bval > ta[j].bval) ||
3396 ((ta[i].bval == ta[j].bval) &&
3397 (ta[i].btype < ta[j].btype))) {
3398 temp = ta[i];
3399 ta[i] = ta[j];
3400 ta[j] = temp;
3401 }
3402 }
3403 }
3404
3405 /* Walk through temporary array to find all ranges in the results */
3406 for (i = 0, sum = 0, rcount = 0; i < m; i++) {
3407 sum += ta[i].btype;
3408 if (sum == mcount) {
3409 rval[rcount].mpur_min = ta[i].bval;
3410 range_started = B_TRUE;
3411 } else if (sum < mcount && range_started) {
3412 rval[rcount++].mpur_max = ta[i].bval;
3413 range_started = B_FALSE;
3414 }
3415 }
3416
3417 *prval = rval;
3418 *prmaxcnt = rmaxcnt;
3419 *prcount = rcount;
3420
3421 kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t));
3422 }
3423
3424 /*
3425 * Returns the mtu ranges which could be supported by aggr group.
3426 * prmaxcnt returns the size of the buffer prval, prcount returns
3427 * the number of valid entries in prval. Caller is responsible
3428 * for freeing up prval.
3429 */
3430 int
aggr_grp_possible_mtu_range(aggr_grp_t * grp,mac_propval_uint32_range_t ** prval,int * prmaxcnt,int * prcount)3431 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval,
3432 int *prmaxcnt, int *prcount)
3433 {
3434 mac_propval_range_t **vals;
3435 aggr_port_t *port;
3436 mac_perim_handle_t mph;
3437 uint_t i, numr;
3438 int err = 0;
3439 size_t sz_propval, sz_range32;
3440 size_t size;
3441
3442 sz_propval = sizeof (mac_propval_range_t);
3443 sz_range32 = sizeof (mac_propval_uint32_range_t);
3444
3445 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
3446
3447 vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports,
3448 KM_SLEEP);
3449
3450 for (port = grp->lg_ports, i = 0; port != NULL;
3451 port = port->lp_next, i++) {
3452
3453 size = sz_propval;
3454 vals[i] = kmem_alloc(size, KM_SLEEP);
3455 vals[i]->mpr_count = 1;
3456
3457 mac_perim_enter_by_mh(port->lp_mh, &mph);
3458
3459 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3460 NULL, 0, vals[i], NULL);
3461 if (err == ENOSPC) {
3462 /*
3463 * Not enough space to hold all ranges.
3464 * Allocate extra space as indicated and retry.
3465 */
3466 numr = vals[i]->mpr_count;
3467 kmem_free(vals[i], sz_propval);
3468 size = sz_propval + (numr - 1) * sz_range32;
3469 vals[i] = kmem_alloc(size, KM_SLEEP);
3470 vals[i]->mpr_count = numr;
3471 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3472 NULL, 0, vals[i], NULL);
3473 ASSERT(err != ENOSPC);
3474 }
3475 mac_perim_exit(mph);
3476 if (err != 0) {
3477 kmem_free(vals[i], size);
3478 vals[i] = NULL;
3479 break;
3480 }
3481 }
3482
3483 /*
3484 * if any of the underlying ports does not support changing MTU then
3485 * just return ENOTSUP
3486 */
3487 if (port != NULL) {
3488 ASSERT(err != 0);
3489 goto done;
3490 }
3491
3492 aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt,
3493 prcount);
3494
3495 done:
3496 for (i = 0; i < grp->lg_nports; i++) {
3497 if (vals[i] != NULL) {
3498 numr = vals[i]->mpr_count;
3499 size = sz_propval + (numr - 1) * sz_range32;
3500 kmem_free(vals[i], size);
3501 }
3502 }
3503
3504 kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports);
3505 return (err);
3506 }
3507
3508 static void
aggr_m_propinfo(void * m_driver,const char * pr_name,mac_prop_id_t pr_num,mac_prop_info_handle_t prh)3509 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3510 mac_prop_info_handle_t prh)
3511 {
3512 aggr_grp_t *grp = m_driver;
3513 mac_propval_uint32_range_t *rval = NULL;
3514 int i, rcount, rmaxcnt;
3515 int err = 0;
3516
3517 _NOTE(ARGUNUSED(pr_name));
3518
3519 switch (pr_num) {
3520 case MAC_PROP_MTU:
3521
3522 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt,
3523 &rcount);
3524 if (err != 0) {
3525 ASSERT(rval == NULL);
3526 return;
3527 }
3528 for (i = 0; i < rcount; i++) {
3529 mac_prop_info_set_range_uint32(prh,
3530 rval[i].mpur_min, rval[i].mpur_max);
3531 }
3532 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt);
3533 break;
3534 }
3535 }
3536