1/*	$OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $	*/
2
3/*
4 * Copyright (c) 2005, 2006 Reyk Floeter <reyk@openbsd.org>
5 * Copyright (c) 2007 Andrew Thompson <thompsa@FreeBSD.org>
6 * Copyright (c) 2014, 2016 Marcelo Araujo <araujo@FreeBSD.org>
7 *
8 * Permission to use, copy, modify, and distribute this software for any
9 * purpose with or without fee is hereby granted, provided that the above
10 * copyright notice and this permission notice appear in all copies.
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
13 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
14 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
15 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
16 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
17 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
18 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19 */
20
21#include <sys/cdefs.h>
22__FBSDID("$FreeBSD$");
23
24#include "opt_inet.h"
25#include "opt_inet6.h"
26#include "opt_kern_tls.h"
27#include "opt_ratelimit.h"
28
29#include <sys/param.h>
30#include <sys/kernel.h>
31#include <sys/malloc.h>
32#include <sys/mbuf.h>
33#include <sys/queue.h>
34#include <sys/socket.h>
35#include <sys/sockio.h>
36#include <sys/sysctl.h>
37#include <sys/module.h>
38#include <sys/priv.h>
39#include <sys/systm.h>
40#include <sys/proc.h>
41#include <sys/lock.h>
42#include <sys/rmlock.h>
43#include <sys/sx.h>
44#include <sys/taskqueue.h>
45#include <sys/eventhandler.h>
46
47#include <net/ethernet.h>
48#include <net/if.h>
49#include <net/if_clone.h>
50#include <net/if_arp.h>
51#include <net/if_dl.h>
52#include <net/if_media.h>
53#include <net/if_types.h>
54#include <net/if_var.h>
55#include <net/bpf.h>
56#include <net/route.h>
57#include <net/vnet.h>
58
59#if defined(INET) || defined(INET6)
60#include <netinet/in.h>
61#include <netinet/ip.h>
62#endif
63#ifdef INET
64#include <netinet/in_systm.h>
65#include <netinet/if_ether.h>
66#endif
67
68#ifdef INET6
69#include <netinet/ip6.h>
70#include <netinet6/in6_var.h>
71#include <netinet6/in6_ifattach.h>
72#endif
73
74#include <net/if_vlan_var.h>
75#include <net/if_lagg.h>
76#include <net/ieee8023ad_lacp.h>
77
78#ifdef INET6
79/*
80 * XXX: declare here to avoid to include many inet6 related files..
81 * should be more generalized?
82 */
83extern void	nd6_setmtu(struct ifnet *);
84#endif
85
86#define	LAGG_RLOCK()	struct epoch_tracker lagg_et; epoch_enter_preempt(net_epoch_preempt, &lagg_et)
87#define	LAGG_RUNLOCK()	epoch_exit_preempt(net_epoch_preempt, &lagg_et)
88#define	LAGG_RLOCK_ASSERT()	NET_EPOCH_ASSERT()
89#define	LAGG_UNLOCK_ASSERT()	MPASS(!in_epoch(net_epoch_preempt))
90
91#define	LAGG_SX_INIT(_sc)	sx_init(&(_sc)->sc_sx, "if_lagg sx")
92#define	LAGG_SX_DESTROY(_sc)	sx_destroy(&(_sc)->sc_sx)
93#define	LAGG_XLOCK(_sc)		sx_xlock(&(_sc)->sc_sx)
94#define	LAGG_XUNLOCK(_sc)	sx_xunlock(&(_sc)->sc_sx)
95#define	LAGG_SXLOCK_ASSERT(_sc)	sx_assert(&(_sc)->sc_sx, SA_LOCKED)
96#define	LAGG_XLOCK_ASSERT(_sc)	sx_assert(&(_sc)->sc_sx, SA_XLOCKED)
97
98/* Special flags we should propagate to the lagg ports. */
99static struct {
100	int flag;
101	int (*func)(struct ifnet *, int);
102} lagg_pflags[] = {
103	{IFF_PROMISC, ifpromisc},
104	{IFF_ALLMULTI, if_allmulti},
105	{0, NULL}
106};
107
108struct lagg_snd_tag {
109	struct m_snd_tag com;
110	struct m_snd_tag *tag;
111};
112
113VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */
114#define	V_lagg_list	VNET(lagg_list)
115VNET_DEFINE_STATIC(struct mtx, lagg_list_mtx);
116#define	V_lagg_list_mtx	VNET(lagg_list_mtx)
117#define	LAGG_LIST_LOCK_INIT(x)		mtx_init(&V_lagg_list_mtx, \
118					"if_lagg list", NULL, MTX_DEF)
119#define	LAGG_LIST_LOCK_DESTROY(x)	mtx_destroy(&V_lagg_list_mtx)
120#define	LAGG_LIST_LOCK(x)		mtx_lock(&V_lagg_list_mtx)
121#define	LAGG_LIST_UNLOCK(x)		mtx_unlock(&V_lagg_list_mtx)
122eventhandler_tag	lagg_detach_cookie = NULL;
123
124static int	lagg_clone_create(struct if_clone *, int, caddr_t);
125static void	lagg_clone_destroy(struct ifnet *);
126VNET_DEFINE_STATIC(struct if_clone *, lagg_cloner);
127#define	V_lagg_cloner	VNET(lagg_cloner)
128static const char laggname[] = "lagg";
129static MALLOC_DEFINE(M_LAGG, laggname, "802.3AD Link Aggregation Interface");
130
131static void	lagg_capabilities(struct lagg_softc *);
132static int	lagg_port_create(struct lagg_softc *, struct ifnet *);
133static int	lagg_port_destroy(struct lagg_port *, int);
134static struct mbuf *lagg_input(struct ifnet *, struct mbuf *);
135static void	lagg_linkstate(struct lagg_softc *);
136static void	lagg_port_state(struct ifnet *, int);
137static int	lagg_port_ioctl(struct ifnet *, u_long, caddr_t);
138static int	lagg_port_output(struct ifnet *, struct mbuf *,
139		    const struct sockaddr *, struct route *);
140static void	lagg_port_ifdetach(void *arg __unused, struct ifnet *);
141#ifdef LAGG_PORT_STACKING
142static int	lagg_port_checkstacking(struct lagg_softc *);
143#endif
144static void	lagg_port2req(struct lagg_port *, struct lagg_reqport *);
145static void	lagg_init(void *);
146static void	lagg_stop(struct lagg_softc *);
147static int	lagg_ioctl(struct ifnet *, u_long, caddr_t);
148#if defined(KERN_TLS) || defined(RATELIMIT)
149static int	lagg_snd_tag_alloc(struct ifnet *,
150		    union if_snd_tag_alloc_params *,
151		    struct m_snd_tag **);
152static int	lagg_snd_tag_modify(struct m_snd_tag *,
153		    union if_snd_tag_modify_params *);
154static int	lagg_snd_tag_query(struct m_snd_tag *,
155		    union if_snd_tag_query_params *);
156static void	lagg_snd_tag_free(struct m_snd_tag *);
157static void     lagg_ratelimit_query(struct ifnet *,
158		    struct if_ratelimit_query_results *);
159#endif
160static int	lagg_setmulti(struct lagg_port *);
161static int	lagg_clrmulti(struct lagg_port *);
162static	int	lagg_setcaps(struct lagg_port *, int cap);
163static	int	lagg_setflag(struct lagg_port *, int, int,
164		    int (*func)(struct ifnet *, int));
165static	int	lagg_setflags(struct lagg_port *, int status);
166static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt);
167static int	lagg_transmit(struct ifnet *, struct mbuf *);
168static void	lagg_qflush(struct ifnet *);
169static int	lagg_media_change(struct ifnet *);
170static void	lagg_media_status(struct ifnet *, struct ifmediareq *);
171static struct lagg_port *lagg_link_active(struct lagg_softc *,
172	    struct lagg_port *);
173
174/* Simple round robin */
175static void	lagg_rr_attach(struct lagg_softc *);
176static int	lagg_rr_start(struct lagg_softc *, struct mbuf *);
177static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *,
178		    struct mbuf *);
179
180/* Active failover */
181static int	lagg_fail_start(struct lagg_softc *, struct mbuf *);
182static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *,
183		    struct mbuf *);
184
185/* Loadbalancing */
186static void	lagg_lb_attach(struct lagg_softc *);
187static void	lagg_lb_detach(struct lagg_softc *);
188static int	lagg_lb_port_create(struct lagg_port *);
189static void	lagg_lb_port_destroy(struct lagg_port *);
190static int	lagg_lb_start(struct lagg_softc *, struct mbuf *);
191static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *,
192		    struct mbuf *);
193static int	lagg_lb_porttable(struct lagg_softc *, struct lagg_port *);
194
195/* Broadcast */
196static int    lagg_bcast_start(struct lagg_softc *, struct mbuf *);
197static struct mbuf *lagg_bcast_input(struct lagg_softc *, struct lagg_port *,
198		    struct mbuf *);
199
200/* 802.3ad LACP */
201static void	lagg_lacp_attach(struct lagg_softc *);
202static void	lagg_lacp_detach(struct lagg_softc *);
203static int	lagg_lacp_start(struct lagg_softc *, struct mbuf *);
204static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *,
205		    struct mbuf *);
206static void	lagg_lacp_lladdr(struct lagg_softc *);
207
208/* lagg protocol table */
209static const struct lagg_proto {
210	lagg_proto	pr_num;
211	void		(*pr_attach)(struct lagg_softc *);
212	void		(*pr_detach)(struct lagg_softc *);
213	int		(*pr_start)(struct lagg_softc *, struct mbuf *);
214	struct mbuf *	(*pr_input)(struct lagg_softc *, struct lagg_port *,
215			    struct mbuf *);
216	int		(*pr_addport)(struct lagg_port *);
217	void		(*pr_delport)(struct lagg_port *);
218	void		(*pr_linkstate)(struct lagg_port *);
219	void 		(*pr_init)(struct lagg_softc *);
220	void 		(*pr_stop)(struct lagg_softc *);
221	void 		(*pr_lladdr)(struct lagg_softc *);
222	void		(*pr_request)(struct lagg_softc *, void *);
223	void		(*pr_portreq)(struct lagg_port *, void *);
224} lagg_protos[] = {
225    {
226	.pr_num = LAGG_PROTO_NONE
227    },
228    {
229	.pr_num = LAGG_PROTO_ROUNDROBIN,
230	.pr_attach = lagg_rr_attach,
231	.pr_start = lagg_rr_start,
232	.pr_input = lagg_rr_input,
233    },
234    {
235	.pr_num = LAGG_PROTO_FAILOVER,
236	.pr_start = lagg_fail_start,
237	.pr_input = lagg_fail_input,
238    },
239    {
240	.pr_num = LAGG_PROTO_LOADBALANCE,
241	.pr_attach = lagg_lb_attach,
242	.pr_detach = lagg_lb_detach,
243	.pr_start = lagg_lb_start,
244	.pr_input = lagg_lb_input,
245	.pr_addport = lagg_lb_port_create,
246	.pr_delport = lagg_lb_port_destroy,
247    },
248    {
249	.pr_num = LAGG_PROTO_LACP,
250	.pr_attach = lagg_lacp_attach,
251	.pr_detach = lagg_lacp_detach,
252	.pr_start = lagg_lacp_start,
253	.pr_input = lagg_lacp_input,
254	.pr_addport = lacp_port_create,
255	.pr_delport = lacp_port_destroy,
256	.pr_linkstate = lacp_linkstate,
257	.pr_init = lacp_init,
258	.pr_stop = lacp_stop,
259	.pr_lladdr = lagg_lacp_lladdr,
260	.pr_request = lacp_req,
261	.pr_portreq = lacp_portreq,
262    },
263    {
264	.pr_num = LAGG_PROTO_BROADCAST,
265	.pr_start = lagg_bcast_start,
266	.pr_input = lagg_bcast_input,
267    },
268};
269
270SYSCTL_DECL(_net_link);
271SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
272    "Link Aggregation");
273
274/* Allow input on any failover links */
275VNET_DEFINE_STATIC(int, lagg_failover_rx_all);
276#define	V_lagg_failover_rx_all	VNET(lagg_failover_rx_all)
277SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET,
278    &VNET_NAME(lagg_failover_rx_all), 0,
279    "Accept input from any interface in a failover lagg");
280
281/* Default value for using flowid */
282VNET_DEFINE_STATIC(int, def_use_flowid) = 0;
283#define	V_def_use_flowid	VNET(def_use_flowid)
284SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN,
285    &VNET_NAME(def_use_flowid), 0,
286    "Default setting for using flow id for load sharing");
287
288/* Default value for using numa */
289VNET_DEFINE_STATIC(int, def_use_numa) = 1;
290#define	V_def_use_numa	VNET(def_use_numa)
291SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_numa, CTLFLAG_RWTUN,
292    &VNET_NAME(def_use_numa), 0,
293    "Use numa to steer flows");
294
295/* Default value for flowid shift */
296VNET_DEFINE_STATIC(int, def_flowid_shift) = 16;
297#define	V_def_flowid_shift	VNET(def_flowid_shift)
298SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN,
299    &VNET_NAME(def_flowid_shift), 0,
300    "Default setting for flowid shift for load sharing");
301
302static void
303vnet_lagg_init(const void *unused __unused)
304{
305
306	LAGG_LIST_LOCK_INIT();
307	SLIST_INIT(&V_lagg_list);
308	V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create,
309	    lagg_clone_destroy, 0);
310}
311VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
312    vnet_lagg_init, NULL);
313
314static void
315vnet_lagg_uninit(const void *unused __unused)
316{
317
318	if_clone_detach(V_lagg_cloner);
319	LAGG_LIST_LOCK_DESTROY();
320}
321VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
322    vnet_lagg_uninit, NULL);
323
324static int
325lagg_modevent(module_t mod, int type, void *data)
326{
327
328	switch (type) {
329	case MOD_LOAD:
330		lagg_input_p = lagg_input;
331		lagg_linkstate_p = lagg_port_state;
332		lagg_detach_cookie = EVENTHANDLER_REGISTER(
333		    ifnet_departure_event, lagg_port_ifdetach, NULL,
334		    EVENTHANDLER_PRI_ANY);
335		break;
336	case MOD_UNLOAD:
337		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
338		    lagg_detach_cookie);
339		lagg_input_p = NULL;
340		lagg_linkstate_p = NULL;
341		break;
342	default:
343		return (EOPNOTSUPP);
344	}
345	return (0);
346}
347
348static moduledata_t lagg_mod = {
349	"if_lagg",
350	lagg_modevent,
351	0
352};
353
354DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
355MODULE_VERSION(if_lagg, 1);
356
357static void
358lagg_proto_attach(struct lagg_softc *sc, lagg_proto pr)
359{
360
361	LAGG_XLOCK_ASSERT(sc);
362	KASSERT(sc->sc_proto == LAGG_PROTO_NONE, ("%s: sc %p has proto",
363	    __func__, sc));
364
365	if (sc->sc_ifflags & IFF_DEBUG)
366		if_printf(sc->sc_ifp, "using proto %u\n", pr);
367
368	if (lagg_protos[pr].pr_attach != NULL)
369		lagg_protos[pr].pr_attach(sc);
370	sc->sc_proto = pr;
371}
372
373static void
374lagg_proto_detach(struct lagg_softc *sc)
375{
376	lagg_proto pr;
377
378	LAGG_XLOCK_ASSERT(sc);
379	pr = sc->sc_proto;
380	sc->sc_proto = LAGG_PROTO_NONE;
381
382	if (lagg_protos[pr].pr_detach != NULL)
383		lagg_protos[pr].pr_detach(sc);
384}
385
386static int
387lagg_proto_start(struct lagg_softc *sc, struct mbuf *m)
388{
389
390	return (lagg_protos[sc->sc_proto].pr_start(sc, m));
391}
392
393static struct mbuf *
394lagg_proto_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
395{
396
397	return (lagg_protos[sc->sc_proto].pr_input(sc, lp, m));
398}
399
400static int
401lagg_proto_addport(struct lagg_softc *sc, struct lagg_port *lp)
402{
403
404	if (lagg_protos[sc->sc_proto].pr_addport == NULL)
405		return (0);
406	else
407		return (lagg_protos[sc->sc_proto].pr_addport(lp));
408}
409
410static void
411lagg_proto_delport(struct lagg_softc *sc, struct lagg_port *lp)
412{
413
414	if (lagg_protos[sc->sc_proto].pr_delport != NULL)
415		lagg_protos[sc->sc_proto].pr_delport(lp);
416}
417
418static void
419lagg_proto_linkstate(struct lagg_softc *sc, struct lagg_port *lp)
420{
421
422	if (lagg_protos[sc->sc_proto].pr_linkstate != NULL)
423		lagg_protos[sc->sc_proto].pr_linkstate(lp);
424}
425
426static void
427lagg_proto_init(struct lagg_softc *sc)
428{
429
430	if (lagg_protos[sc->sc_proto].pr_init != NULL)
431		lagg_protos[sc->sc_proto].pr_init(sc);
432}
433
434static void
435lagg_proto_stop(struct lagg_softc *sc)
436{
437
438	if (lagg_protos[sc->sc_proto].pr_stop != NULL)
439		lagg_protos[sc->sc_proto].pr_stop(sc);
440}
441
442static void
443lagg_proto_lladdr(struct lagg_softc *sc)
444{
445
446	if (lagg_protos[sc->sc_proto].pr_lladdr != NULL)
447		lagg_protos[sc->sc_proto].pr_lladdr(sc);
448}
449
450static void
451lagg_proto_request(struct lagg_softc *sc, void *v)
452{
453
454	if (lagg_protos[sc->sc_proto].pr_request != NULL)
455		lagg_protos[sc->sc_proto].pr_request(sc, v);
456}
457
458static void
459lagg_proto_portreq(struct lagg_softc *sc, struct lagg_port *lp, void *v)
460{
461
462	if (lagg_protos[sc->sc_proto].pr_portreq != NULL)
463		lagg_protos[sc->sc_proto].pr_portreq(lp, v);
464}
465
466/*
467 * This routine is run via an vlan
468 * config EVENT
469 */
470static void
471lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
472{
473	struct lagg_softc *sc = ifp->if_softc;
474	struct lagg_port *lp;
475
476	if (ifp->if_softc !=  arg)   /* Not our event */
477		return;
478
479	LAGG_RLOCK();
480	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
481		EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag);
482	LAGG_RUNLOCK();
483}
484
485/*
486 * This routine is run via an vlan
487 * unconfig EVENT
488 */
489static void
490lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
491{
492	struct lagg_softc *sc = ifp->if_softc;
493	struct lagg_port *lp;
494
495	if (ifp->if_softc !=  arg)   /* Not our event */
496		return;
497
498	LAGG_RLOCK();
499	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
500		EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag);
501	LAGG_RUNLOCK();
502}
503
504static int
505lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
506{
507	struct lagg_softc *sc;
508	struct ifnet *ifp;
509	static const u_char eaddr[6];	/* 00:00:00:00:00:00 */
510
511	sc = malloc(sizeof(*sc), M_LAGG, M_WAITOK|M_ZERO);
512	ifp = sc->sc_ifp = if_alloc(IFT_ETHER);
513	if (ifp == NULL) {
514		free(sc, M_LAGG);
515		return (ENOSPC);
516	}
517	LAGG_SX_INIT(sc);
518
519	LAGG_XLOCK(sc);
520	if (V_def_use_flowid)
521		sc->sc_opts |= LAGG_OPT_USE_FLOWID;
522	if (V_def_use_numa)
523		sc->sc_opts |= LAGG_OPT_USE_NUMA;
524	sc->flowid_shift = V_def_flowid_shift;
525
526	/* Hash all layers by default */
527	sc->sc_flags = MBUF_HASHFLAG_L2|MBUF_HASHFLAG_L3|MBUF_HASHFLAG_L4;
528
529	lagg_proto_attach(sc, LAGG_PROTO_DEFAULT);
530
531	CK_SLIST_INIT(&sc->sc_ports);
532
533	/* Initialise pseudo media types */
534	ifmedia_init(&sc->sc_media, 0, lagg_media_change,
535	    lagg_media_status);
536	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
537	ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
538
539	if_initname(ifp, laggname, unit);
540	ifp->if_softc = sc;
541	ifp->if_transmit = lagg_transmit;
542	ifp->if_qflush = lagg_qflush;
543	ifp->if_init = lagg_init;
544	ifp->if_ioctl = lagg_ioctl;
545	ifp->if_get_counter = lagg_get_counter;
546	ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
547#if defined(KERN_TLS) || defined(RATELIMIT)
548	ifp->if_snd_tag_alloc = lagg_snd_tag_alloc;
549	ifp->if_snd_tag_modify = lagg_snd_tag_modify;
550	ifp->if_snd_tag_query = lagg_snd_tag_query;
551	ifp->if_snd_tag_free = lagg_snd_tag_free;
552	ifp->if_ratelimit_query = lagg_ratelimit_query;
553#endif
554	ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
555
556	/*
557	 * Attach as an ordinary ethernet device, children will be attached
558	 * as special device IFT_IEEE8023ADLAG.
559	 */
560	ether_ifattach(ifp, eaddr);
561
562	sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
563		lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
564	sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
565		lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
566
567	/* Insert into the global list of laggs */
568	LAGG_LIST_LOCK();
569	SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries);
570	LAGG_LIST_UNLOCK();
571	LAGG_XUNLOCK(sc);
572
573	return (0);
574}
575
576static void
577lagg_clone_destroy(struct ifnet *ifp)
578{
579	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
580	struct lagg_port *lp;
581
582	LAGG_XLOCK(sc);
583	sc->sc_destroying = 1;
584	lagg_stop(sc);
585	ifp->if_flags &= ~IFF_UP;
586
587	EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach);
588	EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach);
589
590	/* Shutdown and remove lagg ports */
591	while ((lp = CK_SLIST_FIRST(&sc->sc_ports)) != NULL)
592		lagg_port_destroy(lp, 1);
593
594	/* Unhook the aggregation protocol */
595	lagg_proto_detach(sc);
596	LAGG_XUNLOCK(sc);
597
598	ifmedia_removeall(&sc->sc_media);
599	ether_ifdetach(ifp);
600	if_free(ifp);
601
602	LAGG_LIST_LOCK();
603	SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries);
604	LAGG_LIST_UNLOCK();
605
606	LAGG_SX_DESTROY(sc);
607	free(sc, M_LAGG);
608}
609
610static void
611lagg_capabilities(struct lagg_softc *sc)
612{
613	struct lagg_port *lp;
614	int cap, ena, pena;
615	uint64_t hwa;
616	struct ifnet_hw_tsomax hw_tsomax;
617
618	LAGG_XLOCK_ASSERT(sc);
619
620	/* Get common enabled capabilities for the lagg ports */
621	ena = ~0;
622	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
623		ena &= lp->lp_ifp->if_capenable;
624	ena = (ena == ~0 ? 0 : ena);
625
626	/*
627	 * Apply common enabled capabilities back to the lagg ports.
628	 * May require several iterations if they are dependent.
629	 */
630	do {
631		pena = ena;
632		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
633			lagg_setcaps(lp, ena);
634			ena &= lp->lp_ifp->if_capenable;
635		}
636	} while (pena != ena);
637
638	/* Get other capabilities from the lagg ports */
639	cap = ~0;
640	hwa = ~(uint64_t)0;
641	memset(&hw_tsomax, 0, sizeof(hw_tsomax));
642	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
643		cap &= lp->lp_ifp->if_capabilities;
644		hwa &= lp->lp_ifp->if_hwassist;
645		if_hw_tsomax_common(lp->lp_ifp, &hw_tsomax);
646	}
647	cap = (cap == ~0 ? 0 : cap);
648	hwa = (hwa == ~(uint64_t)0 ? 0 : hwa);
649
650	if (sc->sc_ifp->if_capabilities != cap ||
651	    sc->sc_ifp->if_capenable != ena ||
652	    sc->sc_ifp->if_hwassist != hwa ||
653	    if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax) != 0) {
654		sc->sc_ifp->if_capabilities = cap;
655		sc->sc_ifp->if_capenable = ena;
656		sc->sc_ifp->if_hwassist = hwa;
657		getmicrotime(&sc->sc_ifp->if_lastchange);
658
659		if (sc->sc_ifflags & IFF_DEBUG)
660			if_printf(sc->sc_ifp,
661			    "capabilities 0x%08x enabled 0x%08x\n", cap, ena);
662	}
663}
664
665static int
666lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp)
667{
668	struct lagg_softc *sc_ptr;
669	struct lagg_port *lp, *tlp;
670	struct ifreq ifr;
671	int error, i, oldmtu;
672	uint64_t *pval;
673
674	LAGG_XLOCK_ASSERT(sc);
675
676	if (sc->sc_ifp == ifp) {
677		if_printf(sc->sc_ifp,
678		    "cannot add a lagg to itself as a port\n");
679		return (EINVAL);
680	}
681
682	if (sc->sc_destroying == 1)
683		return (ENXIO);
684
685	/* Limit the maximal number of lagg ports */
686	if (sc->sc_count >= LAGG_MAX_PORTS)
687		return (ENOSPC);
688
689	/* Check if port has already been associated to a lagg */
690	if (ifp->if_lagg != NULL) {
691		/* Port is already in the current lagg? */
692		lp = (struct lagg_port *)ifp->if_lagg;
693		if (lp->lp_softc == sc)
694			return (EEXIST);
695		return (EBUSY);
696	}
697
698	/* XXX Disallow non-ethernet interfaces (this should be any of 802) */
699	if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN)
700		return (EPROTONOSUPPORT);
701
702	/* Allow the first Ethernet member to define the MTU */
703	oldmtu = -1;
704	if (CK_SLIST_EMPTY(&sc->sc_ports)) {
705		sc->sc_ifp->if_mtu = ifp->if_mtu;
706	} else if (sc->sc_ifp->if_mtu != ifp->if_mtu) {
707		if (ifp->if_ioctl == NULL) {
708			if_printf(sc->sc_ifp, "cannot change MTU for %s\n",
709			    ifp->if_xname);
710			return (EINVAL);
711		}
712		oldmtu = ifp->if_mtu;
713		strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name));
714		ifr.ifr_mtu = sc->sc_ifp->if_mtu;
715		error = (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr);
716		if (error != 0) {
717			if_printf(sc->sc_ifp, "invalid MTU for %s\n",
718			    ifp->if_xname);
719			return (error);
720		}
721		ifr.ifr_mtu = oldmtu;
722	}
723
724	lp = malloc(sizeof(struct lagg_port), M_LAGG, M_WAITOK|M_ZERO);
725	lp->lp_softc = sc;
726
727	/* Check if port is a stacked lagg */
728	LAGG_LIST_LOCK();
729	SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) {
730		if (ifp == sc_ptr->sc_ifp) {
731			LAGG_LIST_UNLOCK();
732			free(lp, M_LAGG);
733			if (oldmtu != -1)
734				(*ifp->if_ioctl)(ifp, SIOCSIFMTU,
735				    (caddr_t)&ifr);
736			return (EINVAL);
737			/* XXX disable stacking for the moment, its untested */
738#ifdef LAGG_PORT_STACKING
739			lp->lp_flags |= LAGG_PORT_STACK;
740			if (lagg_port_checkstacking(sc_ptr) >=
741			    LAGG_MAX_STACKING) {
742				LAGG_LIST_UNLOCK();
743				free(lp, M_LAGG);
744				if (oldmtu != -1)
745					(*ifp->if_ioctl)(ifp, SIOCSIFMTU,
746					    (caddr_t)&ifr);
747				return (E2BIG);
748			}
749#endif
750		}
751	}
752	LAGG_LIST_UNLOCK();
753
754	if_ref(ifp);
755	lp->lp_ifp = ifp;
756
757	bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ETHER_ADDR_LEN);
758	lp->lp_ifcapenable = ifp->if_capenable;
759	if (CK_SLIST_EMPTY(&sc->sc_ports)) {
760		bcopy(IF_LLADDR(ifp), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
761		lagg_proto_lladdr(sc);
762		EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
763	} else {
764		if_setlladdr(ifp, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
765	}
766	lagg_setflags(lp, 1);
767
768	if (CK_SLIST_EMPTY(&sc->sc_ports))
769		sc->sc_primary = lp;
770
771	/* Change the interface type */
772	lp->lp_iftype = ifp->if_type;
773	ifp->if_type = IFT_IEEE8023ADLAG;
774	ifp->if_lagg = lp;
775	lp->lp_ioctl = ifp->if_ioctl;
776	ifp->if_ioctl = lagg_port_ioctl;
777	lp->lp_output = ifp->if_output;
778	ifp->if_output = lagg_port_output;
779
780	/* Read port counters */
781	pval = lp->port_counters.val;
782	for (i = 0; i < IFCOUNTERS; i++, pval++)
783		*pval = ifp->if_get_counter(ifp, i);
784
785	/*
786	 * Insert into the list of ports.
787	 * Keep ports sorted by if_index. It is handy, when configuration
788	 * is predictable and `ifconfig laggN create ...` command
789	 * will lead to the same result each time.
790	 */
791	CK_SLIST_FOREACH(tlp, &sc->sc_ports, lp_entries) {
792		if (tlp->lp_ifp->if_index < ifp->if_index && (
793		    CK_SLIST_NEXT(tlp, lp_entries) == NULL ||
794		    ((struct  lagg_port*)CK_SLIST_NEXT(tlp, lp_entries))->lp_ifp->if_index >
795		    ifp->if_index))
796			break;
797	}
798	if (tlp != NULL)
799		CK_SLIST_INSERT_AFTER(tlp, lp, lp_entries);
800	else
801		CK_SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries);
802	sc->sc_count++;
803
804	lagg_setmulti(lp);
805
806	if ((error = lagg_proto_addport(sc, lp)) != 0) {
807		/* Remove the port, without calling pr_delport. */
808		lagg_port_destroy(lp, 0);
809		if (oldmtu != -1)
810			(*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr);
811		return (error);
812	}
813
814	/* Update lagg capabilities */
815	lagg_capabilities(sc);
816	lagg_linkstate(sc);
817
818	return (0);
819}
820
821#ifdef LAGG_PORT_STACKING
822static int
823lagg_port_checkstacking(struct lagg_softc *sc)
824{
825	struct lagg_softc *sc_ptr;
826	struct lagg_port *lp;
827	int m = 0;
828
829	LAGG_SXLOCK_ASSERT(sc);
830	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
831		if (lp->lp_flags & LAGG_PORT_STACK) {
832			sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc;
833			m = MAX(m, lagg_port_checkstacking(sc_ptr));
834		}
835	}
836
837	return (m + 1);
838}
839#endif
840
841static void
842lagg_port_destroy_cb(epoch_context_t ec)
843{
844	struct lagg_port *lp;
845	struct ifnet *ifp;
846
847	lp = __containerof(ec, struct lagg_port, lp_epoch_ctx);
848	ifp = lp->lp_ifp;
849
850	if_rele(ifp);
851	free(lp, M_LAGG);
852}
853
854static int
855lagg_port_destroy(struct lagg_port *lp, int rundelport)
856{
857	struct lagg_softc *sc = lp->lp_softc;
858	struct lagg_port *lp_ptr, *lp0;
859	struct ifnet *ifp = lp->lp_ifp;
860	uint64_t *pval, vdiff;
861	int i;
862
863	LAGG_XLOCK_ASSERT(sc);
864
865	if (rundelport)
866		lagg_proto_delport(sc, lp);
867
868	if (lp->lp_detaching == 0)
869		lagg_clrmulti(lp);
870
871	/* Restore interface */
872	ifp->if_type = lp->lp_iftype;
873	ifp->if_ioctl = lp->lp_ioctl;
874	ifp->if_output = lp->lp_output;
875	ifp->if_lagg = NULL;
876
877	/* Update detached port counters */
878	pval = lp->port_counters.val;
879	for (i = 0; i < IFCOUNTERS; i++, pval++) {
880		vdiff = ifp->if_get_counter(ifp, i) - *pval;
881		sc->detached_counters.val[i] += vdiff;
882	}
883
884	/* Finally, remove the port from the lagg */
885	CK_SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries);
886	sc->sc_count--;
887
888	/* Update the primary interface */
889	if (lp == sc->sc_primary) {
890		uint8_t lladdr[ETHER_ADDR_LEN];
891
892		if ((lp0 = CK_SLIST_FIRST(&sc->sc_ports)) == NULL)
893			bzero(&lladdr, ETHER_ADDR_LEN);
894		else
895			bcopy(lp0->lp_lladdr, lladdr, ETHER_ADDR_LEN);
896		sc->sc_primary = lp0;
897		if (sc->sc_destroying == 0) {
898			bcopy(lladdr, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
899			lagg_proto_lladdr(sc);
900			EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
901		}
902
903		/*
904		 * Update lladdr for each port (new primary needs update
905		 * as well, to switch from old lladdr to its 'real' one)
906		 */
907		CK_SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries)
908			if_setlladdr(lp_ptr->lp_ifp, lladdr, ETHER_ADDR_LEN);
909	}
910
911	if (lp->lp_ifflags)
912		if_printf(ifp, "%s: lp_ifflags unclean\n", __func__);
913
914	if (lp->lp_detaching == 0) {
915		lagg_setflags(lp, 0);
916		lagg_setcaps(lp, lp->lp_ifcapenable);
917		if_setlladdr(ifp, lp->lp_lladdr, ETHER_ADDR_LEN);
918	}
919
920	/*
921	 * free port and release it's ifnet reference after a grace period has
922	 * elapsed.
923	 */
924	NET_EPOCH_CALL(lagg_port_destroy_cb, &lp->lp_epoch_ctx);
925	/* Update lagg capabilities */
926	lagg_capabilities(sc);
927	lagg_linkstate(sc);
928
929	return (0);
930}
931
932static int
933lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
934{
935	struct lagg_reqport *rp = (struct lagg_reqport *)data;
936	struct lagg_softc *sc;
937	struct lagg_port *lp = NULL;
938	int error = 0;
939
940	/* Should be checked by the caller */
941	if (ifp->if_type != IFT_IEEE8023ADLAG ||
942	    (lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL)
943		goto fallback;
944
945	switch (cmd) {
946	case SIOCGLAGGPORT:
947		if (rp->rp_portname[0] == '\0' ||
948		    ifunit(rp->rp_portname) != ifp) {
949			error = EINVAL;
950			break;
951		}
952
953		LAGG_RLOCK();
954		if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) {
955			error = ENOENT;
956			LAGG_RUNLOCK();
957			break;
958		}
959
960		lagg_port2req(lp, rp);
961		LAGG_RUNLOCK();
962		break;
963
964	case SIOCSIFCAP:
965		if (lp->lp_ioctl == NULL) {
966			error = EINVAL;
967			break;
968		}
969		error = (*lp->lp_ioctl)(ifp, cmd, data);
970		if (error)
971			break;
972
973		/* Update lagg interface capabilities */
974		LAGG_XLOCK(sc);
975		lagg_capabilities(sc);
976		LAGG_XUNLOCK(sc);
977		VLAN_CAPABILITIES(sc->sc_ifp);
978		break;
979
980	case SIOCSIFMTU:
981		/* Do not allow the MTU to be changed once joined */
982		error = EINVAL;
983		break;
984
985	default:
986		goto fallback;
987	}
988
989	return (error);
990
991fallback:
992	if (lp != NULL && lp->lp_ioctl != NULL)
993		return ((*lp->lp_ioctl)(ifp, cmd, data));
994
995	return (EINVAL);
996}
997
998/*
999 * Requests counter @cnt data.
1000 *
1001 * Counter value is calculated the following way:
1002 * 1) for each port, sum  difference between current and "initial" measurements.
1003 * 2) add lagg logical interface counters.
1004 * 3) add data from detached_counters array.
1005 *
1006 * We also do the following things on ports attach/detach:
1007 * 1) On port attach we store all counters it has into port_counter array.
1008 * 2) On port detach we add the different between "initial" and
1009 *   current counters data to detached_counters array.
1010 */
1011static uint64_t
1012lagg_get_counter(struct ifnet *ifp, ift_counter cnt)
1013{
1014	struct lagg_softc *sc;
1015	struct lagg_port *lp;
1016	struct ifnet *lpifp;
1017	uint64_t newval, oldval, vsum;
1018
1019	/* Revise this when we've got non-generic counters. */
1020	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
1021
1022	sc = (struct lagg_softc *)ifp->if_softc;
1023
1024	vsum = 0;
1025	LAGG_RLOCK();
1026	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1027		/* Saved attached value */
1028		oldval = lp->port_counters.val[cnt];
1029		/* current value */
1030		lpifp = lp->lp_ifp;
1031		newval = lpifp->if_get_counter(lpifp, cnt);
1032		/* Calculate diff and save new */
1033		vsum += newval - oldval;
1034	}
1035	LAGG_RUNLOCK();
1036
1037	/*
1038	 * Add counter data which might be added by upper
1039	 * layer protocols operating on logical interface.
1040	 */
1041	vsum += if_get_counter_default(ifp, cnt);
1042
1043	/*
1044	 * Add counter data from detached ports counters
1045	 */
1046	vsum += sc->detached_counters.val[cnt];
1047
1048	return (vsum);
1049}
1050
1051/*
1052 * For direct output to child ports.
1053 */
1054static int
1055lagg_port_output(struct ifnet *ifp, struct mbuf *m,
1056	const struct sockaddr *dst, struct route *ro)
1057{
1058	struct lagg_port *lp = ifp->if_lagg;
1059
1060	switch (dst->sa_family) {
1061		case pseudo_AF_HDRCMPLT:
1062		case AF_UNSPEC:
1063			return ((*lp->lp_output)(ifp, m, dst, ro));
1064	}
1065
1066	/* drop any other frames */
1067	m_freem(m);
1068	return (ENETDOWN);
1069}
1070
1071static void
1072lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp)
1073{
1074	struct lagg_port *lp;
1075	struct lagg_softc *sc;
1076
1077	if ((lp = ifp->if_lagg) == NULL)
1078		return;
1079	/* If the ifnet is just being renamed, don't do anything. */
1080	if (ifp->if_flags & IFF_RENAMING)
1081		return;
1082
1083	sc = lp->lp_softc;
1084
1085	LAGG_XLOCK(sc);
1086	lp->lp_detaching = 1;
1087	lagg_port_destroy(lp, 1);
1088	LAGG_XUNLOCK(sc);
1089	VLAN_CAPABILITIES(sc->sc_ifp);
1090}
1091
1092static void
1093lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp)
1094{
1095	struct lagg_softc *sc = lp->lp_softc;
1096
1097	strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname));
1098	strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname));
1099	rp->rp_prio = lp->lp_prio;
1100	rp->rp_flags = lp->lp_flags;
1101	lagg_proto_portreq(sc, lp, &rp->rp_psc);
1102
1103	/* Add protocol specific flags */
1104	switch (sc->sc_proto) {
1105		case LAGG_PROTO_FAILOVER:
1106			if (lp == sc->sc_primary)
1107				rp->rp_flags |= LAGG_PORT_MASTER;
1108			if (lp == lagg_link_active(sc, sc->sc_primary))
1109				rp->rp_flags |= LAGG_PORT_ACTIVE;
1110			break;
1111
1112		case LAGG_PROTO_ROUNDROBIN:
1113		case LAGG_PROTO_LOADBALANCE:
1114		case LAGG_PROTO_BROADCAST:
1115			if (LAGG_PORTACTIVE(lp))
1116				rp->rp_flags |= LAGG_PORT_ACTIVE;
1117			break;
1118
1119		case LAGG_PROTO_LACP:
1120			/* LACP has a different definition of active */
1121			if (lacp_isactive(lp))
1122				rp->rp_flags |= LAGG_PORT_ACTIVE;
1123			if (lacp_iscollecting(lp))
1124				rp->rp_flags |= LAGG_PORT_COLLECTING;
1125			if (lacp_isdistributing(lp))
1126				rp->rp_flags |= LAGG_PORT_DISTRIBUTING;
1127			break;
1128	}
1129
1130}
1131
1132static void
1133lagg_init(void *xsc)
1134{
1135	struct lagg_softc *sc = (struct lagg_softc *)xsc;
1136	struct ifnet *ifp = sc->sc_ifp;
1137	struct lagg_port *lp;
1138
1139	LAGG_XLOCK(sc);
1140	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1141		LAGG_XUNLOCK(sc);
1142		return;
1143	}
1144
1145	ifp->if_drv_flags |= IFF_DRV_RUNNING;
1146
1147	/*
1148	 * Update the port lladdrs if needed.
1149	 * This might be if_setlladdr() notification
1150	 * that lladdr has been changed.
1151	 */
1152	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1153		if (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp->lp_ifp),
1154		    ETHER_ADDR_LEN) != 0)
1155			if_setlladdr(lp->lp_ifp, IF_LLADDR(ifp), ETHER_ADDR_LEN);
1156	}
1157
1158	lagg_proto_init(sc);
1159
1160	LAGG_XUNLOCK(sc);
1161}
1162
1163static void
1164lagg_stop(struct lagg_softc *sc)
1165{
1166	struct ifnet *ifp = sc->sc_ifp;
1167
1168	LAGG_XLOCK_ASSERT(sc);
1169
1170	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
1171		return;
1172
1173	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
1174
1175	lagg_proto_stop(sc);
1176}
1177
1178static int
1179lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1180{
1181	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1182	struct lagg_reqall *ra = (struct lagg_reqall *)data;
1183	struct lagg_reqopts *ro = (struct lagg_reqopts *)data;
1184	struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf;
1185	struct lagg_reqflags *rf = (struct lagg_reqflags *)data;
1186	struct ifreq *ifr = (struct ifreq *)data;
1187	struct lagg_port *lp;
1188	struct ifnet *tpif;
1189	struct thread *td = curthread;
1190	char *buf, *outbuf;
1191	int count, buflen, len, error = 0, oldmtu;
1192
1193	bzero(&rpbuf, sizeof(rpbuf));
1194
1195	/* XXX: This can race with lagg_clone_destroy. */
1196
1197	switch (cmd) {
1198	case SIOCGLAGG:
1199		LAGG_XLOCK(sc);
1200		buflen = sc->sc_count * sizeof(struct lagg_reqport);
1201		outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
1202		ra->ra_proto = sc->sc_proto;
1203		lagg_proto_request(sc, &ra->ra_psc);
1204		count = 0;
1205		buf = outbuf;
1206		len = min(ra->ra_size, buflen);
1207		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1208			if (len < sizeof(rpbuf))
1209				break;
1210
1211			lagg_port2req(lp, &rpbuf);
1212			memcpy(buf, &rpbuf, sizeof(rpbuf));
1213			count++;
1214			buf += sizeof(rpbuf);
1215			len -= sizeof(rpbuf);
1216		}
1217		LAGG_XUNLOCK(sc);
1218		ra->ra_ports = count;
1219		ra->ra_size = count * sizeof(rpbuf);
1220		error = copyout(outbuf, ra->ra_port, ra->ra_size);
1221		free(outbuf, M_TEMP);
1222		break;
1223	case SIOCSLAGG:
1224		error = priv_check(td, PRIV_NET_LAGG);
1225		if (error)
1226			break;
1227		if (ra->ra_proto >= LAGG_PROTO_MAX) {
1228			error = EPROTONOSUPPORT;
1229			break;
1230		}
1231
1232		LAGG_XLOCK(sc);
1233		lagg_proto_detach(sc);
1234		LAGG_UNLOCK_ASSERT();
1235		lagg_proto_attach(sc, ra->ra_proto);
1236		LAGG_XUNLOCK(sc);
1237		break;
1238	case SIOCGLAGGOPTS:
1239		LAGG_XLOCK(sc);
1240		ro->ro_opts = sc->sc_opts;
1241		if (sc->sc_proto == LAGG_PROTO_LACP) {
1242			struct lacp_softc *lsc;
1243
1244			lsc = (struct lacp_softc *)sc->sc_psc;
1245			if (lsc->lsc_debug.lsc_tx_test != 0)
1246				ro->ro_opts |= LAGG_OPT_LACP_TXTEST;
1247			if (lsc->lsc_debug.lsc_rx_test != 0)
1248				ro->ro_opts |= LAGG_OPT_LACP_RXTEST;
1249			if (lsc->lsc_strict_mode != 0)
1250				ro->ro_opts |= LAGG_OPT_LACP_STRICT;
1251			if (lsc->lsc_fast_timeout != 0)
1252				ro->ro_opts |= LAGG_OPT_LACP_FAST_TIMO;
1253
1254			ro->ro_active = sc->sc_active;
1255		} else {
1256			ro->ro_active = 0;
1257			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1258				ro->ro_active += LAGG_PORTACTIVE(lp);
1259		}
1260		ro->ro_bkt = sc->sc_stride;
1261		ro->ro_flapping = sc->sc_flapping;
1262		ro->ro_flowid_shift = sc->flowid_shift;
1263		LAGG_XUNLOCK(sc);
1264		break;
1265	case SIOCSLAGGOPTS:
1266		error = priv_check(td, PRIV_NET_LAGG);
1267		if (error)
1268			break;
1269
1270		/*
1271		 * The stride option was added without defining a corresponding
1272		 * LAGG_OPT flag, so handle a non-zero value before checking
1273		 * anything else to preserve compatibility.
1274		 */
1275		LAGG_XLOCK(sc);
1276		if (ro->ro_opts == 0 && ro->ro_bkt != 0) {
1277			if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN) {
1278				LAGG_XUNLOCK(sc);
1279				error = EINVAL;
1280				break;
1281			}
1282			sc->sc_stride = ro->ro_bkt;
1283		}
1284		if (ro->ro_opts == 0) {
1285			LAGG_XUNLOCK(sc);
1286			break;
1287		}
1288
1289		/*
1290		 * Set options.  LACP options are stored in sc->sc_psc,
1291		 * not in sc_opts.
1292		 */
1293		int valid, lacp;
1294
1295		switch (ro->ro_opts) {
1296		case LAGG_OPT_USE_FLOWID:
1297		case -LAGG_OPT_USE_FLOWID:
1298		case LAGG_OPT_USE_NUMA:
1299		case -LAGG_OPT_USE_NUMA:
1300		case LAGG_OPT_FLOWIDSHIFT:
1301		case LAGG_OPT_RR_LIMIT:
1302			valid = 1;
1303			lacp = 0;
1304			break;
1305		case LAGG_OPT_LACP_TXTEST:
1306		case -LAGG_OPT_LACP_TXTEST:
1307		case LAGG_OPT_LACP_RXTEST:
1308		case -LAGG_OPT_LACP_RXTEST:
1309		case LAGG_OPT_LACP_STRICT:
1310		case -LAGG_OPT_LACP_STRICT:
1311		case LAGG_OPT_LACP_FAST_TIMO:
1312		case -LAGG_OPT_LACP_FAST_TIMO:
1313			valid = lacp = 1;
1314			break;
1315		default:
1316			valid = lacp = 0;
1317			break;
1318		}
1319
1320		if (valid == 0 ||
1321		    (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) {
1322			/* Invalid combination of options specified. */
1323			error = EINVAL;
1324			LAGG_XUNLOCK(sc);
1325			break;	/* Return from SIOCSLAGGOPTS. */
1326		}
1327
1328		/*
1329		 * Store new options into sc->sc_opts except for
1330		 * FLOWIDSHIFT, RR and LACP options.
1331		 */
1332		if (lacp == 0) {
1333			if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT)
1334				sc->flowid_shift = ro->ro_flowid_shift;
1335			else if (ro->ro_opts == LAGG_OPT_RR_LIMIT) {
1336				if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN ||
1337				    ro->ro_bkt == 0) {
1338					error = EINVAL;
1339					LAGG_XUNLOCK(sc);
1340					break;
1341				}
1342				sc->sc_stride = ro->ro_bkt;
1343			} else if (ro->ro_opts > 0)
1344				sc->sc_opts |= ro->ro_opts;
1345			else
1346				sc->sc_opts &= ~ro->ro_opts;
1347		} else {
1348			struct lacp_softc *lsc;
1349			struct lacp_port *lp;
1350
1351			lsc = (struct lacp_softc *)sc->sc_psc;
1352
1353			switch (ro->ro_opts) {
1354			case LAGG_OPT_LACP_TXTEST:
1355				lsc->lsc_debug.lsc_tx_test = 1;
1356				break;
1357			case -LAGG_OPT_LACP_TXTEST:
1358				lsc->lsc_debug.lsc_tx_test = 0;
1359				break;
1360			case LAGG_OPT_LACP_RXTEST:
1361				lsc->lsc_debug.lsc_rx_test = 1;
1362				break;
1363			case -LAGG_OPT_LACP_RXTEST:
1364				lsc->lsc_debug.lsc_rx_test = 0;
1365				break;
1366			case LAGG_OPT_LACP_STRICT:
1367				lsc->lsc_strict_mode = 1;
1368				break;
1369			case -LAGG_OPT_LACP_STRICT:
1370				lsc->lsc_strict_mode = 0;
1371				break;
1372			case LAGG_OPT_LACP_FAST_TIMO:
1373				LACP_LOCK(lsc);
1374        			LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
1375                        		lp->lp_state |= LACP_STATE_TIMEOUT;
1376				LACP_UNLOCK(lsc);
1377				lsc->lsc_fast_timeout = 1;
1378				break;
1379			case -LAGG_OPT_LACP_FAST_TIMO:
1380				LACP_LOCK(lsc);
1381        			LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
1382                        		lp->lp_state &= ~LACP_STATE_TIMEOUT;
1383				LACP_UNLOCK(lsc);
1384				lsc->lsc_fast_timeout = 0;
1385				break;
1386			}
1387		}
1388		LAGG_XUNLOCK(sc);
1389		break;
1390	case SIOCGLAGGFLAGS:
1391		rf->rf_flags = 0;
1392		LAGG_XLOCK(sc);
1393		if (sc->sc_flags & MBUF_HASHFLAG_L2)
1394			rf->rf_flags |= LAGG_F_HASHL2;
1395		if (sc->sc_flags & MBUF_HASHFLAG_L3)
1396			rf->rf_flags |= LAGG_F_HASHL3;
1397		if (sc->sc_flags & MBUF_HASHFLAG_L4)
1398			rf->rf_flags |= LAGG_F_HASHL4;
1399		LAGG_XUNLOCK(sc);
1400		break;
1401	case SIOCSLAGGHASH:
1402		error = priv_check(td, PRIV_NET_LAGG);
1403		if (error)
1404			break;
1405		if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) {
1406			error = EINVAL;
1407			break;
1408		}
1409		LAGG_XLOCK(sc);
1410		sc->sc_flags = 0;
1411		if (rf->rf_flags & LAGG_F_HASHL2)
1412			sc->sc_flags |= MBUF_HASHFLAG_L2;
1413		if (rf->rf_flags & LAGG_F_HASHL3)
1414			sc->sc_flags |= MBUF_HASHFLAG_L3;
1415		if (rf->rf_flags & LAGG_F_HASHL4)
1416			sc->sc_flags |= MBUF_HASHFLAG_L4;
1417		LAGG_XUNLOCK(sc);
1418		break;
1419	case SIOCGLAGGPORT:
1420		if (rp->rp_portname[0] == '\0' ||
1421		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
1422			error = EINVAL;
1423			break;
1424		}
1425
1426		LAGG_RLOCK();
1427		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
1428		    lp->lp_softc != sc) {
1429			error = ENOENT;
1430			LAGG_RUNLOCK();
1431			if_rele(tpif);
1432			break;
1433		}
1434
1435		lagg_port2req(lp, rp);
1436		LAGG_RUNLOCK();
1437		if_rele(tpif);
1438		break;
1439	case SIOCSLAGGPORT:
1440		error = priv_check(td, PRIV_NET_LAGG);
1441		if (error)
1442			break;
1443		if (rp->rp_portname[0] == '\0' ||
1444		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
1445			error = EINVAL;
1446			break;
1447		}
1448#ifdef INET6
1449		/*
1450		 * A laggport interface should not have inet6 address
1451		 * because two interfaces with a valid link-local
1452		 * scope zone must not be merged in any form.  This
1453		 * restriction is needed to prevent violation of
1454		 * link-local scope zone.  Attempts to add a laggport
1455		 * interface which has inet6 addresses triggers
1456		 * removal of all inet6 addresses on the member
1457		 * interface.
1458		 */
1459		if (in6ifa_llaonifp(tpif)) {
1460			in6_ifdetach(tpif);
1461				if_printf(sc->sc_ifp,
1462				    "IPv6 addresses on %s have been removed "
1463				    "before adding it as a member to prevent "
1464				    "IPv6 address scope violation.\n",
1465				    tpif->if_xname);
1466		}
1467#endif
1468		oldmtu = ifp->if_mtu;
1469		LAGG_XLOCK(sc);
1470		error = lagg_port_create(sc, tpif);
1471		LAGG_XUNLOCK(sc);
1472		if_rele(tpif);
1473
1474		/*
1475		 * LAGG MTU may change during addition of the first port.
1476		 * If it did, do network layer specific procedure.
1477		 */
1478		if (ifp->if_mtu != oldmtu) {
1479#ifdef INET6
1480			nd6_setmtu(ifp);
1481#endif
1482			rt_updatemtu(ifp);
1483		}
1484
1485		VLAN_CAPABILITIES(ifp);
1486		break;
1487	case SIOCSLAGGDELPORT:
1488		error = priv_check(td, PRIV_NET_LAGG);
1489		if (error)
1490			break;
1491		if (rp->rp_portname[0] == '\0' ||
1492		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
1493			error = EINVAL;
1494			break;
1495		}
1496
1497		LAGG_XLOCK(sc);
1498		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
1499		    lp->lp_softc != sc) {
1500			error = ENOENT;
1501			LAGG_XUNLOCK(sc);
1502			if_rele(tpif);
1503			break;
1504		}
1505
1506		error = lagg_port_destroy(lp, 1);
1507		LAGG_XUNLOCK(sc);
1508		if_rele(tpif);
1509		VLAN_CAPABILITIES(ifp);
1510		break;
1511	case SIOCSIFFLAGS:
1512		/* Set flags on ports too */
1513		LAGG_XLOCK(sc);
1514		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1515			lagg_setflags(lp, 1);
1516		}
1517
1518		if (!(ifp->if_flags & IFF_UP) &&
1519		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1520			/*
1521			 * If interface is marked down and it is running,
1522			 * then stop and disable it.
1523			 */
1524			lagg_stop(sc);
1525			LAGG_XUNLOCK(sc);
1526		} else if ((ifp->if_flags & IFF_UP) &&
1527		    !(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1528			/*
1529			 * If interface is marked up and it is stopped, then
1530			 * start it.
1531			 */
1532			LAGG_XUNLOCK(sc);
1533			(*ifp->if_init)(sc);
1534		} else
1535			LAGG_XUNLOCK(sc);
1536		break;
1537	case SIOCADDMULTI:
1538	case SIOCDELMULTI:
1539		LAGG_XLOCK(sc);
1540		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1541			lagg_clrmulti(lp);
1542			lagg_setmulti(lp);
1543		}
1544		LAGG_XUNLOCK(sc);
1545		error = 0;
1546		break;
1547	case SIOCSIFMEDIA:
1548	case SIOCGIFMEDIA:
1549		error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
1550		break;
1551
1552	case SIOCSIFCAP:
1553		LAGG_XLOCK(sc);
1554		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1555			if (lp->lp_ioctl != NULL)
1556				(*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
1557		}
1558		lagg_capabilities(sc);
1559		LAGG_XUNLOCK(sc);
1560		VLAN_CAPABILITIES(ifp);
1561		error = 0;
1562		break;
1563
1564	case SIOCSIFMTU:
1565		LAGG_XLOCK(sc);
1566		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1567			if (lp->lp_ioctl != NULL)
1568				error = (*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
1569			else
1570				error = EINVAL;
1571			if (error != 0) {
1572				if_printf(ifp,
1573				    "failed to change MTU to %d on port %s, "
1574				    "reverting all ports to original MTU (%d)\n",
1575				    ifr->ifr_mtu, lp->lp_ifp->if_xname, ifp->if_mtu);
1576				break;
1577			}
1578		}
1579		if (error == 0) {
1580			ifp->if_mtu = ifr->ifr_mtu;
1581		} else {
1582			/* set every port back to the original MTU */
1583			ifr->ifr_mtu = ifp->if_mtu;
1584			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1585				if (lp->lp_ioctl != NULL)
1586					(*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
1587			}
1588		}
1589		LAGG_XUNLOCK(sc);
1590		break;
1591
1592	default:
1593		error = ether_ioctl(ifp, cmd, data);
1594		break;
1595	}
1596	return (error);
1597}
1598
1599#if defined(KERN_TLS) || defined(RATELIMIT)
1600static inline struct lagg_snd_tag *
1601mst_to_lst(struct m_snd_tag *mst)
1602{
1603
1604	return (__containerof(mst, struct lagg_snd_tag, com));
1605}
1606
1607/*
1608 * Look up the port used by a specific flow.  This only works for lagg
1609 * protocols with deterministic port mappings (e.g. not roundrobin).
1610 * In addition protocols which use a hash to map flows to ports must
1611 * be configured to use the mbuf flowid rather than hashing packet
1612 * contents.
1613 */
1614static struct lagg_port *
1615lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid, uint32_t flowtype,
1616    uint8_t numa_domain)
1617{
1618	struct lagg_softc *sc;
1619	struct lagg_port *lp;
1620	struct lagg_lb *lb;
1621	uint32_t hash, p;
1622
1623	sc = ifp->if_softc;
1624
1625	switch (sc->sc_proto) {
1626	case LAGG_PROTO_FAILOVER:
1627		return (lagg_link_active(sc, sc->sc_primary));
1628	case LAGG_PROTO_LOADBALANCE:
1629		if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
1630		    flowtype == M_HASHTYPE_NONE)
1631			return (NULL);
1632		p = flowid >> sc->flowid_shift;
1633		p %= sc->sc_count;
1634		lb = (struct lagg_lb *)sc->sc_psc;
1635		lp = lb->lb_ports[p];
1636		return (lagg_link_active(sc, lp));
1637	case LAGG_PROTO_LACP:
1638		if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
1639		    flowtype == M_HASHTYPE_NONE)
1640			return (NULL);
1641		hash = flowid >> sc->flowid_shift;
1642		return (lacp_select_tx_port_by_hash(sc, hash, numa_domain));
1643	default:
1644		return (NULL);
1645	}
1646}
1647
1648static int
1649lagg_snd_tag_alloc(struct ifnet *ifp,
1650    union if_snd_tag_alloc_params *params,
1651    struct m_snd_tag **ppmt)
1652{
1653	struct lagg_snd_tag *lst;
1654	struct lagg_softc *sc;
1655	struct lagg_port *lp;
1656	struct ifnet *lp_ifp;
1657	int error;
1658
1659	sc = ifp->if_softc;
1660
1661	LAGG_RLOCK();
1662	lp = lookup_snd_tag_port(ifp, params->hdr.flowid,
1663	    params->hdr.flowtype, params->hdr.numa_domain);
1664	if (lp == NULL) {
1665		LAGG_RUNLOCK();
1666		return (EOPNOTSUPP);
1667	}
1668	if (lp->lp_ifp == NULL || lp->lp_ifp->if_snd_tag_alloc == NULL) {
1669		LAGG_RUNLOCK();
1670		return (EOPNOTSUPP);
1671	}
1672	lp_ifp = lp->lp_ifp;
1673	if_ref(lp_ifp);
1674	LAGG_RUNLOCK();
1675
1676	lst = malloc(sizeof(*lst), M_LAGG, M_NOWAIT);
1677	if (lst == NULL) {
1678		if_rele(lp_ifp);
1679		return (ENOMEM);
1680	}
1681
1682	error = lp_ifp->if_snd_tag_alloc(lp_ifp, params, &lst->tag);
1683	if_rele(lp_ifp);
1684	if (error) {
1685		free(lst, M_LAGG);
1686		return (error);
1687	}
1688
1689	m_snd_tag_init(&lst->com, ifp);
1690
1691	*ppmt = &lst->com;
1692	return (0);
1693}
1694
1695static int
1696lagg_snd_tag_modify(struct m_snd_tag *mst,
1697    union if_snd_tag_modify_params *params)
1698{
1699	struct lagg_snd_tag *lst;
1700
1701	lst = mst_to_lst(mst);
1702	return (lst->tag->ifp->if_snd_tag_modify(lst->tag, params));
1703}
1704
1705static int
1706lagg_snd_tag_query(struct m_snd_tag *mst,
1707    union if_snd_tag_query_params *params)
1708{
1709	struct lagg_snd_tag *lst;
1710
1711	lst = mst_to_lst(mst);
1712	return (lst->tag->ifp->if_snd_tag_query(lst->tag, params));
1713}
1714
1715static void
1716lagg_snd_tag_free(struct m_snd_tag *mst)
1717{
1718	struct lagg_snd_tag *lst;
1719
1720	lst = mst_to_lst(mst);
1721	m_snd_tag_rele(lst->tag);
1722	free(lst, M_LAGG);
1723}
1724
1725static void
1726lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
1727{
1728	/*
1729	 * For lagg, we have an indirect
1730	 * interface. The caller needs to
1731	 * get a ratelimit tag on the actual
1732	 * interface the flow will go on.
1733	 */
1734	q->rate_table = NULL;
1735	q->flags = RT_IS_INDIRECT;
1736	q->max_flows = 0;
1737	q->number_of_rates = 0;
1738}
1739#endif
1740
1741static int
1742lagg_setmulti(struct lagg_port *lp)
1743{
1744	struct lagg_softc *sc = lp->lp_softc;
1745	struct ifnet *ifp = lp->lp_ifp;
1746	struct ifnet *scifp = sc->sc_ifp;
1747	struct lagg_mc *mc;
1748	struct ifmultiaddr *ifma;
1749	int error;
1750
1751	IF_ADDR_WLOCK(scifp);
1752	CK_STAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) {
1753		if (ifma->ifma_addr->sa_family != AF_LINK)
1754			continue;
1755		mc = malloc(sizeof(struct lagg_mc), M_LAGG, M_NOWAIT);
1756		if (mc == NULL) {
1757			IF_ADDR_WUNLOCK(scifp);
1758			return (ENOMEM);
1759		}
1760		bcopy(ifma->ifma_addr, &mc->mc_addr,
1761		    ifma->ifma_addr->sa_len);
1762		mc->mc_addr.sdl_index = ifp->if_index;
1763		mc->mc_ifma = NULL;
1764		SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries);
1765	}
1766	IF_ADDR_WUNLOCK(scifp);
1767	SLIST_FOREACH (mc, &lp->lp_mc_head, mc_entries) {
1768		error = if_addmulti(ifp,
1769		    (struct sockaddr *)&mc->mc_addr, &mc->mc_ifma);
1770		if (error)
1771			return (error);
1772	}
1773	return (0);
1774}
1775
1776static int
1777lagg_clrmulti(struct lagg_port *lp)
1778{
1779	struct lagg_mc *mc;
1780
1781	LAGG_XLOCK_ASSERT(lp->lp_softc);
1782	while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) {
1783		SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries);
1784		if (mc->mc_ifma && lp->lp_detaching == 0)
1785			if_delmulti_ifma(mc->mc_ifma);
1786		free(mc, M_LAGG);
1787	}
1788	return (0);
1789}
1790
1791static int
1792lagg_setcaps(struct lagg_port *lp, int cap)
1793{
1794	struct ifreq ifr;
1795
1796	if (lp->lp_ifp->if_capenable == cap)
1797		return (0);
1798	if (lp->lp_ioctl == NULL)
1799		return (ENXIO);
1800	ifr.ifr_reqcap = cap;
1801	return ((*lp->lp_ioctl)(lp->lp_ifp, SIOCSIFCAP, (caddr_t)&ifr));
1802}
1803
1804/* Handle a ref counted flag that should be set on the lagg port as well */
1805static int
1806lagg_setflag(struct lagg_port *lp, int flag, int status,
1807    int (*func)(struct ifnet *, int))
1808{
1809	struct lagg_softc *sc = lp->lp_softc;
1810	struct ifnet *scifp = sc->sc_ifp;
1811	struct ifnet *ifp = lp->lp_ifp;
1812	int error;
1813
1814	LAGG_XLOCK_ASSERT(sc);
1815
1816	status = status ? (scifp->if_flags & flag) : 0;
1817	/* Now "status" contains the flag value or 0 */
1818
1819	/*
1820	 * See if recorded ports status is different from what
1821	 * we want it to be.  If it is, flip it.  We record ports
1822	 * status in lp_ifflags so that we won't clear ports flag
1823	 * we haven't set.  In fact, we don't clear or set ports
1824	 * flags directly, but get or release references to them.
1825	 * That's why we can be sure that recorded flags still are
1826	 * in accord with actual ports flags.
1827	 */
1828	if (status != (lp->lp_ifflags & flag)) {
1829		error = (*func)(ifp, status);
1830		if (error)
1831			return (error);
1832		lp->lp_ifflags &= ~flag;
1833		lp->lp_ifflags |= status;
1834	}
1835	return (0);
1836}
1837
1838/*
1839 * Handle IFF_* flags that require certain changes on the lagg port
1840 * if "status" is true, update ports flags respective to the lagg
1841 * if "status" is false, forcedly clear the flags set on port.
1842 */
1843static int
1844lagg_setflags(struct lagg_port *lp, int status)
1845{
1846	int error, i;
1847
1848	for (i = 0; lagg_pflags[i].flag; i++) {
1849		error = lagg_setflag(lp, lagg_pflags[i].flag,
1850		    status, lagg_pflags[i].func);
1851		if (error)
1852			return (error);
1853	}
1854	return (0);
1855}
1856
1857static int
1858lagg_transmit(struct ifnet *ifp, struct mbuf *m)
1859{
1860	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1861	int error;
1862
1863#if defined(KERN_TLS) || defined(RATELIMIT)
1864	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG)
1865		MPASS(m->m_pkthdr.snd_tag->ifp == ifp);
1866#endif
1867	LAGG_RLOCK();
1868	/* We need a Tx algorithm and at least one port */
1869	if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) {
1870		LAGG_RUNLOCK();
1871		m_freem(m);
1872		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1873		return (ENXIO);
1874	}
1875
1876	ETHER_BPF_MTAP(ifp, m);
1877
1878	error = lagg_proto_start(sc, m);
1879	LAGG_RUNLOCK();
1880	return (error);
1881}
1882
1883/*
1884 * The ifp->if_qflush entry point for lagg(4) is no-op.
1885 */
1886static void
1887lagg_qflush(struct ifnet *ifp __unused)
1888{
1889}
1890
1891static struct mbuf *
1892lagg_input(struct ifnet *ifp, struct mbuf *m)
1893{
1894	struct lagg_port *lp = ifp->if_lagg;
1895	struct lagg_softc *sc = lp->lp_softc;
1896	struct ifnet *scifp = sc->sc_ifp;
1897
1898	LAGG_RLOCK();
1899	if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
1900	    lp->lp_detaching != 0 ||
1901	    sc->sc_proto == LAGG_PROTO_NONE) {
1902		LAGG_RUNLOCK();
1903		m_freem(m);
1904		return (NULL);
1905	}
1906
1907	ETHER_BPF_MTAP(scifp, m);
1908
1909	m = lagg_proto_input(sc, lp, m);
1910	if (m != NULL && (scifp->if_flags & IFF_MONITOR) != 0) {
1911		m_freem(m);
1912		m = NULL;
1913	}
1914
1915	LAGG_RUNLOCK();
1916	return (m);
1917}
1918
1919static int
1920lagg_media_change(struct ifnet *ifp)
1921{
1922	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1923
1924	if (sc->sc_ifflags & IFF_DEBUG)
1925		printf("%s\n", __func__);
1926
1927	/* Ignore */
1928	return (0);
1929}
1930
1931static void
1932lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr)
1933{
1934	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1935	struct lagg_port *lp;
1936
1937	imr->ifm_status = IFM_AVALID;
1938	imr->ifm_active = IFM_ETHER | IFM_AUTO;
1939
1940	LAGG_RLOCK();
1941	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1942		if (LAGG_PORTACTIVE(lp))
1943			imr->ifm_status |= IFM_ACTIVE;
1944	}
1945	LAGG_RUNLOCK();
1946}
1947
1948static void
1949lagg_linkstate(struct lagg_softc *sc)
1950{
1951	struct lagg_port *lp;
1952	int new_link = LINK_STATE_DOWN;
1953	uint64_t speed;
1954
1955	LAGG_XLOCK_ASSERT(sc);
1956
1957	/* LACP handles link state itself */
1958	if (sc->sc_proto == LAGG_PROTO_LACP)
1959		return;
1960
1961	/* Our link is considered up if at least one of our ports is active */
1962	LAGG_RLOCK();
1963	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1964		if (lp->lp_ifp->if_link_state == LINK_STATE_UP) {
1965			new_link = LINK_STATE_UP;
1966			break;
1967		}
1968	}
1969	LAGG_RUNLOCK();
1970	if_link_state_change(sc->sc_ifp, new_link);
1971
1972	/* Update if_baudrate to reflect the max possible speed */
1973	switch (sc->sc_proto) {
1974		case LAGG_PROTO_FAILOVER:
1975			sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ?
1976			    sc->sc_primary->lp_ifp->if_baudrate : 0;
1977			break;
1978		case LAGG_PROTO_ROUNDROBIN:
1979		case LAGG_PROTO_LOADBALANCE:
1980		case LAGG_PROTO_BROADCAST:
1981			speed = 0;
1982			LAGG_RLOCK();
1983			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1984				speed += lp->lp_ifp->if_baudrate;
1985			LAGG_RUNLOCK();
1986			sc->sc_ifp->if_baudrate = speed;
1987			break;
1988		case LAGG_PROTO_LACP:
1989			/* LACP updates if_baudrate itself */
1990			break;
1991	}
1992}
1993
1994static void
1995lagg_port_state(struct ifnet *ifp, int state)
1996{
1997	struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg;
1998	struct lagg_softc *sc = NULL;
1999
2000	if (lp != NULL)
2001		sc = lp->lp_softc;
2002	if (sc == NULL)
2003		return;
2004
2005	LAGG_XLOCK(sc);
2006	lagg_linkstate(sc);
2007	lagg_proto_linkstate(sc, lp);
2008	LAGG_XUNLOCK(sc);
2009}
2010
2011struct lagg_port *
2012lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp)
2013{
2014	struct lagg_port *lp_next, *rval = NULL;
2015
2016	/*
2017	 * Search a port which reports an active link state.
2018	 */
2019
2020#ifdef INVARIANTS
2021	/*
2022	 * This is called with either LAGG_RLOCK() held or
2023	 * LAGG_XLOCK(sc) held.
2024	 */
2025	if (!in_epoch(net_epoch_preempt))
2026		LAGG_XLOCK_ASSERT(sc);
2027#endif
2028
2029	if (lp == NULL)
2030		goto search;
2031	if (LAGG_PORTACTIVE(lp)) {
2032		rval = lp;
2033		goto found;
2034	}
2035	if ((lp_next = CK_SLIST_NEXT(lp, lp_entries)) != NULL &&
2036	    LAGG_PORTACTIVE(lp_next)) {
2037		rval = lp_next;
2038		goto found;
2039	}
2040
2041search:
2042	CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
2043		if (LAGG_PORTACTIVE(lp_next)) {
2044			return (lp_next);
2045		}
2046	}
2047found:
2048	return (rval);
2049}
2050
2051int
2052lagg_enqueue(struct ifnet *ifp, struct mbuf *m)
2053{
2054
2055#if defined(KERN_TLS) || defined(RATELIMIT)
2056	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) {
2057		struct lagg_snd_tag *lst;
2058		struct m_snd_tag *mst;
2059
2060		mst = m->m_pkthdr.snd_tag;
2061		lst = mst_to_lst(mst);
2062		if (lst->tag->ifp != ifp) {
2063			m_freem(m);
2064			return (EAGAIN);
2065		}
2066		m->m_pkthdr.snd_tag = m_snd_tag_ref(lst->tag);
2067		m_snd_tag_rele(mst);
2068	}
2069#endif
2070	return (ifp->if_transmit)(ifp, m);
2071}
2072
2073/*
2074 * Simple round robin aggregation
2075 */
2076static void
2077lagg_rr_attach(struct lagg_softc *sc)
2078{
2079	sc->sc_seq = 0;
2080	sc->sc_stride = 1;
2081}
2082
2083static int
2084lagg_rr_start(struct lagg_softc *sc, struct mbuf *m)
2085{
2086	struct lagg_port *lp;
2087	uint32_t p;
2088
2089	p = atomic_fetchadd_32(&sc->sc_seq, 1);
2090	p /= sc->sc_stride;
2091	p %= sc->sc_count;
2092	lp = CK_SLIST_FIRST(&sc->sc_ports);
2093
2094	while (p--)
2095		lp = CK_SLIST_NEXT(lp, lp_entries);
2096
2097	/*
2098	 * Check the port's link state. This will return the next active
2099	 * port if the link is down or the port is NULL.
2100	 */
2101	if ((lp = lagg_link_active(sc, lp)) == NULL) {
2102		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
2103		m_freem(m);
2104		return (ENETDOWN);
2105	}
2106
2107	/* Send mbuf */
2108	return (lagg_enqueue(lp->lp_ifp, m));
2109}
2110
2111static struct mbuf *
2112lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
2113{
2114	struct ifnet *ifp = sc->sc_ifp;
2115
2116	/* Just pass in the packet to our lagg device */
2117	m->m_pkthdr.rcvif = ifp;
2118
2119	return (m);
2120}
2121
2122/*
2123 * Broadcast mode
2124 */
2125static int
2126lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m)
2127{
2128	int active_ports = 0;
2129	int errors = 0;
2130	int ret;
2131	struct lagg_port *lp, *last = NULL;
2132	struct mbuf *m0;
2133
2134	LAGG_RLOCK_ASSERT();
2135	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
2136		if (!LAGG_PORTACTIVE(lp))
2137			continue;
2138
2139		active_ports++;
2140
2141		if (last != NULL) {
2142			m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT);
2143			if (m0 == NULL) {
2144				ret = ENOBUFS;
2145				errors++;
2146				break;
2147			}
2148			lagg_enqueue(last->lp_ifp, m0);
2149		}
2150		last = lp;
2151	}
2152
2153	if (last == NULL) {
2154		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
2155		m_freem(m);
2156		return (ENOENT);
2157	}
2158	if ((last = lagg_link_active(sc, last)) == NULL) {
2159		errors++;
2160		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors);
2161		m_freem(m);
2162		return (ENETDOWN);
2163	}
2164
2165	ret = lagg_enqueue(last->lp_ifp, m);
2166	if (errors != 0)
2167		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors);
2168
2169	return (ret);
2170}
2171
2172static struct mbuf*
2173lagg_bcast_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
2174{
2175	struct ifnet *ifp = sc->sc_ifp;
2176
2177	/* Just pass in the packet to our lagg device */
2178	m->m_pkthdr.rcvif = ifp;
2179	return (m);
2180}
2181
2182/*
2183 * Active failover
2184 */
2185static int
2186lagg_fail_start(struct lagg_softc *sc, struct mbuf *m)
2187{
2188	struct lagg_port *lp;
2189
2190	/* Use the master port if active or the next available port */
2191	if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) {
2192		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
2193		m_freem(m);
2194		return (ENETDOWN);
2195	}
2196
2197	/* Send mbuf */
2198	return (lagg_enqueue(lp->lp_ifp, m));
2199}
2200
2201static struct mbuf *
2202lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
2203{
2204	struct ifnet *ifp = sc->sc_ifp;
2205	struct lagg_port *tmp_tp;
2206
2207	if (lp == sc->sc_primary || V_lagg_failover_rx_all) {
2208		m->m_pkthdr.rcvif = ifp;
2209		return (m);
2210	}
2211
2212	if (!LAGG_PORTACTIVE(sc->sc_primary)) {
2213		tmp_tp = lagg_link_active(sc, sc->sc_primary);
2214		/*
2215		 * If tmp_tp is null, we've received a packet when all
2216		 * our links are down. Weird, but process it anyways.
2217		 */
2218		if ((tmp_tp == NULL || tmp_tp == lp)) {
2219			m->m_pkthdr.rcvif = ifp;
2220			return (m);
2221		}
2222	}
2223
2224	m_freem(m);
2225	return (NULL);
2226}
2227
2228/*
2229 * Loadbalancing
2230 */
2231static void
2232lagg_lb_attach(struct lagg_softc *sc)
2233{
2234	struct lagg_port *lp;
2235	struct lagg_lb *lb;
2236
2237	LAGG_XLOCK_ASSERT(sc);
2238	lb = malloc(sizeof(struct lagg_lb), M_LAGG, M_WAITOK | M_ZERO);
2239	lb->lb_key = m_ether_tcpip_hash_init();
2240	sc->sc_psc = lb;
2241
2242	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
2243		lagg_lb_port_create(lp);
2244}
2245
2246static void
2247lagg_lb_detach(struct lagg_softc *sc)
2248{
2249	struct lagg_lb *lb;
2250
2251	lb = (struct lagg_lb *)sc->sc_psc;
2252	if (lb != NULL)
2253		free(lb, M_LAGG);
2254}
2255
2256static int
2257lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp)
2258{
2259	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
2260	struct lagg_port *lp_next;
2261	int i = 0, rv;
2262
2263	rv = 0;
2264	bzero(&lb->lb_ports, sizeof(lb->lb_ports));
2265	LAGG_XLOCK_ASSERT(sc);
2266	CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
2267		if (lp_next == lp)
2268			continue;
2269		if (i >= LAGG_MAX_PORTS) {
2270			rv = EINVAL;
2271			break;
2272		}
2273		if (sc->sc_ifflags & IFF_DEBUG)
2274			printf("%s: port %s at index %d\n",
2275			    sc->sc_ifname, lp_next->lp_ifp->if_xname, i);
2276		lb->lb_ports[i++] = lp_next;
2277	}
2278
2279	return (rv);
2280}
2281
2282static int
2283lagg_lb_port_create(struct lagg_port *lp)
2284{
2285	struct lagg_softc *sc = lp->lp_softc;
2286	return (lagg_lb_porttable(sc, NULL));
2287}
2288
2289static void
2290lagg_lb_port_destroy(struct lagg_port *lp)
2291{
2292	struct lagg_softc *sc = lp->lp_softc;
2293	lagg_lb_porttable(sc, lp);
2294}
2295
2296static int
2297lagg_lb_start(struct lagg_softc *sc, struct mbuf *m)
2298{
2299	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
2300	struct lagg_port *lp = NULL;
2301	uint32_t p = 0;
2302
2303	if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) &&
2304	    M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
2305		p = m->m_pkthdr.flowid >> sc->flowid_shift;
2306	else
2307		p = m_ether_tcpip_hash(sc->sc_flags, m, lb->lb_key);
2308	p %= sc->sc_count;
2309	lp = lb->lb_ports[p];
2310
2311	/*
2312	 * Check the port's link state. This will return the next active
2313	 * port if the link is down or the port is NULL.
2314	 */
2315	if ((lp = lagg_link_active(sc, lp)) == NULL) {
2316		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
2317		m_freem(m);
2318		return (ENETDOWN);
2319	}
2320
2321	/* Send mbuf */
2322	return (lagg_enqueue(lp->lp_ifp, m));
2323}
2324
2325static struct mbuf *
2326lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
2327{
2328	struct ifnet *ifp = sc->sc_ifp;
2329
2330	/* Just pass in the packet to our lagg device */
2331	m->m_pkthdr.rcvif = ifp;
2332
2333	return (m);
2334}
2335
2336/*
2337 * 802.3ad LACP
2338 */
2339static void
2340lagg_lacp_attach(struct lagg_softc *sc)
2341{
2342	struct lagg_port *lp;
2343
2344	lacp_attach(sc);
2345	LAGG_XLOCK_ASSERT(sc);
2346	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
2347		lacp_port_create(lp);
2348}
2349
2350static void
2351lagg_lacp_detach(struct lagg_softc *sc)
2352{
2353	struct lagg_port *lp;
2354	void *psc;
2355
2356	LAGG_XLOCK_ASSERT(sc);
2357	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
2358		lacp_port_destroy(lp);
2359
2360	psc = sc->sc_psc;
2361	sc->sc_psc = NULL;
2362	lacp_detach(psc);
2363}
2364
2365static void
2366lagg_lacp_lladdr(struct lagg_softc *sc)
2367{
2368	struct lagg_port *lp;
2369
2370	LAGG_SXLOCK_ASSERT(sc);
2371
2372	/* purge all the lacp ports */
2373	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
2374		lacp_port_destroy(lp);
2375
2376	/* add them back in */
2377	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
2378		lacp_port_create(lp);
2379}
2380
2381static int
2382lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m)
2383{
2384	struct lagg_port *lp;
2385
2386	lp = lacp_select_tx_port(sc, m);
2387	if (lp == NULL) {
2388		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
2389		m_freem(m);
2390		return (ENETDOWN);
2391	}
2392
2393	/* Send mbuf */
2394	return (lagg_enqueue(lp->lp_ifp, m));
2395}
2396
2397static struct mbuf *
2398lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
2399{
2400	struct ifnet *ifp = sc->sc_ifp;
2401	struct ether_header *eh;
2402	u_short etype;
2403
2404	eh = mtod(m, struct ether_header *);
2405	etype = ntohs(eh->ether_type);
2406
2407	/* Tap off LACP control messages */
2408	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) {
2409		m = lacp_input(lp, m);
2410		if (m == NULL)
2411			return (NULL);
2412	}
2413
2414	/*
2415	 * If the port is not collecting or not in the active aggregator then
2416	 * free and return.
2417	 */
2418	if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) {
2419		m_freem(m);
2420		return (NULL);
2421	}
2422
2423	m->m_pkthdr.rcvif = ifp;
2424	return (m);
2425}
2426