xref: /illumos-gate/usr/src/uts/common/inet/cc.h (revision 8f97fda4)
145a4b79dSSebastien Roy /*
245a4b79dSSebastien Roy  * Copyright (c) 2007-2008
3*8f97fda4SAndy Fiddaman  *	Swinburne University of Technology, Melbourne, Australia.
445a4b79dSSebastien Roy  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
545a4b79dSSebastien Roy  * Copyright (c) 2010 The FreeBSD Foundation
645a4b79dSSebastien Roy  * All rights reserved.
745a4b79dSSebastien Roy  * Copyright (c) 2017 by Delphix. All rights reserved.
845a4b79dSSebastien Roy  *
945a4b79dSSebastien Roy  * This software was developed at the Centre for Advanced Internet
1045a4b79dSSebastien Roy  * Architectures, Swinburne University of Technology, by Lawrence Stewart and
1145a4b79dSSebastien Roy  * James Healy, made possible in part by a grant from the Cisco University
1245a4b79dSSebastien Roy  * Research Program Fund at Community Foundation Silicon Valley.
1345a4b79dSSebastien Roy  *
1445a4b79dSSebastien Roy  * Portions of this software were developed at the Centre for Advanced
1545a4b79dSSebastien Roy  * Internet Architectures, Swinburne University of Technology, Melbourne,
1645a4b79dSSebastien Roy  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
1745a4b79dSSebastien Roy  *
1845a4b79dSSebastien Roy  * Redistribution and use in source and binary forms, with or without
1945a4b79dSSebastien Roy  * modification, are permitted provided that the following conditions
2045a4b79dSSebastien Roy  * are met:
2145a4b79dSSebastien Roy  * 1. Redistributions of source code must retain the above copyright
2245a4b79dSSebastien Roy  *    notice, this list of conditions and the following disclaimer.
2345a4b79dSSebastien Roy  * 2. Redistributions in binary form must reproduce the above copyright
2445a4b79dSSebastien Roy  *    notice, this list of conditions and the following disclaimer in the
2545a4b79dSSebastien Roy  *    documentation and/or other materials provided with the distribution.
2645a4b79dSSebastien Roy  *
2745a4b79dSSebastien Roy  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
2845a4b79dSSebastien Roy  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2945a4b79dSSebastien Roy  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
3045a4b79dSSebastien Roy  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
3145a4b79dSSebastien Roy  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
3245a4b79dSSebastien Roy  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
3345a4b79dSSebastien Roy  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
3445a4b79dSSebastien Roy  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
3545a4b79dSSebastien Roy  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3645a4b79dSSebastien Roy  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3745a4b79dSSebastien Roy  * SUCH DAMAGE.
3845a4b79dSSebastien Roy  *
3945a4b79dSSebastien Roy  * $FreeBSD$
4045a4b79dSSebastien Roy  */
4145a4b79dSSebastien Roy 
4245a4b79dSSebastien Roy /*
4345a4b79dSSebastien Roy  * This software was first released in 2007 by James Healy and Lawrence Stewart
4445a4b79dSSebastien Roy  * whilst working on the NewTCP research project at Swinburne University of
4545a4b79dSSebastien Roy  * Technology's Centre for Advanced Internet Architectures, Melbourne,
4645a4b79dSSebastien Roy  * Australia, which was made possible in part by a grant from the Cisco
4745a4b79dSSebastien Roy  * University Research Program Fund at Community Foundation Silicon Valley.
4845a4b79dSSebastien Roy  * More details are available at:
4945a4b79dSSebastien Roy  *   http://caia.swin.edu.au/urp/newtcp/
5045a4b79dSSebastien Roy  */
5145a4b79dSSebastien Roy 
5245a4b79dSSebastien Roy #ifndef _NETINET_CC_H_
5345a4b79dSSebastien Roy #define	_NETINET_CC_H_
5445a4b79dSSebastien Roy 
55*8f97fda4SAndy Fiddaman #if (defined(_KERNEL) || defined(_KMEMUSER))
56*8f97fda4SAndy Fiddaman 
5745a4b79dSSebastien Roy #ifdef	__cplusplus
5845a4b79dSSebastien Roy extern "C" {
5945a4b79dSSebastien Roy #endif
6045a4b79dSSebastien Roy 
6145a4b79dSSebastien Roy #include <netinet/tcp.h>
6245a4b79dSSebastien Roy #include <sys/queue.h>
6345a4b79dSSebastien Roy #include <sys/rwlock.h>
6445a4b79dSSebastien Roy 
6545a4b79dSSebastien Roy #define	CC_ALGO_NAME_MAX	16	/* max congestion control name length */
6645a4b79dSSebastien Roy 
6745a4b79dSSebastien Roy #define	CC_DEFAULT_ALGO_NAME	"sunreno"
6845a4b79dSSebastien Roy 
6945a4b79dSSebastien Roy struct tcp_s;
7045a4b79dSSebastien Roy struct sctp_s;
7145a4b79dSSebastien Roy 
7245a4b79dSSebastien Roy /* CC housekeeping functions. */
7345a4b79dSSebastien Roy extern struct cc_algo *cc_load_algo(const char *name);
7445a4b79dSSebastien Roy extern int	cc_register_algo(struct cc_algo *add_cc);
7545a4b79dSSebastien Roy extern int	cc_deregister_algo(struct cc_algo *remove_cc);
7645a4b79dSSebastien Roy 
7745a4b79dSSebastien Roy /*
7845a4b79dSSebastien Roy  * Wrapper around transport structs that contain same-named congestion
7945a4b79dSSebastien Roy  * control variables. Allows algos to be shared amongst multiple CC aware
8045a4b79dSSebastien Roy  * transports.
8145a4b79dSSebastien Roy  *
8245a4b79dSSebastien Roy  * In theory, this code (from FreeBSD) can be used to support pluggable
8345a4b79dSSebastien Roy  * congestion control for sctp as well as tcp.  However, the support for sctp
8445a4b79dSSebastien Roy  * in FreeBSD is incomplete, and in practice "type" is ignored.  cc_module.h
8545a4b79dSSebastien Roy  * provides a CCV macro which implementations can use to get a variable out of
8645a4b79dSSebastien Roy  * the protocol-appropriate structure.
8745a4b79dSSebastien Roy  *
8845a4b79dSSebastien Roy  * If FreeBSD eventually does extend support for pluggable congestion control
8945a4b79dSSebastien Roy  * to sctp, we'll need to make sure we're setting "type" appropriately or use
9045a4b79dSSebastien Roy  * a definition of CCV that ignores it.
9145a4b79dSSebastien Roy  */
9245a4b79dSSebastien Roy struct cc_var {
9345a4b79dSSebastien Roy 	void		*cc_data; /* Per-connection private algorithm data. */
9445a4b79dSSebastien Roy 	int		bytes_this_ack; /* # bytes acked by the current ACK. */
9545a4b79dSSebastien Roy 	int		t_bytes_acked; /* # bytes acked during current RTT */
9645a4b79dSSebastien Roy 	tcp_seq		curack; /* Most recent ACK. */
9745a4b79dSSebastien Roy 	uint32_t	flags; /* Flags for cc_var (see below) */
9845a4b79dSSebastien Roy 	int		type; /* Indicates which ptr is valid in ccvc. */
9945a4b79dSSebastien Roy 	union ccv_container {
10045a4b79dSSebastien Roy 		struct tcp_s	*tcp;
10145a4b79dSSebastien Roy 		struct sctp_s	*sctp;
10245a4b79dSSebastien Roy 	} ccvc;
10345a4b79dSSebastien Roy 	uint16_t	nsegs; /* # segments coalesced into current chain. */
10445a4b79dSSebastien Roy };
10545a4b79dSSebastien Roy 
10645a4b79dSSebastien Roy /*
10745a4b79dSSebastien Roy  * cc_var flags.
10845a4b79dSSebastien Roy  *
10945a4b79dSSebastien Roy  * CCF_ABC_SENTAWND is set when a full congestion window of data has been ACKed
11045a4b79dSSebastien Roy  *   according to the Appropriate Byte Counting spec, defined in RFC 3465.
11145a4b79dSSebastien Roy  */
11245a4b79dSSebastien Roy #define	CCF_ABC_SENTAWND	0x0001	/* ABC counted cwnd worth of bytes? */
11345a4b79dSSebastien Roy #define	CCF_CWND_LIMITED	0x0002	/* Are we currently cwnd limited? */
11445a4b79dSSebastien Roy #define	CCF_FASTRECOVERY	0x0004	/* in NewReno Fast Recovery */
11545a4b79dSSebastien Roy #define	CCF_WASFRECOVERY	0x0008	/* was in NewReno Fast Recovery */
11645a4b79dSSebastien Roy #define	CCF_CONGRECOVERY	0x0010	/* congestion recovery mode */
11745a4b79dSSebastien Roy #define	CCF_WASCRECOVERY	0x0020	/* was in congestion recovery */
11845a4b79dSSebastien Roy /*
11945a4b79dSSebastien Roy  * In slow-start due to a retransmission timeout. This flag is enabled for the
12045a4b79dSSebastien Roy  * duration of the slow-start phase.
12145a4b79dSSebastien Roy  */
12245a4b79dSSebastien Roy #define	CCF_RTO			0x0040	/* in slow-start due to timeout */
12345a4b79dSSebastien Roy 
12445a4b79dSSebastien Roy #define	IN_FASTRECOVERY(flags)		(flags & CCF_FASTRECOVERY)
12545a4b79dSSebastien Roy #define	ENTER_FASTRECOVERY(flags)	flags |= CCF_FASTRECOVERY
12645a4b79dSSebastien Roy #define	EXIT_FASTRECOVERY(flags)	flags &= ~CCF_FASTRECOVERY
12745a4b79dSSebastien Roy 
12845a4b79dSSebastien Roy #define	IN_CONGRECOVERY(flags)		(flags & CCF_CONGRECOVERY)
12945a4b79dSSebastien Roy #define	ENTER_CONGRECOVERY(flags)	flags |= CCF_CONGRECOVERY
13045a4b79dSSebastien Roy #define	EXIT_CONGRECOVERY(flags)	flags &= ~CCF_CONGRECOVERY
13145a4b79dSSebastien Roy 
13245a4b79dSSebastien Roy #define	IN_RECOVERY(flags) (flags & (CCF_CONGRECOVERY | CCF_FASTRECOVERY))
13345a4b79dSSebastien Roy #define	ENTER_RECOVERY(flags) flags |= (CCF_CONGRECOVERY | CCF_FASTRECOVERY)
13445a4b79dSSebastien Roy #define	EXIT_RECOVERY(flags) flags &= ~(CCF_CONGRECOVERY | CCF_FASTRECOVERY)
13545a4b79dSSebastien Roy 
13645a4b79dSSebastien Roy /*
13745a4b79dSSebastien Roy  * ACK types passed to the ack_received() hook.
13845a4b79dSSebastien Roy  *
13945a4b79dSSebastien Roy  * CC_ACK is passed when an ACK acknowledges previously unACKed data.
14045a4b79dSSebastien Roy  * CC_DUPACK is passed when a duplicate ACK is received.  The conditions under
14145a4b79dSSebastien Roy  *   which an ACK is considered a duplicate ACK are defined in RFC 5681.
14245a4b79dSSebastien Roy  */
14345a4b79dSSebastien Roy #define	CC_ACK		0x0001	/* Regular in sequence ACK. */
14445a4b79dSSebastien Roy #define	CC_DUPACK	0x0002	/* Duplicate ACK. */
14545a4b79dSSebastien Roy #define	CC_PARTIALACK	0x0004	/* Not yet. */
14645a4b79dSSebastien Roy #define	CC_SACK		0x0008	/* Not yet. */
14745a4b79dSSebastien Roy 
14845a4b79dSSebastien Roy /*
14945a4b79dSSebastien Roy  * Congestion signal types passed to the cong_signal() hook. The highest order 8
15045a4b79dSSebastien Roy  * bits (0x01000000 - 0x80000000) are reserved for CC algos to declare their own
15145a4b79dSSebastien Roy  * congestion signal types.
15245a4b79dSSebastien Roy  *
15345a4b79dSSebastien Roy  * The congestion signals defined here cover the following situations:
15445a4b79dSSebastien Roy  * CC_ECN: A packet with an Explicit Congestion Notification was received
15545a4b79dSSebastien Roy  *   See RFC 3168.
15645a4b79dSSebastien Roy  * CC_RTO: A round-trip timeout occured.
15745a4b79dSSebastien Roy  * CC_RTO_ERR: An ACK was received for a sequence number after we fired an RTO
15845a4b79dSSebastien Roy  *   for that sequence number
15945a4b79dSSebastien Roy  * CC_NDUPACK: Trigger fast retransmit based on the assumption that receiving
16045a4b79dSSebastien Roy  *   N duplicate ACKs indicates packet loss rather than reordering.  Fast
16145a4b79dSSebastien Roy  *   retransmit is followed by fast recovery.  Fast retransmit and recovery
16245a4b79dSSebastien Roy  *   were originally described in RFC 2581 and were updated by RFC3782
16345a4b79dSSebastien Roy  *   (NewReno).  In both RFC2581 and RFC3782, N is 3.
16445a4b79dSSebastien Roy  */
16545a4b79dSSebastien Roy #define	CC_ECN		0x00000001	/* ECN marked packet received. */
16645a4b79dSSebastien Roy #define	CC_RTO		0x00000002	/* RTO fired. */
16745a4b79dSSebastien Roy #define	CC_RTO_ERR	0x00000004	/* RTO fired in error. */
16845a4b79dSSebastien Roy #define	CC_NDUPACK	0x00000008	/* Threshold of dupack's reached. */
16945a4b79dSSebastien Roy 
17045a4b79dSSebastien Roy #define	CC_SIGPRIVMASK	0xFF000000	/* Mask to check if sig is private. */
17145a4b79dSSebastien Roy 
17245a4b79dSSebastien Roy /*
17345a4b79dSSebastien Roy  * Structure to hold data and function pointers that together represent a
17445a4b79dSSebastien Roy  * congestion control algorithm.
17545a4b79dSSebastien Roy  */
17645a4b79dSSebastien Roy struct cc_algo {
17745a4b79dSSebastien Roy 	char	name[CC_ALGO_NAME_MAX];
17845a4b79dSSebastien Roy 
17945a4b79dSSebastien Roy 	/* Init CC state for a new control block. */
18045a4b79dSSebastien Roy 	int	(*cb_init)(struct cc_var *ccv);
18145a4b79dSSebastien Roy 
18245a4b79dSSebastien Roy 	/* Cleanup CC state for a terminating control block. */
18345a4b79dSSebastien Roy 	void	(*cb_destroy)(struct cc_var *ccv);
18445a4b79dSSebastien Roy 
18545a4b79dSSebastien Roy 	/* Init variables for a newly established connection. */
18645a4b79dSSebastien Roy 	void	(*conn_init)(struct cc_var *ccv);
18745a4b79dSSebastien Roy 
18845a4b79dSSebastien Roy 	/* Called on receipt of an ack. */
18945a4b79dSSebastien Roy 	void	(*ack_received)(struct cc_var *ccv, uint16_t type);
19045a4b79dSSebastien Roy 
19145a4b79dSSebastien Roy 	/* Called on detection of a congestion signal. */
19245a4b79dSSebastien Roy 	void	(*cong_signal)(struct cc_var *ccv, uint32_t type);
19345a4b79dSSebastien Roy 
19445a4b79dSSebastien Roy 	/* Called after exiting congestion recovery. */
19545a4b79dSSebastien Roy 	void	(*post_recovery)(struct cc_var *ccv);
19645a4b79dSSebastien Roy 
19745a4b79dSSebastien Roy 	/* Called when data transfer resumes after an idle period. */
19845a4b79dSSebastien Roy 	void	(*after_idle)(struct cc_var *ccv);
19945a4b79dSSebastien Roy 
20045a4b79dSSebastien Roy 	STAILQ_ENTRY(cc_algo) entries;
20145a4b79dSSebastien Roy };
20245a4b79dSSebastien Roy 
20345a4b79dSSebastien Roy typedef int cc_walk_func_t(void *, struct cc_algo *);
20445a4b79dSSebastien Roy extern int	cc_walk_algos(cc_walk_func_t *, void *);
20545a4b79dSSebastien Roy 
20645a4b79dSSebastien Roy /* Macro to obtain the CC algo's struct ptr. */
20745a4b79dSSebastien Roy #define	CC_ALGO(tp)	((tp)->tcp_cc_algo)
20845a4b79dSSebastien Roy 
20945a4b79dSSebastien Roy /* Macro to obtain the CC algo's data ptr. */
21045a4b79dSSebastien Roy #define	CC_DATA(tp)	((tp)->tcp_ccv.cc_data)
21145a4b79dSSebastien Roy 
21245a4b79dSSebastien Roy #ifdef	__cplusplus
21345a4b79dSSebastien Roy }
21445a4b79dSSebastien Roy #endif
21545a4b79dSSebastien Roy 
216*8f97fda4SAndy Fiddaman #endif	/* (defined(_KERNEL) || defined(_KMEMUSER)) */
217*8f97fda4SAndy Fiddaman 
21845a4b79dSSebastien Roy #endif /* _NETINET_CC_H_ */
219