145a4b79dSSebastien Roy /*
245a4b79dSSebastien Roy  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
345a4b79dSSebastien Roy  *	The Regents of the University of California.
445a4b79dSSebastien Roy  * Copyright (c) 2007-2008,2010
545a4b79dSSebastien Roy  *	Swinburne University of Technology, Melbourne, Australia.
645a4b79dSSebastien Roy  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
745a4b79dSSebastien Roy  * Copyright (c) 2010 The FreeBSD Foundation
845a4b79dSSebastien Roy  * All rights reserved.
945a4b79dSSebastien Roy  * Copyright (c) 2017 by Delphix. All rights reserved.
10*3b0b0a4eSPaul Winder  * Copyright 2020 RackTop Systems, Inc.
1145a4b79dSSebastien Roy  *
1245a4b79dSSebastien Roy  * This software was developed at the Centre for Advanced Internet
1345a4b79dSSebastien Roy  * Architectures, Swinburne University of Technology, by Lawrence Stewart, James
1445a4b79dSSebastien Roy  * Healy and David Hayes, made possible in part by a grant from the Cisco
1545a4b79dSSebastien Roy  * University Research Program Fund at Community Foundation Silicon Valley.
1645a4b79dSSebastien Roy  *
1745a4b79dSSebastien Roy  * Portions of this software were developed at the Centre for Advanced
1845a4b79dSSebastien Roy  * Internet Architectures, Swinburne University of Technology, Melbourne,
1945a4b79dSSebastien Roy  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
2045a4b79dSSebastien Roy  *
2145a4b79dSSebastien Roy  * Redistribution and use in source and binary forms, with or without
2245a4b79dSSebastien Roy  * modification, are permitted provided that the following conditions
2345a4b79dSSebastien Roy  * are met:
2445a4b79dSSebastien Roy  * 1. Redistributions of source code must retain the above copyright
2545a4b79dSSebastien Roy  *    notice, this list of conditions and the following disclaimer.
2645a4b79dSSebastien Roy  * 2. Redistributions in binary form must reproduce the above copyright
2745a4b79dSSebastien Roy  *    notice, this list of conditions and the following disclaimer in the
2845a4b79dSSebastien Roy  *    documentation and/or other materials provided with the distribution.
2945a4b79dSSebastien Roy  *
3045a4b79dSSebastien Roy  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
3145a4b79dSSebastien Roy  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
3245a4b79dSSebastien Roy  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
3345a4b79dSSebastien Roy  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
3445a4b79dSSebastien Roy  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
3545a4b79dSSebastien Roy  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
3645a4b79dSSebastien Roy  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
3745a4b79dSSebastien Roy  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
3845a4b79dSSebastien Roy  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3945a4b79dSSebastien Roy  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
4045a4b79dSSebastien Roy  * SUCH DAMAGE.
4145a4b79dSSebastien Roy  */
4245a4b79dSSebastien Roy 
4345a4b79dSSebastien Roy /*
4445a4b79dSSebastien Roy  * This software was first released in 2007 by James Healy and Lawrence Stewart
4545a4b79dSSebastien Roy  * whilst working on the NewTCP research project at Swinburne University of
4645a4b79dSSebastien Roy  * Technology's Centre for Advanced Internet Architectures, Melbourne,
4745a4b79dSSebastien Roy  * Australia, which was made possible in part by a grant from the Cisco
4845a4b79dSSebastien Roy  * University Research Program Fund at Community Foundation Silicon Valley.
4945a4b79dSSebastien Roy  * More details are available at:
5045a4b79dSSebastien Roy  *   http://caia.swin.edu.au/urp/newtcp/
5145a4b79dSSebastien Roy  */
5245a4b79dSSebastien Roy 
5345a4b79dSSebastien Roy #include <sys/errno.h>
5445a4b79dSSebastien Roy #include <inet/tcp.h>
5545a4b79dSSebastien Roy #include <inet/tcp_impl.h>
5645a4b79dSSebastien Roy #include <inet/cc.h>
5745a4b79dSSebastien Roy #include <inet/cc/cc_module.h>
5845a4b79dSSebastien Roy 
5945a4b79dSSebastien Roy static void	newreno_ack_received(struct cc_var *ccv, uint16_t type);
6045a4b79dSSebastien Roy static void	newreno_after_idle(struct cc_var *ccv);
6145a4b79dSSebastien Roy static void	newreno_cong_signal(struct cc_var *ccv, uint32_t type);
6245a4b79dSSebastien Roy static void	newreno_post_recovery(struct cc_var *ccv);
6345a4b79dSSebastien Roy 
6445a4b79dSSebastien Roy static struct modlmisc cc_newreno_modlmisc = {
6545a4b79dSSebastien Roy 	&mod_miscops,
6645a4b79dSSebastien Roy 	"New Reno Congestion Control"
6745a4b79dSSebastien Roy };
6845a4b79dSSebastien Roy 
6945a4b79dSSebastien Roy static struct modlinkage cc_newreno_modlinkage = {
7045a4b79dSSebastien Roy 	MODREV_1,
7145a4b79dSSebastien Roy 	&cc_newreno_modlmisc,
7245a4b79dSSebastien Roy 	NULL
7345a4b79dSSebastien Roy };
7445a4b79dSSebastien Roy 
7545a4b79dSSebastien Roy struct cc_algo newreno_cc_algo = {
7645a4b79dSSebastien Roy 	.name = "newreno",
7745a4b79dSSebastien Roy 	.ack_received = newreno_ack_received,
7845a4b79dSSebastien Roy 	.after_idle = newreno_after_idle,
7945a4b79dSSebastien Roy 	.cong_signal = newreno_cong_signal,
8045a4b79dSSebastien Roy 	.post_recovery = newreno_post_recovery,
8145a4b79dSSebastien Roy };
8245a4b79dSSebastien Roy 
8345a4b79dSSebastien Roy int
_init(void)8445a4b79dSSebastien Roy _init(void)
8545a4b79dSSebastien Roy {
8645a4b79dSSebastien Roy 	int err;
8745a4b79dSSebastien Roy 
8845a4b79dSSebastien Roy 	if ((err = cc_register_algo(&newreno_cc_algo)) == 0) {
8945a4b79dSSebastien Roy 		if ((err = mod_install(&cc_newreno_modlinkage)) != 0)
9045a4b79dSSebastien Roy 			(void) cc_deregister_algo(&newreno_cc_algo);
9145a4b79dSSebastien Roy 	}
9245a4b79dSSebastien Roy 	return (err);
9345a4b79dSSebastien Roy }
9445a4b79dSSebastien Roy 
9545a4b79dSSebastien Roy int
_fini(void)9645a4b79dSSebastien Roy _fini(void)
9745a4b79dSSebastien Roy {
9845a4b79dSSebastien Roy 	/* XXX Not unloadable for now */
9945a4b79dSSebastien Roy 	return (EBUSY);
10045a4b79dSSebastien Roy }
10145a4b79dSSebastien Roy 
10245a4b79dSSebastien Roy int
_info(struct modinfo * modinfop)10345a4b79dSSebastien Roy _info(struct modinfo *modinfop)
10445a4b79dSSebastien Roy {
10545a4b79dSSebastien Roy 	return (mod_info(&cc_newreno_modlinkage, modinfop));
10645a4b79dSSebastien Roy }
10745a4b79dSSebastien Roy 
10845a4b79dSSebastien Roy static void
newreno_ack_received(struct cc_var * ccv,uint16_t type)10945a4b79dSSebastien Roy newreno_ack_received(struct cc_var *ccv, uint16_t type)
11045a4b79dSSebastien Roy {
11145a4b79dSSebastien Roy 	if (type == CC_ACK && !IN_RECOVERY(ccv->flags) &&
11245a4b79dSSebastien Roy 	    (ccv->flags & CCF_CWND_LIMITED)) {
11345a4b79dSSebastien Roy 		uint_t cw = CCV(ccv, tcp_cwnd);
11445a4b79dSSebastien Roy 		uint_t incr = CCV(ccv, tcp_mss);
11545a4b79dSSebastien Roy 
11645a4b79dSSebastien Roy 		/*
11745a4b79dSSebastien Roy 		 * Regular in-order ACK, open the congestion window.
11845a4b79dSSebastien Roy 		 * Method depends on which congestion control state we're
11945a4b79dSSebastien Roy 		 * in (slow start or cong avoid) and if ABC (RFC 3465) is
12045a4b79dSSebastien Roy 		 * enabled.
12145a4b79dSSebastien Roy 		 *
12245a4b79dSSebastien Roy 		 * slow start: cwnd <= ssthresh
12345a4b79dSSebastien Roy 		 * cong avoid: cwnd > ssthresh
12445a4b79dSSebastien Roy 		 *
12545a4b79dSSebastien Roy 		 * slow start and ABC (RFC 3465):
12645a4b79dSSebastien Roy 		 *   Grow cwnd exponentially by the amount of data
12745a4b79dSSebastien Roy 		 *   ACKed capping the max increment per ACK to
12845a4b79dSSebastien Roy 		 *   (abc_l_var * maxseg) bytes.
12945a4b79dSSebastien Roy 		 *
13045a4b79dSSebastien Roy 		 * slow start without ABC (RFC 5681):
13145a4b79dSSebastien Roy 		 *   Grow cwnd exponentially by maxseg per ACK.
13245a4b79dSSebastien Roy 		 *
13345a4b79dSSebastien Roy 		 * cong avoid and ABC (RFC 3465):
13445a4b79dSSebastien Roy 		 *   Grow cwnd linearly by maxseg per RTT for each
13545a4b79dSSebastien Roy 		 *   cwnd worth of ACKed data.
13645a4b79dSSebastien Roy 		 *
13745a4b79dSSebastien Roy 		 * cong avoid without ABC (RFC 5681):
13845a4b79dSSebastien Roy 		 *   Grow cwnd linearly by approximately maxseg per RTT using
13945a4b79dSSebastien Roy 		 *   maxseg^2 / cwnd per ACK as the increment.
14045a4b79dSSebastien Roy 		 *   If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
14145a4b79dSSebastien Roy 		 *   avoid capping cwnd.
14245a4b79dSSebastien Roy 		 */
14345a4b79dSSebastien Roy 		if (cw > CCV(ccv, tcp_cwnd_ssthresh)) {
14445a4b79dSSebastien Roy 			if (CC_ABC(ccv)) {
14545a4b79dSSebastien Roy 				if (ccv->flags & CCF_ABC_SENTAWND)
14645a4b79dSSebastien Roy 					ccv->flags &= ~CCF_ABC_SENTAWND;
14745a4b79dSSebastien Roy 				else
14845a4b79dSSebastien Roy 					incr = 0;
14945a4b79dSSebastien Roy 			} else
15045a4b79dSSebastien Roy 				incr = max((incr * incr / cw), 1);
15145a4b79dSSebastien Roy 		} else if (CC_ABC(ccv)) {
15245a4b79dSSebastien Roy 			/*
15345a4b79dSSebastien Roy 			 * In slow-start with ABC enabled and no RTO in sight?
15445a4b79dSSebastien Roy 			 * (Must not use abc_l_var > 1 if slow starting after
15545a4b79dSSebastien Roy 			 * an RTO.
15645a4b79dSSebastien Roy 			 */
15745a4b79dSSebastien Roy 			if (ccv->flags & CCF_RTO) {
15845a4b79dSSebastien Roy 				incr = min(ccv->bytes_this_ack,
15945a4b79dSSebastien Roy 				    CCV(ccv, tcp_mss));
16045a4b79dSSebastien Roy 			} else {
16145a4b79dSSebastien Roy 				incr = min(ccv->bytes_this_ack,
16245a4b79dSSebastien Roy 				    CC_ABC_L_VAR(ccv) * CCV(ccv, tcp_mss));
16345a4b79dSSebastien Roy 			}
16445a4b79dSSebastien Roy 
16545a4b79dSSebastien Roy 		}
16645a4b79dSSebastien Roy 		/* ABC is on by default, so incr equals 0 frequently. */
16745a4b79dSSebastien Roy 		if (incr > 0)
16845a4b79dSSebastien Roy 			CCV(ccv, tcp_cwnd) = min(cw + incr,
16945a4b79dSSebastien Roy 			    TCP_MAXWIN << CCV(ccv, tcp_snd_ws));
17045a4b79dSSebastien Roy 	}
17145a4b79dSSebastien Roy }
17245a4b79dSSebastien Roy 
17345a4b79dSSebastien Roy static void
newreno_after_idle(struct cc_var * ccv)17445a4b79dSSebastien Roy newreno_after_idle(struct cc_var *ccv)
17545a4b79dSSebastien Roy {
17645a4b79dSSebastien Roy 	int rw;
17745a4b79dSSebastien Roy 
17845a4b79dSSebastien Roy 	/*
17945a4b79dSSebastien Roy 	 * If we've been idle for more than one retransmit timeout the old
18045a4b79dSSebastien Roy 	 * congestion window is no longer current and we have to reduce it to
18145a4b79dSSebastien Roy 	 * the restart window before we can transmit again.
18245a4b79dSSebastien Roy 	 *
18345a4b79dSSebastien Roy 	 * The restart window is the initial window or the last CWND, whichever
18445a4b79dSSebastien Roy 	 * is smaller.
18545a4b79dSSebastien Roy 	 *
18645a4b79dSSebastien Roy 	 * This is done to prevent us from flooding the path with a full CWND at
18745a4b79dSSebastien Roy 	 * wirespeed, overloading router and switch buffers along the way.
18845a4b79dSSebastien Roy 	 *
18945a4b79dSSebastien Roy 	 * See RFC5681 Section 4.1. "Restarting Idle Connections".
19045a4b79dSSebastien Roy 	 */
19145a4b79dSSebastien Roy 	if (CCV(ccv, tcp_init_cwnd) != 0) {
19245a4b79dSSebastien Roy 		/*
19345a4b79dSSebastien Roy 		 * The TCP_INIT_CWND socket option was used to override the
19445a4b79dSSebastien Roy 		 * default.
19545a4b79dSSebastien Roy 		 */
19645a4b79dSSebastien Roy 		rw = CCV(ccv, tcp_init_cwnd) * CCV(ccv, tcp_mss);
19745a4b79dSSebastien Roy 	} else if (CCSV(ccv, tcps_slow_start_initial) != 0) {
19845a4b79dSSebastien Roy 		/* The _slow_start_initial tunable was explicitly set. */
19945a4b79dSSebastien Roy 		rw = min(TCP_MAX_INIT_CWND, CCSV(ccv, tcps_slow_start_initial))
20045a4b79dSSebastien Roy 		    * CCV(ccv, tcp_mss);
20145a4b79dSSebastien Roy 	} else {
20245a4b79dSSebastien Roy 		/* Do RFC 3390 */
20345a4b79dSSebastien Roy 		rw = min(4 * CCV(ccv, tcp_mss),
20445a4b79dSSebastien Roy 		    max(2 * CCV(ccv, tcp_mss), 4380));
20545a4b79dSSebastien Roy 	}
20645a4b79dSSebastien Roy 
20745a4b79dSSebastien Roy 	CCV(ccv, tcp_cwnd) = min(rw, CCV(ccv, tcp_cwnd));
20845a4b79dSSebastien Roy }
20945a4b79dSSebastien Roy 
21045a4b79dSSebastien Roy /*
21145a4b79dSSebastien Roy  * Perform any necessary tasks before we enter congestion recovery.
21245a4b79dSSebastien Roy  */
21345a4b79dSSebastien Roy static void
newreno_cong_signal(struct cc_var * ccv,uint32_t type)21445a4b79dSSebastien Roy newreno_cong_signal(struct cc_var *ccv, uint32_t type)
21545a4b79dSSebastien Roy {
21645a4b79dSSebastien Roy 	uint32_t cwin, ssthresh_on_loss;
21745a4b79dSSebastien Roy 	uint32_t mss;
21845a4b79dSSebastien Roy 
21945a4b79dSSebastien Roy 	cwin = CCV(ccv, tcp_cwnd);
22045a4b79dSSebastien Roy 	mss = CCV(ccv, tcp_mss);
22145a4b79dSSebastien Roy 	ssthresh_on_loss =
22245a4b79dSSebastien Roy 	    max((CCV(ccv, tcp_snxt) - CCV(ccv, tcp_suna)) / 2 / mss, 2)
22345a4b79dSSebastien Roy 	    * mss;
22445a4b79dSSebastien Roy 
22545a4b79dSSebastien Roy 	/* Catch algos which mistakenly leak private signal types. */
22645a4b79dSSebastien Roy 	ASSERT((type & CC_SIGPRIVMASK) == 0);
22745a4b79dSSebastien Roy 
22845a4b79dSSebastien Roy 	cwin = max(cwin / 2 / mss, 2) * mss;
22945a4b79dSSebastien Roy 
23045a4b79dSSebastien Roy 	switch (type) {
23145a4b79dSSebastien Roy 	case CC_NDUPACK:
23245a4b79dSSebastien Roy 		if (!IN_FASTRECOVERY(ccv->flags)) {
23345a4b79dSSebastien Roy 			if (!IN_CONGRECOVERY(ccv->flags)) {
23445a4b79dSSebastien Roy 				CCV(ccv, tcp_cwnd_ssthresh) = ssthresh_on_loss;
23545a4b79dSSebastien Roy 				CCV(ccv, tcp_cwnd) = cwin;
23645a4b79dSSebastien Roy 			}
23745a4b79dSSebastien Roy 			ENTER_RECOVERY(ccv->flags);
23845a4b79dSSebastien Roy 		}
23945a4b79dSSebastien Roy 		break;
24045a4b79dSSebastien Roy 	case CC_ECN:
24145a4b79dSSebastien Roy 		if (!IN_CONGRECOVERY(ccv->flags)) {
24245a4b79dSSebastien Roy 			CCV(ccv, tcp_cwnd_ssthresh) = ssthresh_on_loss;
24345a4b79dSSebastien Roy 			CCV(ccv, tcp_cwnd) = cwin;
24445a4b79dSSebastien Roy 			ENTER_CONGRECOVERY(ccv->flags);
24545a4b79dSSebastien Roy 		}
24645a4b79dSSebastien Roy 		break;
24745a4b79dSSebastien Roy 	case CC_RTO:
24845a4b79dSSebastien Roy 		CCV(ccv, tcp_cwnd_ssthresh) = ssthresh_on_loss;
24945a4b79dSSebastien Roy 		CCV(ccv, tcp_cwnd) = mss;
25045a4b79dSSebastien Roy 		break;
25145a4b79dSSebastien Roy 	}
25245a4b79dSSebastien Roy }
25345a4b79dSSebastien Roy 
25445a4b79dSSebastien Roy /*
25545a4b79dSSebastien Roy  * Perform any necessary tasks before we exit congestion recovery.
25645a4b79dSSebastien Roy  */
25745a4b79dSSebastien Roy static void
newreno_post_recovery(struct cc_var * ccv)25845a4b79dSSebastien Roy newreno_post_recovery(struct cc_var *ccv)
25945a4b79dSSebastien Roy {
260*3b0b0a4eSPaul Winder 	uint32_t pipe;
261*3b0b0a4eSPaul Winder 
26245a4b79dSSebastien Roy 	if (IN_FASTRECOVERY(ccv->flags)) {
26345a4b79dSSebastien Roy 		/*
26445a4b79dSSebastien Roy 		 * Fast recovery will conclude after returning from this
265*3b0b0a4eSPaul Winder 		 * function. Window inflation should have left us with
266*3b0b0a4eSPaul Winder 		 * approximately cwnd_ssthresh outstanding data. But in case we
267*3b0b0a4eSPaul Winder 		 * would be inclined to send a burst, better to do it via the
268*3b0b0a4eSPaul Winder 		 * slow start mechanism.
26945a4b79dSSebastien Roy 		 */
270*3b0b0a4eSPaul Winder 		pipe = CCV(ccv, tcp_snxt) - CCV(ccv, tcp_suna);
271*3b0b0a4eSPaul Winder 		if (pipe < CCV(ccv, tcp_cwnd_ssthresh)) {
272*3b0b0a4eSPaul Winder 			/*
273*3b0b0a4eSPaul Winder 			 * Ensure that cwnd does not collapse to 1 MSS under
274*3b0b0a4eSPaul Winder 			 * adverse conditions. Implements RFC6582
275*3b0b0a4eSPaul Winder 			 */
276*3b0b0a4eSPaul Winder 			CCV(ccv, tcp_cwnd) = MAX(pipe, CCV(ccv, tcp_mss)) +
277*3b0b0a4eSPaul Winder 			    CCV(ccv, tcp_mss);
278*3b0b0a4eSPaul Winder 		} else if (CCV(ccv, tcp_cwnd) > CCV(ccv, tcp_cwnd_ssthresh)) {
27945a4b79dSSebastien Roy 			CCV(ccv, tcp_cwnd) = CCV(ccv, tcp_cwnd_ssthresh);
28045a4b79dSSebastien Roy 		}
28145a4b79dSSebastien Roy 	}
28245a4b79dSSebastien Roy }
283