1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3  *	The Regents of the University of California.
4  * Copyright (c) 2007-2008,2010
5  *	Swinburne University of Technology, Melbourne, Australia.
6  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
7  * Copyright (c) 2010 The FreeBSD Foundation
8  * All rights reserved.
9  * Copyright (c) 2017 by Delphix. All rights reserved.
10  * Copyright 2020 RackTop Systems, Inc.
11  *
12  * This software was developed at the Centre for Advanced Internet
13  * Architectures, Swinburne University of Technology, by Lawrence Stewart, James
14  * Healy and David Hayes, made possible in part by a grant from the Cisco
15  * University Research Program Fund at Community Foundation Silicon Valley.
16  *
17  * Portions of this software were developed at the Centre for Advanced
18  * Internet Architectures, Swinburne University of Technology, Melbourne,
19  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
20  *
21  * Redistribution and use in source and binary forms, with or without
22  * modification, are permitted provided that the following conditions
23  * are met:
24  * 1. Redistributions of source code must retain the above copyright
25  *    notice, this list of conditions and the following disclaimer.
26  * 2. Redistributions in binary form must reproduce the above copyright
27  *    notice, this list of conditions and the following disclaimer in the
28  *    documentation and/or other materials provided with the distribution.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  */
42 
43 /*
44  * This software was first released in 2007 by James Healy and Lawrence Stewart
45  * whilst working on the NewTCP research project at Swinburne University of
46  * Technology's Centre for Advanced Internet Architectures, Melbourne,
47  * Australia, which was made possible in part by a grant from the Cisco
48  * University Research Program Fund at Community Foundation Silicon Valley.
49  * More details are available at:
50  *   http://caia.swin.edu.au/urp/newtcp/
51  */
52 
53 #include <sys/errno.h>
54 #include <inet/tcp.h>
55 #include <inet/tcp_impl.h>
56 #include <inet/cc.h>
57 #include <inet/cc/cc_module.h>
58 
59 static void	newreno_ack_received(struct cc_var *ccv, uint16_t type);
60 static void	newreno_after_idle(struct cc_var *ccv);
61 static void	newreno_cong_signal(struct cc_var *ccv, uint32_t type);
62 static void	newreno_post_recovery(struct cc_var *ccv);
63 
64 static struct modlmisc cc_newreno_modlmisc = {
65 	&mod_miscops,
66 	"New Reno Congestion Control"
67 };
68 
69 static struct modlinkage cc_newreno_modlinkage = {
70 	MODREV_1,
71 	&cc_newreno_modlmisc,
72 	NULL
73 };
74 
75 struct cc_algo newreno_cc_algo = {
76 	.name = "newreno",
77 	.ack_received = newreno_ack_received,
78 	.after_idle = newreno_after_idle,
79 	.cong_signal = newreno_cong_signal,
80 	.post_recovery = newreno_post_recovery,
81 };
82 
83 int
_init(void)84 _init(void)
85 {
86 	int err;
87 
88 	if ((err = cc_register_algo(&newreno_cc_algo)) == 0) {
89 		if ((err = mod_install(&cc_newreno_modlinkage)) != 0)
90 			(void) cc_deregister_algo(&newreno_cc_algo);
91 	}
92 	return (err);
93 }
94 
95 int
_fini(void)96 _fini(void)
97 {
98 	/* XXX Not unloadable for now */
99 	return (EBUSY);
100 }
101 
102 int
_info(struct modinfo * modinfop)103 _info(struct modinfo *modinfop)
104 {
105 	return (mod_info(&cc_newreno_modlinkage, modinfop));
106 }
107 
108 static void
newreno_ack_received(struct cc_var * ccv,uint16_t type)109 newreno_ack_received(struct cc_var *ccv, uint16_t type)
110 {
111 	if (type == CC_ACK && !IN_RECOVERY(ccv->flags) &&
112 	    (ccv->flags & CCF_CWND_LIMITED)) {
113 		uint_t cw = CCV(ccv, tcp_cwnd);
114 		uint_t incr = CCV(ccv, tcp_mss);
115 
116 		/*
117 		 * Regular in-order ACK, open the congestion window.
118 		 * Method depends on which congestion control state we're
119 		 * in (slow start or cong avoid) and if ABC (RFC 3465) is
120 		 * enabled.
121 		 *
122 		 * slow start: cwnd <= ssthresh
123 		 * cong avoid: cwnd > ssthresh
124 		 *
125 		 * slow start and ABC (RFC 3465):
126 		 *   Grow cwnd exponentially by the amount of data
127 		 *   ACKed capping the max increment per ACK to
128 		 *   (abc_l_var * maxseg) bytes.
129 		 *
130 		 * slow start without ABC (RFC 5681):
131 		 *   Grow cwnd exponentially by maxseg per ACK.
132 		 *
133 		 * cong avoid and ABC (RFC 3465):
134 		 *   Grow cwnd linearly by maxseg per RTT for each
135 		 *   cwnd worth of ACKed data.
136 		 *
137 		 * cong avoid without ABC (RFC 5681):
138 		 *   Grow cwnd linearly by approximately maxseg per RTT using
139 		 *   maxseg^2 / cwnd per ACK as the increment.
140 		 *   If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
141 		 *   avoid capping cwnd.
142 		 */
143 		if (cw > CCV(ccv, tcp_cwnd_ssthresh)) {
144 			if (CC_ABC(ccv)) {
145 				if (ccv->flags & CCF_ABC_SENTAWND)
146 					ccv->flags &= ~CCF_ABC_SENTAWND;
147 				else
148 					incr = 0;
149 			} else
150 				incr = max((incr * incr / cw), 1);
151 		} else if (CC_ABC(ccv)) {
152 			/*
153 			 * In slow-start with ABC enabled and no RTO in sight?
154 			 * (Must not use abc_l_var > 1 if slow starting after
155 			 * an RTO.
156 			 */
157 			if (ccv->flags & CCF_RTO) {
158 				incr = min(ccv->bytes_this_ack,
159 				    CCV(ccv, tcp_mss));
160 			} else {
161 				incr = min(ccv->bytes_this_ack,
162 				    CC_ABC_L_VAR(ccv) * CCV(ccv, tcp_mss));
163 			}
164 
165 		}
166 		/* ABC is on by default, so incr equals 0 frequently. */
167 		if (incr > 0)
168 			CCV(ccv, tcp_cwnd) = min(cw + incr,
169 			    TCP_MAXWIN << CCV(ccv, tcp_snd_ws));
170 	}
171 }
172 
173 static void
newreno_after_idle(struct cc_var * ccv)174 newreno_after_idle(struct cc_var *ccv)
175 {
176 	int rw;
177 
178 	/*
179 	 * If we've been idle for more than one retransmit timeout the old
180 	 * congestion window is no longer current and we have to reduce it to
181 	 * the restart window before we can transmit again.
182 	 *
183 	 * The restart window is the initial window or the last CWND, whichever
184 	 * is smaller.
185 	 *
186 	 * This is done to prevent us from flooding the path with a full CWND at
187 	 * wirespeed, overloading router and switch buffers along the way.
188 	 *
189 	 * See RFC5681 Section 4.1. "Restarting Idle Connections".
190 	 */
191 	if (CCV(ccv, tcp_init_cwnd) != 0) {
192 		/*
193 		 * The TCP_INIT_CWND socket option was used to override the
194 		 * default.
195 		 */
196 		rw = CCV(ccv, tcp_init_cwnd) * CCV(ccv, tcp_mss);
197 	} else if (CCSV(ccv, tcps_slow_start_initial) != 0) {
198 		/* The _slow_start_initial tunable was explicitly set. */
199 		rw = min(TCP_MAX_INIT_CWND, CCSV(ccv, tcps_slow_start_initial))
200 		    * CCV(ccv, tcp_mss);
201 	} else {
202 		/* Do RFC 3390 */
203 		rw = min(4 * CCV(ccv, tcp_mss),
204 		    max(2 * CCV(ccv, tcp_mss), 4380));
205 	}
206 
207 	CCV(ccv, tcp_cwnd) = min(rw, CCV(ccv, tcp_cwnd));
208 }
209 
210 /*
211  * Perform any necessary tasks before we enter congestion recovery.
212  */
213 static void
newreno_cong_signal(struct cc_var * ccv,uint32_t type)214 newreno_cong_signal(struct cc_var *ccv, uint32_t type)
215 {
216 	uint32_t cwin, ssthresh_on_loss;
217 	uint32_t mss;
218 
219 	cwin = CCV(ccv, tcp_cwnd);
220 	mss = CCV(ccv, tcp_mss);
221 	ssthresh_on_loss =
222 	    max((CCV(ccv, tcp_snxt) - CCV(ccv, tcp_suna)) / 2 / mss, 2)
223 	    * mss;
224 
225 	/* Catch algos which mistakenly leak private signal types. */
226 	ASSERT((type & CC_SIGPRIVMASK) == 0);
227 
228 	cwin = max(cwin / 2 / mss, 2) * mss;
229 
230 	switch (type) {
231 	case CC_NDUPACK:
232 		if (!IN_FASTRECOVERY(ccv->flags)) {
233 			if (!IN_CONGRECOVERY(ccv->flags)) {
234 				CCV(ccv, tcp_cwnd_ssthresh) = ssthresh_on_loss;
235 				CCV(ccv, tcp_cwnd) = cwin;
236 			}
237 			ENTER_RECOVERY(ccv->flags);
238 		}
239 		break;
240 	case CC_ECN:
241 		if (!IN_CONGRECOVERY(ccv->flags)) {
242 			CCV(ccv, tcp_cwnd_ssthresh) = ssthresh_on_loss;
243 			CCV(ccv, tcp_cwnd) = cwin;
244 			ENTER_CONGRECOVERY(ccv->flags);
245 		}
246 		break;
247 	case CC_RTO:
248 		CCV(ccv, tcp_cwnd_ssthresh) = ssthresh_on_loss;
249 		CCV(ccv, tcp_cwnd) = mss;
250 		break;
251 	}
252 }
253 
254 /*
255  * Perform any necessary tasks before we exit congestion recovery.
256  */
257 static void
newreno_post_recovery(struct cc_var * ccv)258 newreno_post_recovery(struct cc_var *ccv)
259 {
260 	uint32_t pipe;
261 
262 	if (IN_FASTRECOVERY(ccv->flags)) {
263 		/*
264 		 * Fast recovery will conclude after returning from this
265 		 * function. Window inflation should have left us with
266 		 * approximately cwnd_ssthresh outstanding data. But in case we
267 		 * would be inclined to send a burst, better to do it via the
268 		 * slow start mechanism.
269 		 */
270 		pipe = CCV(ccv, tcp_snxt) - CCV(ccv, tcp_suna);
271 		if (pipe < CCV(ccv, tcp_cwnd_ssthresh)) {
272 			/*
273 			 * Ensure that cwnd does not collapse to 1 MSS under
274 			 * adverse conditions. Implements RFC6582
275 			 */
276 			CCV(ccv, tcp_cwnd) = MAX(pipe, CCV(ccv, tcp_mss)) +
277 			    CCV(ccv, tcp_mss);
278 		} else if (CCV(ccv, tcp_cwnd) > CCV(ccv, tcp_cwnd_ssthresh)) {
279 			CCV(ccv, tcp_cwnd) = CCV(ccv, tcp_cwnd_ssthresh);
280 		}
281 	}
282 }
283