xref: /illumos-gate/usr/src/uts/common/inet/cc/cc_cubic.c (revision 3b0b0a4e)
1 /*
2  * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org>
3  * Copyright (c) 2010 The FreeBSD Foundation
4  * All rights reserved.
5  * Copyright (c) 2017 by Delphix. All rights reserved.
6  * Copyright 2019 Joyent, Inc.
7  * Copyright 2020 RackTop Systems, Inc.
8  *
9  * This software was developed by Lawrence Stewart while studying at the Centre
10  * for Advanced Internet Architectures, Swinburne University of Technology, made
11  * possible in part by a grant from the Cisco University Research Program Fund
12  * at Community Foundation Silicon Valley.
13  *
14  * Portions of this software were developed at the Centre for Advanced
15  * Internet Architectures, Swinburne University of Technology, Melbourne,
16  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
17  *
18  * Redistribution and use in source and binary forms, with or without
19  * modification, are permitted provided that the following conditions
20  * are met:
21  * 1. Redistributions of source code must retain the above copyright
22  *    notice, this list of conditions and the following disclaimer.
23  * 2. Redistributions in binary form must reproduce the above copyright
24  *    notice, this list of conditions and the following disclaimer in the
25  *    documentation and/or other materials provided with the distribution.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  */
39 
40 /*
41  * An implementation of the CUBIC congestion control algorithm for FreeBSD,
42  * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha.
43  * Originally released as part of the NewTCP research project at Swinburne
44  * University of Technology's Centre for Advanced Internet Architectures,
45  * Melbourne, Australia, which was made possible in part by a grant from the
46  * Cisco University Research Program Fund at Community Foundation Silicon
47  * Valley. More details are available at:
48  *   http://caia.swin.edu.au/urp/newtcp/
49  */
50 
51 #include <sys/errno.h>
52 #include <sys/types.h>
53 #include <sys/kmem.h>
54 #include <sys/ddi.h>
55 #include <sys/sunddi.h>
56 #include <sys/modctl.h>
57 #include <sys/time.h>
58 
59 #include <inet/tcp_impl.h>
60 #include <inet/cc.h>
61 #include <inet/cc/cc_cubic.h>
62 #include <inet/cc/cc_module.h>
63 
64 static struct modlmisc cc_cubic_modlmisc = {
65 	&mod_miscops,
66 	"Cubic Congestion Control"
67 };
68 
69 static struct modlinkage cc_cubic_modlinkage = {
70 	MODREV_1,
71 	&cc_cubic_modlmisc,
72 	NULL
73 };
74 
75 /*
76  * cubic uses the NewReno implementation of after_idle and uses NewReno's
77  * ack_received callback during slow start.
78  */
79 static struct cc_algo *newreno_cc_algo;
80 
81 static void	cubic_ack_received(struct cc_var *ccv, uint16_t type);
82 static void	cubic_cb_destroy(struct cc_var *ccv);
83 static int	cubic_cb_init(struct cc_var *ccv);
84 static void	cubic_cong_signal(struct cc_var *ccv, uint32_t type);
85 static void	cubic_conn_init(struct cc_var *ccv);
86 static void	cubic_post_recovery(struct cc_var *ccv);
87 static void	cubic_record_rtt(struct cc_var *ccv);
88 static void	cubic_ssthresh_update(struct cc_var *ccv);
89 static void	cubic_after_idle(struct cc_var *ccv);
90 
91 struct cubic {
92 	/* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */
93 	int64_t		K;
94 	/* Sum of RTT samples across an epoch in nanoseconds. */
95 	hrtime_t	sum_rtt_nsecs;
96 	/* cwnd at the most recent congestion event. */
97 	uint32_t	max_cwnd;
98 	/* cwnd at the previous congestion event. */
99 	uint32_t	prev_max_cwnd;
100 	/* Number of congestion events. */
101 	uint32_t	num_cong_events;
102 	/* Minimum observed rtt in nanoseconds. */
103 	hrtime_t	min_rtt_nsecs;
104 	/* Mean observed rtt between congestion epochs. */
105 	hrtime_t	mean_rtt_nsecs;
106 	/* ACKs since last congestion event. */
107 	int		epoch_ack_count;
108 	/* Time of last congestion event in nanoseconds. */
109 	hrtime_t	t_last_cong;
110 };
111 
112 struct cc_algo cubic_cc_algo = {
113 	.name = "cubic",
114 	.ack_received = cubic_ack_received,
115 	.cb_destroy = cubic_cb_destroy,
116 	.cb_init = cubic_cb_init,
117 	.cong_signal = cubic_cong_signal,
118 	.conn_init = cubic_conn_init,
119 	.post_recovery = cubic_post_recovery,
120 	.after_idle = cubic_after_idle,
121 };
122 
123 int
124 _init(void)
125 {
126 	int err;
127 
128 	if ((newreno_cc_algo = cc_load_algo("newreno")) == NULL)
129 		return (EINVAL);
130 
131 	if ((err = cc_register_algo(&cubic_cc_algo)) == 0) {
132 		if ((err = mod_install(&cc_cubic_modlinkage)) != 0)
133 			(void) cc_deregister_algo(&cubic_cc_algo);
134 	}
135 
136 	return (err);
137 }
138 
139 int
140 _fini(void)
141 {
142 	/* XXX Not unloadable for now */
143 	return (EBUSY);
144 }
145 
146 int
147 _info(struct modinfo *modinfop)
148 {
149 	return (mod_info(&cc_cubic_modlinkage, modinfop));
150 }
151 
152 static void
153 cubic_ack_received(struct cc_var *ccv, uint16_t type)
154 {
155 	struct cubic *cubic_data;
156 	uint32_t w_tf, w_cubic_next;
157 	hrtime_t nsecs_since_cong;
158 
159 	cubic_data = ccv->cc_data;
160 	cubic_record_rtt(ccv);
161 
162 	/*
163 	 * Regular ACK and we're not in cong/fast recovery and we're cwnd
164 	 * limited and we're either not doing ABC or are slow starting or are
165 	 * doing ABC and we've sent a cwnd's worth of bytes.
166 	 */
167 	if (type == CC_ACK && !IN_RECOVERY(ccv->flags) &&
168 	    (ccv->flags & CCF_CWND_LIMITED) && (!CC_ABC(ccv) ||
169 	    CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) ||
170 	    (CC_ABC(ccv) && (ccv->flags & CCF_ABC_SENTAWND)))) {
171 		/* Use the logic in NewReno ack_received() for slow start. */
172 		if (CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) ||
173 		    cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE)
174 			newreno_cc_algo->ack_received(ccv, type);
175 		else {
176 			nsecs_since_cong = gethrtime() -
177 			    cubic_data->t_last_cong;
178 
179 			/*
180 			 * The mean RTT is used to best reflect the equations in
181 			 * the I-D. Using min_rtt in the tf_cwnd calculation
182 			 * causes w_tf to grow much faster than it should if the
183 			 * RTT is dominated by network buffering rather than
184 			 * propagation delay.
185 			 */
186 			w_tf = tf_cwnd(nsecs_since_cong,
187 			    cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd,
188 			    CCV(ccv, tcp_mss));
189 
190 			w_cubic_next = cubic_cwnd(nsecs_since_cong +
191 			    cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd,
192 			    CCV(ccv, tcp_mss), cubic_data->K);
193 
194 			ccv->flags &= ~CCF_ABC_SENTAWND;
195 
196 			if (w_cubic_next < w_tf) {
197 				/*
198 				 * TCP-friendly region, follow tf
199 				 * cwnd growth.
200 				 */
201 				if (CCV(ccv, tcp_cwnd) < w_tf)
202 					CCV(ccv, tcp_cwnd) = w_tf;
203 			} else if (CCV(ccv, tcp_cwnd) < w_cubic_next) {
204 				/*
205 				 * Concave or convex region, follow CUBIC
206 				 * cwnd growth.
207 				 */
208 				if (CC_ABC(ccv))
209 					CCV(ccv, tcp_cwnd) = MIN(w_cubic_next,
210 					    INT_MAX);
211 				else
212 					CCV(ccv, tcp_cwnd) += MAX(1,
213 					    ((MIN(w_cubic_next, INT_MAX) -
214 					    CCV(ccv, tcp_cwnd)) *
215 					    CCV(ccv, tcp_mss)) /
216 					    CCV(ccv, tcp_cwnd));
217 			}
218 
219 			/*
220 			 * If we're not in slow start and we're probing for a
221 			 * new cwnd limit at the start of a connection
222 			 * (happens when hostcache has a relevant entry),
223 			 * keep updating our current estimate of the
224 			 * max_cwnd.
225 			 */
226 			if (cubic_data->num_cong_events == 0 &&
227 			    cubic_data->max_cwnd < CCV(ccv, tcp_cwnd)) {
228 				cubic_data->max_cwnd = CCV(ccv, tcp_cwnd);
229 				cubic_data->K = cubic_k(cubic_data->max_cwnd /
230 				    CCV(ccv, tcp_mss));
231 			}
232 		}
233 	}
234 }
235 
236 /*
237  * This is a Cubic specific implementation of after_idle.
238  *   - Reset cwnd by calling New Reno implementation of after_idle.
239  *   - Reset t_last_cong.
240  */
241 static void
242 cubic_after_idle(struct cc_var *ccv)
243 {
244 	struct cubic *cubic_data;
245 
246 	cubic_data = ccv->cc_data;
247 
248 	cubic_data->max_cwnd = max(cubic_data->max_cwnd, CCV(ccv, tcp_cwnd));
249 	cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, tcp_mss));
250 
251 	newreno_cc_algo->after_idle(ccv);
252 	cubic_data->t_last_cong = gethrtime();
253 }
254 
255 static void
256 cubic_cb_destroy(struct cc_var *ccv)
257 {
258 
259 	if (ccv->cc_data != NULL)
260 		kmem_free(ccv->cc_data, sizeof (struct cubic));
261 }
262 
263 static int
264 cubic_cb_init(struct cc_var *ccv)
265 {
266 	struct cubic *cubic_data;
267 
268 	cubic_data = kmem_alloc(sizeof (struct cubic), KM_NOSLEEP);
269 
270 	if (cubic_data == NULL)
271 		return (ENOMEM);
272 
273 	/* Init some key variables with sensible defaults. */
274 	cubic_data->t_last_cong = gethrtime();
275 	cubic_data->min_rtt_nsecs = TCPTV_SRTTBASE;
276 	cubic_data->mean_rtt_nsecs = 1;
277 
278 	ccv->cc_data = cubic_data;
279 
280 	return (0);
281 }
282 
283 /*
284  * Perform any necessary tasks before we enter congestion recovery.
285  */
286 static void
287 cubic_cong_signal(struct cc_var *ccv, uint32_t type)
288 {
289 	struct cubic *cubic_data;
290 	uint32_t cwin;
291 	uint32_t mss;
292 
293 	cubic_data = ccv->cc_data;
294 	cwin = CCV(ccv, tcp_cwnd);
295 	mss = CCV(ccv, tcp_mss);
296 
297 	switch (type) {
298 	case CC_NDUPACK:
299 		if (!IN_FASTRECOVERY(ccv->flags)) {
300 			if (!IN_CONGRECOVERY(ccv->flags)) {
301 				cubic_ssthresh_update(ccv);
302 				cubic_data->num_cong_events++;
303 				cubic_data->prev_max_cwnd =
304 				    cubic_data->max_cwnd;
305 				cubic_data->max_cwnd = cwin;
306 				CCV(ccv, tcp_cwnd) =
307 				    CCV(ccv, tcp_cwnd_ssthresh);
308 			}
309 			ENTER_RECOVERY(ccv->flags);
310 		}
311 		break;
312 
313 	case CC_ECN:
314 		if (!IN_CONGRECOVERY(ccv->flags)) {
315 			cubic_ssthresh_update(ccv);
316 			cubic_data->num_cong_events++;
317 			cubic_data->prev_max_cwnd = cubic_data->max_cwnd;
318 			cubic_data->max_cwnd = cwin;
319 			cubic_data->t_last_cong = gethrtime();
320 			CCV(ccv, tcp_cwnd) = CCV(ccv, tcp_cwnd_ssthresh);
321 			ENTER_CONGRECOVERY(ccv->flags);
322 		}
323 		break;
324 
325 	case CC_RTO:
326 		/*
327 		 * Grab the current time and record it so we know when the
328 		 * most recent congestion event was.
329 		 */
330 		cubic_data->num_cong_events++;
331 		cubic_data->t_last_cong = gethrtime();
332 		cubic_ssthresh_update(ccv);
333 		cubic_data->max_cwnd = cwin;
334 		CCV(ccv, tcp_cwnd) = mss;
335 		break;
336 	}
337 }
338 
339 static void
340 cubic_conn_init(struct cc_var *ccv)
341 {
342 	struct cubic *cubic_data;
343 
344 	cubic_data = ccv->cc_data;
345 
346 	/*
347 	 * Ensure we have a sane initial value for max_cwnd recorded. Without
348 	 * this here bad things happen when entries from the TCP hostcache
349 	 * get used.
350 	 */
351 	cubic_data->max_cwnd = CCV(ccv, tcp_cwnd);
352 }
353 
354 /*
355  * Perform any necessary tasks before we exit congestion recovery.
356  */
357 static void
358 cubic_post_recovery(struct cc_var *ccv)
359 {
360 	struct cubic *cubic_data;
361 	uint32_t mss, pipe;
362 
363 	cubic_data = ccv->cc_data;
364 
365 	/* Fast convergence heuristic. */
366 	if (cubic_data->max_cwnd < cubic_data->prev_max_cwnd) {
367 		cubic_data->max_cwnd = (cubic_data->max_cwnd * CUBIC_FC_FACTOR)
368 		    >> CUBIC_SHIFT;
369 	}
370 
371 	mss = CCV(ccv, tcp_mss);
372 
373 	if (IN_FASTRECOVERY(ccv->flags)) {
374 		/*
375 		 * If inflight data is less than ssthresh, set cwnd
376 		 * conservatively to avoid a burst of data, as suggested in
377 		 * the NewReno RFC. Otherwise, use the CUBIC method.
378 		 */
379 		pipe = CCV(ccv, tcp_snxt) - CCV(ccv, tcp_suna);
380 		if (pipe < CCV(ccv, tcp_cwnd_ssthresh)) {
381 			/*
382 			 * Ensure that cwnd does not collapse to 1 MSS under
383 			 * adverse conditions. Implements RFC6582
384 			 */
385 			CCV(ccv, tcp_cwnd) = MAX(pipe, mss) + mss;
386 		} else {
387 			/* Update cwnd based on beta and adjusted max_cwnd. */
388 			CCV(ccv, tcp_cwnd) = max(1, ((CUBIC_BETA *
389 			    cubic_data->max_cwnd) >> CUBIC_SHIFT));
390 		}
391 	}
392 
393 	cubic_data->t_last_cong = gethrtime();
394 
395 	/* Calculate the average RTT between congestion epochs. */
396 	if (cubic_data->epoch_ack_count > 0 &&
397 	    cubic_data->sum_rtt_nsecs >= cubic_data->epoch_ack_count) {
398 		cubic_data->mean_rtt_nsecs =
399 		    (cubic_data->sum_rtt_nsecs / cubic_data->epoch_ack_count);
400 	}
401 
402 	cubic_data->epoch_ack_count = 0;
403 	cubic_data->sum_rtt_nsecs = 0;
404 	cubic_data->K = cubic_k(cubic_data->max_cwnd / mss);
405 }
406 
407 /*
408  * Record the min RTT and sum samples for the epoch average RTT calculation.
409  */
410 static void
411 cubic_record_rtt(struct cc_var *ccv)
412 {
413 	struct cubic *cubic_data;
414 	int t_srtt_nsecs;
415 
416 	/* Ignore srtt until a min number of samples have been taken. */
417 	if (CCV(ccv, tcp_rtt_update) >= CUBIC_MIN_RTT_SAMPLES) {
418 		cubic_data = ccv->cc_data;
419 		/* tcp_rtt_sa is 8 * smoothed RTT in nanoseconds */
420 		t_srtt_nsecs = CCV(ccv, tcp_rtt_sa) >> 3;
421 
422 		/*
423 		 * Record the current SRTT as our minrtt if it's the smallest
424 		 * we've seen or minrtt is currently equal to its initialized
425 		 * value.
426 		 *
427 		 * XXXLAS: Should there be some hysteresis for minrtt?
428 		 */
429 		if ((t_srtt_nsecs < cubic_data->min_rtt_nsecs ||
430 		    cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE)) {
431 			cubic_data->min_rtt_nsecs = max(1, t_srtt_nsecs);
432 
433 			/*
434 			 * If the connection is within its first congestion
435 			 * epoch, ensure we prime mean_rtt_nsecs with a
436 			 * reasonable value until the epoch average RTT is
437 			 * calculated in cubic_post_recovery().
438 			 */
439 			if (cubic_data->min_rtt_nsecs >
440 			    cubic_data->mean_rtt_nsecs)
441 				cubic_data->mean_rtt_nsecs =
442 				    cubic_data->min_rtt_nsecs;
443 		}
444 
445 		/* Sum samples for epoch average RTT calculation. */
446 		cubic_data->sum_rtt_nsecs += t_srtt_nsecs;
447 		cubic_data->epoch_ack_count++;
448 	}
449 }
450 
451 /*
452  * Update the ssthresh in the event of congestion.
453  */
454 static void
455 cubic_ssthresh_update(struct cc_var *ccv)
456 {
457 	struct cubic *cubic_data;
458 
459 	cubic_data = ccv->cc_data;
460 
461 	/*
462 	 * On the first congestion event, set ssthresh to cwnd * 0.5, on
463 	 * subsequent congestion events, set it to cwnd * beta.
464 	 */
465 	if (cubic_data->num_cong_events == 0)
466 		CCV(ccv, tcp_cwnd_ssthresh) = CCV(ccv, tcp_cwnd) >> 1;
467 	else
468 		CCV(ccv, tcp_cwnd_ssthresh) =
469 		    (CCV(ccv, tcp_cwnd) * CUBIC_BETA) >> CUBIC_SHIFT;
470 }
471