altq_red.c revision 090d3896f59f5c4b730f6f1bb169ad850768ca6a
1/*-
2 * Copyright (C) 1997-2003
3 *	Sony Computer Science Laboratories Inc.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 */
27/*-
28 * Copyright (c) 1990-1994 Regents of the University of California.
29 * All rights reserved.
30 *
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted provided that the following conditions
33 * are met:
34 * 1. Redistributions of source code must retain the above copyright
35 *    notice, this list of conditions and the following disclaimer.
36 * 2. Redistributions in binary form must reproduce the above copyright
37 *    notice, this list of conditions and the following disclaimer in the
38 *    documentation and/or other materials provided with the distribution.
39 * 3. All advertising materials mentioning features or use of this software
40 *    must display the following acknowledgement:
41 *	This product includes software developed by the Computer Systems
42 *	Engineering Group at Lawrence Berkeley Laboratory.
43 * 4. Neither the name of the University nor of the Laboratory may be used
44 *    to endorse or promote products derived from this software without
45 *    specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 *
59 * $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $
60 * $FreeBSD$
61 */
62
63#include "opt_altq.h"
64#include "opt_inet.h"
65#include "opt_inet6.h"
66#ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
67
68#include <sys/param.h>
69#include <sys/malloc.h>
70#include <sys/mbuf.h>
71#include <sys/socket.h>
72#include <sys/systm.h>
73#include <sys/errno.h>
74#if 1 /* ALTQ3_COMPAT */
75#include <sys/sockio.h>
76#include <sys/proc.h>
77#include <sys/kernel.h>
78#ifdef ALTQ_FLOWVALVE
79#include <sys/queue.h>
80#include <sys/time.h>
81#endif
82#endif /* ALTQ3_COMPAT */
83
84#include <net/if.h>
85#include <net/if_var.h>
86
87#include <netinet/in.h>
88#include <netinet/in_systm.h>
89#include <netinet/ip.h>
90#ifdef INET6
91#include <netinet/ip6.h>
92#endif
93
94#include <netpfil/pf/pf.h>
95#include <netpfil/pf/pf_altq.h>
96#include <netpfil/pf/pf_mtag.h>
97#include <net/altq/altq.h>
98#include <net/altq/altq_red.h>
99
100/*
101 * ALTQ/RED (Random Early Detection) implementation using 32-bit
102 * fixed-point calculation.
103 *
104 * written by kjc using the ns code as a reference.
105 * you can learn more about red and ns from Sally's home page at
106 * http://www-nrg.ee.lbl.gov/floyd/
107 *
108 * most of the red parameter values are fixed in this implementation
109 * to prevent fixed-point overflow/underflow.
110 * if you change the parameters, watch out for overflow/underflow!
111 *
112 * the parameters used are recommended values by Sally.
113 * the corresponding ns config looks:
114 *	q_weight=0.00195
115 *	minthresh=5 maxthresh=15 queue-size=60
116 *	linterm=30
117 *	dropmech=drop-tail
118 *	bytes=false (can't be handled by 32-bit fixed-point)
119 *	doubleq=false dqthresh=false
120 *	wait=true
121 */
122/*
123 * alternative red parameters for a slow link.
124 *
125 * assume the queue length becomes from zero to L and keeps L, it takes
126 * N packets for q_avg to reach 63% of L.
127 * when q_weight is 0.002, N is about 500 packets.
128 * for a slow link like dial-up, 500 packets takes more than 1 minute!
129 * when q_weight is 0.008, N is about 127 packets.
130 * when q_weight is 0.016, N is about 63 packets.
131 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
132 * are allowed for 0.016.
133 * see Sally's paper for more details.
134 */
135/* normal red parameters */
136#define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
137				/* q_weight = 0.00195 */
138
139/* red parameters for a slow link */
140#define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
141				/* q_weight = 0.0078125 */
142
143/* red parameters for a very slow link (e.g., dialup) */
144#define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
145				/* q_weight = 0.015625 */
146
147/* fixed-point uses 12-bit decimal places */
148#define	FP_SHIFT	12	/* fixed-point shift */
149
150/* red parameters for drop probability */
151#define	INV_P_MAX	10	/* inverse of max drop probability */
152#define	TH_MIN		5	/* min threshold */
153#define	TH_MAX		15	/* max threshold */
154
155#define	RED_LIMIT	60	/* default max queue length */
156#define	RED_STATS		/* collect statistics */
157
158/*
159 * our default policy for forced-drop is drop-tail.
160 * (in altq-1.1.2 or earlier, the default was random-drop.
161 * but it makes more sense to punish the cause of the surge.)
162 * to switch to the random-drop policy, define "RED_RANDOM_DROP".
163 */
164
165
166/* default red parameter values */
167static int default_th_min = TH_MIN;
168static int default_th_max = TH_MAX;
169static int default_inv_pmax = INV_P_MAX;
170
171
172/*
173 * red support routines
174 */
175red_t *
176red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
177   int pkttime)
178{
179	red_t	*rp;
180	int	 w, i;
181	int	 npkts_per_sec;
182
183	rp = malloc(sizeof(red_t), M_DEVBUF, M_NOWAIT | M_ZERO);
184	if (rp == NULL)
185		return (NULL);
186
187	if (weight == 0)
188		rp->red_weight = W_WEIGHT;
189	else
190		rp->red_weight = weight;
191
192	/* allocate weight table */
193	rp->red_wtab = wtab_alloc(rp->red_weight);
194	if (rp->red_wtab == NULL) {
195		free(rp, M_DEVBUF);
196		return (NULL);
197	}
198
199	rp->red_avg = 0;
200	rp->red_idle = 1;
201
202	if (inv_pmax == 0)
203		rp->red_inv_pmax = default_inv_pmax;
204	else
205		rp->red_inv_pmax = inv_pmax;
206	if (th_min == 0)
207		rp->red_thmin = default_th_min;
208	else
209		rp->red_thmin = th_min;
210	if (th_max == 0)
211		rp->red_thmax = default_th_max;
212	else
213		rp->red_thmax = th_max;
214
215	rp->red_flags = flags;
216
217	if (pkttime == 0)
218		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
219		rp->red_pkttime = 800;
220	else
221		rp->red_pkttime = pkttime;
222
223	if (weight == 0) {
224		/* when the link is very slow, adjust red parameters */
225		npkts_per_sec = 1000000 / rp->red_pkttime;
226		if (npkts_per_sec < 50) {
227			/* up to about 400Kbps */
228			rp->red_weight = W_WEIGHT_2;
229		} else if (npkts_per_sec < 300) {
230			/* up to about 2.4Mbps */
231			rp->red_weight = W_WEIGHT_1;
232		}
233	}
234
235	/* calculate wshift.  weight must be power of 2 */
236	w = rp->red_weight;
237	for (i = 0; w > 1; i++)
238		w = w >> 1;
239	rp->red_wshift = i;
240	w = 1 << rp->red_wshift;
241	if (w != rp->red_weight) {
242		printf("invalid weight value %d for red! use %d\n",
243		       rp->red_weight, w);
244		rp->red_weight = w;
245	}
246
247	/*
248	 * thmin_s and thmax_s are scaled versions of th_min and th_max
249	 * to be compared with avg.
250	 */
251	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
252	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
253
254	/*
255	 * precompute probability denominator
256	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
257	 */
258	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
259			 * rp->red_inv_pmax) << FP_SHIFT;
260
261	microtime(&rp->red_last);
262	return (rp);
263}
264
265void
266red_destroy(red_t *rp)
267{
268	wtab_destroy(rp->red_wtab);
269	free(rp, M_DEVBUF);
270}
271
272void
273red_getstats(red_t *rp, struct redstats *sp)
274{
275	sp->q_avg		= rp->red_avg >> rp->red_wshift;
276	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
277	sp->drop_cnt		= rp->red_stats.drop_cnt;
278	sp->drop_forced		= rp->red_stats.drop_forced;
279	sp->drop_unforced	= rp->red_stats.drop_unforced;
280	sp->marked_packets	= rp->red_stats.marked_packets;
281}
282
283int
284red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
285    struct altq_pktattr *pktattr)
286{
287	int avg, droptype;
288	int n;
289
290	avg = rp->red_avg;
291
292	/*
293	 * if we were idle, we pretend that n packets arrived during
294	 * the idle period.
295	 */
296	if (rp->red_idle) {
297		struct timeval now;
298		int t;
299
300		rp->red_idle = 0;
301		microtime(&now);
302		t = (now.tv_sec - rp->red_last.tv_sec);
303		if (t > 60) {
304			/*
305			 * being idle for more than 1 minute, set avg to zero.
306			 * this prevents t from overflow.
307			 */
308			avg = 0;
309		} else {
310			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
311			n = t / rp->red_pkttime - 1;
312
313			/* the following line does (avg = (1 - Wq)^n * avg) */
314			if (n > 0)
315				avg = (avg >> FP_SHIFT) *
316				    pow_w(rp->red_wtab, n);
317		}
318	}
319
320	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
321	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
322	rp->red_avg = avg;		/* save the new value */
323
324	/*
325	 * red_count keeps a tally of arriving traffic that has not
326	 * been dropped.
327	 */
328	rp->red_count++;
329
330	/* see if we drop early */
331	droptype = DTYPE_NODROP;
332	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
333		if (avg >= rp->red_thmax_s) {
334			/* avg >= th_max: forced drop */
335			droptype = DTYPE_FORCED;
336		} else if (rp->red_old == 0) {
337			/* first exceeds th_min */
338			rp->red_count = 1;
339			rp->red_old = 1;
340		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
341				      rp->red_probd, rp->red_count)) {
342			/* mark or drop by red */
343			if ((rp->red_flags & REDF_ECN) &&
344			    mark_ecn(m, pktattr, rp->red_flags)) {
345				/* successfully marked.  do not drop. */
346				rp->red_count = 0;
347#ifdef RED_STATS
348				rp->red_stats.marked_packets++;
349#endif
350			} else {
351				/* unforced drop by red */
352				droptype = DTYPE_EARLY;
353			}
354		}
355	} else {
356		/* avg < th_min */
357		rp->red_old = 0;
358	}
359
360	/*
361	 * if the queue length hits the hard limit, it's a forced drop.
362	 */
363	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
364		droptype = DTYPE_FORCED;
365
366#ifdef RED_RANDOM_DROP
367	/* if successful or forced drop, enqueue this packet. */
368	if (droptype != DTYPE_EARLY)
369		_addq(q, m);
370#else
371	/* if successful, enqueue this packet. */
372	if (droptype == DTYPE_NODROP)
373		_addq(q, m);
374#endif
375	if (droptype != DTYPE_NODROP) {
376		if (droptype == DTYPE_EARLY) {
377			/* drop the incoming packet */
378#ifdef RED_STATS
379			rp->red_stats.drop_unforced++;
380#endif
381		} else {
382			/* forced drop, select a victim packet in the queue. */
383#ifdef RED_RANDOM_DROP
384			m = _getq_random(q);
385#endif
386#ifdef RED_STATS
387			rp->red_stats.drop_forced++;
388#endif
389		}
390#ifdef RED_STATS
391		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
392#endif
393		rp->red_count = 0;
394		m_freem(m);
395		return (-1);
396	}
397	/* successfully queued */
398#ifdef RED_STATS
399	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
400#endif
401	return (0);
402}
403
404/*
405 * early-drop probability is calculated as follows:
406 *   prob = p_max * (avg - th_min) / (th_max - th_min)
407 *   prob_a = prob / (2 - count*prob)
408 *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
409 * here prob_a increases as successive undrop count increases.
410 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
411 * becomes 1 when (count >= (2 / prob))).
412 */
413int
414drop_early(int fp_len, int fp_probd, int count)
415{
416	int	d;		/* denominator of drop-probability */
417
418	d = fp_probd - count * fp_len;
419	if (d <= 0)
420		/* count exceeds the hard limit: drop or mark */
421		return (1);
422
423	/*
424	 * now the range of d is [1..600] in fixed-point. (when
425	 * th_max-th_min=10 and p_max=1/30)
426	 * drop probability = (avg - TH_MIN) / d
427	 */
428
429	if ((arc4random() % d) < fp_len) {
430		/* drop or mark */
431		return (1);
432	}
433	/* no drop/mark */
434	return (0);
435}
436
437/*
438 * try to mark CE bit to the packet.
439 *    returns 1 if successfully marked, 0 otherwise.
440 */
441int
442mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
443{
444	struct mbuf	*m0;
445	struct pf_mtag	*at;
446	void		*hdr;
447
448	at = pf_find_mtag(m);
449	if (at != NULL) {
450		hdr = at->hdr;
451	} else
452		return (0);
453
454	/* verify that pattr_hdr is within the mbuf data */
455	for (m0 = m; m0 != NULL; m0 = m0->m_next)
456		if (((caddr_t)hdr >= m0->m_data) &&
457		    ((caddr_t)hdr < m0->m_data + m0->m_len))
458			break;
459	if (m0 == NULL) {
460		/* ick, tag info is stale */
461		return (0);
462	}
463
464	switch (((struct ip *)hdr)->ip_v) {
465	case IPVERSION:
466		if (flags & REDF_ECN4) {
467			struct ip *ip = hdr;
468			u_int8_t otos;
469			int sum;
470
471			if (ip->ip_v != 4)
472				return (0);	/* version mismatch! */
473
474			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
475				return (0);	/* not-ECT */
476			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
477				return (1);	/* already marked */
478
479			/*
480			 * ecn-capable but not marked,
481			 * mark CE and update checksum
482			 */
483			otos = ip->ip_tos;
484			ip->ip_tos |= IPTOS_ECN_CE;
485			/*
486			 * update checksum (from RFC1624)
487			 *	   HC' = ~(~HC + ~m + m')
488			 */
489			sum = ~ntohs(ip->ip_sum) & 0xffff;
490			sum += (~otos & 0xffff) + ip->ip_tos;
491			sum = (sum >> 16) + (sum & 0xffff);
492			sum += (sum >> 16);  /* add carry */
493			ip->ip_sum = htons(~sum & 0xffff);
494			return (1);
495		}
496		break;
497#ifdef INET6
498	case (IPV6_VERSION >> 4):
499		if (flags & REDF_ECN6) {
500			struct ip6_hdr *ip6 = hdr;
501			u_int32_t flowlabel;
502
503			flowlabel = ntohl(ip6->ip6_flow);
504			if ((flowlabel >> 28) != 6)
505				return (0);	/* version mismatch! */
506			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
507			    (IPTOS_ECN_NOTECT << 20))
508				return (0);	/* not-ECT */
509			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
510			    (IPTOS_ECN_CE << 20))
511				return (1);	/* already marked */
512			/*
513			 * ecn-capable but not marked,  mark CE
514			 */
515			flowlabel |= (IPTOS_ECN_CE << 20);
516			ip6->ip6_flow = htonl(flowlabel);
517			return (1);
518		}
519		break;
520#endif  /* INET6 */
521	}
522
523	/* not marked */
524	return (0);
525}
526
527struct mbuf *
528red_getq(rp, q)
529	red_t *rp;
530	class_queue_t *q;
531{
532	struct mbuf *m;
533
534	if ((m = _getq(q)) == NULL) {
535		if (rp->red_idle == 0) {
536			rp->red_idle = 1;
537			microtime(&rp->red_last);
538		}
539		return NULL;
540	}
541
542	rp->red_idle = 0;
543	return (m);
544}
545
546/*
547 * helper routine to calibrate avg during idle.
548 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
549 * here Wq = 1/weight and the code assumes Wq is close to zero.
550 *
551 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
552 */
553static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
554
555struct wtab *
556wtab_alloc(int weight)
557{
558	struct wtab	*w;
559	int		 i;
560
561	for (w = wtab_list; w != NULL; w = w->w_next)
562		if (w->w_weight == weight) {
563			w->w_refcount++;
564			return (w);
565		}
566
567	w = malloc(sizeof(struct wtab), M_DEVBUF, M_NOWAIT | M_ZERO);
568	if (w == NULL)
569		return (NULL);
570	w->w_weight = weight;
571	w->w_refcount = 1;
572	w->w_next = wtab_list;
573	wtab_list = w;
574
575	/* initialize the weight table */
576	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
577	for (i = 1; i < 32; i++) {
578		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
579		if (w->w_tab[i] == 0 && w->w_param_max == 0)
580			w->w_param_max = 1 << i;
581	}
582
583	return (w);
584}
585
586int
587wtab_destroy(struct wtab *w)
588{
589	struct wtab	*prev;
590
591	if (--w->w_refcount > 0)
592		return (0);
593
594	if (wtab_list == w)
595		wtab_list = w->w_next;
596	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
597		if (prev->w_next == w) {
598			prev->w_next = w->w_next;
599			break;
600		}
601
602	free(w, M_DEVBUF);
603	return (0);
604}
605
606int32_t
607pow_w(struct wtab *w, int n)
608{
609	int	i, bit;
610	int32_t	val;
611
612	if (n >= w->w_param_max)
613		return (0);
614
615	val = 1 << FP_SHIFT;
616	if (n <= 0)
617		return (val);
618
619	bit = 1;
620	i = 0;
621	while (n) {
622		if (n & bit) {
623			val = (val * w->w_tab[i]) >> FP_SHIFT;
624			n &= ~bit;
625		}
626		i++;
627		bit <<=  1;
628	}
629	return (val);
630}
631
632
633#endif /* ALTQ_RED */
634