blob: a1cec18dd49c61c20df93f289ddad9d562b63a42 [file] [log] [blame]
stephen hemminger45e14432011-02-02 15:21:10 +00001/*
2 * net/sched/sch_choke.c CHOKE scheduler
3 *
4 * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
5 * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * version 2 as published by the Free Software Foundation.
10 *
11 */
12
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/kernel.h>
16#include <linux/skbuff.h>
17#include <linux/reciprocal_div.h>
18#include <net/pkt_sched.h>
19#include <net/inet_ecn.h>
20#include <net/red.h>
21#include <linux/ip.h>
22#include <net/ip.h>
23#include <linux/ipv6.h>
24#include <net/ipv6.h>
25
26/*
27 CHOKe stateless AQM for fair bandwidth allocation
28 =================================================
29
30 CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
31 unresponsive flows) is a variant of RED that penalizes misbehaving flows but
32 maintains no flow state. The difference from RED is an additional step
33 during the enqueuing process. If average queue size is over the
34 low threshold (qmin), a packet is chosen at random from the queue.
35 If both the new and chosen packet are from the same flow, both
36 are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
37 needs to access packets in queue randomly. It has a minimal class
38 interface to allow overriding the builtin flow classifier with
39 filters.
40
41 Source:
42 R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
43 Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
44 IEEE INFOCOM, 2000.
45
46 A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
47 Characteristics", IEEE/ACM Transactions on Networking, 2004
48
49 */
50
51/* Upper bound on size of sk_buff table (packets) */
52#define CHOKE_MAX_QUEUE (128*1024 - 1)
53
54struct choke_sched_data {
55/* Parameters */
56 u32 limit;
57 unsigned char flags;
58
59 struct red_parms parms;
60
61/* Variables */
62 struct tcf_proto *filter_list;
63 struct {
64 u32 prob_drop; /* Early probability drops */
65 u32 prob_mark; /* Early probability marks */
66 u32 forced_drop; /* Forced drops, qavg > max_thresh */
67 u32 forced_mark; /* Forced marks, qavg > max_thresh */
68 u32 pdrop; /* Drops due to queue limits */
69 u32 other; /* Drops due to drop() calls */
70 u32 matched; /* Drops to flow match */
71 } stats;
72
73 unsigned int head;
74 unsigned int tail;
75
76 unsigned int tab_mask; /* size - 1 */
77
78 struct sk_buff **tab;
79};
80
81/* deliver a random number between 0 and N - 1 */
82static u32 random_N(unsigned int N)
83{
84 return reciprocal_divide(random32(), N);
85}
86
87/* number of elements in queue including holes */
88static unsigned int choke_len(const struct choke_sched_data *q)
89{
90 return (q->tail - q->head) & q->tab_mask;
91}
92
93/* Is ECN parameter configured */
94static int use_ecn(const struct choke_sched_data *q)
95{
96 return q->flags & TC_RED_ECN;
97}
98
99/* Should packets over max just be dropped (versus marked) */
100static int use_harddrop(const struct choke_sched_data *q)
101{
102 return q->flags & TC_RED_HARDDROP;
103}
104
105/* Move head pointer forward to skip over holes */
106static void choke_zap_head_holes(struct choke_sched_data *q)
107{
108 do {
109 q->head = (q->head + 1) & q->tab_mask;
110 if (q->head == q->tail)
111 break;
112 } while (q->tab[q->head] == NULL);
113}
114
115/* Move tail pointer backwards to reuse holes */
116static void choke_zap_tail_holes(struct choke_sched_data *q)
117{
118 do {
119 q->tail = (q->tail - 1) & q->tab_mask;
120 if (q->head == q->tail)
121 break;
122 } while (q->tab[q->tail] == NULL);
123}
124
125/* Drop packet from queue array by creating a "hole" */
126static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
127{
128 struct choke_sched_data *q = qdisc_priv(sch);
129 struct sk_buff *skb = q->tab[idx];
130
131 q->tab[idx] = NULL;
132
133 if (idx == q->head)
134 choke_zap_head_holes(q);
135 if (idx == q->tail)
136 choke_zap_tail_holes(q);
137
138 sch->qstats.backlog -= qdisc_pkt_len(skb);
139 qdisc_drop(skb, sch);
140 qdisc_tree_decrease_qlen(sch, 1);
141 --sch->q.qlen;
142}
143
144/*
145 * Compare flow of two packets
146 * Returns true only if source and destination address and port match.
147 * false for special cases
148 */
149static bool choke_match_flow(struct sk_buff *skb1,
150 struct sk_buff *skb2)
151{
152 int off1, off2, poff;
153 const u32 *ports1, *ports2;
154 u8 ip_proto;
155 __u32 hash1;
156
157 if (skb1->protocol != skb2->protocol)
158 return false;
159
160 /* Use hash value as quick check
161 * Assumes that __skb_get_rxhash makes IP header and ports linear
162 */
163 hash1 = skb_get_rxhash(skb1);
164 if (!hash1 || hash1 != skb_get_rxhash(skb2))
165 return false;
166
167 /* Probably match, but be sure to avoid hash collisions */
168 off1 = skb_network_offset(skb1);
169 off2 = skb_network_offset(skb2);
170
171 switch (skb1->protocol) {
172 case __constant_htons(ETH_P_IP): {
173 const struct iphdr *ip1, *ip2;
174
175 ip1 = (const struct iphdr *) (skb1->data + off1);
176 ip2 = (const struct iphdr *) (skb2->data + off2);
177
178 ip_proto = ip1->protocol;
179 if (ip_proto != ip2->protocol ||
180 ip1->saddr != ip2->saddr || ip1->daddr != ip2->daddr)
181 return false;
182
183 if ((ip1->frag_off | ip2->frag_off) & htons(IP_MF | IP_OFFSET))
184 ip_proto = 0;
185 off1 += ip1->ihl * 4;
186 off2 += ip2->ihl * 4;
187 break;
188 }
189
190 case __constant_htons(ETH_P_IPV6): {
191 const struct ipv6hdr *ip1, *ip2;
192
193 ip1 = (const struct ipv6hdr *) (skb1->data + off1);
194 ip2 = (const struct ipv6hdr *) (skb2->data + off2);
195
196 ip_proto = ip1->nexthdr;
197 if (ip_proto != ip2->nexthdr ||
198 ipv6_addr_cmp(&ip1->saddr, &ip2->saddr) ||
199 ipv6_addr_cmp(&ip1->daddr, &ip2->daddr))
200 return false;
201 off1 += 40;
202 off2 += 40;
203 }
204
205 default: /* Maybe compare MAC header here? */
206 return false;
207 }
208
209 poff = proto_ports_offset(ip_proto);
210 if (poff < 0)
211 return true;
212
213 off1 += poff;
214 off2 += poff;
215
216 ports1 = (__force u32 *)(skb1->data + off1);
217 ports2 = (__force u32 *)(skb2->data + off2);
218 return *ports1 == *ports2;
219}
220
221static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
222{
223 *(unsigned int *)(qdisc_skb_cb(skb)->data) = classid;
224}
225
226static u16 choke_get_classid(const struct sk_buff *skb)
227{
228 return *(unsigned int *)(qdisc_skb_cb(skb)->data);
229}
230
231/*
232 * Classify flow using either:
233 * 1. pre-existing classification result in skb
234 * 2. fast internal classification
235 * 3. use TC filter based classification
236 */
237static bool choke_classify(struct sk_buff *skb,
238 struct Qdisc *sch, int *qerr)
239
240{
241 struct choke_sched_data *q = qdisc_priv(sch);
242 struct tcf_result res;
243 int result;
244
245 result = tc_classify(skb, q->filter_list, &res);
246 if (result >= 0) {
247#ifdef CONFIG_NET_CLS_ACT
248 switch (result) {
249 case TC_ACT_STOLEN:
250 case TC_ACT_QUEUED:
251 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
252 case TC_ACT_SHOT:
253 return false;
254 }
255#endif
256 choke_set_classid(skb, TC_H_MIN(res.classid));
257 return true;
258 }
259
260 return false;
261}
262
263/*
264 * Select a packet at random from queue
265 * HACK: since queue can have holes from previous deletion; retry several
266 * times to find a random skb but then just give up and return the head
267 * Will return NULL if queue is empty (q->head == q->tail)
268 */
269static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
270 unsigned int *pidx)
271{
272 struct sk_buff *skb;
273 int retrys = 3;
274
275 do {
276 *pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
277 skb = q->tab[*pidx];
278 if (skb)
279 return skb;
280 } while (--retrys > 0);
281
282 return q->tab[*pidx = q->head];
283}
284
285/*
286 * Compare new packet with random packet in queue
287 * returns true if matched and sets *pidx
288 */
289static bool choke_match_random(const struct choke_sched_data *q,
290 struct sk_buff *nskb,
291 unsigned int *pidx)
292{
293 struct sk_buff *oskb;
294
295 if (q->head == q->tail)
296 return false;
297
298 oskb = choke_peek_random(q, pidx);
299 if (q->filter_list)
300 return choke_get_classid(nskb) == choke_get_classid(oskb);
301
302 return choke_match_flow(oskb, nskb);
303}
304
305static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
306{
307 struct choke_sched_data *q = qdisc_priv(sch);
308 struct red_parms *p = &q->parms;
309 int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
310
311 if (q->filter_list) {
312 /* If using external classifiers, get result and record it. */
313 if (!choke_classify(skb, sch, &ret))
314 goto other_drop; /* Packet was eaten by filter */
315 }
316
317 /* Compute average queue usage (see RED) */
318 p->qavg = red_calc_qavg(p, sch->q.qlen);
319 if (red_is_idling(p))
320 red_end_of_idle_period(p);
321
322 /* Is queue small? */
323 if (p->qavg <= p->qth_min)
324 p->qcount = -1;
325 else {
326 unsigned int idx;
327
328 /* Draw a packet at random from queue and compare flow */
329 if (choke_match_random(q, skb, &idx)) {
330 q->stats.matched++;
331 choke_drop_by_idx(sch, idx);
332 goto congestion_drop;
333 }
334
335 /* Queue is large, always mark/drop */
336 if (p->qavg > p->qth_max) {
337 p->qcount = -1;
338
339 sch->qstats.overlimits++;
340 if (use_harddrop(q) || !use_ecn(q) ||
341 !INET_ECN_set_ce(skb)) {
342 q->stats.forced_drop++;
343 goto congestion_drop;
344 }
345
346 q->stats.forced_mark++;
347 } else if (++p->qcount) {
348 if (red_mark_probability(p, p->qavg)) {
349 p->qcount = 0;
350 p->qR = red_random(p);
351
352 sch->qstats.overlimits++;
353 if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
354 q->stats.prob_drop++;
355 goto congestion_drop;
356 }
357
358 q->stats.prob_mark++;
359 }
360 } else
361 p->qR = red_random(p);
362 }
363
364 /* Admit new packet */
365 if (sch->q.qlen < q->limit) {
366 q->tab[q->tail] = skb;
367 q->tail = (q->tail + 1) & q->tab_mask;
368 ++sch->q.qlen;
369 sch->qstats.backlog += qdisc_pkt_len(skb);
370 return NET_XMIT_SUCCESS;
371 }
372
373 q->stats.pdrop++;
374 sch->qstats.drops++;
375 kfree_skb(skb);
376 return NET_XMIT_DROP;
377
378 congestion_drop:
379 qdisc_drop(skb, sch);
380 return NET_XMIT_CN;
381
382 other_drop:
383 if (ret & __NET_XMIT_BYPASS)
384 sch->qstats.drops++;
385 kfree_skb(skb);
386 return ret;
387}
388
389static struct sk_buff *choke_dequeue(struct Qdisc *sch)
390{
391 struct choke_sched_data *q = qdisc_priv(sch);
392 struct sk_buff *skb;
393
394 if (q->head == q->tail) {
395 if (!red_is_idling(&q->parms))
396 red_start_of_idle_period(&q->parms);
397 return NULL;
398 }
399
400 skb = q->tab[q->head];
401 q->tab[q->head] = NULL;
402 choke_zap_head_holes(q);
403 --sch->q.qlen;
404 sch->qstats.backlog -= qdisc_pkt_len(skb);
405 qdisc_bstats_update(sch, skb);
406
407 return skb;
408}
409
410static unsigned int choke_drop(struct Qdisc *sch)
411{
412 struct choke_sched_data *q = qdisc_priv(sch);
413 unsigned int len;
414
415 len = qdisc_queue_drop(sch);
416 if (len > 0)
417 q->stats.other++;
418 else {
419 if (!red_is_idling(&q->parms))
420 red_start_of_idle_period(&q->parms);
421 }
422
423 return len;
424}
425
426static void choke_reset(struct Qdisc *sch)
427{
428 struct choke_sched_data *q = qdisc_priv(sch);
429
430 red_restart(&q->parms);
431}
432
433static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
434 [TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) },
435 [TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE },
436};
437
438
439static void choke_free(void *addr)
440{
441 if (addr) {
442 if (is_vmalloc_addr(addr))
443 vfree(addr);
444 else
445 kfree(addr);
446 }
447}
448
449static int choke_change(struct Qdisc *sch, struct nlattr *opt)
450{
451 struct choke_sched_data *q = qdisc_priv(sch);
452 struct nlattr *tb[TCA_CHOKE_MAX + 1];
453 const struct tc_red_qopt *ctl;
454 int err;
455 struct sk_buff **old = NULL;
456 unsigned int mask;
457
458 if (opt == NULL)
459 return -EINVAL;
460
461 err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
462 if (err < 0)
463 return err;
464
465 if (tb[TCA_CHOKE_PARMS] == NULL ||
466 tb[TCA_CHOKE_STAB] == NULL)
467 return -EINVAL;
468
469 ctl = nla_data(tb[TCA_CHOKE_PARMS]);
470
471 if (ctl->limit > CHOKE_MAX_QUEUE)
472 return -EINVAL;
473
474 mask = roundup_pow_of_two(ctl->limit + 1) - 1;
475 if (mask != q->tab_mask) {
476 struct sk_buff **ntab;
477
478 ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
479 if (!ntab)
480 ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
481 if (!ntab)
482 return -ENOMEM;
483
484 sch_tree_lock(sch);
485 old = q->tab;
486 if (old) {
487 unsigned int oqlen = sch->q.qlen, tail = 0;
488
489 while (q->head != q->tail) {
490 struct sk_buff *skb = q->tab[q->head];
491
492 q->head = (q->head + 1) & q->tab_mask;
493 if (!skb)
494 continue;
495 if (tail < mask) {
496 ntab[tail++] = skb;
497 continue;
498 }
499 sch->qstats.backlog -= qdisc_pkt_len(skb);
500 --sch->q.qlen;
501 qdisc_drop(skb, sch);
502 }
503 qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
504 q->head = 0;
505 q->tail = tail;
506 }
507
508 q->tab_mask = mask;
509 q->tab = ntab;
510 } else
511 sch_tree_lock(sch);
512
513 q->flags = ctl->flags;
514 q->limit = ctl->limit;
515
516 red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
517 ctl->Plog, ctl->Scell_log,
518 nla_data(tb[TCA_CHOKE_STAB]));
519
520 if (q->head == q->tail)
521 red_end_of_idle_period(&q->parms);
522
523 sch_tree_unlock(sch);
524 choke_free(old);
525 return 0;
526}
527
528static int choke_init(struct Qdisc *sch, struct nlattr *opt)
529{
530 return choke_change(sch, opt);
531}
532
533static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
534{
535 struct choke_sched_data *q = qdisc_priv(sch);
536 struct nlattr *opts = NULL;
537 struct tc_red_qopt opt = {
538 .limit = q->limit,
539 .flags = q->flags,
540 .qth_min = q->parms.qth_min >> q->parms.Wlog,
541 .qth_max = q->parms.qth_max >> q->parms.Wlog,
542 .Wlog = q->parms.Wlog,
543 .Plog = q->parms.Plog,
544 .Scell_log = q->parms.Scell_log,
545 };
546
547 opts = nla_nest_start(skb, TCA_OPTIONS);
548 if (opts == NULL)
549 goto nla_put_failure;
550
551 NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt);
552 return nla_nest_end(skb, opts);
553
554nla_put_failure:
555 nla_nest_cancel(skb, opts);
556 return -EMSGSIZE;
557}
558
559static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
560{
561 struct choke_sched_data *q = qdisc_priv(sch);
562 struct tc_choke_xstats st = {
563 .early = q->stats.prob_drop + q->stats.forced_drop,
564 .marked = q->stats.prob_mark + q->stats.forced_mark,
565 .pdrop = q->stats.pdrop,
566 .other = q->stats.other,
567 .matched = q->stats.matched,
568 };
569
570 return gnet_stats_copy_app(d, &st, sizeof(st));
571}
572
573static void choke_destroy(struct Qdisc *sch)
574{
575 struct choke_sched_data *q = qdisc_priv(sch);
576
577 tcf_destroy_chain(&q->filter_list);
578 choke_free(q->tab);
579}
580
581static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
582{
583 return NULL;
584}
585
586static unsigned long choke_get(struct Qdisc *sch, u32 classid)
587{
588 return 0;
589}
590
591static void choke_put(struct Qdisc *q, unsigned long cl)
592{
593}
594
595static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
596 u32 classid)
597{
598 return 0;
599}
600
601static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
602{
603 struct choke_sched_data *q = qdisc_priv(sch);
604
605 if (cl)
606 return NULL;
607 return &q->filter_list;
608}
609
610static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
611 struct sk_buff *skb, struct tcmsg *tcm)
612{
613 tcm->tcm_handle |= TC_H_MIN(cl);
614 return 0;
615}
616
617static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
618{
619 if (!arg->stop) {
620 if (arg->fn(sch, 1, arg) < 0) {
621 arg->stop = 1;
622 return;
623 }
624 arg->count++;
625 }
626}
627
628static const struct Qdisc_class_ops choke_class_ops = {
629 .leaf = choke_leaf,
630 .get = choke_get,
631 .put = choke_put,
632 .tcf_chain = choke_find_tcf,
633 .bind_tcf = choke_bind,
634 .unbind_tcf = choke_put,
635 .dump = choke_dump_class,
636 .walk = choke_walk,
637};
638
639static struct sk_buff *choke_peek_head(struct Qdisc *sch)
640{
641 struct choke_sched_data *q = qdisc_priv(sch);
642
643 return (q->head != q->tail) ? q->tab[q->head] : NULL;
644}
645
646static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
647 .id = "choke",
648 .priv_size = sizeof(struct choke_sched_data),
649
650 .enqueue = choke_enqueue,
651 .dequeue = choke_dequeue,
652 .peek = choke_peek_head,
653 .drop = choke_drop,
654 .init = choke_init,
655 .destroy = choke_destroy,
656 .reset = choke_reset,
657 .change = choke_change,
658 .dump = choke_dump,
659 .dump_stats = choke_dump_stats,
660 .owner = THIS_MODULE,
661};
662
663static int __init choke_module_init(void)
664{
665 return register_qdisc(&choke_qdisc_ops);
666}
667
668static void __exit choke_module_exit(void)
669{
670 unregister_qdisc(&choke_qdisc_ops);
671}
672
673module_init(choke_module_init)
674module_exit(choke_module_exit)
675
676MODULE_LICENSE("GPL");