blob: be98a01253e930bc905b3e9460602cfb2caaeee6 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 */
12
Linus Torvalds1da177e2005-04-16 15:20:36 -070013#include <linux/module.h>
14#include <asm/uaccess.h>
15#include <asm/system.h>
16#include <linux/bitops.h>
17#include <linux/types.h>
18#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070019#include <linux/string.h>
20#include <linux/mm.h>
21#include <linux/socket.h>
22#include <linux/sockios.h>
23#include <linux/in.h>
24#include <linux/errno.h>
25#include <linux/interrupt.h>
26#include <linux/if_ether.h>
27#include <linux/inet.h>
28#include <linux/netdevice.h>
29#include <linux/etherdevice.h>
30#include <linux/notifier.h>
31#include <net/ip.h>
32#include <net/route.h>
33#include <linux/skbuff.h>
34#include <net/sock.h>
35#include <net/pkt_sched.h>
36
37
38/* Class-Based Queueing (CBQ) algorithm.
39 =======================================
40
41 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +090042 Management Models for Packet Networks",
Linus Torvalds1da177e2005-04-16 15:20:36 -070043 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
44
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +090045 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
Linus Torvalds1da177e2005-04-16 15:20:36 -070046
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +090047 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
Linus Torvalds1da177e2005-04-16 15:20:36 -070048 Parameters", 1996
49
50 [4] Sally Floyd and Michael Speer, "Experimental Results
51 for Class-Based Queueing", 1998, not published.
52
53 -----------------------------------------------------------------------
54
55 Algorithm skeleton was taken from NS simulator cbq.cc.
56 If someone wants to check this code against the LBL version,
57 he should take into account that ONLY the skeleton was borrowed,
58 the implementation is different. Particularly:
59
60 --- The WRR algorithm is different. Our version looks more
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +090061 reasonable (I hope) and works when quanta are allowed to be
62 less than MTU, which is always the case when real time classes
63 have small rates. Note, that the statement of [3] is
64 incomplete, delay may actually be estimated even if class
65 per-round allotment is less than MTU. Namely, if per-round
66 allotment is W*r_i, and r_1+...+r_k = r < 1
Linus Torvalds1da177e2005-04-16 15:20:36 -070067
68 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
69
70 In the worst case we have IntServ estimate with D = W*r+k*MTU
71 and C = MTU*r. The proof (if correct at all) is trivial.
72
73
74 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
75 interpret some places, which look like wrong translations
76 from NS. Anyone is advised to find these differences
77 and explain to me, why I am wrong 8).
78
79 --- Linux has no EOI event, so that we cannot estimate true class
80 idle time. Workaround is to consider the next dequeue event
81 as sign that previous packet is finished. This is wrong because of
82 internal device queueing, but on a permanently loaded link it is true.
83 Moreover, combined with clock integrator, this scheme looks
84 very close to an ideal solution. */
85
86struct cbq_sched_data;
87
88
89struct cbq_class
90{
91 struct cbq_class *next; /* hash table link */
92 struct cbq_class *next_alive; /* next class with backlog in this priority band */
93
94/* Parameters */
95 u32 classid;
96 unsigned char priority; /* class priority */
97 unsigned char priority2; /* priority to be used after overlimit */
98 unsigned char ewma_log; /* time constant for idle time calculation */
99 unsigned char ovl_strategy;
100#ifdef CONFIG_NET_CLS_POLICE
101 unsigned char police;
102#endif
103
104 u32 defmap;
105
106 /* Link-sharing scheduler parameters */
107 long maxidle; /* Class parameters: see below. */
108 long offtime;
109 long minidle;
110 u32 avpkt;
111 struct qdisc_rate_table *R_tab;
112
113 /* Overlimit strategy parameters */
114 void (*overlimit)(struct cbq_class *cl);
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700115 psched_tdiff_t penalty;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116
117 /* General scheduler (WRR) parameters */
118 long allot;
119 long quantum; /* Allotment per WRR round */
120 long weight; /* Relative allotment: see below */
121
122 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
123 struct cbq_class *split; /* Ptr to split node */
124 struct cbq_class *share; /* Ptr to LS parent in the class tree */
125 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
126 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
127 parent otherwise */
128 struct cbq_class *sibling; /* Sibling chain */
129 struct cbq_class *children; /* Pointer to children chain */
130
131 struct Qdisc *q; /* Elementary queueing discipline */
132
133
134/* Variables */
135 unsigned char cpriority; /* Effective priority */
136 unsigned char delayed;
137 unsigned char level; /* level of the class in hierarchy:
138 0 for leaf classes, and maximal
139 level of children + 1 for nodes.
140 */
141
142 psched_time_t last; /* Last end of service */
143 psched_time_t undertime;
144 long avgidle;
145 long deficit; /* Saved deficit for WRR */
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700146 psched_time_t penalized;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147 struct gnet_stats_basic bstats;
148 struct gnet_stats_queue qstats;
149 struct gnet_stats_rate_est rate_est;
150 spinlock_t *stats_lock;
151 struct tc_cbq_xstats xstats;
152
153 struct tcf_proto *filter_list;
154
155 int refcnt;
156 int filters;
157
158 struct cbq_class *defaults[TC_PRIO_MAX+1];
159};
160
161struct cbq_sched_data
162{
163 struct cbq_class *classes[16]; /* Hash table of all classes */
164 int nclasses[TC_CBQ_MAXPRIO+1];
165 unsigned quanta[TC_CBQ_MAXPRIO+1];
166
167 struct cbq_class link;
168
169 unsigned activemask;
170 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
171 with backlog */
172
173#ifdef CONFIG_NET_CLS_POLICE
174 struct cbq_class *rx_class;
175#endif
176 struct cbq_class *tx_class;
177 struct cbq_class *tx_borrowed;
178 int tx_len;
179 psched_time_t now; /* Cached timestamp */
180 psched_time_t now_rt; /* Cached real time */
181 unsigned pmask;
182
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700183 struct hrtimer delay_timer;
Patrick McHardy88a99352007-03-16 01:21:11 -0700184 struct qdisc_watchdog watchdog; /* Watchdog timer,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185 started when CBQ has
186 backlog, but cannot
187 transmit just now */
Patrick McHardy88a99352007-03-16 01:21:11 -0700188 psched_tdiff_t wd_expires;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189 int toplevel;
190 u32 hgenerator;
191};
192
193
194#define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log])
195
196
197static __inline__ unsigned cbq_hash(u32 h)
198{
199 h ^= h>>8;
200 h ^= h>>4;
201 return h&0xF;
202}
203
204static __inline__ struct cbq_class *
205cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
206{
207 struct cbq_class *cl;
208
209 for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
210 if (cl->classid == classid)
211 return cl;
212 return NULL;
213}
214
215#ifdef CONFIG_NET_CLS_POLICE
216
217static struct cbq_class *
218cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
219{
220 struct cbq_class *cl, *new;
221
222 for (cl = this->tparent; cl; cl = cl->tparent)
223 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
224 return new;
225
226 return NULL;
227}
228
229#endif
230
231/* Classify packet. The procedure is pretty complicated, but
232 it allows us to combine link sharing and priority scheduling
233 transparently.
234
235 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
236 so that it resolves to split nodes. Then packets are classified
237 by logical priority, or a more specific classifier may be attached
238 to the split node.
239 */
240
241static struct cbq_class *
242cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
243{
244 struct cbq_sched_data *q = qdisc_priv(sch);
245 struct cbq_class *head = &q->link;
246 struct cbq_class **defmap;
247 struct cbq_class *cl = NULL;
248 u32 prio = skb->priority;
249 struct tcf_result res;
250
251 /*
252 * Step 1. If skb->priority points to one of our classes, use it.
253 */
254 if (TC_H_MAJ(prio^sch->handle) == 0 &&
255 (cl = cbq_class_lookup(q, prio)) != NULL)
256 return cl;
257
Jamal Hadi Salim29f1df62006-01-08 22:35:55 -0800258 *qerr = NET_XMIT_BYPASS;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259 for (;;) {
260 int result = 0;
261 defmap = head->defaults;
262
263 /*
264 * Step 2+n. Apply classifier.
265 */
266 if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
267 goto fallback;
268
269 if ((cl = (void*)res.class) == NULL) {
270 if (TC_H_MAJ(res.classid))
271 cl = cbq_class_lookup(q, res.classid);
272 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
273 cl = defmap[TC_PRIO_BESTEFFORT];
274
275 if (cl == NULL || cl->level >= head->level)
276 goto fallback;
277 }
278
279#ifdef CONFIG_NET_CLS_ACT
280 switch (result) {
281 case TC_ACT_QUEUED:
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900282 case TC_ACT_STOLEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700283 *qerr = NET_XMIT_SUCCESS;
284 case TC_ACT_SHOT:
285 return NULL;
286 }
287#elif defined(CONFIG_NET_CLS_POLICE)
288 switch (result) {
289 case TC_POLICE_RECLASSIFY:
290 return cbq_reclassify(skb, cl);
291 case TC_POLICE_SHOT:
292 return NULL;
293 default:
294 break;
295 }
296#endif
297 if (cl->level == 0)
298 return cl;
299
300 /*
301 * Step 3+n. If classifier selected a link sharing class,
302 * apply agency specific classifier.
303 * Repeat this procdure until we hit a leaf node.
304 */
305 head = cl;
306 }
307
308fallback:
309 cl = head;
310
311 /*
312 * Step 4. No success...
313 */
314 if (TC_H_MAJ(prio) == 0 &&
315 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
316 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
317 return head;
318
319 return cl;
320}
321
322/*
323 A packet has just been enqueued on the empty class.
324 cbq_activate_class adds it to the tail of active class list
325 of its priority band.
326 */
327
328static __inline__ void cbq_activate_class(struct cbq_class *cl)
329{
330 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
331 int prio = cl->cpriority;
332 struct cbq_class *cl_tail;
333
334 cl_tail = q->active[prio];
335 q->active[prio] = cl;
336
337 if (cl_tail != NULL) {
338 cl->next_alive = cl_tail->next_alive;
339 cl_tail->next_alive = cl;
340 } else {
341 cl->next_alive = cl;
342 q->activemask |= (1<<prio);
343 }
344}
345
346/*
347 Unlink class from active chain.
348 Note that this same procedure is done directly in cbq_dequeue*
349 during round-robin procedure.
350 */
351
352static void cbq_deactivate_class(struct cbq_class *this)
353{
354 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
355 int prio = this->cpriority;
356 struct cbq_class *cl;
357 struct cbq_class *cl_prev = q->active[prio];
358
359 do {
360 cl = cl_prev->next_alive;
361 if (cl == this) {
362 cl_prev->next_alive = cl->next_alive;
363 cl->next_alive = NULL;
364
365 if (cl == q->active[prio]) {
366 q->active[prio] = cl_prev;
367 if (cl == q->active[prio]) {
368 q->active[prio] = NULL;
369 q->activemask &= ~(1<<prio);
370 return;
371 }
372 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700373 return;
374 }
375 } while ((cl_prev = cl) != q->active[prio]);
376}
377
378static void
379cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
380{
381 int toplevel = q->toplevel;
382
383 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
384 psched_time_t now;
385 psched_tdiff_t incr;
386
387 PSCHED_GET_TIME(now);
388 incr = PSCHED_TDIFF(now, q->now_rt);
389 PSCHED_TADD2(q->now, incr, now);
390
391 do {
392 if (PSCHED_TLESS(cl->undertime, now)) {
393 q->toplevel = cl->level;
394 return;
395 }
396 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
397 }
398}
399
400static int
401cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
402{
403 struct cbq_sched_data *q = qdisc_priv(sch);
404 int len = skb->len;
405 int ret;
406 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
407
408#ifdef CONFIG_NET_CLS_POLICE
409 q->rx_class = cl;
410#endif
411 if (cl == NULL) {
Jamal Hadi Salim29f1df62006-01-08 22:35:55 -0800412 if (ret == NET_XMIT_BYPASS)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700413 sch->qstats.drops++;
414 kfree_skb(skb);
415 return ret;
416 }
417
418#ifdef CONFIG_NET_CLS_POLICE
419 cl->q->__parent = sch;
420#endif
421 if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
422 sch->q.qlen++;
423 sch->bstats.packets++;
424 sch->bstats.bytes+=len;
425 cbq_mark_toplevel(q, cl);
426 if (!cl->next_alive)
427 cbq_activate_class(cl);
428 return ret;
429 }
430
431 sch->qstats.drops++;
432 cbq_mark_toplevel(q, cl);
433 cl->qstats.drops++;
434 return ret;
435}
436
437static int
438cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
439{
440 struct cbq_sched_data *q = qdisc_priv(sch);
441 struct cbq_class *cl;
442 int ret;
443
444 if ((cl = q->tx_class) == NULL) {
445 kfree_skb(skb);
446 sch->qstats.drops++;
447 return NET_XMIT_CN;
448 }
449 q->tx_class = NULL;
450
451 cbq_mark_toplevel(q, cl);
452
453#ifdef CONFIG_NET_CLS_POLICE
454 q->rx_class = cl;
455 cl->q->__parent = sch;
456#endif
457 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
458 sch->q.qlen++;
459 sch->qstats.requeues++;
460 if (!cl->next_alive)
461 cbq_activate_class(cl);
462 return 0;
463 }
464 sch->qstats.drops++;
465 cl->qstats.drops++;
466 return ret;
467}
468
469/* Overlimit actions */
470
471/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
472
473static void cbq_ovl_classic(struct cbq_class *cl)
474{
475 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
476 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
477
478 if (!cl->delayed) {
479 delay += cl->offtime;
480
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900481 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482 Class goes to sleep, so that it will have no
483 chance to work avgidle. Let's forgive it 8)
484
485 BTW cbq-2.0 has a crap in this
486 place, apparently they forgot to shift it by cl->ewma_log.
487 */
488 if (cl->avgidle < 0)
489 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
490 if (cl->avgidle < cl->minidle)
491 cl->avgidle = cl->minidle;
492 if (delay <= 0)
493 delay = 1;
494 PSCHED_TADD2(q->now, delay, cl->undertime);
495
496 cl->xstats.overactions++;
497 cl->delayed = 1;
498 }
499 if (q->wd_expires == 0 || q->wd_expires > delay)
500 q->wd_expires = delay;
501
502 /* Dirty work! We must schedule wakeups based on
503 real available rate, rather than leaf rate,
504 which may be tiny (even zero).
505 */
506 if (q->toplevel == TC_CBQ_MAXLEVEL) {
507 struct cbq_class *b;
508 psched_tdiff_t base_delay = q->wd_expires;
509
510 for (b = cl->borrow; b; b = b->borrow) {
511 delay = PSCHED_TDIFF(b->undertime, q->now);
512 if (delay < base_delay) {
513 if (delay <= 0)
514 delay = 1;
515 base_delay = delay;
516 }
517 }
518
519 q->wd_expires = base_delay;
520 }
521}
522
523/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
524 they go overlimit
525 */
526
527static void cbq_ovl_rclassic(struct cbq_class *cl)
528{
529 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
530 struct cbq_class *this = cl;
531
532 do {
533 if (cl->level > q->toplevel) {
534 cl = NULL;
535 break;
536 }
537 } while ((cl = cl->borrow) != NULL);
538
539 if (cl == NULL)
540 cl = this;
541 cbq_ovl_classic(cl);
542}
543
544/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
545
546static void cbq_ovl_delay(struct cbq_class *cl)
547{
548 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
549 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
550
551 if (!cl->delayed) {
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700552 psched_time_t sched = q->now;
553 ktime_t expires;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700554
555 delay += cl->offtime;
556 if (cl->avgidle < 0)
557 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
558 if (cl->avgidle < cl->minidle)
559 cl->avgidle = cl->minidle;
560 PSCHED_TADD2(q->now, delay, cl->undertime);
561
562 if (delay > 0) {
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700563 sched += delay + cl->penalty;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700564 cl->penalized = sched;
565 cl->cpriority = TC_CBQ_MAXPRIO;
566 q->pmask |= (1<<TC_CBQ_MAXPRIO);
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700567
568 expires = ktime_set(0, 0);
569 expires = ktime_add_ns(expires, PSCHED_US2NS(sched));
570 if (hrtimer_try_to_cancel(&q->delay_timer) &&
571 ktime_to_ns(ktime_sub(q->delay_timer.expires,
572 expires)) > 0)
573 q->delay_timer.expires = expires;
574 hrtimer_restart(&q->delay_timer);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700575 cl->delayed = 1;
576 cl->xstats.overactions++;
577 return;
578 }
579 delay = 1;
580 }
581 if (q->wd_expires == 0 || q->wd_expires > delay)
582 q->wd_expires = delay;
583}
584
585/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
586
587static void cbq_ovl_lowprio(struct cbq_class *cl)
588{
589 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
590
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700591 cl->penalized = q->now + cl->penalty;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700592
593 if (cl->cpriority != cl->priority2) {
594 cl->cpriority = cl->priority2;
595 q->pmask |= (1<<cl->cpriority);
596 cl->xstats.overactions++;
597 }
598 cbq_ovl_classic(cl);
599}
600
601/* TC_CBQ_OVL_DROP: penalize class by dropping */
602
603static void cbq_ovl_drop(struct cbq_class *cl)
604{
605 if (cl->q->ops->drop)
606 if (cl->q->ops->drop(cl->q))
607 cl->qdisc->q.qlen--;
608 cl->xstats.overactions++;
609 cbq_ovl_classic(cl);
610}
611
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700612static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
613 psched_time_t now)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614{
615 struct cbq_class *cl;
616 struct cbq_class *cl_prev = q->active[prio];
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700617 psched_time_t sched = now;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618
619 if (cl_prev == NULL)
Patrick McHardye9054a32007-03-16 01:21:40 -0700620 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700621
622 do {
623 cl = cl_prev->next_alive;
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700624 if (now - cl->penalized > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700625 cl_prev->next_alive = cl->next_alive;
626 cl->next_alive = NULL;
627 cl->cpriority = cl->priority;
628 cl->delayed = 0;
629 cbq_activate_class(cl);
630
631 if (cl == q->active[prio]) {
632 q->active[prio] = cl_prev;
633 if (cl == q->active[prio]) {
634 q->active[prio] = NULL;
635 return 0;
636 }
637 }
638
639 cl = cl_prev->next_alive;
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700640 } else if (sched - cl->penalized > 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700641 sched = cl->penalized;
642 } while ((cl_prev = cl) != q->active[prio]);
643
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700644 return sched - now;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700645}
646
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700647static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648{
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700649 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
650 delay_timer);
651 struct Qdisc *sch = q->watchdog.qdisc;
652 psched_time_t now;
653 psched_tdiff_t delay = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700654 unsigned pmask;
655
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700656 PSCHED_GET_TIME(now);
657
Linus Torvalds1da177e2005-04-16 15:20:36 -0700658 pmask = q->pmask;
659 q->pmask = 0;
660
661 while (pmask) {
662 int prio = ffz(~pmask);
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700663 psched_tdiff_t tmp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700664
665 pmask &= ~(1<<prio);
666
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700667 tmp = cbq_undelay_prio(q, prio, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700668 if (tmp > 0) {
669 q->pmask |= 1<<prio;
670 if (tmp < delay || delay == 0)
671 delay = tmp;
672 }
673 }
674
675 if (delay) {
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700676 ktime_t time;
677
678 time = ktime_set(0, 0);
679 time = ktime_add_ns(time, PSCHED_US2NS(now + delay));
680 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700681 }
682
683 sch->flags &= ~TCQ_F_THROTTLED;
684 netif_schedule(sch->dev);
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700685 return HRTIMER_NORESTART;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700686}
687
688
689#ifdef CONFIG_NET_CLS_POLICE
690
691static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
692{
693 int len = skb->len;
694 struct Qdisc *sch = child->__parent;
695 struct cbq_sched_data *q = qdisc_priv(sch);
696 struct cbq_class *cl = q->rx_class;
697
698 q->rx_class = NULL;
699
700 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
701
702 cbq_mark_toplevel(q, cl);
703
704 q->rx_class = cl;
705 cl->q->__parent = sch;
706
707 if (cl->q->enqueue(skb, cl->q) == 0) {
708 sch->q.qlen++;
709 sch->bstats.packets++;
710 sch->bstats.bytes+=len;
711 if (!cl->next_alive)
712 cbq_activate_class(cl);
713 return 0;
714 }
715 sch->qstats.drops++;
716 return 0;
717 }
718
719 sch->qstats.drops++;
720 return -1;
721}
722#endif
723
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900724/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700725 It is mission critical procedure.
726
727 We "regenerate" toplevel cutoff, if transmitting class
728 has backlog and it is not regulated. It is not part of
729 original CBQ description, but looks more reasonable.
730 Probably, it is wrong. This question needs further investigation.
731*/
732
733static __inline__ void
734cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
735 struct cbq_class *borrowed)
736{
737 if (cl && q->toplevel >= borrowed->level) {
738 if (cl->q->q.qlen > 1) {
739 do {
740 if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
741 q->toplevel = borrowed->level;
742 return;
743 }
744 } while ((borrowed=borrowed->borrow) != NULL);
745 }
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900746#if 0
Linus Torvalds1da177e2005-04-16 15:20:36 -0700747 /* It is not necessary now. Uncommenting it
748 will save CPU cycles, but decrease fairness.
749 */
750 q->toplevel = TC_CBQ_MAXLEVEL;
751#endif
752 }
753}
754
755static void
756cbq_update(struct cbq_sched_data *q)
757{
758 struct cbq_class *this = q->tx_class;
759 struct cbq_class *cl = this;
760 int len = q->tx_len;
761
762 q->tx_class = NULL;
763
764 for ( ; cl; cl = cl->share) {
765 long avgidle = cl->avgidle;
766 long idle;
767
768 cl->bstats.packets++;
769 cl->bstats.bytes += len;
770
771 /*
772 (now - last) is total time between packet right edges.
773 (last_pktlen/rate) is "virtual" busy time, so that
774
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900775 idle = (now - last) - last_pktlen/rate
Linus Torvalds1da177e2005-04-16 15:20:36 -0700776 */
777
778 idle = PSCHED_TDIFF(q->now, cl->last);
779 if ((unsigned long)idle > 128*1024*1024) {
780 avgidle = cl->maxidle;
781 } else {
782 idle -= L2T(cl, len);
783
784 /* true_avgidle := (1-W)*true_avgidle + W*idle,
785 where W=2^{-ewma_log}. But cl->avgidle is scaled:
786 cl->avgidle == true_avgidle/W,
787 hence:
788 */
789 avgidle += idle - (avgidle>>cl->ewma_log);
790 }
791
792 if (avgidle <= 0) {
793 /* Overlimit or at-limit */
794
795 if (avgidle < cl->minidle)
796 avgidle = cl->minidle;
797
798 cl->avgidle = avgidle;
799
800 /* Calculate expected time, when this class
801 will be allowed to send.
802 It will occur, when:
803 (1-W)*true_avgidle + W*delay = 0, i.e.
804 idle = (1/W - 1)*(-true_avgidle)
805 or
806 idle = (1 - W)*(-cl->avgidle);
807 */
808 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
809
810 /*
811 That is not all.
812 To maintain the rate allocated to the class,
813 we add to undertime virtual clock,
814 necessary to complete transmitted packet.
815 (len/phys_bandwidth has been already passed
816 to the moment of cbq_update)
817 */
818
819 idle -= L2T(&q->link, len);
820 idle += L2T(cl, len);
821
822 PSCHED_AUDIT_TDIFF(idle);
823
824 PSCHED_TADD2(q->now, idle, cl->undertime);
825 } else {
826 /* Underlimit */
827
828 PSCHED_SET_PASTPERFECT(cl->undertime);
829 if (avgidle > cl->maxidle)
830 cl->avgidle = cl->maxidle;
831 else
832 cl->avgidle = avgidle;
833 }
834 cl->last = q->now;
835 }
836
837 cbq_update_toplevel(q, this, q->tx_borrowed);
838}
839
840static __inline__ struct cbq_class *
841cbq_under_limit(struct cbq_class *cl)
842{
843 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
844 struct cbq_class *this_cl = cl;
845
846 if (cl->tparent == NULL)
847 return cl;
848
849 if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
850 !PSCHED_TLESS(q->now, cl->undertime)) {
851 cl->delayed = 0;
852 return cl;
853 }
854
855 do {
856 /* It is very suspicious place. Now overlimit
857 action is generated for not bounded classes
858 only if link is completely congested.
859 Though it is in agree with ancestor-only paradigm,
860 it looks very stupid. Particularly,
861 it means that this chunk of code will either
862 never be called or result in strong amplification
863 of burstiness. Dangerous, silly, and, however,
864 no another solution exists.
865 */
866 if ((cl = cl->borrow) == NULL) {
867 this_cl->qstats.overlimits++;
868 this_cl->overlimit(this_cl);
869 return NULL;
870 }
871 if (cl->level > q->toplevel)
872 return NULL;
873 } while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
874 PSCHED_TLESS(q->now, cl->undertime));
875
876 cl->delayed = 0;
877 return cl;
878}
879
880static __inline__ struct sk_buff *
881cbq_dequeue_prio(struct Qdisc *sch, int prio)
882{
883 struct cbq_sched_data *q = qdisc_priv(sch);
884 struct cbq_class *cl_tail, *cl_prev, *cl;
885 struct sk_buff *skb;
886 int deficit;
887
888 cl_tail = cl_prev = q->active[prio];
889 cl = cl_prev->next_alive;
890
891 do {
892 deficit = 0;
893
894 /* Start round */
895 do {
896 struct cbq_class *borrow = cl;
897
898 if (cl->q->q.qlen &&
899 (borrow = cbq_under_limit(cl)) == NULL)
900 goto skip_class;
901
902 if (cl->deficit <= 0) {
903 /* Class exhausted its allotment per
904 this round. Switch to the next one.
905 */
906 deficit = 1;
907 cl->deficit += cl->quantum;
908 goto next_class;
909 }
910
911 skb = cl->q->dequeue(cl->q);
912
913 /* Class did not give us any skb :-(
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900914 It could occur even if cl->q->q.qlen != 0
Linus Torvalds1da177e2005-04-16 15:20:36 -0700915 f.e. if cl->q == "tbf"
916 */
917 if (skb == NULL)
918 goto skip_class;
919
920 cl->deficit -= skb->len;
921 q->tx_class = cl;
922 q->tx_borrowed = borrow;
923 if (borrow != cl) {
924#ifndef CBQ_XSTATS_BORROWS_BYTES
925 borrow->xstats.borrows++;
926 cl->xstats.borrows++;
927#else
928 borrow->xstats.borrows += skb->len;
929 cl->xstats.borrows += skb->len;
930#endif
931 }
932 q->tx_len = skb->len;
933
934 if (cl->deficit <= 0) {
935 q->active[prio] = cl;
936 cl = cl->next_alive;
937 cl->deficit += cl->quantum;
938 }
939 return skb;
940
941skip_class:
942 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
943 /* Class is empty or penalized.
944 Unlink it from active chain.
945 */
946 cl_prev->next_alive = cl->next_alive;
947 cl->next_alive = NULL;
948
949 /* Did cl_tail point to it? */
950 if (cl == cl_tail) {
951 /* Repair it! */
952 cl_tail = cl_prev;
953
954 /* Was it the last class in this band? */
955 if (cl == cl_tail) {
956 /* Kill the band! */
957 q->active[prio] = NULL;
958 q->activemask &= ~(1<<prio);
959 if (cl->q->q.qlen)
960 cbq_activate_class(cl);
961 return NULL;
962 }
963
964 q->active[prio] = cl_tail;
965 }
966 if (cl->q->q.qlen)
967 cbq_activate_class(cl);
968
969 cl = cl_prev;
970 }
971
972next_class:
973 cl_prev = cl;
974 cl = cl->next_alive;
975 } while (cl_prev != cl_tail);
976 } while (deficit);
977
978 q->active[prio] = cl_prev;
979
980 return NULL;
981}
982
983static __inline__ struct sk_buff *
984cbq_dequeue_1(struct Qdisc *sch)
985{
986 struct cbq_sched_data *q = qdisc_priv(sch);
987 struct sk_buff *skb;
988 unsigned activemask;
989
990 activemask = q->activemask&0xFF;
991 while (activemask) {
992 int prio = ffz(~activemask);
993 activemask &= ~(1<<prio);
994 skb = cbq_dequeue_prio(sch, prio);
995 if (skb)
996 return skb;
997 }
998 return NULL;
999}
1000
1001static struct sk_buff *
1002cbq_dequeue(struct Qdisc *sch)
1003{
1004 struct sk_buff *skb;
1005 struct cbq_sched_data *q = qdisc_priv(sch);
1006 psched_time_t now;
1007 psched_tdiff_t incr;
1008
1009 PSCHED_GET_TIME(now);
1010 incr = PSCHED_TDIFF(now, q->now_rt);
1011
1012 if (q->tx_class) {
1013 psched_tdiff_t incr2;
1014 /* Time integrator. We calculate EOS time
1015 by adding expected packet transmission time.
1016 If real time is greater, we warp artificial clock,
1017 so that:
1018
1019 cbq_time = max(real_time, work);
1020 */
1021 incr2 = L2T(&q->link, q->tx_len);
1022 PSCHED_TADD(q->now, incr2);
1023 cbq_update(q);
1024 if ((incr -= incr2) < 0)
1025 incr = 0;
1026 }
1027 PSCHED_TADD(q->now, incr);
1028 q->now_rt = now;
1029
1030 for (;;) {
1031 q->wd_expires = 0;
1032
1033 skb = cbq_dequeue_1(sch);
1034 if (skb) {
1035 sch->q.qlen--;
1036 sch->flags &= ~TCQ_F_THROTTLED;
1037 return skb;
1038 }
1039
1040 /* All the classes are overlimit.
1041
1042 It is possible, if:
1043
1044 1. Scheduler is empty.
1045 2. Toplevel cutoff inhibited borrowing.
1046 3. Root class is overlimit.
1047
1048 Reset 2d and 3d conditions and retry.
1049
1050 Note, that NS and cbq-2.0 are buggy, peeking
1051 an arbitrary class is appropriate for ancestor-only
1052 sharing, but not for toplevel algorithm.
1053
1054 Our version is better, but slower, because it requires
1055 two passes, but it is unavoidable with top-level sharing.
1056 */
1057
1058 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1059 PSCHED_IS_PASTPERFECT(q->link.undertime))
1060 break;
1061
1062 q->toplevel = TC_CBQ_MAXLEVEL;
1063 PSCHED_SET_PASTPERFECT(q->link.undertime);
1064 }
1065
1066 /* No packets in scheduler or nobody wants to give them to us :-(
1067 Sigh... start watchdog timer in the last case. */
1068
1069 if (sch->q.qlen) {
1070 sch->qstats.overlimits++;
Patrick McHardy88a99352007-03-16 01:21:11 -07001071 if (q->wd_expires)
1072 qdisc_watchdog_schedule(&q->watchdog,
Patrick McHardybb239ac2007-03-16 12:31:28 -07001073 now + q->wd_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001074 }
1075 return NULL;
1076}
1077
1078/* CBQ class maintanance routines */
1079
1080static void cbq_adjust_levels(struct cbq_class *this)
1081{
1082 if (this == NULL)
1083 return;
1084
1085 do {
1086 int level = 0;
1087 struct cbq_class *cl;
1088
1089 if ((cl = this->children) != NULL) {
1090 do {
1091 if (cl->level > level)
1092 level = cl->level;
1093 } while ((cl = cl->sibling) != this->children);
1094 }
1095 this->level = level+1;
1096 } while ((this = this->tparent) != NULL);
1097}
1098
1099static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1100{
1101 struct cbq_class *cl;
1102 unsigned h;
1103
1104 if (q->quanta[prio] == 0)
1105 return;
1106
1107 for (h=0; h<16; h++) {
1108 for (cl = q->classes[h]; cl; cl = cl->next) {
1109 /* BUGGGG... Beware! This expression suffer of
1110 arithmetic overflows!
1111 */
1112 if (cl->priority == prio) {
1113 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1114 q->quanta[prio];
1115 }
1116 if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
1117 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
1118 cl->quantum = cl->qdisc->dev->mtu/2 + 1;
1119 }
1120 }
1121 }
1122}
1123
1124static void cbq_sync_defmap(struct cbq_class *cl)
1125{
1126 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1127 struct cbq_class *split = cl->split;
1128 unsigned h;
1129 int i;
1130
1131 if (split == NULL)
1132 return;
1133
1134 for (i=0; i<=TC_PRIO_MAX; i++) {
1135 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1136 split->defaults[i] = NULL;
1137 }
1138
1139 for (i=0; i<=TC_PRIO_MAX; i++) {
1140 int level = split->level;
1141
1142 if (split->defaults[i])
1143 continue;
1144
1145 for (h=0; h<16; h++) {
1146 struct cbq_class *c;
1147
1148 for (c = q->classes[h]; c; c = c->next) {
1149 if (c->split == split && c->level < level &&
1150 c->defmap&(1<<i)) {
1151 split->defaults[i] = c;
1152 level = c->level;
1153 }
1154 }
1155 }
1156 }
1157}
1158
1159static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1160{
1161 struct cbq_class *split = NULL;
1162
1163 if (splitid == 0) {
1164 if ((split = cl->split) == NULL)
1165 return;
1166 splitid = split->classid;
1167 }
1168
1169 if (split == NULL || split->classid != splitid) {
1170 for (split = cl->tparent; split; split = split->tparent)
1171 if (split->classid == splitid)
1172 break;
1173 }
1174
1175 if (split == NULL)
1176 return;
1177
1178 if (cl->split != split) {
1179 cl->defmap = 0;
1180 cbq_sync_defmap(cl);
1181 cl->split = split;
1182 cl->defmap = def&mask;
1183 } else
1184 cl->defmap = (cl->defmap&~mask)|(def&mask);
1185
1186 cbq_sync_defmap(cl);
1187}
1188
1189static void cbq_unlink_class(struct cbq_class *this)
1190{
1191 struct cbq_class *cl, **clp;
1192 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1193
1194 for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
1195 if (cl == this) {
1196 *clp = cl->next;
1197 cl->next = NULL;
1198 break;
1199 }
1200 }
1201
1202 if (this->tparent) {
1203 clp=&this->sibling;
1204 cl = *clp;
1205 do {
1206 if (cl == this) {
1207 *clp = cl->sibling;
1208 break;
1209 }
1210 clp = &cl->sibling;
1211 } while ((cl = *clp) != this->sibling);
1212
1213 if (this->tparent->children == this) {
1214 this->tparent->children = this->sibling;
1215 if (this->sibling == this)
1216 this->tparent->children = NULL;
1217 }
1218 } else {
1219 BUG_TRAP(this->sibling == this);
1220 }
1221}
1222
1223static void cbq_link_class(struct cbq_class *this)
1224{
1225 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1226 unsigned h = cbq_hash(this->classid);
1227 struct cbq_class *parent = this->tparent;
1228
1229 this->sibling = this;
1230 this->next = q->classes[h];
1231 q->classes[h] = this;
1232
1233 if (parent == NULL)
1234 return;
1235
1236 if (parent->children == NULL) {
1237 parent->children = this;
1238 } else {
1239 this->sibling = parent->children->sibling;
1240 parent->children->sibling = this;
1241 }
1242}
1243
1244static unsigned int cbq_drop(struct Qdisc* sch)
1245{
1246 struct cbq_sched_data *q = qdisc_priv(sch);
1247 struct cbq_class *cl, *cl_head;
1248 int prio;
1249 unsigned int len;
1250
1251 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1252 if ((cl_head = q->active[prio]) == NULL)
1253 continue;
1254
1255 cl = cl_head;
1256 do {
1257 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1258 sch->q.qlen--;
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08001259 if (!cl->q->q.qlen)
1260 cbq_deactivate_class(cl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001261 return len;
1262 }
1263 } while ((cl = cl->next_alive) != cl_head);
1264 }
1265 return 0;
1266}
1267
1268static void
1269cbq_reset(struct Qdisc* sch)
1270{
1271 struct cbq_sched_data *q = qdisc_priv(sch);
1272 struct cbq_class *cl;
1273 int prio;
1274 unsigned h;
1275
1276 q->activemask = 0;
1277 q->pmask = 0;
1278 q->tx_class = NULL;
1279 q->tx_borrowed = NULL;
Patrick McHardy88a99352007-03-16 01:21:11 -07001280 qdisc_watchdog_cancel(&q->watchdog);
Patrick McHardy1a13cb62007-03-16 01:22:20 -07001281 hrtimer_cancel(&q->delay_timer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001282 q->toplevel = TC_CBQ_MAXLEVEL;
1283 PSCHED_GET_TIME(q->now);
1284 q->now_rt = q->now;
1285
1286 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1287 q->active[prio] = NULL;
1288
1289 for (h = 0; h < 16; h++) {
1290 for (cl = q->classes[h]; cl; cl = cl->next) {
1291 qdisc_reset(cl->q);
1292
1293 cl->next_alive = NULL;
1294 PSCHED_SET_PASTPERFECT(cl->undertime);
1295 cl->avgidle = cl->maxidle;
1296 cl->deficit = cl->quantum;
1297 cl->cpriority = cl->priority;
1298 }
1299 }
1300 sch->q.qlen = 0;
1301}
1302
1303
1304static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1305{
1306 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1307 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1308 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1309 }
1310 if (lss->change&TCF_CBQ_LSS_EWMA)
1311 cl->ewma_log = lss->ewma_log;
1312 if (lss->change&TCF_CBQ_LSS_AVPKT)
1313 cl->avpkt = lss->avpkt;
1314 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1315 cl->minidle = -(long)lss->minidle;
1316 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1317 cl->maxidle = lss->maxidle;
1318 cl->avgidle = lss->maxidle;
1319 }
1320 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1321 cl->offtime = lss->offtime;
1322 return 0;
1323}
1324
1325static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1326{
1327 q->nclasses[cl->priority]--;
1328 q->quanta[cl->priority] -= cl->weight;
1329 cbq_normalize_quanta(q, cl->priority);
1330}
1331
1332static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1333{
1334 q->nclasses[cl->priority]++;
1335 q->quanta[cl->priority] += cl->weight;
1336 cbq_normalize_quanta(q, cl->priority);
1337}
1338
1339static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1340{
1341 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1342
1343 if (wrr->allot)
1344 cl->allot = wrr->allot;
1345 if (wrr->weight)
1346 cl->weight = wrr->weight;
1347 if (wrr->priority) {
1348 cl->priority = wrr->priority-1;
1349 cl->cpriority = cl->priority;
1350 if (cl->priority >= cl->priority2)
1351 cl->priority2 = TC_CBQ_MAXPRIO-1;
1352 }
1353
1354 cbq_addprio(q, cl);
1355 return 0;
1356}
1357
1358static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1359{
1360 switch (ovl->strategy) {
1361 case TC_CBQ_OVL_CLASSIC:
1362 cl->overlimit = cbq_ovl_classic;
1363 break;
1364 case TC_CBQ_OVL_DELAY:
1365 cl->overlimit = cbq_ovl_delay;
1366 break;
1367 case TC_CBQ_OVL_LOWPRIO:
1368 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1369 ovl->priority2-1 <= cl->priority)
1370 return -EINVAL;
1371 cl->priority2 = ovl->priority2-1;
1372 cl->overlimit = cbq_ovl_lowprio;
1373 break;
1374 case TC_CBQ_OVL_DROP:
1375 cl->overlimit = cbq_ovl_drop;
1376 break;
1377 case TC_CBQ_OVL_RCLASSIC:
1378 cl->overlimit = cbq_ovl_rclassic;
1379 break;
1380 default:
1381 return -EINVAL;
1382 }
Patrick McHardy1a13cb62007-03-16 01:22:20 -07001383 cl->penalty = ovl->penalty;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001384 return 0;
1385}
1386
1387#ifdef CONFIG_NET_CLS_POLICE
1388static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1389{
1390 cl->police = p->police;
1391
1392 if (cl->q->handle) {
1393 if (p->police == TC_POLICE_RECLASSIFY)
1394 cl->q->reshape_fail = cbq_reshape_fail;
1395 else
1396 cl->q->reshape_fail = NULL;
1397 }
1398 return 0;
1399}
1400#endif
1401
1402static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1403{
1404 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1405 return 0;
1406}
1407
1408static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
1409{
1410 struct cbq_sched_data *q = qdisc_priv(sch);
1411 struct rtattr *tb[TCA_CBQ_MAX];
1412 struct tc_ratespec *r;
1413
1414 if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 ||
1415 tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1416 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1417 return -EINVAL;
1418
1419 if (tb[TCA_CBQ_LSSOPT-1] &&
1420 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1421 return -EINVAL;
1422
1423 r = RTA_DATA(tb[TCA_CBQ_RATE-1]);
1424
1425 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL)
1426 return -EINVAL;
1427
1428 q->link.refcnt = 1;
1429 q->link.sibling = &q->link;
1430 q->link.classid = sch->handle;
1431 q->link.qdisc = sch;
Patrick McHardy9f9afec2006-11-29 17:35:18 -08001432 if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1433 sch->handle)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001434 q->link.q = &noop_qdisc;
1435
1436 q->link.priority = TC_CBQ_MAXPRIO-1;
1437 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1438 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1439 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1440 q->link.overlimit = cbq_ovl_classic;
1441 q->link.allot = psched_mtu(sch->dev);
1442 q->link.quantum = q->link.allot;
1443 q->link.weight = q->link.R_tab->rate.rate;
1444
1445 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1446 q->link.avpkt = q->link.allot/2;
1447 q->link.minidle = -0x7FFFFFFF;
1448 q->link.stats_lock = &sch->dev->queue_lock;
1449
Patrick McHardy88a99352007-03-16 01:21:11 -07001450 qdisc_watchdog_init(&q->watchdog, sch);
Patrick McHardy1a13cb62007-03-16 01:22:20 -07001451 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001452 q->delay_timer.function = cbq_undelay;
1453 q->toplevel = TC_CBQ_MAXLEVEL;
1454 PSCHED_GET_TIME(q->now);
1455 q->now_rt = q->now;
1456
1457 cbq_link_class(&q->link);
1458
1459 if (tb[TCA_CBQ_LSSOPT-1])
1460 cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1461
1462 cbq_addprio(q, &q->link);
1463 return 0;
1464}
1465
1466static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1467{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001468 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001469
1470 RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1471 return skb->len;
1472
1473rtattr_failure:
1474 skb_trim(skb, b - skb->data);
1475 return -1;
1476}
1477
1478static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1479{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001480 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001481 struct tc_cbq_lssopt opt;
1482
1483 opt.flags = 0;
1484 if (cl->borrow == NULL)
1485 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1486 if (cl->share == NULL)
1487 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1488 opt.ewma_log = cl->ewma_log;
1489 opt.level = cl->level;
1490 opt.avpkt = cl->avpkt;
1491 opt.maxidle = cl->maxidle;
1492 opt.minidle = (u32)(-cl->minidle);
1493 opt.offtime = cl->offtime;
1494 opt.change = ~0;
1495 RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1496 return skb->len;
1497
1498rtattr_failure:
1499 skb_trim(skb, b - skb->data);
1500 return -1;
1501}
1502
1503static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1504{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001505 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001506 struct tc_cbq_wrropt opt;
1507
1508 opt.flags = 0;
1509 opt.allot = cl->allot;
1510 opt.priority = cl->priority+1;
1511 opt.cpriority = cl->cpriority+1;
1512 opt.weight = cl->weight;
1513 RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1514 return skb->len;
1515
1516rtattr_failure:
1517 skb_trim(skb, b - skb->data);
1518 return -1;
1519}
1520
1521static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1522{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001523 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001524 struct tc_cbq_ovl opt;
1525
1526 opt.strategy = cl->ovl_strategy;
1527 opt.priority2 = cl->priority2+1;
Patrick McHardy8a470772005-06-28 12:56:45 -07001528 opt.pad = 0;
Patrick McHardy1a13cb62007-03-16 01:22:20 -07001529 opt.penalty = cl->penalty;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001530 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1531 return skb->len;
1532
1533rtattr_failure:
1534 skb_trim(skb, b - skb->data);
1535 return -1;
1536}
1537
1538static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1539{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001540 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001541 struct tc_cbq_fopt opt;
1542
1543 if (cl->split || cl->defmap) {
1544 opt.split = cl->split ? cl->split->classid : 0;
1545 opt.defmap = cl->defmap;
1546 opt.defchange = ~0;
1547 RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1548 }
1549 return skb->len;
1550
1551rtattr_failure:
1552 skb_trim(skb, b - skb->data);
1553 return -1;
1554}
1555
1556#ifdef CONFIG_NET_CLS_POLICE
1557static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1558{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001559 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001560 struct tc_cbq_police opt;
1561
1562 if (cl->police) {
1563 opt.police = cl->police;
Patrick McHardy9ef1d4c2005-06-28 12:55:30 -07001564 opt.__res1 = 0;
1565 opt.__res2 = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001566 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1567 }
1568 return skb->len;
1569
1570rtattr_failure:
1571 skb_trim(skb, b - skb->data);
1572 return -1;
1573}
1574#endif
1575
1576static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1577{
1578 if (cbq_dump_lss(skb, cl) < 0 ||
1579 cbq_dump_rate(skb, cl) < 0 ||
1580 cbq_dump_wrr(skb, cl) < 0 ||
1581 cbq_dump_ovl(skb, cl) < 0 ||
1582#ifdef CONFIG_NET_CLS_POLICE
1583 cbq_dump_police(skb, cl) < 0 ||
1584#endif
1585 cbq_dump_fopt(skb, cl) < 0)
1586 return -1;
1587 return 0;
1588}
1589
1590static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1591{
1592 struct cbq_sched_data *q = qdisc_priv(sch);
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001593 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001594 struct rtattr *rta;
1595
1596 rta = (struct rtattr*)b;
1597 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1598 if (cbq_dump_attr(skb, &q->link) < 0)
1599 goto rtattr_failure;
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001600 rta->rta_len = skb_tail_pointer(skb) - b;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001601 return skb->len;
1602
1603rtattr_failure:
1604 skb_trim(skb, b - skb->data);
1605 return -1;
1606}
1607
1608static int
1609cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1610{
1611 struct cbq_sched_data *q = qdisc_priv(sch);
1612
1613 q->link.xstats.avgidle = q->link.avgidle;
1614 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1615}
1616
1617static int
1618cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1619 struct sk_buff *skb, struct tcmsg *tcm)
1620{
1621 struct cbq_class *cl = (struct cbq_class*)arg;
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001622 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001623 struct rtattr *rta;
1624
1625 if (cl->tparent)
1626 tcm->tcm_parent = cl->tparent->classid;
1627 else
1628 tcm->tcm_parent = TC_H_ROOT;
1629 tcm->tcm_handle = cl->classid;
1630 tcm->tcm_info = cl->q->handle;
1631
1632 rta = (struct rtattr*)b;
1633 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1634 if (cbq_dump_attr(skb, cl) < 0)
1635 goto rtattr_failure;
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001636 rta->rta_len = skb_tail_pointer(skb) - b;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001637 return skb->len;
1638
1639rtattr_failure:
1640 skb_trim(skb, b - skb->data);
1641 return -1;
1642}
1643
1644static int
1645cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1646 struct gnet_dump *d)
1647{
1648 struct cbq_sched_data *q = qdisc_priv(sch);
1649 struct cbq_class *cl = (struct cbq_class*)arg;
1650
1651 cl->qstats.qlen = cl->q->q.qlen;
1652 cl->xstats.avgidle = cl->avgidle;
1653 cl->xstats.undertime = 0;
1654
1655 if (!PSCHED_IS_PASTPERFECT(cl->undertime))
1656 cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now);
1657
1658 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1659#ifdef CONFIG_NET_ESTIMATOR
1660 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1661#endif
1662 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1663 return -1;
1664
1665 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1666}
1667
1668static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1669 struct Qdisc **old)
1670{
1671 struct cbq_class *cl = (struct cbq_class*)arg;
1672
1673 if (cl) {
1674 if (new == NULL) {
Patrick McHardy9f9afec2006-11-29 17:35:18 -08001675 if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1676 cl->classid)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001677 return -ENOBUFS;
1678 } else {
1679#ifdef CONFIG_NET_CLS_POLICE
1680 if (cl->police == TC_POLICE_RECLASSIFY)
1681 new->reshape_fail = cbq_reshape_fail;
1682#endif
1683 }
1684 sch_tree_lock(sch);
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08001685 *old = xchg(&cl->q, new);
Patrick McHardy5e50da02006-11-29 17:36:20 -08001686 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001687 qdisc_reset(*old);
1688 sch_tree_unlock(sch);
1689
1690 return 0;
1691 }
1692 return -ENOENT;
1693}
1694
1695static struct Qdisc *
1696cbq_leaf(struct Qdisc *sch, unsigned long arg)
1697{
1698 struct cbq_class *cl = (struct cbq_class*)arg;
1699
1700 return cl ? cl->q : NULL;
1701}
1702
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08001703static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1704{
1705 struct cbq_class *cl = (struct cbq_class *)arg;
1706
1707 if (cl->q->q.qlen == 0)
1708 cbq_deactivate_class(cl);
1709}
1710
Linus Torvalds1da177e2005-04-16 15:20:36 -07001711static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1712{
1713 struct cbq_sched_data *q = qdisc_priv(sch);
1714 struct cbq_class *cl = cbq_class_lookup(q, classid);
1715
1716 if (cl) {
1717 cl->refcnt++;
1718 return (unsigned long)cl;
1719 }
1720 return 0;
1721}
1722
1723static void cbq_destroy_filters(struct cbq_class *cl)
1724{
1725 struct tcf_proto *tp;
1726
1727 while ((tp = cl->filter_list) != NULL) {
1728 cl->filter_list = tp->next;
1729 tcf_destroy(tp);
1730 }
1731}
1732
1733static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1734{
1735 struct cbq_sched_data *q = qdisc_priv(sch);
1736
1737 BUG_TRAP(!cl->filters);
1738
1739 cbq_destroy_filters(cl);
1740 qdisc_destroy(cl->q);
1741 qdisc_put_rtab(cl->R_tab);
1742#ifdef CONFIG_NET_ESTIMATOR
1743 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1744#endif
1745 if (cl != &q->link)
1746 kfree(cl);
1747}
1748
1749static void
1750cbq_destroy(struct Qdisc* sch)
1751{
1752 struct cbq_sched_data *q = qdisc_priv(sch);
1753 struct cbq_class *cl;
1754 unsigned h;
1755
1756#ifdef CONFIG_NET_CLS_POLICE
1757 q->rx_class = NULL;
1758#endif
1759 /*
1760 * Filters must be destroyed first because we don't destroy the
1761 * classes from root to leafs which means that filters can still
1762 * be bound to classes which have been destroyed already. --TGR '04
1763 */
1764 for (h = 0; h < 16; h++)
1765 for (cl = q->classes[h]; cl; cl = cl->next)
1766 cbq_destroy_filters(cl);
1767
1768 for (h = 0; h < 16; h++) {
1769 struct cbq_class *next;
1770
1771 for (cl = q->classes[h]; cl; cl = next) {
1772 next = cl->next;
1773 cbq_destroy_class(sch, cl);
1774 }
1775 }
1776}
1777
1778static void cbq_put(struct Qdisc *sch, unsigned long arg)
1779{
1780 struct cbq_class *cl = (struct cbq_class*)arg;
1781
1782 if (--cl->refcnt == 0) {
1783#ifdef CONFIG_NET_CLS_POLICE
1784 struct cbq_sched_data *q = qdisc_priv(sch);
1785
1786 spin_lock_bh(&sch->dev->queue_lock);
1787 if (q->rx_class == cl)
1788 q->rx_class = NULL;
1789 spin_unlock_bh(&sch->dev->queue_lock);
1790#endif
1791
1792 cbq_destroy_class(sch, cl);
1793 }
1794}
1795
1796static int
1797cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
1798 unsigned long *arg)
1799{
1800 int err;
1801 struct cbq_sched_data *q = qdisc_priv(sch);
1802 struct cbq_class *cl = (struct cbq_class*)*arg;
1803 struct rtattr *opt = tca[TCA_OPTIONS-1];
1804 struct rtattr *tb[TCA_CBQ_MAX];
1805 struct cbq_class *parent;
1806 struct qdisc_rate_table *rtab = NULL;
1807
1808 if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt))
1809 return -EINVAL;
1810
1811 if (tb[TCA_CBQ_OVL_STRATEGY-1] &&
1812 RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl))
1813 return -EINVAL;
1814
1815 if (tb[TCA_CBQ_FOPT-1] &&
1816 RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt))
1817 return -EINVAL;
1818
1819 if (tb[TCA_CBQ_RATE-1] &&
1820 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1821 return -EINVAL;
1822
1823 if (tb[TCA_CBQ_LSSOPT-1] &&
1824 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1825 return -EINVAL;
1826
1827 if (tb[TCA_CBQ_WRROPT-1] &&
1828 RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt))
1829 return -EINVAL;
1830
1831#ifdef CONFIG_NET_CLS_POLICE
1832 if (tb[TCA_CBQ_POLICE-1] &&
1833 RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police))
1834 return -EINVAL;
1835#endif
1836
1837 if (cl) {
1838 /* Check parent */
1839 if (parentid) {
1840 if (cl->tparent && cl->tparent->classid != parentid)
1841 return -EINVAL;
1842 if (!cl->tparent && parentid != TC_H_ROOT)
1843 return -EINVAL;
1844 }
1845
1846 if (tb[TCA_CBQ_RATE-1]) {
1847 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1848 if (rtab == NULL)
1849 return -EINVAL;
1850 }
1851
1852 /* Change class parameters */
1853 sch_tree_lock(sch);
1854
1855 if (cl->next_alive != NULL)
1856 cbq_deactivate_class(cl);
1857
1858 if (rtab) {
1859 rtab = xchg(&cl->R_tab, rtab);
1860 qdisc_put_rtab(rtab);
1861 }
1862
1863 if (tb[TCA_CBQ_LSSOPT-1])
1864 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1865
1866 if (tb[TCA_CBQ_WRROPT-1]) {
1867 cbq_rmprio(q, cl);
1868 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1869 }
1870
1871 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1872 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1873
1874#ifdef CONFIG_NET_CLS_POLICE
1875 if (tb[TCA_CBQ_POLICE-1])
1876 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1877#endif
1878
1879 if (tb[TCA_CBQ_FOPT-1])
1880 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1881
1882 if (cl->q->q.qlen)
1883 cbq_activate_class(cl);
1884
1885 sch_tree_unlock(sch);
1886
1887#ifdef CONFIG_NET_ESTIMATOR
1888 if (tca[TCA_RATE-1])
1889 gen_replace_estimator(&cl->bstats, &cl->rate_est,
1890 cl->stats_lock, tca[TCA_RATE-1]);
1891#endif
1892 return 0;
1893 }
1894
1895 if (parentid == TC_H_ROOT)
1896 return -EINVAL;
1897
1898 if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1899 tb[TCA_CBQ_LSSOPT-1] == NULL)
1900 return -EINVAL;
1901
1902 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1903 if (rtab == NULL)
1904 return -EINVAL;
1905
1906 if (classid) {
1907 err = -EINVAL;
1908 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1909 goto failure;
1910 } else {
1911 int i;
1912 classid = TC_H_MAKE(sch->handle,0x8000);
1913
1914 for (i=0; i<0x8000; i++) {
1915 if (++q->hgenerator >= 0x8000)
1916 q->hgenerator = 1;
1917 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1918 break;
1919 }
1920 err = -ENOSR;
1921 if (i >= 0x8000)
1922 goto failure;
1923 classid = classid|q->hgenerator;
1924 }
1925
1926 parent = &q->link;
1927 if (parentid) {
1928 parent = cbq_class_lookup(q, parentid);
1929 err = -EINVAL;
1930 if (parent == NULL)
1931 goto failure;
1932 }
1933
1934 err = -ENOBUFS;
Panagiotis Issaris0da974f2006-07-21 14:51:30 -07001935 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001936 if (cl == NULL)
1937 goto failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001938 cl->R_tab = rtab;
1939 rtab = NULL;
1940 cl->refcnt = 1;
Patrick McHardy9f9afec2006-11-29 17:35:18 -08001941 if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001942 cl->q = &noop_qdisc;
1943 cl->classid = classid;
1944 cl->tparent = parent;
1945 cl->qdisc = sch;
1946 cl->allot = parent->allot;
1947 cl->quantum = cl->allot;
1948 cl->weight = cl->R_tab->rate.rate;
1949 cl->stats_lock = &sch->dev->queue_lock;
1950
1951 sch_tree_lock(sch);
1952 cbq_link_class(cl);
1953 cl->borrow = cl->tparent;
1954 if (cl->tparent != &q->link)
1955 cl->share = cl->tparent;
1956 cbq_adjust_levels(parent);
1957 cl->minidle = -0x7FFFFFFF;
1958 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1959 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1960 if (cl->ewma_log==0)
1961 cl->ewma_log = q->link.ewma_log;
1962 if (cl->maxidle==0)
1963 cl->maxidle = q->link.maxidle;
1964 if (cl->avpkt==0)
1965 cl->avpkt = q->link.avpkt;
1966 cl->overlimit = cbq_ovl_classic;
1967 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1968 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1969#ifdef CONFIG_NET_CLS_POLICE
1970 if (tb[TCA_CBQ_POLICE-1])
1971 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1972#endif
1973 if (tb[TCA_CBQ_FOPT-1])
1974 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1975 sch_tree_unlock(sch);
1976
1977#ifdef CONFIG_NET_ESTIMATOR
1978 if (tca[TCA_RATE-1])
1979 gen_new_estimator(&cl->bstats, &cl->rate_est,
1980 cl->stats_lock, tca[TCA_RATE-1]);
1981#endif
1982
1983 *arg = (unsigned long)cl;
1984 return 0;
1985
1986failure:
1987 qdisc_put_rtab(rtab);
1988 return err;
1989}
1990
1991static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1992{
1993 struct cbq_sched_data *q = qdisc_priv(sch);
1994 struct cbq_class *cl = (struct cbq_class*)arg;
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08001995 unsigned int qlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001996
1997 if (cl->filters || cl->children || cl == &q->link)
1998 return -EBUSY;
1999
2000 sch_tree_lock(sch);
2001
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08002002 qlen = cl->q->q.qlen;
2003 qdisc_reset(cl->q);
2004 qdisc_tree_decrease_qlen(cl->q, qlen);
2005
Linus Torvalds1da177e2005-04-16 15:20:36 -07002006 if (cl->next_alive)
2007 cbq_deactivate_class(cl);
2008
2009 if (q->tx_borrowed == cl)
2010 q->tx_borrowed = q->tx_class;
2011 if (q->tx_class == cl) {
2012 q->tx_class = NULL;
2013 q->tx_borrowed = NULL;
2014 }
2015#ifdef CONFIG_NET_CLS_POLICE
2016 if (q->rx_class == cl)
2017 q->rx_class = NULL;
2018#endif
2019
2020 cbq_unlink_class(cl);
2021 cbq_adjust_levels(cl->tparent);
2022 cl->defmap = 0;
2023 cbq_sync_defmap(cl);
2024
2025 cbq_rmprio(q, cl);
2026 sch_tree_unlock(sch);
2027
2028 if (--cl->refcnt == 0)
2029 cbq_destroy_class(sch, cl);
2030
2031 return 0;
2032}
2033
2034static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
2035{
2036 struct cbq_sched_data *q = qdisc_priv(sch);
2037 struct cbq_class *cl = (struct cbq_class *)arg;
2038
2039 if (cl == NULL)
2040 cl = &q->link;
2041
2042 return &cl->filter_list;
2043}
2044
2045static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
2046 u32 classid)
2047{
2048 struct cbq_sched_data *q = qdisc_priv(sch);
2049 struct cbq_class *p = (struct cbq_class*)parent;
2050 struct cbq_class *cl = cbq_class_lookup(q, classid);
2051
2052 if (cl) {
2053 if (p && p->level <= cl->level)
2054 return 0;
2055 cl->filters++;
2056 return (unsigned long)cl;
2057 }
2058 return 0;
2059}
2060
2061static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2062{
2063 struct cbq_class *cl = (struct cbq_class*)arg;
2064
2065 cl->filters--;
2066}
2067
2068static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2069{
2070 struct cbq_sched_data *q = qdisc_priv(sch);
2071 unsigned h;
2072
2073 if (arg->stop)
2074 return;
2075
2076 for (h = 0; h < 16; h++) {
2077 struct cbq_class *cl;
2078
2079 for (cl = q->classes[h]; cl; cl = cl->next) {
2080 if (arg->count < arg->skip) {
2081 arg->count++;
2082 continue;
2083 }
2084 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2085 arg->stop = 1;
2086 return;
2087 }
2088 arg->count++;
2089 }
2090 }
2091}
2092
2093static struct Qdisc_class_ops cbq_class_ops = {
2094 .graft = cbq_graft,
2095 .leaf = cbq_leaf,
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08002096 .qlen_notify = cbq_qlen_notify,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002097 .get = cbq_get,
2098 .put = cbq_put,
2099 .change = cbq_change_class,
2100 .delete = cbq_delete,
2101 .walk = cbq_walk,
2102 .tcf_chain = cbq_find_tcf,
2103 .bind_tcf = cbq_bind_filter,
2104 .unbind_tcf = cbq_unbind_filter,
2105 .dump = cbq_dump_class,
2106 .dump_stats = cbq_dump_class_stats,
2107};
2108
2109static struct Qdisc_ops cbq_qdisc_ops = {
2110 .next = NULL,
2111 .cl_ops = &cbq_class_ops,
2112 .id = "cbq",
2113 .priv_size = sizeof(struct cbq_sched_data),
2114 .enqueue = cbq_enqueue,
2115 .dequeue = cbq_dequeue,
2116 .requeue = cbq_requeue,
2117 .drop = cbq_drop,
2118 .init = cbq_init,
2119 .reset = cbq_reset,
2120 .destroy = cbq_destroy,
2121 .change = NULL,
2122 .dump = cbq_dump,
2123 .dump_stats = cbq_dump_stats,
2124 .owner = THIS_MODULE,
2125};
2126
2127static int __init cbq_module_init(void)
2128{
2129 return register_qdisc(&cbq_qdisc_ops);
2130}
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +09002131static void __exit cbq_module_exit(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002132{
2133 unregister_qdisc(&cbq_qdisc_ops);
2134}
2135module_init(cbq_module_init)
2136module_exit(cbq_module_exit)
2137MODULE_LICENSE("GPL");