blob: 57ac6c5cd27399c211ac0712bd92c242ceb88505 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 */
12
Linus Torvalds1da177e2005-04-16 15:20:36 -070013#include <linux/module.h>
14#include <asm/uaccess.h>
15#include <asm/system.h>
16#include <linux/bitops.h>
17#include <linux/types.h>
18#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070019#include <linux/string.h>
20#include <linux/mm.h>
21#include <linux/socket.h>
22#include <linux/sockios.h>
23#include <linux/in.h>
24#include <linux/errno.h>
25#include <linux/interrupt.h>
26#include <linux/if_ether.h>
27#include <linux/inet.h>
28#include <linux/netdevice.h>
29#include <linux/etherdevice.h>
30#include <linux/notifier.h>
31#include <net/ip.h>
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -070032#include <net/netlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070033#include <net/route.h>
34#include <linux/skbuff.h>
35#include <net/sock.h>
36#include <net/pkt_sched.h>
37
38
39/* Class-Based Queueing (CBQ) algorithm.
40 =======================================
41
42 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +090043 Management Models for Packet Networks",
Linus Torvalds1da177e2005-04-16 15:20:36 -070044 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
45
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +090046 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
Linus Torvalds1da177e2005-04-16 15:20:36 -070047
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +090048 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
Linus Torvalds1da177e2005-04-16 15:20:36 -070049 Parameters", 1996
50
51 [4] Sally Floyd and Michael Speer, "Experimental Results
52 for Class-Based Queueing", 1998, not published.
53
54 -----------------------------------------------------------------------
55
56 Algorithm skeleton was taken from NS simulator cbq.cc.
57 If someone wants to check this code against the LBL version,
58 he should take into account that ONLY the skeleton was borrowed,
59 the implementation is different. Particularly:
60
61 --- The WRR algorithm is different. Our version looks more
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +090062 reasonable (I hope) and works when quanta are allowed to be
63 less than MTU, which is always the case when real time classes
64 have small rates. Note, that the statement of [3] is
65 incomplete, delay may actually be estimated even if class
66 per-round allotment is less than MTU. Namely, if per-round
67 allotment is W*r_i, and r_1+...+r_k = r < 1
Linus Torvalds1da177e2005-04-16 15:20:36 -070068
69 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
70
71 In the worst case we have IntServ estimate with D = W*r+k*MTU
72 and C = MTU*r. The proof (if correct at all) is trivial.
73
74
75 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
76 interpret some places, which look like wrong translations
77 from NS. Anyone is advised to find these differences
78 and explain to me, why I am wrong 8).
79
80 --- Linux has no EOI event, so that we cannot estimate true class
81 idle time. Workaround is to consider the next dequeue event
82 as sign that previous packet is finished. This is wrong because of
83 internal device queueing, but on a permanently loaded link it is true.
84 Moreover, combined with clock integrator, this scheme looks
85 very close to an ideal solution. */
86
87struct cbq_sched_data;
88
89
90struct cbq_class
91{
92 struct cbq_class *next; /* hash table link */
93 struct cbq_class *next_alive; /* next class with backlog in this priority band */
94
95/* Parameters */
96 u32 classid;
97 unsigned char priority; /* class priority */
98 unsigned char priority2; /* priority to be used after overlimit */
99 unsigned char ewma_log; /* time constant for idle time calculation */
100 unsigned char ovl_strategy;
101#ifdef CONFIG_NET_CLS_POLICE
102 unsigned char police;
103#endif
104
105 u32 defmap;
106
107 /* Link-sharing scheduler parameters */
108 long maxidle; /* Class parameters: see below. */
109 long offtime;
110 long minidle;
111 u32 avpkt;
112 struct qdisc_rate_table *R_tab;
113
114 /* Overlimit strategy parameters */
115 void (*overlimit)(struct cbq_class *cl);
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700116 psched_tdiff_t penalty;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117
118 /* General scheduler (WRR) parameters */
119 long allot;
120 long quantum; /* Allotment per WRR round */
121 long weight; /* Relative allotment: see below */
122
123 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
124 struct cbq_class *split; /* Ptr to split node */
125 struct cbq_class *share; /* Ptr to LS parent in the class tree */
126 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
127 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
128 parent otherwise */
129 struct cbq_class *sibling; /* Sibling chain */
130 struct cbq_class *children; /* Pointer to children chain */
131
132 struct Qdisc *q; /* Elementary queueing discipline */
133
134
135/* Variables */
136 unsigned char cpriority; /* Effective priority */
137 unsigned char delayed;
138 unsigned char level; /* level of the class in hierarchy:
139 0 for leaf classes, and maximal
140 level of children + 1 for nodes.
141 */
142
143 psched_time_t last; /* Last end of service */
144 psched_time_t undertime;
145 long avgidle;
146 long deficit; /* Saved deficit for WRR */
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700147 psched_time_t penalized;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148 struct gnet_stats_basic bstats;
149 struct gnet_stats_queue qstats;
150 struct gnet_stats_rate_est rate_est;
151 spinlock_t *stats_lock;
152 struct tc_cbq_xstats xstats;
153
154 struct tcf_proto *filter_list;
155
156 int refcnt;
157 int filters;
158
159 struct cbq_class *defaults[TC_PRIO_MAX+1];
160};
161
162struct cbq_sched_data
163{
164 struct cbq_class *classes[16]; /* Hash table of all classes */
165 int nclasses[TC_CBQ_MAXPRIO+1];
166 unsigned quanta[TC_CBQ_MAXPRIO+1];
167
168 struct cbq_class link;
169
170 unsigned activemask;
171 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
172 with backlog */
173
174#ifdef CONFIG_NET_CLS_POLICE
175 struct cbq_class *rx_class;
176#endif
177 struct cbq_class *tx_class;
178 struct cbq_class *tx_borrowed;
179 int tx_len;
180 psched_time_t now; /* Cached timestamp */
181 psched_time_t now_rt; /* Cached real time */
182 unsigned pmask;
183
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700184 struct hrtimer delay_timer;
Patrick McHardy88a99352007-03-16 01:21:11 -0700185 struct qdisc_watchdog watchdog; /* Watchdog timer,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186 started when CBQ has
187 backlog, but cannot
188 transmit just now */
Patrick McHardy88a99352007-03-16 01:21:11 -0700189 psched_tdiff_t wd_expires;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700190 int toplevel;
191 u32 hgenerator;
192};
193
194
195#define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log])
196
197
198static __inline__ unsigned cbq_hash(u32 h)
199{
200 h ^= h>>8;
201 h ^= h>>4;
202 return h&0xF;
203}
204
205static __inline__ struct cbq_class *
206cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
207{
208 struct cbq_class *cl;
209
210 for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
211 if (cl->classid == classid)
212 return cl;
213 return NULL;
214}
215
216#ifdef CONFIG_NET_CLS_POLICE
217
218static struct cbq_class *
219cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
220{
221 struct cbq_class *cl, *new;
222
223 for (cl = this->tparent; cl; cl = cl->tparent)
224 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
225 return new;
226
227 return NULL;
228}
229
230#endif
231
232/* Classify packet. The procedure is pretty complicated, but
233 it allows us to combine link sharing and priority scheduling
234 transparently.
235
236 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
237 so that it resolves to split nodes. Then packets are classified
238 by logical priority, or a more specific classifier may be attached
239 to the split node.
240 */
241
242static struct cbq_class *
243cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
244{
245 struct cbq_sched_data *q = qdisc_priv(sch);
246 struct cbq_class *head = &q->link;
247 struct cbq_class **defmap;
248 struct cbq_class *cl = NULL;
249 u32 prio = skb->priority;
250 struct tcf_result res;
251
252 /*
253 * Step 1. If skb->priority points to one of our classes, use it.
254 */
255 if (TC_H_MAJ(prio^sch->handle) == 0 &&
256 (cl = cbq_class_lookup(q, prio)) != NULL)
257 return cl;
258
Jamal Hadi Salim29f1df62006-01-08 22:35:55 -0800259 *qerr = NET_XMIT_BYPASS;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260 for (;;) {
261 int result = 0;
262 defmap = head->defaults;
263
264 /*
265 * Step 2+n. Apply classifier.
266 */
267 if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
268 goto fallback;
269
270 if ((cl = (void*)res.class) == NULL) {
271 if (TC_H_MAJ(res.classid))
272 cl = cbq_class_lookup(q, res.classid);
273 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
274 cl = defmap[TC_PRIO_BESTEFFORT];
275
276 if (cl == NULL || cl->level >= head->level)
277 goto fallback;
278 }
279
280#ifdef CONFIG_NET_CLS_ACT
281 switch (result) {
282 case TC_ACT_QUEUED:
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900283 case TC_ACT_STOLEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 *qerr = NET_XMIT_SUCCESS;
285 case TC_ACT_SHOT:
286 return NULL;
287 }
288#elif defined(CONFIG_NET_CLS_POLICE)
289 switch (result) {
290 case TC_POLICE_RECLASSIFY:
291 return cbq_reclassify(skb, cl);
292 case TC_POLICE_SHOT:
293 return NULL;
294 default:
295 break;
296 }
297#endif
298 if (cl->level == 0)
299 return cl;
300
301 /*
302 * Step 3+n. If classifier selected a link sharing class,
303 * apply agency specific classifier.
304 * Repeat this procdure until we hit a leaf node.
305 */
306 head = cl;
307 }
308
309fallback:
310 cl = head;
311
312 /*
313 * Step 4. No success...
314 */
315 if (TC_H_MAJ(prio) == 0 &&
316 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
317 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
318 return head;
319
320 return cl;
321}
322
323/*
324 A packet has just been enqueued on the empty class.
325 cbq_activate_class adds it to the tail of active class list
326 of its priority band.
327 */
328
329static __inline__ void cbq_activate_class(struct cbq_class *cl)
330{
331 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
332 int prio = cl->cpriority;
333 struct cbq_class *cl_tail;
334
335 cl_tail = q->active[prio];
336 q->active[prio] = cl;
337
338 if (cl_tail != NULL) {
339 cl->next_alive = cl_tail->next_alive;
340 cl_tail->next_alive = cl;
341 } else {
342 cl->next_alive = cl;
343 q->activemask |= (1<<prio);
344 }
345}
346
347/*
348 Unlink class from active chain.
349 Note that this same procedure is done directly in cbq_dequeue*
350 during round-robin procedure.
351 */
352
353static void cbq_deactivate_class(struct cbq_class *this)
354{
355 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
356 int prio = this->cpriority;
357 struct cbq_class *cl;
358 struct cbq_class *cl_prev = q->active[prio];
359
360 do {
361 cl = cl_prev->next_alive;
362 if (cl == this) {
363 cl_prev->next_alive = cl->next_alive;
364 cl->next_alive = NULL;
365
366 if (cl == q->active[prio]) {
367 q->active[prio] = cl_prev;
368 if (cl == q->active[prio]) {
369 q->active[prio] = NULL;
370 q->activemask &= ~(1<<prio);
371 return;
372 }
373 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374 return;
375 }
376 } while ((cl_prev = cl) != q->active[prio]);
377}
378
379static void
380cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
381{
382 int toplevel = q->toplevel;
383
384 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
385 psched_time_t now;
386 psched_tdiff_t incr;
387
388 PSCHED_GET_TIME(now);
389 incr = PSCHED_TDIFF(now, q->now_rt);
390 PSCHED_TADD2(q->now, incr, now);
391
392 do {
393 if (PSCHED_TLESS(cl->undertime, now)) {
394 q->toplevel = cl->level;
395 return;
396 }
397 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
398 }
399}
400
401static int
402cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
403{
404 struct cbq_sched_data *q = qdisc_priv(sch);
405 int len = skb->len;
406 int ret;
407 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
408
409#ifdef CONFIG_NET_CLS_POLICE
410 q->rx_class = cl;
411#endif
412 if (cl == NULL) {
Jamal Hadi Salim29f1df62006-01-08 22:35:55 -0800413 if (ret == NET_XMIT_BYPASS)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414 sch->qstats.drops++;
415 kfree_skb(skb);
416 return ret;
417 }
418
419#ifdef CONFIG_NET_CLS_POLICE
420 cl->q->__parent = sch;
421#endif
422 if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
423 sch->q.qlen++;
424 sch->bstats.packets++;
425 sch->bstats.bytes+=len;
426 cbq_mark_toplevel(q, cl);
427 if (!cl->next_alive)
428 cbq_activate_class(cl);
429 return ret;
430 }
431
432 sch->qstats.drops++;
433 cbq_mark_toplevel(q, cl);
434 cl->qstats.drops++;
435 return ret;
436}
437
438static int
439cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
440{
441 struct cbq_sched_data *q = qdisc_priv(sch);
442 struct cbq_class *cl;
443 int ret;
444
445 if ((cl = q->tx_class) == NULL) {
446 kfree_skb(skb);
447 sch->qstats.drops++;
448 return NET_XMIT_CN;
449 }
450 q->tx_class = NULL;
451
452 cbq_mark_toplevel(q, cl);
453
454#ifdef CONFIG_NET_CLS_POLICE
455 q->rx_class = cl;
456 cl->q->__parent = sch;
457#endif
458 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
459 sch->q.qlen++;
460 sch->qstats.requeues++;
461 if (!cl->next_alive)
462 cbq_activate_class(cl);
463 return 0;
464 }
465 sch->qstats.drops++;
466 cl->qstats.drops++;
467 return ret;
468}
469
470/* Overlimit actions */
471
472/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
473
474static void cbq_ovl_classic(struct cbq_class *cl)
475{
476 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
477 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
478
479 if (!cl->delayed) {
480 delay += cl->offtime;
481
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900482 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483 Class goes to sleep, so that it will have no
484 chance to work avgidle. Let's forgive it 8)
485
486 BTW cbq-2.0 has a crap in this
487 place, apparently they forgot to shift it by cl->ewma_log.
488 */
489 if (cl->avgidle < 0)
490 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
491 if (cl->avgidle < cl->minidle)
492 cl->avgidle = cl->minidle;
493 if (delay <= 0)
494 delay = 1;
495 PSCHED_TADD2(q->now, delay, cl->undertime);
496
497 cl->xstats.overactions++;
498 cl->delayed = 1;
499 }
500 if (q->wd_expires == 0 || q->wd_expires > delay)
501 q->wd_expires = delay;
502
503 /* Dirty work! We must schedule wakeups based on
504 real available rate, rather than leaf rate,
505 which may be tiny (even zero).
506 */
507 if (q->toplevel == TC_CBQ_MAXLEVEL) {
508 struct cbq_class *b;
509 psched_tdiff_t base_delay = q->wd_expires;
510
511 for (b = cl->borrow; b; b = b->borrow) {
512 delay = PSCHED_TDIFF(b->undertime, q->now);
513 if (delay < base_delay) {
514 if (delay <= 0)
515 delay = 1;
516 base_delay = delay;
517 }
518 }
519
520 q->wd_expires = base_delay;
521 }
522}
523
524/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
525 they go overlimit
526 */
527
528static void cbq_ovl_rclassic(struct cbq_class *cl)
529{
530 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
531 struct cbq_class *this = cl;
532
533 do {
534 if (cl->level > q->toplevel) {
535 cl = NULL;
536 break;
537 }
538 } while ((cl = cl->borrow) != NULL);
539
540 if (cl == NULL)
541 cl = this;
542 cbq_ovl_classic(cl);
543}
544
545/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
546
547static void cbq_ovl_delay(struct cbq_class *cl)
548{
549 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
550 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
551
552 if (!cl->delayed) {
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700553 psched_time_t sched = q->now;
554 ktime_t expires;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700555
556 delay += cl->offtime;
557 if (cl->avgidle < 0)
558 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
559 if (cl->avgidle < cl->minidle)
560 cl->avgidle = cl->minidle;
561 PSCHED_TADD2(q->now, delay, cl->undertime);
562
563 if (delay > 0) {
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700564 sched += delay + cl->penalty;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700565 cl->penalized = sched;
566 cl->cpriority = TC_CBQ_MAXPRIO;
567 q->pmask |= (1<<TC_CBQ_MAXPRIO);
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700568
569 expires = ktime_set(0, 0);
570 expires = ktime_add_ns(expires, PSCHED_US2NS(sched));
571 if (hrtimer_try_to_cancel(&q->delay_timer) &&
572 ktime_to_ns(ktime_sub(q->delay_timer.expires,
573 expires)) > 0)
574 q->delay_timer.expires = expires;
575 hrtimer_restart(&q->delay_timer);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700576 cl->delayed = 1;
577 cl->xstats.overactions++;
578 return;
579 }
580 delay = 1;
581 }
582 if (q->wd_expires == 0 || q->wd_expires > delay)
583 q->wd_expires = delay;
584}
585
586/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
587
588static void cbq_ovl_lowprio(struct cbq_class *cl)
589{
590 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
591
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700592 cl->penalized = q->now + cl->penalty;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700593
594 if (cl->cpriority != cl->priority2) {
595 cl->cpriority = cl->priority2;
596 q->pmask |= (1<<cl->cpriority);
597 cl->xstats.overactions++;
598 }
599 cbq_ovl_classic(cl);
600}
601
602/* TC_CBQ_OVL_DROP: penalize class by dropping */
603
604static void cbq_ovl_drop(struct cbq_class *cl)
605{
606 if (cl->q->ops->drop)
607 if (cl->q->ops->drop(cl->q))
608 cl->qdisc->q.qlen--;
609 cl->xstats.overactions++;
610 cbq_ovl_classic(cl);
611}
612
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700613static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
614 psched_time_t now)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700615{
616 struct cbq_class *cl;
617 struct cbq_class *cl_prev = q->active[prio];
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700618 psched_time_t sched = now;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700619
620 if (cl_prev == NULL)
Patrick McHardye9054a32007-03-16 01:21:40 -0700621 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700622
623 do {
624 cl = cl_prev->next_alive;
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700625 if (now - cl->penalized > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700626 cl_prev->next_alive = cl->next_alive;
627 cl->next_alive = NULL;
628 cl->cpriority = cl->priority;
629 cl->delayed = 0;
630 cbq_activate_class(cl);
631
632 if (cl == q->active[prio]) {
633 q->active[prio] = cl_prev;
634 if (cl == q->active[prio]) {
635 q->active[prio] = NULL;
636 return 0;
637 }
638 }
639
640 cl = cl_prev->next_alive;
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700641 } else if (sched - cl->penalized > 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700642 sched = cl->penalized;
643 } while ((cl_prev = cl) != q->active[prio]);
644
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700645 return sched - now;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700646}
647
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700648static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700649{
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700650 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
651 delay_timer);
652 struct Qdisc *sch = q->watchdog.qdisc;
653 psched_time_t now;
654 psched_tdiff_t delay = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700655 unsigned pmask;
656
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700657 PSCHED_GET_TIME(now);
658
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659 pmask = q->pmask;
660 q->pmask = 0;
661
662 while (pmask) {
663 int prio = ffz(~pmask);
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700664 psched_tdiff_t tmp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665
666 pmask &= ~(1<<prio);
667
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700668 tmp = cbq_undelay_prio(q, prio, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669 if (tmp > 0) {
670 q->pmask |= 1<<prio;
671 if (tmp < delay || delay == 0)
672 delay = tmp;
673 }
674 }
675
676 if (delay) {
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700677 ktime_t time;
678
679 time = ktime_set(0, 0);
680 time = ktime_add_ns(time, PSCHED_US2NS(now + delay));
681 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700682 }
683
684 sch->flags &= ~TCQ_F_THROTTLED;
685 netif_schedule(sch->dev);
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700686 return HRTIMER_NORESTART;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700687}
688
689
690#ifdef CONFIG_NET_CLS_POLICE
691
692static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
693{
694 int len = skb->len;
695 struct Qdisc *sch = child->__parent;
696 struct cbq_sched_data *q = qdisc_priv(sch);
697 struct cbq_class *cl = q->rx_class;
698
699 q->rx_class = NULL;
700
701 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
702
703 cbq_mark_toplevel(q, cl);
704
705 q->rx_class = cl;
706 cl->q->__parent = sch;
707
708 if (cl->q->enqueue(skb, cl->q) == 0) {
709 sch->q.qlen++;
710 sch->bstats.packets++;
711 sch->bstats.bytes+=len;
712 if (!cl->next_alive)
713 cbq_activate_class(cl);
714 return 0;
715 }
716 sch->qstats.drops++;
717 return 0;
718 }
719
720 sch->qstats.drops++;
721 return -1;
722}
723#endif
724
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900725/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700726 It is mission critical procedure.
727
728 We "regenerate" toplevel cutoff, if transmitting class
729 has backlog and it is not regulated. It is not part of
730 original CBQ description, but looks more reasonable.
731 Probably, it is wrong. This question needs further investigation.
732*/
733
734static __inline__ void
735cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
736 struct cbq_class *borrowed)
737{
738 if (cl && q->toplevel >= borrowed->level) {
739 if (cl->q->q.qlen > 1) {
740 do {
741 if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
742 q->toplevel = borrowed->level;
743 return;
744 }
745 } while ((borrowed=borrowed->borrow) != NULL);
746 }
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900747#if 0
Linus Torvalds1da177e2005-04-16 15:20:36 -0700748 /* It is not necessary now. Uncommenting it
749 will save CPU cycles, but decrease fairness.
750 */
751 q->toplevel = TC_CBQ_MAXLEVEL;
752#endif
753 }
754}
755
756static void
757cbq_update(struct cbq_sched_data *q)
758{
759 struct cbq_class *this = q->tx_class;
760 struct cbq_class *cl = this;
761 int len = q->tx_len;
762
763 q->tx_class = NULL;
764
765 for ( ; cl; cl = cl->share) {
766 long avgidle = cl->avgidle;
767 long idle;
768
769 cl->bstats.packets++;
770 cl->bstats.bytes += len;
771
772 /*
773 (now - last) is total time between packet right edges.
774 (last_pktlen/rate) is "virtual" busy time, so that
775
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900776 idle = (now - last) - last_pktlen/rate
Linus Torvalds1da177e2005-04-16 15:20:36 -0700777 */
778
779 idle = PSCHED_TDIFF(q->now, cl->last);
780 if ((unsigned long)idle > 128*1024*1024) {
781 avgidle = cl->maxidle;
782 } else {
783 idle -= L2T(cl, len);
784
785 /* true_avgidle := (1-W)*true_avgidle + W*idle,
786 where W=2^{-ewma_log}. But cl->avgidle is scaled:
787 cl->avgidle == true_avgidle/W,
788 hence:
789 */
790 avgidle += idle - (avgidle>>cl->ewma_log);
791 }
792
793 if (avgidle <= 0) {
794 /* Overlimit or at-limit */
795
796 if (avgidle < cl->minidle)
797 avgidle = cl->minidle;
798
799 cl->avgidle = avgidle;
800
801 /* Calculate expected time, when this class
802 will be allowed to send.
803 It will occur, when:
804 (1-W)*true_avgidle + W*delay = 0, i.e.
805 idle = (1/W - 1)*(-true_avgidle)
806 or
807 idle = (1 - W)*(-cl->avgidle);
808 */
809 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
810
811 /*
812 That is not all.
813 To maintain the rate allocated to the class,
814 we add to undertime virtual clock,
815 necessary to complete transmitted packet.
816 (len/phys_bandwidth has been already passed
817 to the moment of cbq_update)
818 */
819
820 idle -= L2T(&q->link, len);
821 idle += L2T(cl, len);
822
Linus Torvalds1da177e2005-04-16 15:20:36 -0700823 PSCHED_TADD2(q->now, idle, cl->undertime);
824 } else {
825 /* Underlimit */
826
827 PSCHED_SET_PASTPERFECT(cl->undertime);
828 if (avgidle > cl->maxidle)
829 cl->avgidle = cl->maxidle;
830 else
831 cl->avgidle = avgidle;
832 }
833 cl->last = q->now;
834 }
835
836 cbq_update_toplevel(q, this, q->tx_borrowed);
837}
838
839static __inline__ struct cbq_class *
840cbq_under_limit(struct cbq_class *cl)
841{
842 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
843 struct cbq_class *this_cl = cl;
844
845 if (cl->tparent == NULL)
846 return cl;
847
848 if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
849 !PSCHED_TLESS(q->now, cl->undertime)) {
850 cl->delayed = 0;
851 return cl;
852 }
853
854 do {
855 /* It is very suspicious place. Now overlimit
856 action is generated for not bounded classes
857 only if link is completely congested.
858 Though it is in agree with ancestor-only paradigm,
859 it looks very stupid. Particularly,
860 it means that this chunk of code will either
861 never be called or result in strong amplification
862 of burstiness. Dangerous, silly, and, however,
863 no another solution exists.
864 */
865 if ((cl = cl->borrow) == NULL) {
866 this_cl->qstats.overlimits++;
867 this_cl->overlimit(this_cl);
868 return NULL;
869 }
870 if (cl->level > q->toplevel)
871 return NULL;
872 } while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
873 PSCHED_TLESS(q->now, cl->undertime));
874
875 cl->delayed = 0;
876 return cl;
877}
878
879static __inline__ struct sk_buff *
880cbq_dequeue_prio(struct Qdisc *sch, int prio)
881{
882 struct cbq_sched_data *q = qdisc_priv(sch);
883 struct cbq_class *cl_tail, *cl_prev, *cl;
884 struct sk_buff *skb;
885 int deficit;
886
887 cl_tail = cl_prev = q->active[prio];
888 cl = cl_prev->next_alive;
889
890 do {
891 deficit = 0;
892
893 /* Start round */
894 do {
895 struct cbq_class *borrow = cl;
896
897 if (cl->q->q.qlen &&
898 (borrow = cbq_under_limit(cl)) == NULL)
899 goto skip_class;
900
901 if (cl->deficit <= 0) {
902 /* Class exhausted its allotment per
903 this round. Switch to the next one.
904 */
905 deficit = 1;
906 cl->deficit += cl->quantum;
907 goto next_class;
908 }
909
910 skb = cl->q->dequeue(cl->q);
911
912 /* Class did not give us any skb :-(
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900913 It could occur even if cl->q->q.qlen != 0
Linus Torvalds1da177e2005-04-16 15:20:36 -0700914 f.e. if cl->q == "tbf"
915 */
916 if (skb == NULL)
917 goto skip_class;
918
919 cl->deficit -= skb->len;
920 q->tx_class = cl;
921 q->tx_borrowed = borrow;
922 if (borrow != cl) {
923#ifndef CBQ_XSTATS_BORROWS_BYTES
924 borrow->xstats.borrows++;
925 cl->xstats.borrows++;
926#else
927 borrow->xstats.borrows += skb->len;
928 cl->xstats.borrows += skb->len;
929#endif
930 }
931 q->tx_len = skb->len;
932
933 if (cl->deficit <= 0) {
934 q->active[prio] = cl;
935 cl = cl->next_alive;
936 cl->deficit += cl->quantum;
937 }
938 return skb;
939
940skip_class:
941 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
942 /* Class is empty or penalized.
943 Unlink it from active chain.
944 */
945 cl_prev->next_alive = cl->next_alive;
946 cl->next_alive = NULL;
947
948 /* Did cl_tail point to it? */
949 if (cl == cl_tail) {
950 /* Repair it! */
951 cl_tail = cl_prev;
952
953 /* Was it the last class in this band? */
954 if (cl == cl_tail) {
955 /* Kill the band! */
956 q->active[prio] = NULL;
957 q->activemask &= ~(1<<prio);
958 if (cl->q->q.qlen)
959 cbq_activate_class(cl);
960 return NULL;
961 }
962
963 q->active[prio] = cl_tail;
964 }
965 if (cl->q->q.qlen)
966 cbq_activate_class(cl);
967
968 cl = cl_prev;
969 }
970
971next_class:
972 cl_prev = cl;
973 cl = cl->next_alive;
974 } while (cl_prev != cl_tail);
975 } while (deficit);
976
977 q->active[prio] = cl_prev;
978
979 return NULL;
980}
981
982static __inline__ struct sk_buff *
983cbq_dequeue_1(struct Qdisc *sch)
984{
985 struct cbq_sched_data *q = qdisc_priv(sch);
986 struct sk_buff *skb;
987 unsigned activemask;
988
989 activemask = q->activemask&0xFF;
990 while (activemask) {
991 int prio = ffz(~activemask);
992 activemask &= ~(1<<prio);
993 skb = cbq_dequeue_prio(sch, prio);
994 if (skb)
995 return skb;
996 }
997 return NULL;
998}
999
1000static struct sk_buff *
1001cbq_dequeue(struct Qdisc *sch)
1002{
1003 struct sk_buff *skb;
1004 struct cbq_sched_data *q = qdisc_priv(sch);
1005 psched_time_t now;
1006 psched_tdiff_t incr;
1007
1008 PSCHED_GET_TIME(now);
1009 incr = PSCHED_TDIFF(now, q->now_rt);
1010
1011 if (q->tx_class) {
1012 psched_tdiff_t incr2;
1013 /* Time integrator. We calculate EOS time
1014 by adding expected packet transmission time.
1015 If real time is greater, we warp artificial clock,
1016 so that:
1017
1018 cbq_time = max(real_time, work);
1019 */
1020 incr2 = L2T(&q->link, q->tx_len);
1021 PSCHED_TADD(q->now, incr2);
1022 cbq_update(q);
1023 if ((incr -= incr2) < 0)
1024 incr = 0;
1025 }
1026 PSCHED_TADD(q->now, incr);
1027 q->now_rt = now;
1028
1029 for (;;) {
1030 q->wd_expires = 0;
1031
1032 skb = cbq_dequeue_1(sch);
1033 if (skb) {
1034 sch->q.qlen--;
1035 sch->flags &= ~TCQ_F_THROTTLED;
1036 return skb;
1037 }
1038
1039 /* All the classes are overlimit.
1040
1041 It is possible, if:
1042
1043 1. Scheduler is empty.
1044 2. Toplevel cutoff inhibited borrowing.
1045 3. Root class is overlimit.
1046
1047 Reset 2d and 3d conditions and retry.
1048
1049 Note, that NS and cbq-2.0 are buggy, peeking
1050 an arbitrary class is appropriate for ancestor-only
1051 sharing, but not for toplevel algorithm.
1052
1053 Our version is better, but slower, because it requires
1054 two passes, but it is unavoidable with top-level sharing.
1055 */
1056
1057 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1058 PSCHED_IS_PASTPERFECT(q->link.undertime))
1059 break;
1060
1061 q->toplevel = TC_CBQ_MAXLEVEL;
1062 PSCHED_SET_PASTPERFECT(q->link.undertime);
1063 }
1064
1065 /* No packets in scheduler or nobody wants to give them to us :-(
1066 Sigh... start watchdog timer in the last case. */
1067
1068 if (sch->q.qlen) {
1069 sch->qstats.overlimits++;
Patrick McHardy88a99352007-03-16 01:21:11 -07001070 if (q->wd_expires)
1071 qdisc_watchdog_schedule(&q->watchdog,
Patrick McHardybb239ac2007-03-16 12:31:28 -07001072 now + q->wd_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001073 }
1074 return NULL;
1075}
1076
1077/* CBQ class maintanance routines */
1078
1079static void cbq_adjust_levels(struct cbq_class *this)
1080{
1081 if (this == NULL)
1082 return;
1083
1084 do {
1085 int level = 0;
1086 struct cbq_class *cl;
1087
1088 if ((cl = this->children) != NULL) {
1089 do {
1090 if (cl->level > level)
1091 level = cl->level;
1092 } while ((cl = cl->sibling) != this->children);
1093 }
1094 this->level = level+1;
1095 } while ((this = this->tparent) != NULL);
1096}
1097
1098static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1099{
1100 struct cbq_class *cl;
1101 unsigned h;
1102
1103 if (q->quanta[prio] == 0)
1104 return;
1105
1106 for (h=0; h<16; h++) {
1107 for (cl = q->classes[h]; cl; cl = cl->next) {
1108 /* BUGGGG... Beware! This expression suffer of
1109 arithmetic overflows!
1110 */
1111 if (cl->priority == prio) {
1112 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1113 q->quanta[prio];
1114 }
1115 if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
1116 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
1117 cl->quantum = cl->qdisc->dev->mtu/2 + 1;
1118 }
1119 }
1120 }
1121}
1122
1123static void cbq_sync_defmap(struct cbq_class *cl)
1124{
1125 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1126 struct cbq_class *split = cl->split;
1127 unsigned h;
1128 int i;
1129
1130 if (split == NULL)
1131 return;
1132
1133 for (i=0; i<=TC_PRIO_MAX; i++) {
1134 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1135 split->defaults[i] = NULL;
1136 }
1137
1138 for (i=0; i<=TC_PRIO_MAX; i++) {
1139 int level = split->level;
1140
1141 if (split->defaults[i])
1142 continue;
1143
1144 for (h=0; h<16; h++) {
1145 struct cbq_class *c;
1146
1147 for (c = q->classes[h]; c; c = c->next) {
1148 if (c->split == split && c->level < level &&
1149 c->defmap&(1<<i)) {
1150 split->defaults[i] = c;
1151 level = c->level;
1152 }
1153 }
1154 }
1155 }
1156}
1157
1158static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1159{
1160 struct cbq_class *split = NULL;
1161
1162 if (splitid == 0) {
1163 if ((split = cl->split) == NULL)
1164 return;
1165 splitid = split->classid;
1166 }
1167
1168 if (split == NULL || split->classid != splitid) {
1169 for (split = cl->tparent; split; split = split->tparent)
1170 if (split->classid == splitid)
1171 break;
1172 }
1173
1174 if (split == NULL)
1175 return;
1176
1177 if (cl->split != split) {
1178 cl->defmap = 0;
1179 cbq_sync_defmap(cl);
1180 cl->split = split;
1181 cl->defmap = def&mask;
1182 } else
1183 cl->defmap = (cl->defmap&~mask)|(def&mask);
1184
1185 cbq_sync_defmap(cl);
1186}
1187
1188static void cbq_unlink_class(struct cbq_class *this)
1189{
1190 struct cbq_class *cl, **clp;
1191 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1192
1193 for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
1194 if (cl == this) {
1195 *clp = cl->next;
1196 cl->next = NULL;
1197 break;
1198 }
1199 }
1200
1201 if (this->tparent) {
1202 clp=&this->sibling;
1203 cl = *clp;
1204 do {
1205 if (cl == this) {
1206 *clp = cl->sibling;
1207 break;
1208 }
1209 clp = &cl->sibling;
1210 } while ((cl = *clp) != this->sibling);
1211
1212 if (this->tparent->children == this) {
1213 this->tparent->children = this->sibling;
1214 if (this->sibling == this)
1215 this->tparent->children = NULL;
1216 }
1217 } else {
1218 BUG_TRAP(this->sibling == this);
1219 }
1220}
1221
1222static void cbq_link_class(struct cbq_class *this)
1223{
1224 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1225 unsigned h = cbq_hash(this->classid);
1226 struct cbq_class *parent = this->tparent;
1227
1228 this->sibling = this;
1229 this->next = q->classes[h];
1230 q->classes[h] = this;
1231
1232 if (parent == NULL)
1233 return;
1234
1235 if (parent->children == NULL) {
1236 parent->children = this;
1237 } else {
1238 this->sibling = parent->children->sibling;
1239 parent->children->sibling = this;
1240 }
1241}
1242
1243static unsigned int cbq_drop(struct Qdisc* sch)
1244{
1245 struct cbq_sched_data *q = qdisc_priv(sch);
1246 struct cbq_class *cl, *cl_head;
1247 int prio;
1248 unsigned int len;
1249
1250 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1251 if ((cl_head = q->active[prio]) == NULL)
1252 continue;
1253
1254 cl = cl_head;
1255 do {
1256 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1257 sch->q.qlen--;
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08001258 if (!cl->q->q.qlen)
1259 cbq_deactivate_class(cl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001260 return len;
1261 }
1262 } while ((cl = cl->next_alive) != cl_head);
1263 }
1264 return 0;
1265}
1266
1267static void
1268cbq_reset(struct Qdisc* sch)
1269{
1270 struct cbq_sched_data *q = qdisc_priv(sch);
1271 struct cbq_class *cl;
1272 int prio;
1273 unsigned h;
1274
1275 q->activemask = 0;
1276 q->pmask = 0;
1277 q->tx_class = NULL;
1278 q->tx_borrowed = NULL;
Patrick McHardy88a99352007-03-16 01:21:11 -07001279 qdisc_watchdog_cancel(&q->watchdog);
Patrick McHardy1a13cb62007-03-16 01:22:20 -07001280 hrtimer_cancel(&q->delay_timer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001281 q->toplevel = TC_CBQ_MAXLEVEL;
1282 PSCHED_GET_TIME(q->now);
1283 q->now_rt = q->now;
1284
1285 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1286 q->active[prio] = NULL;
1287
1288 for (h = 0; h < 16; h++) {
1289 for (cl = q->classes[h]; cl; cl = cl->next) {
1290 qdisc_reset(cl->q);
1291
1292 cl->next_alive = NULL;
1293 PSCHED_SET_PASTPERFECT(cl->undertime);
1294 cl->avgidle = cl->maxidle;
1295 cl->deficit = cl->quantum;
1296 cl->cpriority = cl->priority;
1297 }
1298 }
1299 sch->q.qlen = 0;
1300}
1301
1302
1303static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1304{
1305 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1306 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1307 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1308 }
1309 if (lss->change&TCF_CBQ_LSS_EWMA)
1310 cl->ewma_log = lss->ewma_log;
1311 if (lss->change&TCF_CBQ_LSS_AVPKT)
1312 cl->avpkt = lss->avpkt;
1313 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1314 cl->minidle = -(long)lss->minidle;
1315 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1316 cl->maxidle = lss->maxidle;
1317 cl->avgidle = lss->maxidle;
1318 }
1319 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1320 cl->offtime = lss->offtime;
1321 return 0;
1322}
1323
1324static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1325{
1326 q->nclasses[cl->priority]--;
1327 q->quanta[cl->priority] -= cl->weight;
1328 cbq_normalize_quanta(q, cl->priority);
1329}
1330
1331static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1332{
1333 q->nclasses[cl->priority]++;
1334 q->quanta[cl->priority] += cl->weight;
1335 cbq_normalize_quanta(q, cl->priority);
1336}
1337
1338static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1339{
1340 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1341
1342 if (wrr->allot)
1343 cl->allot = wrr->allot;
1344 if (wrr->weight)
1345 cl->weight = wrr->weight;
1346 if (wrr->priority) {
1347 cl->priority = wrr->priority-1;
1348 cl->cpriority = cl->priority;
1349 if (cl->priority >= cl->priority2)
1350 cl->priority2 = TC_CBQ_MAXPRIO-1;
1351 }
1352
1353 cbq_addprio(q, cl);
1354 return 0;
1355}
1356
1357static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1358{
1359 switch (ovl->strategy) {
1360 case TC_CBQ_OVL_CLASSIC:
1361 cl->overlimit = cbq_ovl_classic;
1362 break;
1363 case TC_CBQ_OVL_DELAY:
1364 cl->overlimit = cbq_ovl_delay;
1365 break;
1366 case TC_CBQ_OVL_LOWPRIO:
1367 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1368 ovl->priority2-1 <= cl->priority)
1369 return -EINVAL;
1370 cl->priority2 = ovl->priority2-1;
1371 cl->overlimit = cbq_ovl_lowprio;
1372 break;
1373 case TC_CBQ_OVL_DROP:
1374 cl->overlimit = cbq_ovl_drop;
1375 break;
1376 case TC_CBQ_OVL_RCLASSIC:
1377 cl->overlimit = cbq_ovl_rclassic;
1378 break;
1379 default:
1380 return -EINVAL;
1381 }
Patrick McHardy1a13cb62007-03-16 01:22:20 -07001382 cl->penalty = ovl->penalty;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001383 return 0;
1384}
1385
1386#ifdef CONFIG_NET_CLS_POLICE
1387static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1388{
1389 cl->police = p->police;
1390
1391 if (cl->q->handle) {
1392 if (p->police == TC_POLICE_RECLASSIFY)
1393 cl->q->reshape_fail = cbq_reshape_fail;
1394 else
1395 cl->q->reshape_fail = NULL;
1396 }
1397 return 0;
1398}
1399#endif
1400
1401static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1402{
1403 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1404 return 0;
1405}
1406
1407static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
1408{
1409 struct cbq_sched_data *q = qdisc_priv(sch);
1410 struct rtattr *tb[TCA_CBQ_MAX];
1411 struct tc_ratespec *r;
1412
1413 if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 ||
1414 tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1415 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1416 return -EINVAL;
1417
1418 if (tb[TCA_CBQ_LSSOPT-1] &&
1419 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1420 return -EINVAL;
1421
1422 r = RTA_DATA(tb[TCA_CBQ_RATE-1]);
1423
1424 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL)
1425 return -EINVAL;
1426
1427 q->link.refcnt = 1;
1428 q->link.sibling = &q->link;
1429 q->link.classid = sch->handle;
1430 q->link.qdisc = sch;
Patrick McHardy9f9afec2006-11-29 17:35:18 -08001431 if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1432 sch->handle)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001433 q->link.q = &noop_qdisc;
1434
1435 q->link.priority = TC_CBQ_MAXPRIO-1;
1436 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1437 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1438 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1439 q->link.overlimit = cbq_ovl_classic;
1440 q->link.allot = psched_mtu(sch->dev);
1441 q->link.quantum = q->link.allot;
1442 q->link.weight = q->link.R_tab->rate.rate;
1443
1444 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1445 q->link.avpkt = q->link.allot/2;
1446 q->link.minidle = -0x7FFFFFFF;
1447 q->link.stats_lock = &sch->dev->queue_lock;
1448
Patrick McHardy88a99352007-03-16 01:21:11 -07001449 qdisc_watchdog_init(&q->watchdog, sch);
Patrick McHardy1a13cb62007-03-16 01:22:20 -07001450 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001451 q->delay_timer.function = cbq_undelay;
1452 q->toplevel = TC_CBQ_MAXLEVEL;
1453 PSCHED_GET_TIME(q->now);
1454 q->now_rt = q->now;
1455
1456 cbq_link_class(&q->link);
1457
1458 if (tb[TCA_CBQ_LSSOPT-1])
1459 cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1460
1461 cbq_addprio(q, &q->link);
1462 return 0;
1463}
1464
1465static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1466{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001467 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001468
1469 RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1470 return skb->len;
1471
1472rtattr_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001473 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001474 return -1;
1475}
1476
1477static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1478{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001479 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001480 struct tc_cbq_lssopt opt;
1481
1482 opt.flags = 0;
1483 if (cl->borrow == NULL)
1484 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1485 if (cl->share == NULL)
1486 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1487 opt.ewma_log = cl->ewma_log;
1488 opt.level = cl->level;
1489 opt.avpkt = cl->avpkt;
1490 opt.maxidle = cl->maxidle;
1491 opt.minidle = (u32)(-cl->minidle);
1492 opt.offtime = cl->offtime;
1493 opt.change = ~0;
1494 RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1495 return skb->len;
1496
1497rtattr_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001498 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001499 return -1;
1500}
1501
1502static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1503{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001504 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001505 struct tc_cbq_wrropt opt;
1506
1507 opt.flags = 0;
1508 opt.allot = cl->allot;
1509 opt.priority = cl->priority+1;
1510 opt.cpriority = cl->cpriority+1;
1511 opt.weight = cl->weight;
1512 RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1513 return skb->len;
1514
1515rtattr_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001516 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001517 return -1;
1518}
1519
1520static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1521{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001522 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001523 struct tc_cbq_ovl opt;
1524
1525 opt.strategy = cl->ovl_strategy;
1526 opt.priority2 = cl->priority2+1;
Patrick McHardy8a470772005-06-28 12:56:45 -07001527 opt.pad = 0;
Patrick McHardy1a13cb62007-03-16 01:22:20 -07001528 opt.penalty = cl->penalty;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001529 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1530 return skb->len;
1531
1532rtattr_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001533 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001534 return -1;
1535}
1536
1537static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1538{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001539 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001540 struct tc_cbq_fopt opt;
1541
1542 if (cl->split || cl->defmap) {
1543 opt.split = cl->split ? cl->split->classid : 0;
1544 opt.defmap = cl->defmap;
1545 opt.defchange = ~0;
1546 RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1547 }
1548 return skb->len;
1549
1550rtattr_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001551 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001552 return -1;
1553}
1554
1555#ifdef CONFIG_NET_CLS_POLICE
1556static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1557{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001558 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001559 struct tc_cbq_police opt;
1560
1561 if (cl->police) {
1562 opt.police = cl->police;
Patrick McHardy9ef1d4c2005-06-28 12:55:30 -07001563 opt.__res1 = 0;
1564 opt.__res2 = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001565 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1566 }
1567 return skb->len;
1568
1569rtattr_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001570 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001571 return -1;
1572}
1573#endif
1574
1575static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1576{
1577 if (cbq_dump_lss(skb, cl) < 0 ||
1578 cbq_dump_rate(skb, cl) < 0 ||
1579 cbq_dump_wrr(skb, cl) < 0 ||
1580 cbq_dump_ovl(skb, cl) < 0 ||
1581#ifdef CONFIG_NET_CLS_POLICE
1582 cbq_dump_police(skb, cl) < 0 ||
1583#endif
1584 cbq_dump_fopt(skb, cl) < 0)
1585 return -1;
1586 return 0;
1587}
1588
1589static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1590{
1591 struct cbq_sched_data *q = qdisc_priv(sch);
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001592 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001593 struct rtattr *rta;
1594
1595 rta = (struct rtattr*)b;
1596 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1597 if (cbq_dump_attr(skb, &q->link) < 0)
1598 goto rtattr_failure;
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001599 rta->rta_len = skb_tail_pointer(skb) - b;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001600 return skb->len;
1601
1602rtattr_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001603 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001604 return -1;
1605}
1606
1607static int
1608cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1609{
1610 struct cbq_sched_data *q = qdisc_priv(sch);
1611
1612 q->link.xstats.avgidle = q->link.avgidle;
1613 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1614}
1615
1616static int
1617cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1618 struct sk_buff *skb, struct tcmsg *tcm)
1619{
1620 struct cbq_class *cl = (struct cbq_class*)arg;
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001621 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001622 struct rtattr *rta;
1623
1624 if (cl->tparent)
1625 tcm->tcm_parent = cl->tparent->classid;
1626 else
1627 tcm->tcm_parent = TC_H_ROOT;
1628 tcm->tcm_handle = cl->classid;
1629 tcm->tcm_info = cl->q->handle;
1630
1631 rta = (struct rtattr*)b;
1632 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1633 if (cbq_dump_attr(skb, cl) < 0)
1634 goto rtattr_failure;
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001635 rta->rta_len = skb_tail_pointer(skb) - b;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001636 return skb->len;
1637
1638rtattr_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001639 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001640 return -1;
1641}
1642
1643static int
1644cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1645 struct gnet_dump *d)
1646{
1647 struct cbq_sched_data *q = qdisc_priv(sch);
1648 struct cbq_class *cl = (struct cbq_class*)arg;
1649
1650 cl->qstats.qlen = cl->q->q.qlen;
1651 cl->xstats.avgidle = cl->avgidle;
1652 cl->xstats.undertime = 0;
1653
1654 if (!PSCHED_IS_PASTPERFECT(cl->undertime))
1655 cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now);
1656
1657 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1658#ifdef CONFIG_NET_ESTIMATOR
1659 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1660#endif
1661 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1662 return -1;
1663
1664 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1665}
1666
1667static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1668 struct Qdisc **old)
1669{
1670 struct cbq_class *cl = (struct cbq_class*)arg;
1671
1672 if (cl) {
1673 if (new == NULL) {
Patrick McHardy9f9afec2006-11-29 17:35:18 -08001674 if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1675 cl->classid)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001676 return -ENOBUFS;
1677 } else {
1678#ifdef CONFIG_NET_CLS_POLICE
1679 if (cl->police == TC_POLICE_RECLASSIFY)
1680 new->reshape_fail = cbq_reshape_fail;
1681#endif
1682 }
1683 sch_tree_lock(sch);
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08001684 *old = xchg(&cl->q, new);
Patrick McHardy5e50da02006-11-29 17:36:20 -08001685 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001686 qdisc_reset(*old);
1687 sch_tree_unlock(sch);
1688
1689 return 0;
1690 }
1691 return -ENOENT;
1692}
1693
1694static struct Qdisc *
1695cbq_leaf(struct Qdisc *sch, unsigned long arg)
1696{
1697 struct cbq_class *cl = (struct cbq_class*)arg;
1698
1699 return cl ? cl->q : NULL;
1700}
1701
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08001702static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1703{
1704 struct cbq_class *cl = (struct cbq_class *)arg;
1705
1706 if (cl->q->q.qlen == 0)
1707 cbq_deactivate_class(cl);
1708}
1709
Linus Torvalds1da177e2005-04-16 15:20:36 -07001710static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1711{
1712 struct cbq_sched_data *q = qdisc_priv(sch);
1713 struct cbq_class *cl = cbq_class_lookup(q, classid);
1714
1715 if (cl) {
1716 cl->refcnt++;
1717 return (unsigned long)cl;
1718 }
1719 return 0;
1720}
1721
1722static void cbq_destroy_filters(struct cbq_class *cl)
1723{
1724 struct tcf_proto *tp;
1725
1726 while ((tp = cl->filter_list) != NULL) {
1727 cl->filter_list = tp->next;
1728 tcf_destroy(tp);
1729 }
1730}
1731
1732static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1733{
1734 struct cbq_sched_data *q = qdisc_priv(sch);
1735
1736 BUG_TRAP(!cl->filters);
1737
1738 cbq_destroy_filters(cl);
1739 qdisc_destroy(cl->q);
1740 qdisc_put_rtab(cl->R_tab);
1741#ifdef CONFIG_NET_ESTIMATOR
1742 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1743#endif
1744 if (cl != &q->link)
1745 kfree(cl);
1746}
1747
1748static void
1749cbq_destroy(struct Qdisc* sch)
1750{
1751 struct cbq_sched_data *q = qdisc_priv(sch);
1752 struct cbq_class *cl;
1753 unsigned h;
1754
1755#ifdef CONFIG_NET_CLS_POLICE
1756 q->rx_class = NULL;
1757#endif
1758 /*
1759 * Filters must be destroyed first because we don't destroy the
1760 * classes from root to leafs which means that filters can still
1761 * be bound to classes which have been destroyed already. --TGR '04
1762 */
1763 for (h = 0; h < 16; h++)
1764 for (cl = q->classes[h]; cl; cl = cl->next)
1765 cbq_destroy_filters(cl);
1766
1767 for (h = 0; h < 16; h++) {
1768 struct cbq_class *next;
1769
1770 for (cl = q->classes[h]; cl; cl = next) {
1771 next = cl->next;
1772 cbq_destroy_class(sch, cl);
1773 }
1774 }
1775}
1776
1777static void cbq_put(struct Qdisc *sch, unsigned long arg)
1778{
1779 struct cbq_class *cl = (struct cbq_class*)arg;
1780
1781 if (--cl->refcnt == 0) {
1782#ifdef CONFIG_NET_CLS_POLICE
1783 struct cbq_sched_data *q = qdisc_priv(sch);
1784
1785 spin_lock_bh(&sch->dev->queue_lock);
1786 if (q->rx_class == cl)
1787 q->rx_class = NULL;
1788 spin_unlock_bh(&sch->dev->queue_lock);
1789#endif
1790
1791 cbq_destroy_class(sch, cl);
1792 }
1793}
1794
1795static int
1796cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
1797 unsigned long *arg)
1798{
1799 int err;
1800 struct cbq_sched_data *q = qdisc_priv(sch);
1801 struct cbq_class *cl = (struct cbq_class*)*arg;
1802 struct rtattr *opt = tca[TCA_OPTIONS-1];
1803 struct rtattr *tb[TCA_CBQ_MAX];
1804 struct cbq_class *parent;
1805 struct qdisc_rate_table *rtab = NULL;
1806
1807 if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt))
1808 return -EINVAL;
1809
1810 if (tb[TCA_CBQ_OVL_STRATEGY-1] &&
1811 RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl))
1812 return -EINVAL;
1813
1814 if (tb[TCA_CBQ_FOPT-1] &&
1815 RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt))
1816 return -EINVAL;
1817
1818 if (tb[TCA_CBQ_RATE-1] &&
1819 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1820 return -EINVAL;
1821
1822 if (tb[TCA_CBQ_LSSOPT-1] &&
1823 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1824 return -EINVAL;
1825
1826 if (tb[TCA_CBQ_WRROPT-1] &&
1827 RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt))
1828 return -EINVAL;
1829
1830#ifdef CONFIG_NET_CLS_POLICE
1831 if (tb[TCA_CBQ_POLICE-1] &&
1832 RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police))
1833 return -EINVAL;
1834#endif
1835
1836 if (cl) {
1837 /* Check parent */
1838 if (parentid) {
1839 if (cl->tparent && cl->tparent->classid != parentid)
1840 return -EINVAL;
1841 if (!cl->tparent && parentid != TC_H_ROOT)
1842 return -EINVAL;
1843 }
1844
1845 if (tb[TCA_CBQ_RATE-1]) {
1846 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1847 if (rtab == NULL)
1848 return -EINVAL;
1849 }
1850
1851 /* Change class parameters */
1852 sch_tree_lock(sch);
1853
1854 if (cl->next_alive != NULL)
1855 cbq_deactivate_class(cl);
1856
1857 if (rtab) {
1858 rtab = xchg(&cl->R_tab, rtab);
1859 qdisc_put_rtab(rtab);
1860 }
1861
1862 if (tb[TCA_CBQ_LSSOPT-1])
1863 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1864
1865 if (tb[TCA_CBQ_WRROPT-1]) {
1866 cbq_rmprio(q, cl);
1867 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1868 }
1869
1870 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1871 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1872
1873#ifdef CONFIG_NET_CLS_POLICE
1874 if (tb[TCA_CBQ_POLICE-1])
1875 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1876#endif
1877
1878 if (tb[TCA_CBQ_FOPT-1])
1879 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1880
1881 if (cl->q->q.qlen)
1882 cbq_activate_class(cl);
1883
1884 sch_tree_unlock(sch);
1885
1886#ifdef CONFIG_NET_ESTIMATOR
1887 if (tca[TCA_RATE-1])
1888 gen_replace_estimator(&cl->bstats, &cl->rate_est,
1889 cl->stats_lock, tca[TCA_RATE-1]);
1890#endif
1891 return 0;
1892 }
1893
1894 if (parentid == TC_H_ROOT)
1895 return -EINVAL;
1896
1897 if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1898 tb[TCA_CBQ_LSSOPT-1] == NULL)
1899 return -EINVAL;
1900
1901 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1902 if (rtab == NULL)
1903 return -EINVAL;
1904
1905 if (classid) {
1906 err = -EINVAL;
1907 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1908 goto failure;
1909 } else {
1910 int i;
1911 classid = TC_H_MAKE(sch->handle,0x8000);
1912
1913 for (i=0; i<0x8000; i++) {
1914 if (++q->hgenerator >= 0x8000)
1915 q->hgenerator = 1;
1916 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1917 break;
1918 }
1919 err = -ENOSR;
1920 if (i >= 0x8000)
1921 goto failure;
1922 classid = classid|q->hgenerator;
1923 }
1924
1925 parent = &q->link;
1926 if (parentid) {
1927 parent = cbq_class_lookup(q, parentid);
1928 err = -EINVAL;
1929 if (parent == NULL)
1930 goto failure;
1931 }
1932
1933 err = -ENOBUFS;
Panagiotis Issaris0da974f2006-07-21 14:51:30 -07001934 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001935 if (cl == NULL)
1936 goto failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001937 cl->R_tab = rtab;
1938 rtab = NULL;
1939 cl->refcnt = 1;
Patrick McHardy9f9afec2006-11-29 17:35:18 -08001940 if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001941 cl->q = &noop_qdisc;
1942 cl->classid = classid;
1943 cl->tparent = parent;
1944 cl->qdisc = sch;
1945 cl->allot = parent->allot;
1946 cl->quantum = cl->allot;
1947 cl->weight = cl->R_tab->rate.rate;
1948 cl->stats_lock = &sch->dev->queue_lock;
1949
1950 sch_tree_lock(sch);
1951 cbq_link_class(cl);
1952 cl->borrow = cl->tparent;
1953 if (cl->tparent != &q->link)
1954 cl->share = cl->tparent;
1955 cbq_adjust_levels(parent);
1956 cl->minidle = -0x7FFFFFFF;
1957 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1958 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1959 if (cl->ewma_log==0)
1960 cl->ewma_log = q->link.ewma_log;
1961 if (cl->maxidle==0)
1962 cl->maxidle = q->link.maxidle;
1963 if (cl->avpkt==0)
1964 cl->avpkt = q->link.avpkt;
1965 cl->overlimit = cbq_ovl_classic;
1966 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1967 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1968#ifdef CONFIG_NET_CLS_POLICE
1969 if (tb[TCA_CBQ_POLICE-1])
1970 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1971#endif
1972 if (tb[TCA_CBQ_FOPT-1])
1973 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1974 sch_tree_unlock(sch);
1975
1976#ifdef CONFIG_NET_ESTIMATOR
1977 if (tca[TCA_RATE-1])
1978 gen_new_estimator(&cl->bstats, &cl->rate_est,
1979 cl->stats_lock, tca[TCA_RATE-1]);
1980#endif
1981
1982 *arg = (unsigned long)cl;
1983 return 0;
1984
1985failure:
1986 qdisc_put_rtab(rtab);
1987 return err;
1988}
1989
1990static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1991{
1992 struct cbq_sched_data *q = qdisc_priv(sch);
1993 struct cbq_class *cl = (struct cbq_class*)arg;
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08001994 unsigned int qlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001995
1996 if (cl->filters || cl->children || cl == &q->link)
1997 return -EBUSY;
1998
1999 sch_tree_lock(sch);
2000
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08002001 qlen = cl->q->q.qlen;
2002 qdisc_reset(cl->q);
2003 qdisc_tree_decrease_qlen(cl->q, qlen);
2004
Linus Torvalds1da177e2005-04-16 15:20:36 -07002005 if (cl->next_alive)
2006 cbq_deactivate_class(cl);
2007
2008 if (q->tx_borrowed == cl)
2009 q->tx_borrowed = q->tx_class;
2010 if (q->tx_class == cl) {
2011 q->tx_class = NULL;
2012 q->tx_borrowed = NULL;
2013 }
2014#ifdef CONFIG_NET_CLS_POLICE
2015 if (q->rx_class == cl)
2016 q->rx_class = NULL;
2017#endif
2018
2019 cbq_unlink_class(cl);
2020 cbq_adjust_levels(cl->tparent);
2021 cl->defmap = 0;
2022 cbq_sync_defmap(cl);
2023
2024 cbq_rmprio(q, cl);
2025 sch_tree_unlock(sch);
2026
2027 if (--cl->refcnt == 0)
2028 cbq_destroy_class(sch, cl);
2029
2030 return 0;
2031}
2032
2033static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
2034{
2035 struct cbq_sched_data *q = qdisc_priv(sch);
2036 struct cbq_class *cl = (struct cbq_class *)arg;
2037
2038 if (cl == NULL)
2039 cl = &q->link;
2040
2041 return &cl->filter_list;
2042}
2043
2044static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
2045 u32 classid)
2046{
2047 struct cbq_sched_data *q = qdisc_priv(sch);
2048 struct cbq_class *p = (struct cbq_class*)parent;
2049 struct cbq_class *cl = cbq_class_lookup(q, classid);
2050
2051 if (cl) {
2052 if (p && p->level <= cl->level)
2053 return 0;
2054 cl->filters++;
2055 return (unsigned long)cl;
2056 }
2057 return 0;
2058}
2059
2060static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2061{
2062 struct cbq_class *cl = (struct cbq_class*)arg;
2063
2064 cl->filters--;
2065}
2066
2067static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2068{
2069 struct cbq_sched_data *q = qdisc_priv(sch);
2070 unsigned h;
2071
2072 if (arg->stop)
2073 return;
2074
2075 for (h = 0; h < 16; h++) {
2076 struct cbq_class *cl;
2077
2078 for (cl = q->classes[h]; cl; cl = cl->next) {
2079 if (arg->count < arg->skip) {
2080 arg->count++;
2081 continue;
2082 }
2083 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2084 arg->stop = 1;
2085 return;
2086 }
2087 arg->count++;
2088 }
2089 }
2090}
2091
2092static struct Qdisc_class_ops cbq_class_ops = {
2093 .graft = cbq_graft,
2094 .leaf = cbq_leaf,
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08002095 .qlen_notify = cbq_qlen_notify,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002096 .get = cbq_get,
2097 .put = cbq_put,
2098 .change = cbq_change_class,
2099 .delete = cbq_delete,
2100 .walk = cbq_walk,
2101 .tcf_chain = cbq_find_tcf,
2102 .bind_tcf = cbq_bind_filter,
2103 .unbind_tcf = cbq_unbind_filter,
2104 .dump = cbq_dump_class,
2105 .dump_stats = cbq_dump_class_stats,
2106};
2107
2108static struct Qdisc_ops cbq_qdisc_ops = {
2109 .next = NULL,
2110 .cl_ops = &cbq_class_ops,
2111 .id = "cbq",
2112 .priv_size = sizeof(struct cbq_sched_data),
2113 .enqueue = cbq_enqueue,
2114 .dequeue = cbq_dequeue,
2115 .requeue = cbq_requeue,
2116 .drop = cbq_drop,
2117 .init = cbq_init,
2118 .reset = cbq_reset,
2119 .destroy = cbq_destroy,
2120 .change = NULL,
2121 .dump = cbq_dump,
2122 .dump_stats = cbq_dump_stats,
2123 .owner = THIS_MODULE,
2124};
2125
2126static int __init cbq_module_init(void)
2127{
2128 return register_qdisc(&cbq_qdisc_ops);
2129}
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +09002130static void __exit cbq_module_exit(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002131{
2132 unregister_qdisc(&cbq_qdisc_ops);
2133}
2134module_init(cbq_module_init)
2135module_exit(cbq_module_exit)
2136MODULE_LICENSE("GPL");