blob: 0b6489291140f0324c73e8a8ddb0aad716c03593 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
Linus Torvalds1da177e2005-04-16 15:20:36 -070018#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/sched.h>
22#include <linux/string.h>
23#include <linux/mm.h>
24#include <linux/socket.h>
25#include <linux/sockios.h>
26#include <linux/in.h>
27#include <linux/errno.h>
28#include <linux/interrupt.h>
29#include <linux/netdevice.h>
30#include <linux/skbuff.h>
31#include <linux/rtnetlink.h>
32#include <linux/init.h>
33#include <linux/proc_fs.h>
34#include <linux/seq_file.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/bitops.h>
38
39#include <net/sock.h>
40#include <net/pkt_sched.h>
41
42#include <asm/processor.h>
43#include <asm/uaccess.h>
44#include <asm/system.h>
45
46static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
47 struct Qdisc *old, struct Qdisc *new);
48static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
49 struct Qdisc *q, unsigned long cl, int event);
50
51/*
52
53 Short review.
54 -------------
55
56 This file consists of two interrelated parts:
57
58 1. queueing disciplines manager frontend.
59 2. traffic classes manager frontend.
60
61 Generally, queueing discipline ("qdisc") is a black box,
62 which is able to enqueue packets and to dequeue them (when
63 device is ready to send something) in order and at times
64 determined by algorithm hidden in it.
65
66 qdisc's are divided to two categories:
67 - "queues", which have no internal structure visible from outside.
68 - "schedulers", which split all the packets to "traffic classes",
69 using "packet classifiers" (look at cls_api.c)
70
71 In turn, classes may have child qdiscs (as rule, queues)
72 attached to them etc. etc. etc.
73
74 The goal of the routines in this file is to translate
75 information supplied by user in the form of handles
76 to more intelligible for kernel form, to make some sanity
77 checks and part of work, which is common to all qdiscs
78 and to provide rtnetlink notifications.
79
80 All real intelligent work is done inside qdisc modules.
81
82
83
84 Every discipline has two major routines: enqueue and dequeue.
85
86 ---dequeue
87
88 dequeue usually returns a skb to send. It is allowed to return NULL,
89 but it does not mean that queue is empty, it just means that
90 discipline does not want to send anything this time.
91 Queue is really empty if q->q.qlen == 0.
92 For complicated disciplines with multiple queues q->q is not
93 real packet queue, but however q->q.qlen must be valid.
94
95 ---enqueue
96
97 enqueue returns 0, if packet was enqueued successfully.
98 If packet (this one or another one) was dropped, it returns
99 not zero error code.
100 NET_XMIT_DROP - this packet dropped
101 Expected action: do not backoff, but wait until queue will clear.
102 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
103 Expected action: backoff or ignore
104 NET_XMIT_POLICED - dropped by police.
105 Expected action: backoff or error to real-time apps.
106
107 Auxiliary routines:
108
109 ---requeue
110
111 requeues once dequeued packet. It is used for non-standard or
112 just buggy devices, which can defer output even if dev->tbusy=0.
113
114 ---reset
115
116 returns qdisc to initial state: purge all buffers, clear all
117 timers, counters (except for statistics) etc.
118
119 ---init
120
121 initializes newly created qdisc.
122
123 ---destroy
124
125 destroys resources allocated by init and during lifetime of qdisc.
126
127 ---change
128
129 changes qdisc parameters.
130 */
131
132/* Protects list of registered TC modules. It is pure SMP lock. */
133static DEFINE_RWLOCK(qdisc_mod_lock);
134
135
136/************************************************
137 * Queueing disciplines manipulation. *
138 ************************************************/
139
140
141/* The list of all installed queueing disciplines. */
142
143static struct Qdisc_ops *qdisc_base;
144
145/* Register/uregister queueing discipline */
146
147int register_qdisc(struct Qdisc_ops *qops)
148{
149 struct Qdisc_ops *q, **qp;
150 int rc = -EEXIST;
151
152 write_lock(&qdisc_mod_lock);
153 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
154 if (!strcmp(qops->id, q->id))
155 goto out;
156
157 if (qops->enqueue == NULL)
158 qops->enqueue = noop_qdisc_ops.enqueue;
159 if (qops->requeue == NULL)
160 qops->requeue = noop_qdisc_ops.requeue;
161 if (qops->dequeue == NULL)
162 qops->dequeue = noop_qdisc_ops.dequeue;
163
164 qops->next = NULL;
165 *qp = qops;
166 rc = 0;
167out:
168 write_unlock(&qdisc_mod_lock);
169 return rc;
170}
171
172int unregister_qdisc(struct Qdisc_ops *qops)
173{
174 struct Qdisc_ops *q, **qp;
175 int err = -ENOENT;
176
177 write_lock(&qdisc_mod_lock);
178 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
179 if (q == qops)
180 break;
181 if (q) {
182 *qp = q->next;
183 q->next = NULL;
184 err = 0;
185 }
186 write_unlock(&qdisc_mod_lock);
187 return err;
188}
189
190/* We know handle. Find qdisc among all qdisc's attached to device
191 (root qdisc, all its children, children of children etc.)
192 */
193
194struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
195{
196 struct Qdisc *q;
197
Patrick McHardy85670cc2006-09-27 16:45:45 -0700198 read_lock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199 list_for_each_entry(q, &dev->qdisc_list, list) {
200 if (q->handle == handle) {
Patrick McHardy85670cc2006-09-27 16:45:45 -0700201 read_unlock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202 return q;
203 }
204 }
Patrick McHardy85670cc2006-09-27 16:45:45 -0700205 read_unlock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206 return NULL;
207}
208
209static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
210{
211 unsigned long cl;
212 struct Qdisc *leaf;
213 struct Qdisc_class_ops *cops = p->ops->cl_ops;
214
215 if (cops == NULL)
216 return NULL;
217 cl = cops->get(p, classid);
218
219 if (cl == 0)
220 return NULL;
221 leaf = cops->leaf(p, cl);
222 cops->put(p, cl);
223 return leaf;
224}
225
226/* Find queueing discipline by name */
227
228static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
229{
230 struct Qdisc_ops *q = NULL;
231
232 if (kind) {
233 read_lock(&qdisc_mod_lock);
234 for (q = qdisc_base; q; q = q->next) {
235 if (rtattr_strcmp(kind, q->id) == 0) {
236 if (!try_module_get(q->owner))
237 q = NULL;
238 break;
239 }
240 }
241 read_unlock(&qdisc_mod_lock);
242 }
243 return q;
244}
245
246static struct qdisc_rate_table *qdisc_rtab_list;
247
248struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
249{
250 struct qdisc_rate_table *rtab;
251
252 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
253 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
254 rtab->refcnt++;
255 return rtab;
256 }
257 }
258
259 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
260 return NULL;
261
262 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
263 if (rtab) {
264 rtab->rate = *r;
265 rtab->refcnt = 1;
266 memcpy(rtab->data, RTA_DATA(tab), 1024);
267 rtab->next = qdisc_rtab_list;
268 qdisc_rtab_list = rtab;
269 }
270 return rtab;
271}
272
273void qdisc_put_rtab(struct qdisc_rate_table *tab)
274{
275 struct qdisc_rate_table *rtab, **rtabp;
276
277 if (!tab || --tab->refcnt)
278 return;
279
280 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
281 if (rtab == tab) {
282 *rtabp = rtab->next;
283 kfree(rtab);
284 return;
285 }
286 }
287}
288
289
290/* Allocate an unique handle from space managed by kernel */
291
292static u32 qdisc_alloc_handle(struct net_device *dev)
293{
294 int i = 0x10000;
295 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
296
297 do {
298 autohandle += TC_H_MAKE(0x10000U, 0);
299 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
300 autohandle = TC_H_MAKE(0x80000000U, 0);
301 } while (qdisc_lookup(dev, autohandle) && --i > 0);
302
303 return i>0 ? autohandle : 0;
304}
305
306/* Attach toplevel qdisc to device dev */
307
308static struct Qdisc *
309dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
310{
311 struct Qdisc *oqdisc;
312
313 if (dev->flags & IFF_UP)
314 dev_deactivate(dev);
315
316 qdisc_lock_tree(dev);
317 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
318 oqdisc = dev->qdisc_ingress;
319 /* Prune old scheduler */
320 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
321 /* delete */
322 qdisc_reset(oqdisc);
323 dev->qdisc_ingress = NULL;
324 } else { /* new */
325 dev->qdisc_ingress = qdisc;
326 }
327
328 } else {
329
330 oqdisc = dev->qdisc_sleeping;
331
332 /* Prune old scheduler */
333 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
334 qdisc_reset(oqdisc);
335
336 /* ... and graft new one */
337 if (qdisc == NULL)
338 qdisc = &noop_qdisc;
339 dev->qdisc_sleeping = qdisc;
340 dev->qdisc = &noop_qdisc;
341 }
342
343 qdisc_unlock_tree(dev);
344
345 if (dev->flags & IFF_UP)
346 dev_activate(dev);
347
348 return oqdisc;
349}
350
351
352/* Graft qdisc "new" to class "classid" of qdisc "parent" or
353 to device "dev".
354
355 Old qdisc is not destroyed but returned in *old.
356 */
357
358static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
359 u32 classid,
360 struct Qdisc *new, struct Qdisc **old)
361{
362 int err = 0;
363 struct Qdisc *q = *old;
364
365
366 if (parent == NULL) {
367 if (q && q->flags&TCQ_F_INGRESS) {
368 *old = dev_graft_qdisc(dev, q);
369 } else {
370 *old = dev_graft_qdisc(dev, new);
371 }
372 } else {
373 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
374
375 err = -EINVAL;
376
377 if (cops) {
378 unsigned long cl = cops->get(parent, classid);
379 if (cl) {
380 err = cops->graft(parent, cl, new, old);
381 if (new)
382 new->parent = classid;
383 cops->put(parent, cl);
384 }
385 }
386 }
387 return err;
388}
389
390/*
391 Allocate and initialize new qdisc.
392
393 Parameters are passed via opt.
394 */
395
396static struct Qdisc *
397qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
398{
399 int err;
400 struct rtattr *kind = tca[TCA_KIND-1];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401 struct Qdisc *sch;
402 struct Qdisc_ops *ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403
404 ops = qdisc_lookup_ops(kind);
405#ifdef CONFIG_KMOD
406 if (ops == NULL && kind != NULL) {
407 char name[IFNAMSIZ];
408 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
409 /* We dropped the RTNL semaphore in order to
410 * perform the module load. So, even if we
411 * succeeded in loading the module we have to
412 * tell the caller to replay the request. We
413 * indicate this using -EAGAIN.
414 * We replay the request because the device may
415 * go away in the mean time.
416 */
417 rtnl_unlock();
418 request_module("sch_%s", name);
419 rtnl_lock();
420 ops = qdisc_lookup_ops(kind);
421 if (ops != NULL) {
422 /* We will try again qdisc_lookup_ops,
423 * so don't keep a reference.
424 */
425 module_put(ops->owner);
426 err = -EAGAIN;
427 goto err_out;
428 }
429 }
430 }
431#endif
432
Jamal Hadi Salimb9e2cc02006-08-03 16:36:51 -0700433 err = -ENOENT;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700434 if (ops == NULL)
435 goto err_out;
436
Thomas Graf3d54b822005-07-05 14:15:09 -0700437 sch = qdisc_alloc(dev, ops);
438 if (IS_ERR(sch)) {
439 err = PTR_ERR(sch);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440 goto err_out2;
Thomas Graf3d54b822005-07-05 14:15:09 -0700441 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442
Thomas Graf3d54b822005-07-05 14:15:09 -0700443 if (handle == TC_H_INGRESS) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700444 sch->flags |= TCQ_F_INGRESS;
Thomas Graf3d54b822005-07-05 14:15:09 -0700445 handle = TC_H_MAKE(TC_H_INGRESS, 0);
446 } else if (handle == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700447 handle = qdisc_alloc_handle(dev);
448 err = -ENOMEM;
449 if (handle == 0)
450 goto err_out3;
451 }
452
Thomas Graf3d54b822005-07-05 14:15:09 -0700453 sch->handle = handle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454
455 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
Thomas Graf023e09a2005-07-05 14:15:53 -0700456#ifdef CONFIG_NET_ESTIMATOR
457 if (tca[TCA_RATE-1]) {
458 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
459 sch->stats_lock,
460 tca[TCA_RATE-1]);
461 if (err) {
462 /*
463 * Any broken qdiscs that would require
464 * a ops->reset() here? The qdisc was never
465 * in action so it shouldn't be necessary.
466 */
467 if (ops->destroy)
468 ops->destroy(sch);
469 goto err_out3;
470 }
471 }
472#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700473 qdisc_lock_tree(dev);
474 list_add_tail(&sch->list, &dev->qdisc_list);
475 qdisc_unlock_tree(dev);
476
Linus Torvalds1da177e2005-04-16 15:20:36 -0700477 return sch;
478 }
479err_out3:
480 dev_put(dev);
Thomas Graf3d54b822005-07-05 14:15:09 -0700481 kfree((char *) sch - sch->padded);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482err_out2:
483 module_put(ops->owner);
484err_out:
485 *errp = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 return NULL;
487}
488
489static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
490{
491 if (tca[TCA_OPTIONS-1]) {
492 int err;
493
494 if (sch->ops->change == NULL)
495 return -EINVAL;
496 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
497 if (err)
498 return err;
499 }
500#ifdef CONFIG_NET_ESTIMATOR
501 if (tca[TCA_RATE-1])
502 gen_replace_estimator(&sch->bstats, &sch->rate_est,
503 sch->stats_lock, tca[TCA_RATE-1]);
504#endif
505 return 0;
506}
507
508struct check_loop_arg
509{
510 struct qdisc_walker w;
511 struct Qdisc *p;
512 int depth;
513};
514
515static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
516
517static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
518{
519 struct check_loop_arg arg;
520
521 if (q->ops->cl_ops == NULL)
522 return 0;
523
524 arg.w.stop = arg.w.skip = arg.w.count = 0;
525 arg.w.fn = check_loop_fn;
526 arg.depth = depth;
527 arg.p = p;
528 q->ops->cl_ops->walk(q, &arg.w);
529 return arg.w.stop ? -ELOOP : 0;
530}
531
532static int
533check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
534{
535 struct Qdisc *leaf;
536 struct Qdisc_class_ops *cops = q->ops->cl_ops;
537 struct check_loop_arg *arg = (struct check_loop_arg *)w;
538
539 leaf = cops->leaf(q, cl);
540 if (leaf) {
541 if (leaf == arg->p || arg->depth > 7)
542 return -ELOOP;
543 return check_loop(leaf, arg->p, arg->depth + 1);
544 }
545 return 0;
546}
547
548/*
549 * Delete/get qdisc.
550 */
551
552static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
553{
554 struct tcmsg *tcm = NLMSG_DATA(n);
555 struct rtattr **tca = arg;
556 struct net_device *dev;
557 u32 clid = tcm->tcm_parent;
558 struct Qdisc *q = NULL;
559 struct Qdisc *p = NULL;
560 int err;
561
562 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
563 return -ENODEV;
564
565 if (clid) {
566 if (clid != TC_H_ROOT) {
567 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
568 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
569 return -ENOENT;
570 q = qdisc_leaf(p, clid);
571 } else { /* ingress */
572 q = dev->qdisc_ingress;
573 }
574 } else {
575 q = dev->qdisc_sleeping;
576 }
577 if (!q)
578 return -ENOENT;
579
580 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
581 return -EINVAL;
582 } else {
583 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
584 return -ENOENT;
585 }
586
587 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
588 return -EINVAL;
589
590 if (n->nlmsg_type == RTM_DELQDISC) {
591 if (!clid)
592 return -EINVAL;
593 if (q->handle == 0)
594 return -ENOENT;
595 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
596 return err;
597 if (q) {
598 qdisc_notify(skb, n, clid, q, NULL);
599 spin_lock_bh(&dev->queue_lock);
600 qdisc_destroy(q);
601 spin_unlock_bh(&dev->queue_lock);
602 }
603 } else {
604 qdisc_notify(skb, n, clid, NULL, q);
605 }
606 return 0;
607}
608
609/*
610 Create/change qdisc.
611 */
612
613static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
614{
615 struct tcmsg *tcm;
616 struct rtattr **tca;
617 struct net_device *dev;
618 u32 clid;
619 struct Qdisc *q, *p;
620 int err;
621
622replay:
623 /* Reinit, just in case something touches this. */
624 tcm = NLMSG_DATA(n);
625 tca = arg;
626 clid = tcm->tcm_parent;
627 q = p = NULL;
628
629 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
630 return -ENODEV;
631
632 if (clid) {
633 if (clid != TC_H_ROOT) {
634 if (clid != TC_H_INGRESS) {
635 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
636 return -ENOENT;
637 q = qdisc_leaf(p, clid);
638 } else { /*ingress */
639 q = dev->qdisc_ingress;
640 }
641 } else {
642 q = dev->qdisc_sleeping;
643 }
644
645 /* It may be default qdisc, ignore it */
646 if (q && q->handle == 0)
647 q = NULL;
648
649 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
650 if (tcm->tcm_handle) {
651 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
652 return -EEXIST;
653 if (TC_H_MIN(tcm->tcm_handle))
654 return -EINVAL;
655 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
656 goto create_n_graft;
657 if (n->nlmsg_flags&NLM_F_EXCL)
658 return -EEXIST;
659 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
660 return -EINVAL;
661 if (q == p ||
662 (p && check_loop(q, p, 0)))
663 return -ELOOP;
664 atomic_inc(&q->refcnt);
665 goto graft;
666 } else {
667 if (q == NULL)
668 goto create_n_graft;
669
670 /* This magic test requires explanation.
671 *
672 * We know, that some child q is already
673 * attached to this parent and have choice:
674 * either to change it or to create/graft new one.
675 *
676 * 1. We are allowed to create/graft only
677 * if CREATE and REPLACE flags are set.
678 *
679 * 2. If EXCL is set, requestor wanted to say,
680 * that qdisc tcm_handle is not expected
681 * to exist, so that we choose create/graft too.
682 *
683 * 3. The last case is when no flags are set.
684 * Alas, it is sort of hole in API, we
685 * cannot decide what to do unambiguously.
686 * For now we select create/graft, if
687 * user gave KIND, which does not match existing.
688 */
689 if ((n->nlmsg_flags&NLM_F_CREATE) &&
690 (n->nlmsg_flags&NLM_F_REPLACE) &&
691 ((n->nlmsg_flags&NLM_F_EXCL) ||
692 (tca[TCA_KIND-1] &&
693 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
694 goto create_n_graft;
695 }
696 }
697 } else {
698 if (!tcm->tcm_handle)
699 return -EINVAL;
700 q = qdisc_lookup(dev, tcm->tcm_handle);
701 }
702
703 /* Change qdisc parameters */
704 if (q == NULL)
705 return -ENOENT;
706 if (n->nlmsg_flags&NLM_F_EXCL)
707 return -EEXIST;
708 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
709 return -EINVAL;
710 err = qdisc_change(q, tca);
711 if (err == 0)
712 qdisc_notify(skb, n, clid, NULL, q);
713 return err;
714
715create_n_graft:
716 if (!(n->nlmsg_flags&NLM_F_CREATE))
717 return -ENOENT;
718 if (clid == TC_H_INGRESS)
719 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
720 else
721 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
722 if (q == NULL) {
723 if (err == -EAGAIN)
724 goto replay;
725 return err;
726 }
727
728graft:
729 if (1) {
730 struct Qdisc *old_q = NULL;
731 err = qdisc_graft(dev, p, clid, q, &old_q);
732 if (err) {
733 if (q) {
734 spin_lock_bh(&dev->queue_lock);
735 qdisc_destroy(q);
736 spin_unlock_bh(&dev->queue_lock);
737 }
738 return err;
739 }
740 qdisc_notify(skb, n, clid, old_q, q);
741 if (old_q) {
742 spin_lock_bh(&dev->queue_lock);
743 qdisc_destroy(old_q);
744 spin_unlock_bh(&dev->queue_lock);
745 }
746 }
747 return 0;
748}
749
750static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700751 u32 pid, u32 seq, u16 flags, int event)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700752{
753 struct tcmsg *tcm;
754 struct nlmsghdr *nlh;
755 unsigned char *b = skb->tail;
756 struct gnet_dump d;
757
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700758 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700759 tcm = NLMSG_DATA(nlh);
760 tcm->tcm_family = AF_UNSPEC;
Patrick McHardy9ef1d4c2005-06-28 12:55:30 -0700761 tcm->tcm__pad1 = 0;
762 tcm->tcm__pad2 = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700763 tcm->tcm_ifindex = q->dev->ifindex;
764 tcm->tcm_parent = clid;
765 tcm->tcm_handle = q->handle;
766 tcm->tcm_info = atomic_read(&q->refcnt);
767 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
768 if (q->ops->dump && q->ops->dump(q, skb) < 0)
769 goto rtattr_failure;
770 q->qstats.qlen = q->q.qlen;
771
772 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
773 TCA_XSTATS, q->stats_lock, &d) < 0)
774 goto rtattr_failure;
775
776 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
777 goto rtattr_failure;
778
779 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
780#ifdef CONFIG_NET_ESTIMATOR
781 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
782#endif
783 gnet_stats_copy_queue(&d, &q->qstats) < 0)
784 goto rtattr_failure;
785
786 if (gnet_stats_finish_copy(&d) < 0)
787 goto rtattr_failure;
788
789 nlh->nlmsg_len = skb->tail - b;
790 return skb->len;
791
792nlmsg_failure:
793rtattr_failure:
794 skb_trim(skb, b - skb->data);
795 return -1;
796}
797
798static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
799 u32 clid, struct Qdisc *old, struct Qdisc *new)
800{
801 struct sk_buff *skb;
802 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
803
804 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
805 if (!skb)
806 return -ENOBUFS;
807
808 if (old && old->handle) {
809 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
810 goto err_out;
811 }
812 if (new) {
813 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
814 goto err_out;
815 }
816
817 if (skb->len)
Patrick McHardyac6d4392005-08-14 19:29:52 -0700818 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700819
820err_out:
821 kfree_skb(skb);
822 return -EINVAL;
823}
824
825static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
826{
827 int idx, q_idx;
828 int s_idx, s_q_idx;
829 struct net_device *dev;
830 struct Qdisc *q;
831
832 s_idx = cb->args[0];
833 s_q_idx = q_idx = cb->args[1];
834 read_lock(&dev_base_lock);
835 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
836 if (idx < s_idx)
837 continue;
838 if (idx > s_idx)
839 s_q_idx = 0;
Patrick McHardy85670cc2006-09-27 16:45:45 -0700840 read_lock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700841 q_idx = 0;
842 list_for_each_entry(q, &dev->qdisc_list, list) {
843 if (q_idx < s_q_idx) {
844 q_idx++;
845 continue;
846 }
847 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
848 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
Patrick McHardy85670cc2006-09-27 16:45:45 -0700849 read_unlock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700850 goto done;
851 }
852 q_idx++;
853 }
Patrick McHardy85670cc2006-09-27 16:45:45 -0700854 read_unlock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700855 }
856
857done:
858 read_unlock(&dev_base_lock);
859
860 cb->args[0] = idx;
861 cb->args[1] = q_idx;
862
863 return skb->len;
864}
865
866
867
868/************************************************
869 * Traffic classes manipulation. *
870 ************************************************/
871
872
873
874static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
875{
876 struct tcmsg *tcm = NLMSG_DATA(n);
877 struct rtattr **tca = arg;
878 struct net_device *dev;
879 struct Qdisc *q = NULL;
880 struct Qdisc_class_ops *cops;
881 unsigned long cl = 0;
882 unsigned long new_cl;
883 u32 pid = tcm->tcm_parent;
884 u32 clid = tcm->tcm_handle;
885 u32 qid = TC_H_MAJ(clid);
886 int err;
887
888 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
889 return -ENODEV;
890
891 /*
892 parent == TC_H_UNSPEC - unspecified parent.
893 parent == TC_H_ROOT - class is root, which has no parent.
894 parent == X:0 - parent is root class.
895 parent == X:Y - parent is a node in hierarchy.
896 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
897
898 handle == 0:0 - generate handle from kernel pool.
899 handle == 0:Y - class is X:Y, where X:0 is qdisc.
900 handle == X:Y - clear.
901 handle == X:0 - root class.
902 */
903
904 /* Step 1. Determine qdisc handle X:0 */
905
906 if (pid != TC_H_ROOT) {
907 u32 qid1 = TC_H_MAJ(pid);
908
909 if (qid && qid1) {
910 /* If both majors are known, they must be identical. */
911 if (qid != qid1)
912 return -EINVAL;
913 } else if (qid1) {
914 qid = qid1;
915 } else if (qid == 0)
916 qid = dev->qdisc_sleeping->handle;
917
918 /* Now qid is genuine qdisc handle consistent
919 both with parent and child.
920
921 TC_H_MAJ(pid) still may be unspecified, complete it now.
922 */
923 if (pid)
924 pid = TC_H_MAKE(qid, pid);
925 } else {
926 if (qid == 0)
927 qid = dev->qdisc_sleeping->handle;
928 }
929
930 /* OK. Locate qdisc */
931 if ((q = qdisc_lookup(dev, qid)) == NULL)
932 return -ENOENT;
933
934 /* An check that it supports classes */
935 cops = q->ops->cl_ops;
936 if (cops == NULL)
937 return -EINVAL;
938
939 /* Now try to get class */
940 if (clid == 0) {
941 if (pid == TC_H_ROOT)
942 clid = qid;
943 } else
944 clid = TC_H_MAKE(qid, clid);
945
946 if (clid)
947 cl = cops->get(q, clid);
948
949 if (cl == 0) {
950 err = -ENOENT;
951 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
952 goto out;
953 } else {
954 switch (n->nlmsg_type) {
955 case RTM_NEWTCLASS:
956 err = -EEXIST;
957 if (n->nlmsg_flags&NLM_F_EXCL)
958 goto out;
959 break;
960 case RTM_DELTCLASS:
961 err = cops->delete(q, cl);
962 if (err == 0)
963 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
964 goto out;
965 case RTM_GETTCLASS:
966 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
967 goto out;
968 default:
969 err = -EINVAL;
970 goto out;
971 }
972 }
973
974 new_cl = cl;
975 err = cops->change(q, clid, pid, tca, &new_cl);
976 if (err == 0)
977 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
978
979out:
980 if (cl)
981 cops->put(q, cl);
982
983 return err;
984}
985
986
987static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
988 unsigned long cl,
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700989 u32 pid, u32 seq, u16 flags, int event)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700990{
991 struct tcmsg *tcm;
992 struct nlmsghdr *nlh;
993 unsigned char *b = skb->tail;
994 struct gnet_dump d;
995 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
996
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700997 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700998 tcm = NLMSG_DATA(nlh);
999 tcm->tcm_family = AF_UNSPEC;
1000 tcm->tcm_ifindex = q->dev->ifindex;
1001 tcm->tcm_parent = q->handle;
1002 tcm->tcm_handle = q->handle;
1003 tcm->tcm_info = 0;
1004 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1005 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1006 goto rtattr_failure;
1007
1008 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1009 TCA_XSTATS, q->stats_lock, &d) < 0)
1010 goto rtattr_failure;
1011
1012 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1013 goto rtattr_failure;
1014
1015 if (gnet_stats_finish_copy(&d) < 0)
1016 goto rtattr_failure;
1017
1018 nlh->nlmsg_len = skb->tail - b;
1019 return skb->len;
1020
1021nlmsg_failure:
1022rtattr_failure:
1023 skb_trim(skb, b - skb->data);
1024 return -1;
1025}
1026
1027static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1028 struct Qdisc *q, unsigned long cl, int event)
1029{
1030 struct sk_buff *skb;
1031 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1032
1033 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1034 if (!skb)
1035 return -ENOBUFS;
1036
1037 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1038 kfree_skb(skb);
1039 return -EINVAL;
1040 }
1041
Patrick McHardyac6d4392005-08-14 19:29:52 -07001042 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001043}
1044
1045struct qdisc_dump_args
1046{
1047 struct qdisc_walker w;
1048 struct sk_buff *skb;
1049 struct netlink_callback *cb;
1050};
1051
1052static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1053{
1054 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1055
1056 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1057 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1058}
1059
1060static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1061{
1062 int t;
1063 int s_t;
1064 struct net_device *dev;
1065 struct Qdisc *q;
1066 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1067 struct qdisc_dump_args arg;
1068
1069 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1070 return 0;
1071 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1072 return 0;
1073
1074 s_t = cb->args[0];
1075 t = 0;
1076
Patrick McHardy85670cc2006-09-27 16:45:45 -07001077 read_lock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001078 list_for_each_entry(q, &dev->qdisc_list, list) {
1079 if (t < s_t || !q->ops->cl_ops ||
1080 (tcm->tcm_parent &&
1081 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1082 t++;
1083 continue;
1084 }
1085 if (t > s_t)
1086 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1087 arg.w.fn = qdisc_class_dump;
1088 arg.skb = skb;
1089 arg.cb = cb;
1090 arg.w.stop = 0;
1091 arg.w.skip = cb->args[1];
1092 arg.w.count = 0;
1093 q->ops->cl_ops->walk(q, &arg.w);
1094 cb->args[1] = arg.w.count;
1095 if (arg.w.stop)
1096 break;
1097 t++;
1098 }
Patrick McHardy85670cc2006-09-27 16:45:45 -07001099 read_unlock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001100
1101 cb->args[0] = t;
1102
1103 dev_put(dev);
1104 return skb->len;
1105}
1106
1107/* Main classifier routine: scans classifier chain attached
1108 to this qdisc, (optionally) tests for protocol and asks
1109 specific classifiers.
1110 */
1111int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1112 struct tcf_result *res)
1113{
1114 int err = 0;
1115 u32 protocol = skb->protocol;
1116#ifdef CONFIG_NET_CLS_ACT
1117 struct tcf_proto *otp = tp;
1118reclassify:
1119#endif
1120 protocol = skb->protocol;
1121
1122 for ( ; tp; tp = tp->next) {
1123 if ((tp->protocol == protocol ||
1124 tp->protocol == __constant_htons(ETH_P_ALL)) &&
1125 (err = tp->classify(skb, tp, res)) >= 0) {
1126#ifdef CONFIG_NET_CLS_ACT
1127 if ( TC_ACT_RECLASSIFY == err) {
1128 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1129 tp = otp;
1130
1131 if (MAX_REC_LOOP < verd++) {
1132 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1133 tp->prio&0xffff, ntohs(tp->protocol));
1134 return TC_ACT_SHOT;
1135 }
1136 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1137 goto reclassify;
1138 } else {
1139 if (skb->tc_verd)
1140 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1141 return err;
1142 }
1143#else
1144
1145 return err;
1146#endif
1147 }
1148
1149 }
1150 return -1;
1151}
1152
1153static int psched_us_per_tick = 1;
1154static int psched_tick_per_us = 1;
1155
1156#ifdef CONFIG_PROC_FS
1157static int psched_show(struct seq_file *seq, void *v)
1158{
1159 seq_printf(seq, "%08x %08x %08x %08x\n",
1160 psched_tick_per_us, psched_us_per_tick,
1161 1000000, HZ);
1162
1163 return 0;
1164}
1165
1166static int psched_open(struct inode *inode, struct file *file)
1167{
1168 return single_open(file, psched_show, PDE(inode)->data);
1169}
1170
1171static struct file_operations psched_fops = {
1172 .owner = THIS_MODULE,
1173 .open = psched_open,
1174 .read = seq_read,
1175 .llseek = seq_lseek,
1176 .release = single_release,
1177};
1178#endif
1179
1180#ifdef CONFIG_NET_SCH_CLK_CPU
1181psched_tdiff_t psched_clock_per_hz;
1182int psched_clock_scale;
1183EXPORT_SYMBOL(psched_clock_per_hz);
1184EXPORT_SYMBOL(psched_clock_scale);
1185
1186psched_time_t psched_time_base;
1187cycles_t psched_time_mark;
1188EXPORT_SYMBOL(psched_time_mark);
1189EXPORT_SYMBOL(psched_time_base);
1190
1191/*
1192 * Periodically adjust psched_time_base to avoid overflow
1193 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1194 */
1195static void psched_tick(unsigned long);
Ingo Molnar8d06afa2005-09-09 13:10:40 -07001196static DEFINE_TIMER(psched_timer, psched_tick, 0, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001197
1198static void psched_tick(unsigned long dummy)
1199{
1200 if (sizeof(cycles_t) == sizeof(u32)) {
1201 psched_time_t dummy_stamp;
1202 PSCHED_GET_TIME(dummy_stamp);
1203 psched_timer.expires = jiffies + 1*HZ;
1204 add_timer(&psched_timer);
1205 }
1206}
1207
1208int __init psched_calibrate_clock(void)
1209{
1210 psched_time_t stamp, stamp1;
1211 struct timeval tv, tv1;
1212 psched_tdiff_t delay;
1213 long rdelay;
1214 unsigned long stop;
1215
1216 psched_tick(0);
1217 stop = jiffies + HZ/10;
1218 PSCHED_GET_TIME(stamp);
1219 do_gettimeofday(&tv);
1220 while (time_before(jiffies, stop)) {
1221 barrier();
1222 cpu_relax();
1223 }
1224 PSCHED_GET_TIME(stamp1);
1225 do_gettimeofday(&tv1);
1226
1227 delay = PSCHED_TDIFF(stamp1, stamp);
1228 rdelay = tv1.tv_usec - tv.tv_usec;
1229 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1230 if (rdelay > delay)
1231 return -1;
1232 delay /= rdelay;
1233 psched_tick_per_us = delay;
1234 while ((delay>>=1) != 0)
1235 psched_clock_scale++;
1236 psched_us_per_tick = 1<<psched_clock_scale;
1237 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1238 return 0;
1239}
1240#endif
1241
1242static int __init pktsched_init(void)
1243{
1244 struct rtnetlink_link *link_p;
1245
1246#ifdef CONFIG_NET_SCH_CLK_CPU
1247 if (psched_calibrate_clock() < 0)
1248 return -1;
1249#elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1250 psched_tick_per_us = HZ<<PSCHED_JSCALE;
1251 psched_us_per_tick = 1000000;
1252#endif
1253
1254 link_p = rtnetlink_links[PF_UNSPEC];
1255
1256 /* Setup rtnetlink links. It is made here to avoid
1257 exporting large number of public symbols.
1258 */
1259
1260 if (link_p) {
1261 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1262 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1263 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1264 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1265 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1266 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1267 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1268 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1269 }
1270
1271 register_qdisc(&pfifo_qdisc_ops);
1272 register_qdisc(&bfifo_qdisc_ops);
1273 proc_net_fops_create("psched", 0, &psched_fops);
1274
1275 return 0;
1276}
1277
1278subsys_initcall(pktsched_init);
1279
Stephen Hemmingerd5d75cd2005-05-03 16:24:57 -07001280EXPORT_SYMBOL(qdisc_lookup);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001281EXPORT_SYMBOL(qdisc_get_rtab);
1282EXPORT_SYMBOL(qdisc_put_rtab);
1283EXPORT_SYMBOL(register_qdisc);
1284EXPORT_SYMBOL(unregister_qdisc);
1285EXPORT_SYMBOL(tc_classify);