blob: 1ef482ba6b3670b9a2f946680109baf138f4925e [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
18#include <linux/config.h>
19#include <linux/module.h>
20#include <linux/types.h>
21#include <linux/kernel.h>
22#include <linux/sched.h>
23#include <linux/string.h>
24#include <linux/mm.h>
25#include <linux/socket.h>
26#include <linux/sockios.h>
27#include <linux/in.h>
28#include <linux/errno.h>
29#include <linux/interrupt.h>
30#include <linux/netdevice.h>
31#include <linux/skbuff.h>
32#include <linux/rtnetlink.h>
33#include <linux/init.h>
34#include <linux/proc_fs.h>
35#include <linux/seq_file.h>
36#include <linux/kmod.h>
37#include <linux/list.h>
38#include <linux/bitops.h>
39
40#include <net/sock.h>
41#include <net/pkt_sched.h>
42
43#include <asm/processor.h>
44#include <asm/uaccess.h>
45#include <asm/system.h>
46
47static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
48 struct Qdisc *old, struct Qdisc *new);
49static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
50 struct Qdisc *q, unsigned long cl, int event);
51
52/*
53
54 Short review.
55 -------------
56
57 This file consists of two interrelated parts:
58
59 1. queueing disciplines manager frontend.
60 2. traffic classes manager frontend.
61
62 Generally, queueing discipline ("qdisc") is a black box,
63 which is able to enqueue packets and to dequeue them (when
64 device is ready to send something) in order and at times
65 determined by algorithm hidden in it.
66
67 qdisc's are divided to two categories:
68 - "queues", which have no internal structure visible from outside.
69 - "schedulers", which split all the packets to "traffic classes",
70 using "packet classifiers" (look at cls_api.c)
71
72 In turn, classes may have child qdiscs (as rule, queues)
73 attached to them etc. etc. etc.
74
75 The goal of the routines in this file is to translate
76 information supplied by user in the form of handles
77 to more intelligible for kernel form, to make some sanity
78 checks and part of work, which is common to all qdiscs
79 and to provide rtnetlink notifications.
80
81 All real intelligent work is done inside qdisc modules.
82
83
84
85 Every discipline has two major routines: enqueue and dequeue.
86
87 ---dequeue
88
89 dequeue usually returns a skb to send. It is allowed to return NULL,
90 but it does not mean that queue is empty, it just means that
91 discipline does not want to send anything this time.
92 Queue is really empty if q->q.qlen == 0.
93 For complicated disciplines with multiple queues q->q is not
94 real packet queue, but however q->q.qlen must be valid.
95
96 ---enqueue
97
98 enqueue returns 0, if packet was enqueued successfully.
99 If packet (this one or another one) was dropped, it returns
100 not zero error code.
101 NET_XMIT_DROP - this packet dropped
102 Expected action: do not backoff, but wait until queue will clear.
103 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
104 Expected action: backoff or ignore
105 NET_XMIT_POLICED - dropped by police.
106 Expected action: backoff or error to real-time apps.
107
108 Auxiliary routines:
109
110 ---requeue
111
112 requeues once dequeued packet. It is used for non-standard or
113 just buggy devices, which can defer output even if dev->tbusy=0.
114
115 ---reset
116
117 returns qdisc to initial state: purge all buffers, clear all
118 timers, counters (except for statistics) etc.
119
120 ---init
121
122 initializes newly created qdisc.
123
124 ---destroy
125
126 destroys resources allocated by init and during lifetime of qdisc.
127
128 ---change
129
130 changes qdisc parameters.
131 */
132
133/* Protects list of registered TC modules. It is pure SMP lock. */
134static DEFINE_RWLOCK(qdisc_mod_lock);
135
136
137/************************************************
138 * Queueing disciplines manipulation. *
139 ************************************************/
140
141
142/* The list of all installed queueing disciplines. */
143
144static struct Qdisc_ops *qdisc_base;
145
146/* Register/uregister queueing discipline */
147
148int register_qdisc(struct Qdisc_ops *qops)
149{
150 struct Qdisc_ops *q, **qp;
151 int rc = -EEXIST;
152
153 write_lock(&qdisc_mod_lock);
154 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
155 if (!strcmp(qops->id, q->id))
156 goto out;
157
158 if (qops->enqueue == NULL)
159 qops->enqueue = noop_qdisc_ops.enqueue;
160 if (qops->requeue == NULL)
161 qops->requeue = noop_qdisc_ops.requeue;
162 if (qops->dequeue == NULL)
163 qops->dequeue = noop_qdisc_ops.dequeue;
164
165 qops->next = NULL;
166 *qp = qops;
167 rc = 0;
168out:
169 write_unlock(&qdisc_mod_lock);
170 return rc;
171}
172
173int unregister_qdisc(struct Qdisc_ops *qops)
174{
175 struct Qdisc_ops *q, **qp;
176 int err = -ENOENT;
177
178 write_lock(&qdisc_mod_lock);
179 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
180 if (q == qops)
181 break;
182 if (q) {
183 *qp = q->next;
184 q->next = NULL;
185 err = 0;
186 }
187 write_unlock(&qdisc_mod_lock);
188 return err;
189}
190
191/* We know handle. Find qdisc among all qdisc's attached to device
192 (root qdisc, all its children, children of children etc.)
193 */
194
195struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
196{
197 struct Qdisc *q;
198
199 read_lock_bh(&qdisc_tree_lock);
200 list_for_each_entry(q, &dev->qdisc_list, list) {
201 if (q->handle == handle) {
202 read_unlock_bh(&qdisc_tree_lock);
203 return q;
204 }
205 }
206 read_unlock_bh(&qdisc_tree_lock);
207 return NULL;
208}
209
210static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
211{
212 unsigned long cl;
213 struct Qdisc *leaf;
214 struct Qdisc_class_ops *cops = p->ops->cl_ops;
215
216 if (cops == NULL)
217 return NULL;
218 cl = cops->get(p, classid);
219
220 if (cl == 0)
221 return NULL;
222 leaf = cops->leaf(p, cl);
223 cops->put(p, cl);
224 return leaf;
225}
226
227/* Find queueing discipline by name */
228
229static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
230{
231 struct Qdisc_ops *q = NULL;
232
233 if (kind) {
234 read_lock(&qdisc_mod_lock);
235 for (q = qdisc_base; q; q = q->next) {
236 if (rtattr_strcmp(kind, q->id) == 0) {
237 if (!try_module_get(q->owner))
238 q = NULL;
239 break;
240 }
241 }
242 read_unlock(&qdisc_mod_lock);
243 }
244 return q;
245}
246
247static struct qdisc_rate_table *qdisc_rtab_list;
248
249struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
250{
251 struct qdisc_rate_table *rtab;
252
253 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
254 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
255 rtab->refcnt++;
256 return rtab;
257 }
258 }
259
260 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
261 return NULL;
262
263 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
264 if (rtab) {
265 rtab->rate = *r;
266 rtab->refcnt = 1;
267 memcpy(rtab->data, RTA_DATA(tab), 1024);
268 rtab->next = qdisc_rtab_list;
269 qdisc_rtab_list = rtab;
270 }
271 return rtab;
272}
273
274void qdisc_put_rtab(struct qdisc_rate_table *tab)
275{
276 struct qdisc_rate_table *rtab, **rtabp;
277
278 if (!tab || --tab->refcnt)
279 return;
280
281 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
282 if (rtab == tab) {
283 *rtabp = rtab->next;
284 kfree(rtab);
285 return;
286 }
287 }
288}
289
290
291/* Allocate an unique handle from space managed by kernel */
292
293static u32 qdisc_alloc_handle(struct net_device *dev)
294{
295 int i = 0x10000;
296 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
297
298 do {
299 autohandle += TC_H_MAKE(0x10000U, 0);
300 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
301 autohandle = TC_H_MAKE(0x80000000U, 0);
302 } while (qdisc_lookup(dev, autohandle) && --i > 0);
303
304 return i>0 ? autohandle : 0;
305}
306
307/* Attach toplevel qdisc to device dev */
308
309static struct Qdisc *
310dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
311{
312 struct Qdisc *oqdisc;
313
314 if (dev->flags & IFF_UP)
315 dev_deactivate(dev);
316
317 qdisc_lock_tree(dev);
318 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
319 oqdisc = dev->qdisc_ingress;
320 /* Prune old scheduler */
321 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
322 /* delete */
323 qdisc_reset(oqdisc);
324 dev->qdisc_ingress = NULL;
325 } else { /* new */
326 dev->qdisc_ingress = qdisc;
327 }
328
329 } else {
330
331 oqdisc = dev->qdisc_sleeping;
332
333 /* Prune old scheduler */
334 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
335 qdisc_reset(oqdisc);
336
337 /* ... and graft new one */
338 if (qdisc == NULL)
339 qdisc = &noop_qdisc;
340 dev->qdisc_sleeping = qdisc;
341 dev->qdisc = &noop_qdisc;
342 }
343
344 qdisc_unlock_tree(dev);
345
346 if (dev->flags & IFF_UP)
347 dev_activate(dev);
348
349 return oqdisc;
350}
351
352
353/* Graft qdisc "new" to class "classid" of qdisc "parent" or
354 to device "dev".
355
356 Old qdisc is not destroyed but returned in *old.
357 */
358
359static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
360 u32 classid,
361 struct Qdisc *new, struct Qdisc **old)
362{
363 int err = 0;
364 struct Qdisc *q = *old;
365
366
367 if (parent == NULL) {
368 if (q && q->flags&TCQ_F_INGRESS) {
369 *old = dev_graft_qdisc(dev, q);
370 } else {
371 *old = dev_graft_qdisc(dev, new);
372 }
373 } else {
374 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
375
376 err = -EINVAL;
377
378 if (cops) {
379 unsigned long cl = cops->get(parent, classid);
380 if (cl) {
381 err = cops->graft(parent, cl, new, old);
382 if (new)
383 new->parent = classid;
384 cops->put(parent, cl);
385 }
386 }
387 }
388 return err;
389}
390
391/*
392 Allocate and initialize new qdisc.
393
394 Parameters are passed via opt.
395 */
396
397static struct Qdisc *
398qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
399{
400 int err;
401 struct rtattr *kind = tca[TCA_KIND-1];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402 struct Qdisc *sch;
403 struct Qdisc_ops *ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404
405 ops = qdisc_lookup_ops(kind);
406#ifdef CONFIG_KMOD
407 if (ops == NULL && kind != NULL) {
408 char name[IFNAMSIZ];
409 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
410 /* We dropped the RTNL semaphore in order to
411 * perform the module load. So, even if we
412 * succeeded in loading the module we have to
413 * tell the caller to replay the request. We
414 * indicate this using -EAGAIN.
415 * We replay the request because the device may
416 * go away in the mean time.
417 */
418 rtnl_unlock();
419 request_module("sch_%s", name);
420 rtnl_lock();
421 ops = qdisc_lookup_ops(kind);
422 if (ops != NULL) {
423 /* We will try again qdisc_lookup_ops,
424 * so don't keep a reference.
425 */
426 module_put(ops->owner);
427 err = -EAGAIN;
428 goto err_out;
429 }
430 }
431 }
432#endif
433
434 err = -EINVAL;
435 if (ops == NULL)
436 goto err_out;
437
Thomas Graf3d54b822005-07-05 14:15:09 -0700438 sch = qdisc_alloc(dev, ops);
439 if (IS_ERR(sch)) {
440 err = PTR_ERR(sch);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700441 goto err_out2;
Thomas Graf3d54b822005-07-05 14:15:09 -0700442 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700443
Thomas Graf3d54b822005-07-05 14:15:09 -0700444 if (handle == TC_H_INGRESS) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445 sch->flags |= TCQ_F_INGRESS;
Thomas Graf3d54b822005-07-05 14:15:09 -0700446 handle = TC_H_MAKE(TC_H_INGRESS, 0);
447 } else if (handle == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700448 handle = qdisc_alloc_handle(dev);
449 err = -ENOMEM;
450 if (handle == 0)
451 goto err_out3;
452 }
453
Thomas Graf3d54b822005-07-05 14:15:09 -0700454 sch->handle = handle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700455
456 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
457 qdisc_lock_tree(dev);
458 list_add_tail(&sch->list, &dev->qdisc_list);
459 qdisc_unlock_tree(dev);
460
461#ifdef CONFIG_NET_ESTIMATOR
462 if (tca[TCA_RATE-1])
463 gen_new_estimator(&sch->bstats, &sch->rate_est,
464 sch->stats_lock, tca[TCA_RATE-1]);
465#endif
466 return sch;
467 }
468err_out3:
469 dev_put(dev);
Thomas Graf3d54b822005-07-05 14:15:09 -0700470 kfree((char *) sch - sch->padded);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700471err_out2:
472 module_put(ops->owner);
473err_out:
474 *errp = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700475 return NULL;
476}
477
478static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
479{
480 if (tca[TCA_OPTIONS-1]) {
481 int err;
482
483 if (sch->ops->change == NULL)
484 return -EINVAL;
485 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
486 if (err)
487 return err;
488 }
489#ifdef CONFIG_NET_ESTIMATOR
490 if (tca[TCA_RATE-1])
491 gen_replace_estimator(&sch->bstats, &sch->rate_est,
492 sch->stats_lock, tca[TCA_RATE-1]);
493#endif
494 return 0;
495}
496
497struct check_loop_arg
498{
499 struct qdisc_walker w;
500 struct Qdisc *p;
501 int depth;
502};
503
504static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
505
506static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
507{
508 struct check_loop_arg arg;
509
510 if (q->ops->cl_ops == NULL)
511 return 0;
512
513 arg.w.stop = arg.w.skip = arg.w.count = 0;
514 arg.w.fn = check_loop_fn;
515 arg.depth = depth;
516 arg.p = p;
517 q->ops->cl_ops->walk(q, &arg.w);
518 return arg.w.stop ? -ELOOP : 0;
519}
520
521static int
522check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
523{
524 struct Qdisc *leaf;
525 struct Qdisc_class_ops *cops = q->ops->cl_ops;
526 struct check_loop_arg *arg = (struct check_loop_arg *)w;
527
528 leaf = cops->leaf(q, cl);
529 if (leaf) {
530 if (leaf == arg->p || arg->depth > 7)
531 return -ELOOP;
532 return check_loop(leaf, arg->p, arg->depth + 1);
533 }
534 return 0;
535}
536
537/*
538 * Delete/get qdisc.
539 */
540
541static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
542{
543 struct tcmsg *tcm = NLMSG_DATA(n);
544 struct rtattr **tca = arg;
545 struct net_device *dev;
546 u32 clid = tcm->tcm_parent;
547 struct Qdisc *q = NULL;
548 struct Qdisc *p = NULL;
549 int err;
550
551 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
552 return -ENODEV;
553
554 if (clid) {
555 if (clid != TC_H_ROOT) {
556 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
557 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
558 return -ENOENT;
559 q = qdisc_leaf(p, clid);
560 } else { /* ingress */
561 q = dev->qdisc_ingress;
562 }
563 } else {
564 q = dev->qdisc_sleeping;
565 }
566 if (!q)
567 return -ENOENT;
568
569 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
570 return -EINVAL;
571 } else {
572 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
573 return -ENOENT;
574 }
575
576 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
577 return -EINVAL;
578
579 if (n->nlmsg_type == RTM_DELQDISC) {
580 if (!clid)
581 return -EINVAL;
582 if (q->handle == 0)
583 return -ENOENT;
584 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
585 return err;
586 if (q) {
587 qdisc_notify(skb, n, clid, q, NULL);
588 spin_lock_bh(&dev->queue_lock);
589 qdisc_destroy(q);
590 spin_unlock_bh(&dev->queue_lock);
591 }
592 } else {
593 qdisc_notify(skb, n, clid, NULL, q);
594 }
595 return 0;
596}
597
598/*
599 Create/change qdisc.
600 */
601
602static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
603{
604 struct tcmsg *tcm;
605 struct rtattr **tca;
606 struct net_device *dev;
607 u32 clid;
608 struct Qdisc *q, *p;
609 int err;
610
611replay:
612 /* Reinit, just in case something touches this. */
613 tcm = NLMSG_DATA(n);
614 tca = arg;
615 clid = tcm->tcm_parent;
616 q = p = NULL;
617
618 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
619 return -ENODEV;
620
621 if (clid) {
622 if (clid != TC_H_ROOT) {
623 if (clid != TC_H_INGRESS) {
624 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
625 return -ENOENT;
626 q = qdisc_leaf(p, clid);
627 } else { /*ingress */
628 q = dev->qdisc_ingress;
629 }
630 } else {
631 q = dev->qdisc_sleeping;
632 }
633
634 /* It may be default qdisc, ignore it */
635 if (q && q->handle == 0)
636 q = NULL;
637
638 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
639 if (tcm->tcm_handle) {
640 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
641 return -EEXIST;
642 if (TC_H_MIN(tcm->tcm_handle))
643 return -EINVAL;
644 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
645 goto create_n_graft;
646 if (n->nlmsg_flags&NLM_F_EXCL)
647 return -EEXIST;
648 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
649 return -EINVAL;
650 if (q == p ||
651 (p && check_loop(q, p, 0)))
652 return -ELOOP;
653 atomic_inc(&q->refcnt);
654 goto graft;
655 } else {
656 if (q == NULL)
657 goto create_n_graft;
658
659 /* This magic test requires explanation.
660 *
661 * We know, that some child q is already
662 * attached to this parent and have choice:
663 * either to change it or to create/graft new one.
664 *
665 * 1. We are allowed to create/graft only
666 * if CREATE and REPLACE flags are set.
667 *
668 * 2. If EXCL is set, requestor wanted to say,
669 * that qdisc tcm_handle is not expected
670 * to exist, so that we choose create/graft too.
671 *
672 * 3. The last case is when no flags are set.
673 * Alas, it is sort of hole in API, we
674 * cannot decide what to do unambiguously.
675 * For now we select create/graft, if
676 * user gave KIND, which does not match existing.
677 */
678 if ((n->nlmsg_flags&NLM_F_CREATE) &&
679 (n->nlmsg_flags&NLM_F_REPLACE) &&
680 ((n->nlmsg_flags&NLM_F_EXCL) ||
681 (tca[TCA_KIND-1] &&
682 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
683 goto create_n_graft;
684 }
685 }
686 } else {
687 if (!tcm->tcm_handle)
688 return -EINVAL;
689 q = qdisc_lookup(dev, tcm->tcm_handle);
690 }
691
692 /* Change qdisc parameters */
693 if (q == NULL)
694 return -ENOENT;
695 if (n->nlmsg_flags&NLM_F_EXCL)
696 return -EEXIST;
697 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
698 return -EINVAL;
699 err = qdisc_change(q, tca);
700 if (err == 0)
701 qdisc_notify(skb, n, clid, NULL, q);
702 return err;
703
704create_n_graft:
705 if (!(n->nlmsg_flags&NLM_F_CREATE))
706 return -ENOENT;
707 if (clid == TC_H_INGRESS)
708 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
709 else
710 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
711 if (q == NULL) {
712 if (err == -EAGAIN)
713 goto replay;
714 return err;
715 }
716
717graft:
718 if (1) {
719 struct Qdisc *old_q = NULL;
720 err = qdisc_graft(dev, p, clid, q, &old_q);
721 if (err) {
722 if (q) {
723 spin_lock_bh(&dev->queue_lock);
724 qdisc_destroy(q);
725 spin_unlock_bh(&dev->queue_lock);
726 }
727 return err;
728 }
729 qdisc_notify(skb, n, clid, old_q, q);
730 if (old_q) {
731 spin_lock_bh(&dev->queue_lock);
732 qdisc_destroy(old_q);
733 spin_unlock_bh(&dev->queue_lock);
734 }
735 }
736 return 0;
737}
738
739static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700740 u32 pid, u32 seq, u16 flags, int event)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700741{
742 struct tcmsg *tcm;
743 struct nlmsghdr *nlh;
744 unsigned char *b = skb->tail;
745 struct gnet_dump d;
746
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700747 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700748 tcm = NLMSG_DATA(nlh);
749 tcm->tcm_family = AF_UNSPEC;
Patrick McHardy9ef1d4c2005-06-28 12:55:30 -0700750 tcm->tcm__pad1 = 0;
751 tcm->tcm__pad2 = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700752 tcm->tcm_ifindex = q->dev->ifindex;
753 tcm->tcm_parent = clid;
754 tcm->tcm_handle = q->handle;
755 tcm->tcm_info = atomic_read(&q->refcnt);
756 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
757 if (q->ops->dump && q->ops->dump(q, skb) < 0)
758 goto rtattr_failure;
759 q->qstats.qlen = q->q.qlen;
760
761 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
762 TCA_XSTATS, q->stats_lock, &d) < 0)
763 goto rtattr_failure;
764
765 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
766 goto rtattr_failure;
767
768 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
769#ifdef CONFIG_NET_ESTIMATOR
770 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
771#endif
772 gnet_stats_copy_queue(&d, &q->qstats) < 0)
773 goto rtattr_failure;
774
775 if (gnet_stats_finish_copy(&d) < 0)
776 goto rtattr_failure;
777
778 nlh->nlmsg_len = skb->tail - b;
779 return skb->len;
780
781nlmsg_failure:
782rtattr_failure:
783 skb_trim(skb, b - skb->data);
784 return -1;
785}
786
787static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
788 u32 clid, struct Qdisc *old, struct Qdisc *new)
789{
790 struct sk_buff *skb;
791 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
792
793 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
794 if (!skb)
795 return -ENOBUFS;
796
797 if (old && old->handle) {
798 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
799 goto err_out;
800 }
801 if (new) {
802 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
803 goto err_out;
804 }
805
806 if (skb->len)
807 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
808
809err_out:
810 kfree_skb(skb);
811 return -EINVAL;
812}
813
814static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
815{
816 int idx, q_idx;
817 int s_idx, s_q_idx;
818 struct net_device *dev;
819 struct Qdisc *q;
820
821 s_idx = cb->args[0];
822 s_q_idx = q_idx = cb->args[1];
823 read_lock(&dev_base_lock);
824 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
825 if (idx < s_idx)
826 continue;
827 if (idx > s_idx)
828 s_q_idx = 0;
829 read_lock_bh(&qdisc_tree_lock);
830 q_idx = 0;
831 list_for_each_entry(q, &dev->qdisc_list, list) {
832 if (q_idx < s_q_idx) {
833 q_idx++;
834 continue;
835 }
836 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
837 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
838 read_unlock_bh(&qdisc_tree_lock);
839 goto done;
840 }
841 q_idx++;
842 }
843 read_unlock_bh(&qdisc_tree_lock);
844 }
845
846done:
847 read_unlock(&dev_base_lock);
848
849 cb->args[0] = idx;
850 cb->args[1] = q_idx;
851
852 return skb->len;
853}
854
855
856
857/************************************************
858 * Traffic classes manipulation. *
859 ************************************************/
860
861
862
863static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
864{
865 struct tcmsg *tcm = NLMSG_DATA(n);
866 struct rtattr **tca = arg;
867 struct net_device *dev;
868 struct Qdisc *q = NULL;
869 struct Qdisc_class_ops *cops;
870 unsigned long cl = 0;
871 unsigned long new_cl;
872 u32 pid = tcm->tcm_parent;
873 u32 clid = tcm->tcm_handle;
874 u32 qid = TC_H_MAJ(clid);
875 int err;
876
877 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
878 return -ENODEV;
879
880 /*
881 parent == TC_H_UNSPEC - unspecified parent.
882 parent == TC_H_ROOT - class is root, which has no parent.
883 parent == X:0 - parent is root class.
884 parent == X:Y - parent is a node in hierarchy.
885 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
886
887 handle == 0:0 - generate handle from kernel pool.
888 handle == 0:Y - class is X:Y, where X:0 is qdisc.
889 handle == X:Y - clear.
890 handle == X:0 - root class.
891 */
892
893 /* Step 1. Determine qdisc handle X:0 */
894
895 if (pid != TC_H_ROOT) {
896 u32 qid1 = TC_H_MAJ(pid);
897
898 if (qid && qid1) {
899 /* If both majors are known, they must be identical. */
900 if (qid != qid1)
901 return -EINVAL;
902 } else if (qid1) {
903 qid = qid1;
904 } else if (qid == 0)
905 qid = dev->qdisc_sleeping->handle;
906
907 /* Now qid is genuine qdisc handle consistent
908 both with parent and child.
909
910 TC_H_MAJ(pid) still may be unspecified, complete it now.
911 */
912 if (pid)
913 pid = TC_H_MAKE(qid, pid);
914 } else {
915 if (qid == 0)
916 qid = dev->qdisc_sleeping->handle;
917 }
918
919 /* OK. Locate qdisc */
920 if ((q = qdisc_lookup(dev, qid)) == NULL)
921 return -ENOENT;
922
923 /* An check that it supports classes */
924 cops = q->ops->cl_ops;
925 if (cops == NULL)
926 return -EINVAL;
927
928 /* Now try to get class */
929 if (clid == 0) {
930 if (pid == TC_H_ROOT)
931 clid = qid;
932 } else
933 clid = TC_H_MAKE(qid, clid);
934
935 if (clid)
936 cl = cops->get(q, clid);
937
938 if (cl == 0) {
939 err = -ENOENT;
940 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
941 goto out;
942 } else {
943 switch (n->nlmsg_type) {
944 case RTM_NEWTCLASS:
945 err = -EEXIST;
946 if (n->nlmsg_flags&NLM_F_EXCL)
947 goto out;
948 break;
949 case RTM_DELTCLASS:
950 err = cops->delete(q, cl);
951 if (err == 0)
952 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
953 goto out;
954 case RTM_GETTCLASS:
955 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
956 goto out;
957 default:
958 err = -EINVAL;
959 goto out;
960 }
961 }
962
963 new_cl = cl;
964 err = cops->change(q, clid, pid, tca, &new_cl);
965 if (err == 0)
966 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
967
968out:
969 if (cl)
970 cops->put(q, cl);
971
972 return err;
973}
974
975
976static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
977 unsigned long cl,
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700978 u32 pid, u32 seq, u16 flags, int event)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700979{
980 struct tcmsg *tcm;
981 struct nlmsghdr *nlh;
982 unsigned char *b = skb->tail;
983 struct gnet_dump d;
984 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
985
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700986 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700987 tcm = NLMSG_DATA(nlh);
988 tcm->tcm_family = AF_UNSPEC;
989 tcm->tcm_ifindex = q->dev->ifindex;
990 tcm->tcm_parent = q->handle;
991 tcm->tcm_handle = q->handle;
992 tcm->tcm_info = 0;
993 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
994 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
995 goto rtattr_failure;
996
997 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
998 TCA_XSTATS, q->stats_lock, &d) < 0)
999 goto rtattr_failure;
1000
1001 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1002 goto rtattr_failure;
1003
1004 if (gnet_stats_finish_copy(&d) < 0)
1005 goto rtattr_failure;
1006
1007 nlh->nlmsg_len = skb->tail - b;
1008 return skb->len;
1009
1010nlmsg_failure:
1011rtattr_failure:
1012 skb_trim(skb, b - skb->data);
1013 return -1;
1014}
1015
1016static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1017 struct Qdisc *q, unsigned long cl, int event)
1018{
1019 struct sk_buff *skb;
1020 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1021
1022 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1023 if (!skb)
1024 return -ENOBUFS;
1025
1026 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1027 kfree_skb(skb);
1028 return -EINVAL;
1029 }
1030
1031 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1032}
1033
1034struct qdisc_dump_args
1035{
1036 struct qdisc_walker w;
1037 struct sk_buff *skb;
1038 struct netlink_callback *cb;
1039};
1040
1041static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1042{
1043 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1044
1045 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1046 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1047}
1048
1049static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1050{
1051 int t;
1052 int s_t;
1053 struct net_device *dev;
1054 struct Qdisc *q;
1055 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1056 struct qdisc_dump_args arg;
1057
1058 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1059 return 0;
1060 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1061 return 0;
1062
1063 s_t = cb->args[0];
1064 t = 0;
1065
1066 read_lock_bh(&qdisc_tree_lock);
1067 list_for_each_entry(q, &dev->qdisc_list, list) {
1068 if (t < s_t || !q->ops->cl_ops ||
1069 (tcm->tcm_parent &&
1070 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1071 t++;
1072 continue;
1073 }
1074 if (t > s_t)
1075 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1076 arg.w.fn = qdisc_class_dump;
1077 arg.skb = skb;
1078 arg.cb = cb;
1079 arg.w.stop = 0;
1080 arg.w.skip = cb->args[1];
1081 arg.w.count = 0;
1082 q->ops->cl_ops->walk(q, &arg.w);
1083 cb->args[1] = arg.w.count;
1084 if (arg.w.stop)
1085 break;
1086 t++;
1087 }
1088 read_unlock_bh(&qdisc_tree_lock);
1089
1090 cb->args[0] = t;
1091
1092 dev_put(dev);
1093 return skb->len;
1094}
1095
1096/* Main classifier routine: scans classifier chain attached
1097 to this qdisc, (optionally) tests for protocol and asks
1098 specific classifiers.
1099 */
1100int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1101 struct tcf_result *res)
1102{
1103 int err = 0;
1104 u32 protocol = skb->protocol;
1105#ifdef CONFIG_NET_CLS_ACT
1106 struct tcf_proto *otp = tp;
1107reclassify:
1108#endif
1109 protocol = skb->protocol;
1110
1111 for ( ; tp; tp = tp->next) {
1112 if ((tp->protocol == protocol ||
1113 tp->protocol == __constant_htons(ETH_P_ALL)) &&
1114 (err = tp->classify(skb, tp, res)) >= 0) {
1115#ifdef CONFIG_NET_CLS_ACT
1116 if ( TC_ACT_RECLASSIFY == err) {
1117 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1118 tp = otp;
1119
1120 if (MAX_REC_LOOP < verd++) {
1121 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1122 tp->prio&0xffff, ntohs(tp->protocol));
1123 return TC_ACT_SHOT;
1124 }
1125 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1126 goto reclassify;
1127 } else {
1128 if (skb->tc_verd)
1129 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1130 return err;
1131 }
1132#else
1133
1134 return err;
1135#endif
1136 }
1137
1138 }
1139 return -1;
1140}
1141
1142static int psched_us_per_tick = 1;
1143static int psched_tick_per_us = 1;
1144
1145#ifdef CONFIG_PROC_FS
1146static int psched_show(struct seq_file *seq, void *v)
1147{
1148 seq_printf(seq, "%08x %08x %08x %08x\n",
1149 psched_tick_per_us, psched_us_per_tick,
1150 1000000, HZ);
1151
1152 return 0;
1153}
1154
1155static int psched_open(struct inode *inode, struct file *file)
1156{
1157 return single_open(file, psched_show, PDE(inode)->data);
1158}
1159
1160static struct file_operations psched_fops = {
1161 .owner = THIS_MODULE,
1162 .open = psched_open,
1163 .read = seq_read,
1164 .llseek = seq_lseek,
1165 .release = single_release,
1166};
1167#endif
1168
1169#ifdef CONFIG_NET_SCH_CLK_CPU
1170psched_tdiff_t psched_clock_per_hz;
1171int psched_clock_scale;
1172EXPORT_SYMBOL(psched_clock_per_hz);
1173EXPORT_SYMBOL(psched_clock_scale);
1174
1175psched_time_t psched_time_base;
1176cycles_t psched_time_mark;
1177EXPORT_SYMBOL(psched_time_mark);
1178EXPORT_SYMBOL(psched_time_base);
1179
1180/*
1181 * Periodically adjust psched_time_base to avoid overflow
1182 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1183 */
1184static void psched_tick(unsigned long);
1185static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0);
1186
1187static void psched_tick(unsigned long dummy)
1188{
1189 if (sizeof(cycles_t) == sizeof(u32)) {
1190 psched_time_t dummy_stamp;
1191 PSCHED_GET_TIME(dummy_stamp);
1192 psched_timer.expires = jiffies + 1*HZ;
1193 add_timer(&psched_timer);
1194 }
1195}
1196
1197int __init psched_calibrate_clock(void)
1198{
1199 psched_time_t stamp, stamp1;
1200 struct timeval tv, tv1;
1201 psched_tdiff_t delay;
1202 long rdelay;
1203 unsigned long stop;
1204
1205 psched_tick(0);
1206 stop = jiffies + HZ/10;
1207 PSCHED_GET_TIME(stamp);
1208 do_gettimeofday(&tv);
1209 while (time_before(jiffies, stop)) {
1210 barrier();
1211 cpu_relax();
1212 }
1213 PSCHED_GET_TIME(stamp1);
1214 do_gettimeofday(&tv1);
1215
1216 delay = PSCHED_TDIFF(stamp1, stamp);
1217 rdelay = tv1.tv_usec - tv.tv_usec;
1218 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1219 if (rdelay > delay)
1220 return -1;
1221 delay /= rdelay;
1222 psched_tick_per_us = delay;
1223 while ((delay>>=1) != 0)
1224 psched_clock_scale++;
1225 psched_us_per_tick = 1<<psched_clock_scale;
1226 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1227 return 0;
1228}
1229#endif
1230
1231static int __init pktsched_init(void)
1232{
1233 struct rtnetlink_link *link_p;
1234
1235#ifdef CONFIG_NET_SCH_CLK_CPU
1236 if (psched_calibrate_clock() < 0)
1237 return -1;
1238#elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1239 psched_tick_per_us = HZ<<PSCHED_JSCALE;
1240 psched_us_per_tick = 1000000;
1241#endif
1242
1243 link_p = rtnetlink_links[PF_UNSPEC];
1244
1245 /* Setup rtnetlink links. It is made here to avoid
1246 exporting large number of public symbols.
1247 */
1248
1249 if (link_p) {
1250 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1251 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1252 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1253 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1254 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1255 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1256 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1257 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1258 }
1259
1260 register_qdisc(&pfifo_qdisc_ops);
1261 register_qdisc(&bfifo_qdisc_ops);
1262 proc_net_fops_create("psched", 0, &psched_fops);
1263
1264 return 0;
1265}
1266
1267subsys_initcall(pktsched_init);
1268
Stephen Hemmingerd5d75cd2005-05-03 16:24:57 -07001269EXPORT_SYMBOL(qdisc_lookup);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001270EXPORT_SYMBOL(qdisc_get_rtab);
1271EXPORT_SYMBOL(qdisc_put_rtab);
1272EXPORT_SYMBOL(register_qdisc);
1273EXPORT_SYMBOL(unregister_qdisc);
1274EXPORT_SYMBOL(tc_classify);