blob: 737681cb9a928d65800a7e85661b034764683840 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
18#include <linux/config.h>
19#include <linux/module.h>
20#include <linux/types.h>
21#include <linux/kernel.h>
22#include <linux/sched.h>
23#include <linux/string.h>
24#include <linux/mm.h>
25#include <linux/socket.h>
26#include <linux/sockios.h>
27#include <linux/in.h>
28#include <linux/errno.h>
29#include <linux/interrupt.h>
30#include <linux/netdevice.h>
31#include <linux/skbuff.h>
32#include <linux/rtnetlink.h>
33#include <linux/init.h>
34#include <linux/proc_fs.h>
35#include <linux/seq_file.h>
36#include <linux/kmod.h>
37#include <linux/list.h>
38#include <linux/bitops.h>
39
40#include <net/sock.h>
41#include <net/pkt_sched.h>
42
43#include <asm/processor.h>
44#include <asm/uaccess.h>
45#include <asm/system.h>
46
47static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
48 struct Qdisc *old, struct Qdisc *new);
49static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
50 struct Qdisc *q, unsigned long cl, int event);
51
52/*
53
54 Short review.
55 -------------
56
57 This file consists of two interrelated parts:
58
59 1. queueing disciplines manager frontend.
60 2. traffic classes manager frontend.
61
62 Generally, queueing discipline ("qdisc") is a black box,
63 which is able to enqueue packets and to dequeue them (when
64 device is ready to send something) in order and at times
65 determined by algorithm hidden in it.
66
67 qdisc's are divided to two categories:
68 - "queues", which have no internal structure visible from outside.
69 - "schedulers", which split all the packets to "traffic classes",
70 using "packet classifiers" (look at cls_api.c)
71
72 In turn, classes may have child qdiscs (as rule, queues)
73 attached to them etc. etc. etc.
74
75 The goal of the routines in this file is to translate
76 information supplied by user in the form of handles
77 to more intelligible for kernel form, to make some sanity
78 checks and part of work, which is common to all qdiscs
79 and to provide rtnetlink notifications.
80
81 All real intelligent work is done inside qdisc modules.
82
83
84
85 Every discipline has two major routines: enqueue and dequeue.
86
87 ---dequeue
88
89 dequeue usually returns a skb to send. It is allowed to return NULL,
90 but it does not mean that queue is empty, it just means that
91 discipline does not want to send anything this time.
92 Queue is really empty if q->q.qlen == 0.
93 For complicated disciplines with multiple queues q->q is not
94 real packet queue, but however q->q.qlen must be valid.
95
96 ---enqueue
97
98 enqueue returns 0, if packet was enqueued successfully.
99 If packet (this one or another one) was dropped, it returns
100 not zero error code.
101 NET_XMIT_DROP - this packet dropped
102 Expected action: do not backoff, but wait until queue will clear.
103 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
104 Expected action: backoff or ignore
105 NET_XMIT_POLICED - dropped by police.
106 Expected action: backoff or error to real-time apps.
107
108 Auxiliary routines:
109
110 ---requeue
111
112 requeues once dequeued packet. It is used for non-standard or
113 just buggy devices, which can defer output even if dev->tbusy=0.
114
115 ---reset
116
117 returns qdisc to initial state: purge all buffers, clear all
118 timers, counters (except for statistics) etc.
119
120 ---init
121
122 initializes newly created qdisc.
123
124 ---destroy
125
126 destroys resources allocated by init and during lifetime of qdisc.
127
128 ---change
129
130 changes qdisc parameters.
131 */
132
133/* Protects list of registered TC modules. It is pure SMP lock. */
134static DEFINE_RWLOCK(qdisc_mod_lock);
135
136
137/************************************************
138 * Queueing disciplines manipulation. *
139 ************************************************/
140
141
142/* The list of all installed queueing disciplines. */
143
144static struct Qdisc_ops *qdisc_base;
145
146/* Register/uregister queueing discipline */
147
148int register_qdisc(struct Qdisc_ops *qops)
149{
150 struct Qdisc_ops *q, **qp;
151 int rc = -EEXIST;
152
153 write_lock(&qdisc_mod_lock);
154 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
155 if (!strcmp(qops->id, q->id))
156 goto out;
157
158 if (qops->enqueue == NULL)
159 qops->enqueue = noop_qdisc_ops.enqueue;
160 if (qops->requeue == NULL)
161 qops->requeue = noop_qdisc_ops.requeue;
162 if (qops->dequeue == NULL)
163 qops->dequeue = noop_qdisc_ops.dequeue;
164
165 qops->next = NULL;
166 *qp = qops;
167 rc = 0;
168out:
169 write_unlock(&qdisc_mod_lock);
170 return rc;
171}
172
173int unregister_qdisc(struct Qdisc_ops *qops)
174{
175 struct Qdisc_ops *q, **qp;
176 int err = -ENOENT;
177
178 write_lock(&qdisc_mod_lock);
179 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
180 if (q == qops)
181 break;
182 if (q) {
183 *qp = q->next;
184 q->next = NULL;
185 err = 0;
186 }
187 write_unlock(&qdisc_mod_lock);
188 return err;
189}
190
191/* We know handle. Find qdisc among all qdisc's attached to device
192 (root qdisc, all its children, children of children etc.)
193 */
194
195struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
196{
197 struct Qdisc *q;
198
199 read_lock_bh(&qdisc_tree_lock);
200 list_for_each_entry(q, &dev->qdisc_list, list) {
201 if (q->handle == handle) {
202 read_unlock_bh(&qdisc_tree_lock);
203 return q;
204 }
205 }
206 read_unlock_bh(&qdisc_tree_lock);
207 return NULL;
208}
209
210static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
211{
212 unsigned long cl;
213 struct Qdisc *leaf;
214 struct Qdisc_class_ops *cops = p->ops->cl_ops;
215
216 if (cops == NULL)
217 return NULL;
218 cl = cops->get(p, classid);
219
220 if (cl == 0)
221 return NULL;
222 leaf = cops->leaf(p, cl);
223 cops->put(p, cl);
224 return leaf;
225}
226
227/* Find queueing discipline by name */
228
229static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
230{
231 struct Qdisc_ops *q = NULL;
232
233 if (kind) {
234 read_lock(&qdisc_mod_lock);
235 for (q = qdisc_base; q; q = q->next) {
236 if (rtattr_strcmp(kind, q->id) == 0) {
237 if (!try_module_get(q->owner))
238 q = NULL;
239 break;
240 }
241 }
242 read_unlock(&qdisc_mod_lock);
243 }
244 return q;
245}
246
247static struct qdisc_rate_table *qdisc_rtab_list;
248
249struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
250{
251 struct qdisc_rate_table *rtab;
252
253 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
254 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
255 rtab->refcnt++;
256 return rtab;
257 }
258 }
259
260 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
261 return NULL;
262
263 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
264 if (rtab) {
265 rtab->rate = *r;
266 rtab->refcnt = 1;
267 memcpy(rtab->data, RTA_DATA(tab), 1024);
268 rtab->next = qdisc_rtab_list;
269 qdisc_rtab_list = rtab;
270 }
271 return rtab;
272}
273
274void qdisc_put_rtab(struct qdisc_rate_table *tab)
275{
276 struct qdisc_rate_table *rtab, **rtabp;
277
278 if (!tab || --tab->refcnt)
279 return;
280
281 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
282 if (rtab == tab) {
283 *rtabp = rtab->next;
284 kfree(rtab);
285 return;
286 }
287 }
288}
289
290
291/* Allocate an unique handle from space managed by kernel */
292
293static u32 qdisc_alloc_handle(struct net_device *dev)
294{
295 int i = 0x10000;
296 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
297
298 do {
299 autohandle += TC_H_MAKE(0x10000U, 0);
300 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
301 autohandle = TC_H_MAKE(0x80000000U, 0);
302 } while (qdisc_lookup(dev, autohandle) && --i > 0);
303
304 return i>0 ? autohandle : 0;
305}
306
307/* Attach toplevel qdisc to device dev */
308
309static struct Qdisc *
310dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
311{
312 struct Qdisc *oqdisc;
313
314 if (dev->flags & IFF_UP)
315 dev_deactivate(dev);
316
317 qdisc_lock_tree(dev);
318 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
319 oqdisc = dev->qdisc_ingress;
320 /* Prune old scheduler */
321 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
322 /* delete */
323 qdisc_reset(oqdisc);
324 dev->qdisc_ingress = NULL;
325 } else { /* new */
326 dev->qdisc_ingress = qdisc;
327 }
328
329 } else {
330
331 oqdisc = dev->qdisc_sleeping;
332
333 /* Prune old scheduler */
334 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
335 qdisc_reset(oqdisc);
336
337 /* ... and graft new one */
338 if (qdisc == NULL)
339 qdisc = &noop_qdisc;
340 dev->qdisc_sleeping = qdisc;
341 dev->qdisc = &noop_qdisc;
342 }
343
344 qdisc_unlock_tree(dev);
345
346 if (dev->flags & IFF_UP)
347 dev_activate(dev);
348
349 return oqdisc;
350}
351
352
353/* Graft qdisc "new" to class "classid" of qdisc "parent" or
354 to device "dev".
355
356 Old qdisc is not destroyed but returned in *old.
357 */
358
359static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
360 u32 classid,
361 struct Qdisc *new, struct Qdisc **old)
362{
363 int err = 0;
364 struct Qdisc *q = *old;
365
366
367 if (parent == NULL) {
368 if (q && q->flags&TCQ_F_INGRESS) {
369 *old = dev_graft_qdisc(dev, q);
370 } else {
371 *old = dev_graft_qdisc(dev, new);
372 }
373 } else {
374 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
375
376 err = -EINVAL;
377
378 if (cops) {
379 unsigned long cl = cops->get(parent, classid);
380 if (cl) {
381 err = cops->graft(parent, cl, new, old);
382 if (new)
383 new->parent = classid;
384 cops->put(parent, cl);
385 }
386 }
387 }
388 return err;
389}
390
391/*
392 Allocate and initialize new qdisc.
393
394 Parameters are passed via opt.
395 */
396
397static struct Qdisc *
398qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
399{
400 int err;
401 struct rtattr *kind = tca[TCA_KIND-1];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402 struct Qdisc *sch;
403 struct Qdisc_ops *ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404
405 ops = qdisc_lookup_ops(kind);
406#ifdef CONFIG_KMOD
407 if (ops == NULL && kind != NULL) {
408 char name[IFNAMSIZ];
409 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
410 /* We dropped the RTNL semaphore in order to
411 * perform the module load. So, even if we
412 * succeeded in loading the module we have to
413 * tell the caller to replay the request. We
414 * indicate this using -EAGAIN.
415 * We replay the request because the device may
416 * go away in the mean time.
417 */
418 rtnl_unlock();
419 request_module("sch_%s", name);
420 rtnl_lock();
421 ops = qdisc_lookup_ops(kind);
422 if (ops != NULL) {
423 /* We will try again qdisc_lookup_ops,
424 * so don't keep a reference.
425 */
426 module_put(ops->owner);
427 err = -EAGAIN;
428 goto err_out;
429 }
430 }
431 }
432#endif
433
434 err = -EINVAL;
435 if (ops == NULL)
436 goto err_out;
437
Thomas Graf3d54b822005-07-05 14:15:09 -0700438 sch = qdisc_alloc(dev, ops);
439 if (IS_ERR(sch)) {
440 err = PTR_ERR(sch);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700441 goto err_out2;
Thomas Graf3d54b822005-07-05 14:15:09 -0700442 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700443
Thomas Graf3d54b822005-07-05 14:15:09 -0700444 if (handle == TC_H_INGRESS) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445 sch->flags |= TCQ_F_INGRESS;
Thomas Graf3d54b822005-07-05 14:15:09 -0700446 handle = TC_H_MAKE(TC_H_INGRESS, 0);
447 } else if (handle == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700448 handle = qdisc_alloc_handle(dev);
449 err = -ENOMEM;
450 if (handle == 0)
451 goto err_out3;
452 }
453
Thomas Graf3d54b822005-07-05 14:15:09 -0700454 sch->handle = handle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700455
456 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
Thomas Graf023e09a2005-07-05 14:15:53 -0700457#ifdef CONFIG_NET_ESTIMATOR
458 if (tca[TCA_RATE-1]) {
459 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
460 sch->stats_lock,
461 tca[TCA_RATE-1]);
462 if (err) {
463 /*
464 * Any broken qdiscs that would require
465 * a ops->reset() here? The qdisc was never
466 * in action so it shouldn't be necessary.
467 */
468 if (ops->destroy)
469 ops->destroy(sch);
470 goto err_out3;
471 }
472 }
473#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474 qdisc_lock_tree(dev);
475 list_add_tail(&sch->list, &dev->qdisc_list);
476 qdisc_unlock_tree(dev);
477
Linus Torvalds1da177e2005-04-16 15:20:36 -0700478 return sch;
479 }
480err_out3:
481 dev_put(dev);
Thomas Graf3d54b822005-07-05 14:15:09 -0700482 kfree((char *) sch - sch->padded);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483err_out2:
484 module_put(ops->owner);
485err_out:
486 *errp = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700487 return NULL;
488}
489
490static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
491{
492 if (tca[TCA_OPTIONS-1]) {
493 int err;
494
495 if (sch->ops->change == NULL)
496 return -EINVAL;
497 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
498 if (err)
499 return err;
500 }
501#ifdef CONFIG_NET_ESTIMATOR
502 if (tca[TCA_RATE-1])
503 gen_replace_estimator(&sch->bstats, &sch->rate_est,
504 sch->stats_lock, tca[TCA_RATE-1]);
505#endif
506 return 0;
507}
508
509struct check_loop_arg
510{
511 struct qdisc_walker w;
512 struct Qdisc *p;
513 int depth;
514};
515
516static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
517
518static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
519{
520 struct check_loop_arg arg;
521
522 if (q->ops->cl_ops == NULL)
523 return 0;
524
525 arg.w.stop = arg.w.skip = arg.w.count = 0;
526 arg.w.fn = check_loop_fn;
527 arg.depth = depth;
528 arg.p = p;
529 q->ops->cl_ops->walk(q, &arg.w);
530 return arg.w.stop ? -ELOOP : 0;
531}
532
533static int
534check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
535{
536 struct Qdisc *leaf;
537 struct Qdisc_class_ops *cops = q->ops->cl_ops;
538 struct check_loop_arg *arg = (struct check_loop_arg *)w;
539
540 leaf = cops->leaf(q, cl);
541 if (leaf) {
542 if (leaf == arg->p || arg->depth > 7)
543 return -ELOOP;
544 return check_loop(leaf, arg->p, arg->depth + 1);
545 }
546 return 0;
547}
548
549/*
550 * Delete/get qdisc.
551 */
552
553static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
554{
555 struct tcmsg *tcm = NLMSG_DATA(n);
556 struct rtattr **tca = arg;
557 struct net_device *dev;
558 u32 clid = tcm->tcm_parent;
559 struct Qdisc *q = NULL;
560 struct Qdisc *p = NULL;
561 int err;
562
563 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
564 return -ENODEV;
565
566 if (clid) {
567 if (clid != TC_H_ROOT) {
568 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
569 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
570 return -ENOENT;
571 q = qdisc_leaf(p, clid);
572 } else { /* ingress */
573 q = dev->qdisc_ingress;
574 }
575 } else {
576 q = dev->qdisc_sleeping;
577 }
578 if (!q)
579 return -ENOENT;
580
581 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
582 return -EINVAL;
583 } else {
584 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
585 return -ENOENT;
586 }
587
588 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
589 return -EINVAL;
590
591 if (n->nlmsg_type == RTM_DELQDISC) {
592 if (!clid)
593 return -EINVAL;
594 if (q->handle == 0)
595 return -ENOENT;
596 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
597 return err;
598 if (q) {
599 qdisc_notify(skb, n, clid, q, NULL);
600 spin_lock_bh(&dev->queue_lock);
601 qdisc_destroy(q);
602 spin_unlock_bh(&dev->queue_lock);
603 }
604 } else {
605 qdisc_notify(skb, n, clid, NULL, q);
606 }
607 return 0;
608}
609
610/*
611 Create/change qdisc.
612 */
613
614static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
615{
616 struct tcmsg *tcm;
617 struct rtattr **tca;
618 struct net_device *dev;
619 u32 clid;
620 struct Qdisc *q, *p;
621 int err;
622
623replay:
624 /* Reinit, just in case something touches this. */
625 tcm = NLMSG_DATA(n);
626 tca = arg;
627 clid = tcm->tcm_parent;
628 q = p = NULL;
629
630 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
631 return -ENODEV;
632
633 if (clid) {
634 if (clid != TC_H_ROOT) {
635 if (clid != TC_H_INGRESS) {
636 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
637 return -ENOENT;
638 q = qdisc_leaf(p, clid);
639 } else { /*ingress */
640 q = dev->qdisc_ingress;
641 }
642 } else {
643 q = dev->qdisc_sleeping;
644 }
645
646 /* It may be default qdisc, ignore it */
647 if (q && q->handle == 0)
648 q = NULL;
649
650 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
651 if (tcm->tcm_handle) {
652 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
653 return -EEXIST;
654 if (TC_H_MIN(tcm->tcm_handle))
655 return -EINVAL;
656 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
657 goto create_n_graft;
658 if (n->nlmsg_flags&NLM_F_EXCL)
659 return -EEXIST;
660 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
661 return -EINVAL;
662 if (q == p ||
663 (p && check_loop(q, p, 0)))
664 return -ELOOP;
665 atomic_inc(&q->refcnt);
666 goto graft;
667 } else {
668 if (q == NULL)
669 goto create_n_graft;
670
671 /* This magic test requires explanation.
672 *
673 * We know, that some child q is already
674 * attached to this parent and have choice:
675 * either to change it or to create/graft new one.
676 *
677 * 1. We are allowed to create/graft only
678 * if CREATE and REPLACE flags are set.
679 *
680 * 2. If EXCL is set, requestor wanted to say,
681 * that qdisc tcm_handle is not expected
682 * to exist, so that we choose create/graft too.
683 *
684 * 3. The last case is when no flags are set.
685 * Alas, it is sort of hole in API, we
686 * cannot decide what to do unambiguously.
687 * For now we select create/graft, if
688 * user gave KIND, which does not match existing.
689 */
690 if ((n->nlmsg_flags&NLM_F_CREATE) &&
691 (n->nlmsg_flags&NLM_F_REPLACE) &&
692 ((n->nlmsg_flags&NLM_F_EXCL) ||
693 (tca[TCA_KIND-1] &&
694 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
695 goto create_n_graft;
696 }
697 }
698 } else {
699 if (!tcm->tcm_handle)
700 return -EINVAL;
701 q = qdisc_lookup(dev, tcm->tcm_handle);
702 }
703
704 /* Change qdisc parameters */
705 if (q == NULL)
706 return -ENOENT;
707 if (n->nlmsg_flags&NLM_F_EXCL)
708 return -EEXIST;
709 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
710 return -EINVAL;
711 err = qdisc_change(q, tca);
712 if (err == 0)
713 qdisc_notify(skb, n, clid, NULL, q);
714 return err;
715
716create_n_graft:
717 if (!(n->nlmsg_flags&NLM_F_CREATE))
718 return -ENOENT;
719 if (clid == TC_H_INGRESS)
720 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
721 else
722 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
723 if (q == NULL) {
724 if (err == -EAGAIN)
725 goto replay;
726 return err;
727 }
728
729graft:
730 if (1) {
731 struct Qdisc *old_q = NULL;
732 err = qdisc_graft(dev, p, clid, q, &old_q);
733 if (err) {
734 if (q) {
735 spin_lock_bh(&dev->queue_lock);
736 qdisc_destroy(q);
737 spin_unlock_bh(&dev->queue_lock);
738 }
739 return err;
740 }
741 qdisc_notify(skb, n, clid, old_q, q);
742 if (old_q) {
743 spin_lock_bh(&dev->queue_lock);
744 qdisc_destroy(old_q);
745 spin_unlock_bh(&dev->queue_lock);
746 }
747 }
748 return 0;
749}
750
751static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700752 u32 pid, u32 seq, u16 flags, int event)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700753{
754 struct tcmsg *tcm;
755 struct nlmsghdr *nlh;
756 unsigned char *b = skb->tail;
757 struct gnet_dump d;
758
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700759 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700760 tcm = NLMSG_DATA(nlh);
761 tcm->tcm_family = AF_UNSPEC;
Patrick McHardy9ef1d4c2005-06-28 12:55:30 -0700762 tcm->tcm__pad1 = 0;
763 tcm->tcm__pad2 = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700764 tcm->tcm_ifindex = q->dev->ifindex;
765 tcm->tcm_parent = clid;
766 tcm->tcm_handle = q->handle;
767 tcm->tcm_info = atomic_read(&q->refcnt);
768 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
769 if (q->ops->dump && q->ops->dump(q, skb) < 0)
770 goto rtattr_failure;
771 q->qstats.qlen = q->q.qlen;
772
773 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
774 TCA_XSTATS, q->stats_lock, &d) < 0)
775 goto rtattr_failure;
776
777 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
778 goto rtattr_failure;
779
780 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
781#ifdef CONFIG_NET_ESTIMATOR
782 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
783#endif
784 gnet_stats_copy_queue(&d, &q->qstats) < 0)
785 goto rtattr_failure;
786
787 if (gnet_stats_finish_copy(&d) < 0)
788 goto rtattr_failure;
789
790 nlh->nlmsg_len = skb->tail - b;
791 return skb->len;
792
793nlmsg_failure:
794rtattr_failure:
795 skb_trim(skb, b - skb->data);
796 return -1;
797}
798
799static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
800 u32 clid, struct Qdisc *old, struct Qdisc *new)
801{
802 struct sk_buff *skb;
803 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
804
805 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
806 if (!skb)
807 return -ENOBUFS;
808
809 if (old && old->handle) {
810 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
811 goto err_out;
812 }
813 if (new) {
814 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
815 goto err_out;
816 }
817
818 if (skb->len)
Patrick McHardyac6d4392005-08-14 19:29:52 -0700819 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700820
821err_out:
822 kfree_skb(skb);
823 return -EINVAL;
824}
825
826static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
827{
828 int idx, q_idx;
829 int s_idx, s_q_idx;
830 struct net_device *dev;
831 struct Qdisc *q;
832
833 s_idx = cb->args[0];
834 s_q_idx = q_idx = cb->args[1];
835 read_lock(&dev_base_lock);
836 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
837 if (idx < s_idx)
838 continue;
839 if (idx > s_idx)
840 s_q_idx = 0;
841 read_lock_bh(&qdisc_tree_lock);
842 q_idx = 0;
843 list_for_each_entry(q, &dev->qdisc_list, list) {
844 if (q_idx < s_q_idx) {
845 q_idx++;
846 continue;
847 }
848 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
849 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
850 read_unlock_bh(&qdisc_tree_lock);
851 goto done;
852 }
853 q_idx++;
854 }
855 read_unlock_bh(&qdisc_tree_lock);
856 }
857
858done:
859 read_unlock(&dev_base_lock);
860
861 cb->args[0] = idx;
862 cb->args[1] = q_idx;
863
864 return skb->len;
865}
866
867
868
869/************************************************
870 * Traffic classes manipulation. *
871 ************************************************/
872
873
874
875static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
876{
877 struct tcmsg *tcm = NLMSG_DATA(n);
878 struct rtattr **tca = arg;
879 struct net_device *dev;
880 struct Qdisc *q = NULL;
881 struct Qdisc_class_ops *cops;
882 unsigned long cl = 0;
883 unsigned long new_cl;
884 u32 pid = tcm->tcm_parent;
885 u32 clid = tcm->tcm_handle;
886 u32 qid = TC_H_MAJ(clid);
887 int err;
888
889 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
890 return -ENODEV;
891
892 /*
893 parent == TC_H_UNSPEC - unspecified parent.
894 parent == TC_H_ROOT - class is root, which has no parent.
895 parent == X:0 - parent is root class.
896 parent == X:Y - parent is a node in hierarchy.
897 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
898
899 handle == 0:0 - generate handle from kernel pool.
900 handle == 0:Y - class is X:Y, where X:0 is qdisc.
901 handle == X:Y - clear.
902 handle == X:0 - root class.
903 */
904
905 /* Step 1. Determine qdisc handle X:0 */
906
907 if (pid != TC_H_ROOT) {
908 u32 qid1 = TC_H_MAJ(pid);
909
910 if (qid && qid1) {
911 /* If both majors are known, they must be identical. */
912 if (qid != qid1)
913 return -EINVAL;
914 } else if (qid1) {
915 qid = qid1;
916 } else if (qid == 0)
917 qid = dev->qdisc_sleeping->handle;
918
919 /* Now qid is genuine qdisc handle consistent
920 both with parent and child.
921
922 TC_H_MAJ(pid) still may be unspecified, complete it now.
923 */
924 if (pid)
925 pid = TC_H_MAKE(qid, pid);
926 } else {
927 if (qid == 0)
928 qid = dev->qdisc_sleeping->handle;
929 }
930
931 /* OK. Locate qdisc */
932 if ((q = qdisc_lookup(dev, qid)) == NULL)
933 return -ENOENT;
934
935 /* An check that it supports classes */
936 cops = q->ops->cl_ops;
937 if (cops == NULL)
938 return -EINVAL;
939
940 /* Now try to get class */
941 if (clid == 0) {
942 if (pid == TC_H_ROOT)
943 clid = qid;
944 } else
945 clid = TC_H_MAKE(qid, clid);
946
947 if (clid)
948 cl = cops->get(q, clid);
949
950 if (cl == 0) {
951 err = -ENOENT;
952 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
953 goto out;
954 } else {
955 switch (n->nlmsg_type) {
956 case RTM_NEWTCLASS:
957 err = -EEXIST;
958 if (n->nlmsg_flags&NLM_F_EXCL)
959 goto out;
960 break;
961 case RTM_DELTCLASS:
962 err = cops->delete(q, cl);
963 if (err == 0)
964 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
965 goto out;
966 case RTM_GETTCLASS:
967 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
968 goto out;
969 default:
970 err = -EINVAL;
971 goto out;
972 }
973 }
974
975 new_cl = cl;
976 err = cops->change(q, clid, pid, tca, &new_cl);
977 if (err == 0)
978 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
979
980out:
981 if (cl)
982 cops->put(q, cl);
983
984 return err;
985}
986
987
988static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
989 unsigned long cl,
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700990 u32 pid, u32 seq, u16 flags, int event)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991{
992 struct tcmsg *tcm;
993 struct nlmsghdr *nlh;
994 unsigned char *b = skb->tail;
995 struct gnet_dump d;
996 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
997
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700998 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700999 tcm = NLMSG_DATA(nlh);
1000 tcm->tcm_family = AF_UNSPEC;
1001 tcm->tcm_ifindex = q->dev->ifindex;
1002 tcm->tcm_parent = q->handle;
1003 tcm->tcm_handle = q->handle;
1004 tcm->tcm_info = 0;
1005 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1006 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1007 goto rtattr_failure;
1008
1009 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1010 TCA_XSTATS, q->stats_lock, &d) < 0)
1011 goto rtattr_failure;
1012
1013 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1014 goto rtattr_failure;
1015
1016 if (gnet_stats_finish_copy(&d) < 0)
1017 goto rtattr_failure;
1018
1019 nlh->nlmsg_len = skb->tail - b;
1020 return skb->len;
1021
1022nlmsg_failure:
1023rtattr_failure:
1024 skb_trim(skb, b - skb->data);
1025 return -1;
1026}
1027
1028static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1029 struct Qdisc *q, unsigned long cl, int event)
1030{
1031 struct sk_buff *skb;
1032 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1033
1034 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1035 if (!skb)
1036 return -ENOBUFS;
1037
1038 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1039 kfree_skb(skb);
1040 return -EINVAL;
1041 }
1042
Patrick McHardyac6d4392005-08-14 19:29:52 -07001043 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001044}
1045
1046struct qdisc_dump_args
1047{
1048 struct qdisc_walker w;
1049 struct sk_buff *skb;
1050 struct netlink_callback *cb;
1051};
1052
1053static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1054{
1055 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1056
1057 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1058 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1059}
1060
1061static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1062{
1063 int t;
1064 int s_t;
1065 struct net_device *dev;
1066 struct Qdisc *q;
1067 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1068 struct qdisc_dump_args arg;
1069
1070 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1071 return 0;
1072 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1073 return 0;
1074
1075 s_t = cb->args[0];
1076 t = 0;
1077
1078 read_lock_bh(&qdisc_tree_lock);
1079 list_for_each_entry(q, &dev->qdisc_list, list) {
1080 if (t < s_t || !q->ops->cl_ops ||
1081 (tcm->tcm_parent &&
1082 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1083 t++;
1084 continue;
1085 }
1086 if (t > s_t)
1087 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1088 arg.w.fn = qdisc_class_dump;
1089 arg.skb = skb;
1090 arg.cb = cb;
1091 arg.w.stop = 0;
1092 arg.w.skip = cb->args[1];
1093 arg.w.count = 0;
1094 q->ops->cl_ops->walk(q, &arg.w);
1095 cb->args[1] = arg.w.count;
1096 if (arg.w.stop)
1097 break;
1098 t++;
1099 }
1100 read_unlock_bh(&qdisc_tree_lock);
1101
1102 cb->args[0] = t;
1103
1104 dev_put(dev);
1105 return skb->len;
1106}
1107
1108/* Main classifier routine: scans classifier chain attached
1109 to this qdisc, (optionally) tests for protocol and asks
1110 specific classifiers.
1111 */
1112int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1113 struct tcf_result *res)
1114{
1115 int err = 0;
1116 u32 protocol = skb->protocol;
1117#ifdef CONFIG_NET_CLS_ACT
1118 struct tcf_proto *otp = tp;
1119reclassify:
1120#endif
1121 protocol = skb->protocol;
1122
1123 for ( ; tp; tp = tp->next) {
1124 if ((tp->protocol == protocol ||
1125 tp->protocol == __constant_htons(ETH_P_ALL)) &&
1126 (err = tp->classify(skb, tp, res)) >= 0) {
1127#ifdef CONFIG_NET_CLS_ACT
1128 if ( TC_ACT_RECLASSIFY == err) {
1129 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1130 tp = otp;
1131
1132 if (MAX_REC_LOOP < verd++) {
1133 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1134 tp->prio&0xffff, ntohs(tp->protocol));
1135 return TC_ACT_SHOT;
1136 }
1137 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1138 goto reclassify;
1139 } else {
1140 if (skb->tc_verd)
1141 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1142 return err;
1143 }
1144#else
1145
1146 return err;
1147#endif
1148 }
1149
1150 }
1151 return -1;
1152}
1153
1154static int psched_us_per_tick = 1;
1155static int psched_tick_per_us = 1;
1156
1157#ifdef CONFIG_PROC_FS
1158static int psched_show(struct seq_file *seq, void *v)
1159{
1160 seq_printf(seq, "%08x %08x %08x %08x\n",
1161 psched_tick_per_us, psched_us_per_tick,
1162 1000000, HZ);
1163
1164 return 0;
1165}
1166
1167static int psched_open(struct inode *inode, struct file *file)
1168{
1169 return single_open(file, psched_show, PDE(inode)->data);
1170}
1171
1172static struct file_operations psched_fops = {
1173 .owner = THIS_MODULE,
1174 .open = psched_open,
1175 .read = seq_read,
1176 .llseek = seq_lseek,
1177 .release = single_release,
1178};
1179#endif
1180
1181#ifdef CONFIG_NET_SCH_CLK_CPU
1182psched_tdiff_t psched_clock_per_hz;
1183int psched_clock_scale;
1184EXPORT_SYMBOL(psched_clock_per_hz);
1185EXPORT_SYMBOL(psched_clock_scale);
1186
1187psched_time_t psched_time_base;
1188cycles_t psched_time_mark;
1189EXPORT_SYMBOL(psched_time_mark);
1190EXPORT_SYMBOL(psched_time_base);
1191
1192/*
1193 * Periodically adjust psched_time_base to avoid overflow
1194 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1195 */
1196static void psched_tick(unsigned long);
1197static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0);
1198
1199static void psched_tick(unsigned long dummy)
1200{
1201 if (sizeof(cycles_t) == sizeof(u32)) {
1202 psched_time_t dummy_stamp;
1203 PSCHED_GET_TIME(dummy_stamp);
1204 psched_timer.expires = jiffies + 1*HZ;
1205 add_timer(&psched_timer);
1206 }
1207}
1208
1209int __init psched_calibrate_clock(void)
1210{
1211 psched_time_t stamp, stamp1;
1212 struct timeval tv, tv1;
1213 psched_tdiff_t delay;
1214 long rdelay;
1215 unsigned long stop;
1216
1217 psched_tick(0);
1218 stop = jiffies + HZ/10;
1219 PSCHED_GET_TIME(stamp);
1220 do_gettimeofday(&tv);
1221 while (time_before(jiffies, stop)) {
1222 barrier();
1223 cpu_relax();
1224 }
1225 PSCHED_GET_TIME(stamp1);
1226 do_gettimeofday(&tv1);
1227
1228 delay = PSCHED_TDIFF(stamp1, stamp);
1229 rdelay = tv1.tv_usec - tv.tv_usec;
1230 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1231 if (rdelay > delay)
1232 return -1;
1233 delay /= rdelay;
1234 psched_tick_per_us = delay;
1235 while ((delay>>=1) != 0)
1236 psched_clock_scale++;
1237 psched_us_per_tick = 1<<psched_clock_scale;
1238 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1239 return 0;
1240}
1241#endif
1242
1243static int __init pktsched_init(void)
1244{
1245 struct rtnetlink_link *link_p;
1246
1247#ifdef CONFIG_NET_SCH_CLK_CPU
1248 if (psched_calibrate_clock() < 0)
1249 return -1;
1250#elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1251 psched_tick_per_us = HZ<<PSCHED_JSCALE;
1252 psched_us_per_tick = 1000000;
1253#endif
1254
1255 link_p = rtnetlink_links[PF_UNSPEC];
1256
1257 /* Setup rtnetlink links. It is made here to avoid
1258 exporting large number of public symbols.
1259 */
1260
1261 if (link_p) {
1262 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1263 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1264 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1265 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1266 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1267 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1268 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1269 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1270 }
1271
1272 register_qdisc(&pfifo_qdisc_ops);
1273 register_qdisc(&bfifo_qdisc_ops);
1274 proc_net_fops_create("psched", 0, &psched_fops);
1275
1276 return 0;
1277}
1278
1279subsys_initcall(pktsched_init);
1280
Stephen Hemmingerd5d75cd2005-05-03 16:24:57 -07001281EXPORT_SYMBOL(qdisc_lookup);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001282EXPORT_SYMBOL(qdisc_get_rtab);
1283EXPORT_SYMBOL(qdisc_put_rtab);
1284EXPORT_SYMBOL(register_qdisc);
1285EXPORT_SYMBOL(unregister_qdisc);
1286EXPORT_SYMBOL(tc_classify);