blob: 65825f4409d98c9872898bac77422c37b481b092 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
Linus Torvalds1da177e2005-04-16 15:20:36 -070018#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/sched.h>
22#include <linux/string.h>
23#include <linux/mm.h>
24#include <linux/socket.h>
25#include <linux/sockios.h>
26#include <linux/in.h>
27#include <linux/errno.h>
28#include <linux/interrupt.h>
29#include <linux/netdevice.h>
30#include <linux/skbuff.h>
31#include <linux/rtnetlink.h>
32#include <linux/init.h>
33#include <linux/proc_fs.h>
34#include <linux/seq_file.h>
35#include <linux/kmod.h>
36#include <linux/list.h>
37#include <linux/bitops.h>
38
39#include <net/sock.h>
40#include <net/pkt_sched.h>
41
42#include <asm/processor.h>
43#include <asm/uaccess.h>
44#include <asm/system.h>
45
46static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
47 struct Qdisc *old, struct Qdisc *new);
48static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
49 struct Qdisc *q, unsigned long cl, int event);
50
51/*
52
53 Short review.
54 -------------
55
56 This file consists of two interrelated parts:
57
58 1. queueing disciplines manager frontend.
59 2. traffic classes manager frontend.
60
61 Generally, queueing discipline ("qdisc") is a black box,
62 which is able to enqueue packets and to dequeue them (when
63 device is ready to send something) in order and at times
64 determined by algorithm hidden in it.
65
66 qdisc's are divided to two categories:
67 - "queues", which have no internal structure visible from outside.
68 - "schedulers", which split all the packets to "traffic classes",
69 using "packet classifiers" (look at cls_api.c)
70
71 In turn, classes may have child qdiscs (as rule, queues)
72 attached to them etc. etc. etc.
73
74 The goal of the routines in this file is to translate
75 information supplied by user in the form of handles
76 to more intelligible for kernel form, to make some sanity
77 checks and part of work, which is common to all qdiscs
78 and to provide rtnetlink notifications.
79
80 All real intelligent work is done inside qdisc modules.
81
82
83
84 Every discipline has two major routines: enqueue and dequeue.
85
86 ---dequeue
87
88 dequeue usually returns a skb to send. It is allowed to return NULL,
89 but it does not mean that queue is empty, it just means that
90 discipline does not want to send anything this time.
91 Queue is really empty if q->q.qlen == 0.
92 For complicated disciplines with multiple queues q->q is not
93 real packet queue, but however q->q.qlen must be valid.
94
95 ---enqueue
96
97 enqueue returns 0, if packet was enqueued successfully.
98 If packet (this one or another one) was dropped, it returns
99 not zero error code.
100 NET_XMIT_DROP - this packet dropped
101 Expected action: do not backoff, but wait until queue will clear.
102 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
103 Expected action: backoff or ignore
104 NET_XMIT_POLICED - dropped by police.
105 Expected action: backoff or error to real-time apps.
106
107 Auxiliary routines:
108
109 ---requeue
110
111 requeues once dequeued packet. It is used for non-standard or
112 just buggy devices, which can defer output even if dev->tbusy=0.
113
114 ---reset
115
116 returns qdisc to initial state: purge all buffers, clear all
117 timers, counters (except for statistics) etc.
118
119 ---init
120
121 initializes newly created qdisc.
122
123 ---destroy
124
125 destroys resources allocated by init and during lifetime of qdisc.
126
127 ---change
128
129 changes qdisc parameters.
130 */
131
132/* Protects list of registered TC modules. It is pure SMP lock. */
133static DEFINE_RWLOCK(qdisc_mod_lock);
134
135
136/************************************************
137 * Queueing disciplines manipulation. *
138 ************************************************/
139
140
141/* The list of all installed queueing disciplines. */
142
143static struct Qdisc_ops *qdisc_base;
144
145/* Register/uregister queueing discipline */
146
147int register_qdisc(struct Qdisc_ops *qops)
148{
149 struct Qdisc_ops *q, **qp;
150 int rc = -EEXIST;
151
152 write_lock(&qdisc_mod_lock);
153 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
154 if (!strcmp(qops->id, q->id))
155 goto out;
156
157 if (qops->enqueue == NULL)
158 qops->enqueue = noop_qdisc_ops.enqueue;
159 if (qops->requeue == NULL)
160 qops->requeue = noop_qdisc_ops.requeue;
161 if (qops->dequeue == NULL)
162 qops->dequeue = noop_qdisc_ops.dequeue;
163
164 qops->next = NULL;
165 *qp = qops;
166 rc = 0;
167out:
168 write_unlock(&qdisc_mod_lock);
169 return rc;
170}
171
172int unregister_qdisc(struct Qdisc_ops *qops)
173{
174 struct Qdisc_ops *q, **qp;
175 int err = -ENOENT;
176
177 write_lock(&qdisc_mod_lock);
178 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
179 if (q == qops)
180 break;
181 if (q) {
182 *qp = q->next;
183 q->next = NULL;
184 err = 0;
185 }
186 write_unlock(&qdisc_mod_lock);
187 return err;
188}
189
190/* We know handle. Find qdisc among all qdisc's attached to device
191 (root qdisc, all its children, children of children etc.)
192 */
193
Patrick McHardy43effa12006-11-29 17:35:48 -0800194static struct Qdisc *__qdisc_lookup(struct net_device *dev, u32 handle)
195{
196 struct Qdisc *q;
197
198 list_for_each_entry(q, &dev->qdisc_list, list) {
199 if (q->handle == handle)
200 return q;
201 }
202 return NULL;
203}
204
Linus Torvalds1da177e2005-04-16 15:20:36 -0700205struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
206{
207 struct Qdisc *q;
208
Patrick McHardy85670cc2006-09-27 16:45:45 -0700209 read_lock(&qdisc_tree_lock);
Patrick McHardy43effa12006-11-29 17:35:48 -0800210 q = __qdisc_lookup(dev, handle);
Patrick McHardy85670cc2006-09-27 16:45:45 -0700211 read_unlock(&qdisc_tree_lock);
Patrick McHardy43effa12006-11-29 17:35:48 -0800212 return q;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213}
214
215static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
216{
217 unsigned long cl;
218 struct Qdisc *leaf;
219 struct Qdisc_class_ops *cops = p->ops->cl_ops;
220
221 if (cops == NULL)
222 return NULL;
223 cl = cops->get(p, classid);
224
225 if (cl == 0)
226 return NULL;
227 leaf = cops->leaf(p, cl);
228 cops->put(p, cl);
229 return leaf;
230}
231
232/* Find queueing discipline by name */
233
234static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
235{
236 struct Qdisc_ops *q = NULL;
237
238 if (kind) {
239 read_lock(&qdisc_mod_lock);
240 for (q = qdisc_base; q; q = q->next) {
241 if (rtattr_strcmp(kind, q->id) == 0) {
242 if (!try_module_get(q->owner))
243 q = NULL;
244 break;
245 }
246 }
247 read_unlock(&qdisc_mod_lock);
248 }
249 return q;
250}
251
252static struct qdisc_rate_table *qdisc_rtab_list;
253
254struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
255{
256 struct qdisc_rate_table *rtab;
257
258 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
259 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
260 rtab->refcnt++;
261 return rtab;
262 }
263 }
264
265 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
266 return NULL;
267
268 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
269 if (rtab) {
270 rtab->rate = *r;
271 rtab->refcnt = 1;
272 memcpy(rtab->data, RTA_DATA(tab), 1024);
273 rtab->next = qdisc_rtab_list;
274 qdisc_rtab_list = rtab;
275 }
276 return rtab;
277}
278
279void qdisc_put_rtab(struct qdisc_rate_table *tab)
280{
281 struct qdisc_rate_table *rtab, **rtabp;
282
283 if (!tab || --tab->refcnt)
284 return;
285
286 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
287 if (rtab == tab) {
288 *rtabp = rtab->next;
289 kfree(rtab);
290 return;
291 }
292 }
293}
294
295
296/* Allocate an unique handle from space managed by kernel */
297
298static u32 qdisc_alloc_handle(struct net_device *dev)
299{
300 int i = 0x10000;
301 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
302
303 do {
304 autohandle += TC_H_MAKE(0x10000U, 0);
305 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
306 autohandle = TC_H_MAKE(0x80000000U, 0);
307 } while (qdisc_lookup(dev, autohandle) && --i > 0);
308
309 return i>0 ? autohandle : 0;
310}
311
312/* Attach toplevel qdisc to device dev */
313
314static struct Qdisc *
315dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
316{
317 struct Qdisc *oqdisc;
318
319 if (dev->flags & IFF_UP)
320 dev_deactivate(dev);
321
322 qdisc_lock_tree(dev);
323 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
324 oqdisc = dev->qdisc_ingress;
325 /* Prune old scheduler */
326 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
327 /* delete */
328 qdisc_reset(oqdisc);
329 dev->qdisc_ingress = NULL;
330 } else { /* new */
331 dev->qdisc_ingress = qdisc;
332 }
333
334 } else {
335
336 oqdisc = dev->qdisc_sleeping;
337
338 /* Prune old scheduler */
339 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
340 qdisc_reset(oqdisc);
341
342 /* ... and graft new one */
343 if (qdisc == NULL)
344 qdisc = &noop_qdisc;
345 dev->qdisc_sleeping = qdisc;
346 dev->qdisc = &noop_qdisc;
347 }
348
349 qdisc_unlock_tree(dev);
350
351 if (dev->flags & IFF_UP)
352 dev_activate(dev);
353
354 return oqdisc;
355}
356
Patrick McHardy43effa12006-11-29 17:35:48 -0800357void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
358{
359 struct Qdisc_class_ops *cops;
360 unsigned long cl;
361 u32 parentid;
362
363 if (n == 0)
364 return;
365 while ((parentid = sch->parent)) {
366 sch = __qdisc_lookup(sch->dev, TC_H_MAJ(parentid));
367 cops = sch->ops->cl_ops;
368 if (cops->qlen_notify) {
369 cl = cops->get(sch, parentid);
370 cops->qlen_notify(sch, cl);
371 cops->put(sch, cl);
372 }
373 sch->q.qlen -= n;
374 }
375}
376EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700377
378/* Graft qdisc "new" to class "classid" of qdisc "parent" or
379 to device "dev".
380
381 Old qdisc is not destroyed but returned in *old.
382 */
383
384static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
385 u32 classid,
386 struct Qdisc *new, struct Qdisc **old)
387{
388 int err = 0;
389 struct Qdisc *q = *old;
390
391
392 if (parent == NULL) {
393 if (q && q->flags&TCQ_F_INGRESS) {
394 *old = dev_graft_qdisc(dev, q);
395 } else {
396 *old = dev_graft_qdisc(dev, new);
397 }
398 } else {
399 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
400
401 err = -EINVAL;
402
403 if (cops) {
404 unsigned long cl = cops->get(parent, classid);
405 if (cl) {
406 err = cops->graft(parent, cl, new, old);
407 if (new)
408 new->parent = classid;
409 cops->put(parent, cl);
410 }
411 }
412 }
413 return err;
414}
415
416/*
417 Allocate and initialize new qdisc.
418
419 Parameters are passed via opt.
420 */
421
422static struct Qdisc *
423qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
424{
425 int err;
426 struct rtattr *kind = tca[TCA_KIND-1];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700427 struct Qdisc *sch;
428 struct Qdisc_ops *ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429
430 ops = qdisc_lookup_ops(kind);
431#ifdef CONFIG_KMOD
432 if (ops == NULL && kind != NULL) {
433 char name[IFNAMSIZ];
434 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
435 /* We dropped the RTNL semaphore in order to
436 * perform the module load. So, even if we
437 * succeeded in loading the module we have to
438 * tell the caller to replay the request. We
439 * indicate this using -EAGAIN.
440 * We replay the request because the device may
441 * go away in the mean time.
442 */
443 rtnl_unlock();
444 request_module("sch_%s", name);
445 rtnl_lock();
446 ops = qdisc_lookup_ops(kind);
447 if (ops != NULL) {
448 /* We will try again qdisc_lookup_ops,
449 * so don't keep a reference.
450 */
451 module_put(ops->owner);
452 err = -EAGAIN;
453 goto err_out;
454 }
455 }
456 }
457#endif
458
Jamal Hadi Salimb9e2cc02006-08-03 16:36:51 -0700459 err = -ENOENT;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460 if (ops == NULL)
461 goto err_out;
462
Thomas Graf3d54b822005-07-05 14:15:09 -0700463 sch = qdisc_alloc(dev, ops);
464 if (IS_ERR(sch)) {
465 err = PTR_ERR(sch);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 goto err_out2;
Thomas Graf3d54b822005-07-05 14:15:09 -0700467 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700468
Thomas Graf3d54b822005-07-05 14:15:09 -0700469 if (handle == TC_H_INGRESS) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700470 sch->flags |= TCQ_F_INGRESS;
Thomas Graf3d54b822005-07-05 14:15:09 -0700471 handle = TC_H_MAKE(TC_H_INGRESS, 0);
472 } else if (handle == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700473 handle = qdisc_alloc_handle(dev);
474 err = -ENOMEM;
475 if (handle == 0)
476 goto err_out3;
477 }
478
Thomas Graf3d54b822005-07-05 14:15:09 -0700479 sch->handle = handle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700480
481 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
Thomas Graf023e09a2005-07-05 14:15:53 -0700482#ifdef CONFIG_NET_ESTIMATOR
483 if (tca[TCA_RATE-1]) {
484 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
485 sch->stats_lock,
486 tca[TCA_RATE-1]);
487 if (err) {
488 /*
489 * Any broken qdiscs that would require
490 * a ops->reset() here? The qdisc was never
491 * in action so it shouldn't be necessary.
492 */
493 if (ops->destroy)
494 ops->destroy(sch);
495 goto err_out3;
496 }
497 }
498#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700499 qdisc_lock_tree(dev);
500 list_add_tail(&sch->list, &dev->qdisc_list);
501 qdisc_unlock_tree(dev);
502
Linus Torvalds1da177e2005-04-16 15:20:36 -0700503 return sch;
504 }
505err_out3:
506 dev_put(dev);
Thomas Graf3d54b822005-07-05 14:15:09 -0700507 kfree((char *) sch - sch->padded);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508err_out2:
509 module_put(ops->owner);
510err_out:
511 *errp = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700512 return NULL;
513}
514
515static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
516{
517 if (tca[TCA_OPTIONS-1]) {
518 int err;
519
520 if (sch->ops->change == NULL)
521 return -EINVAL;
522 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
523 if (err)
524 return err;
525 }
526#ifdef CONFIG_NET_ESTIMATOR
527 if (tca[TCA_RATE-1])
528 gen_replace_estimator(&sch->bstats, &sch->rate_est,
529 sch->stats_lock, tca[TCA_RATE-1]);
530#endif
531 return 0;
532}
533
534struct check_loop_arg
535{
536 struct qdisc_walker w;
537 struct Qdisc *p;
538 int depth;
539};
540
541static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
542
543static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
544{
545 struct check_loop_arg arg;
546
547 if (q->ops->cl_ops == NULL)
548 return 0;
549
550 arg.w.stop = arg.w.skip = arg.w.count = 0;
551 arg.w.fn = check_loop_fn;
552 arg.depth = depth;
553 arg.p = p;
554 q->ops->cl_ops->walk(q, &arg.w);
555 return arg.w.stop ? -ELOOP : 0;
556}
557
558static int
559check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
560{
561 struct Qdisc *leaf;
562 struct Qdisc_class_ops *cops = q->ops->cl_ops;
563 struct check_loop_arg *arg = (struct check_loop_arg *)w;
564
565 leaf = cops->leaf(q, cl);
566 if (leaf) {
567 if (leaf == arg->p || arg->depth > 7)
568 return -ELOOP;
569 return check_loop(leaf, arg->p, arg->depth + 1);
570 }
571 return 0;
572}
573
574/*
575 * Delete/get qdisc.
576 */
577
578static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
579{
580 struct tcmsg *tcm = NLMSG_DATA(n);
581 struct rtattr **tca = arg;
582 struct net_device *dev;
583 u32 clid = tcm->tcm_parent;
584 struct Qdisc *q = NULL;
585 struct Qdisc *p = NULL;
586 int err;
587
588 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
589 return -ENODEV;
590
591 if (clid) {
592 if (clid != TC_H_ROOT) {
593 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
594 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
595 return -ENOENT;
596 q = qdisc_leaf(p, clid);
597 } else { /* ingress */
598 q = dev->qdisc_ingress;
599 }
600 } else {
601 q = dev->qdisc_sleeping;
602 }
603 if (!q)
604 return -ENOENT;
605
606 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
607 return -EINVAL;
608 } else {
609 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
610 return -ENOENT;
611 }
612
613 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
614 return -EINVAL;
615
616 if (n->nlmsg_type == RTM_DELQDISC) {
617 if (!clid)
618 return -EINVAL;
619 if (q->handle == 0)
620 return -ENOENT;
621 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
622 return err;
623 if (q) {
624 qdisc_notify(skb, n, clid, q, NULL);
625 spin_lock_bh(&dev->queue_lock);
626 qdisc_destroy(q);
627 spin_unlock_bh(&dev->queue_lock);
628 }
629 } else {
630 qdisc_notify(skb, n, clid, NULL, q);
631 }
632 return 0;
633}
634
635/*
636 Create/change qdisc.
637 */
638
639static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
640{
641 struct tcmsg *tcm;
642 struct rtattr **tca;
643 struct net_device *dev;
644 u32 clid;
645 struct Qdisc *q, *p;
646 int err;
647
648replay:
649 /* Reinit, just in case something touches this. */
650 tcm = NLMSG_DATA(n);
651 tca = arg;
652 clid = tcm->tcm_parent;
653 q = p = NULL;
654
655 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
656 return -ENODEV;
657
658 if (clid) {
659 if (clid != TC_H_ROOT) {
660 if (clid != TC_H_INGRESS) {
661 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
662 return -ENOENT;
663 q = qdisc_leaf(p, clid);
664 } else { /*ingress */
665 q = dev->qdisc_ingress;
666 }
667 } else {
668 q = dev->qdisc_sleeping;
669 }
670
671 /* It may be default qdisc, ignore it */
672 if (q && q->handle == 0)
673 q = NULL;
674
675 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
676 if (tcm->tcm_handle) {
677 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
678 return -EEXIST;
679 if (TC_H_MIN(tcm->tcm_handle))
680 return -EINVAL;
681 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
682 goto create_n_graft;
683 if (n->nlmsg_flags&NLM_F_EXCL)
684 return -EEXIST;
685 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
686 return -EINVAL;
687 if (q == p ||
688 (p && check_loop(q, p, 0)))
689 return -ELOOP;
690 atomic_inc(&q->refcnt);
691 goto graft;
692 } else {
693 if (q == NULL)
694 goto create_n_graft;
695
696 /* This magic test requires explanation.
697 *
698 * We know, that some child q is already
699 * attached to this parent and have choice:
700 * either to change it or to create/graft new one.
701 *
702 * 1. We are allowed to create/graft only
703 * if CREATE and REPLACE flags are set.
704 *
705 * 2. If EXCL is set, requestor wanted to say,
706 * that qdisc tcm_handle is not expected
707 * to exist, so that we choose create/graft too.
708 *
709 * 3. The last case is when no flags are set.
710 * Alas, it is sort of hole in API, we
711 * cannot decide what to do unambiguously.
712 * For now we select create/graft, if
713 * user gave KIND, which does not match existing.
714 */
715 if ((n->nlmsg_flags&NLM_F_CREATE) &&
716 (n->nlmsg_flags&NLM_F_REPLACE) &&
717 ((n->nlmsg_flags&NLM_F_EXCL) ||
718 (tca[TCA_KIND-1] &&
719 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
720 goto create_n_graft;
721 }
722 }
723 } else {
724 if (!tcm->tcm_handle)
725 return -EINVAL;
726 q = qdisc_lookup(dev, tcm->tcm_handle);
727 }
728
729 /* Change qdisc parameters */
730 if (q == NULL)
731 return -ENOENT;
732 if (n->nlmsg_flags&NLM_F_EXCL)
733 return -EEXIST;
734 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
735 return -EINVAL;
736 err = qdisc_change(q, tca);
737 if (err == 0)
738 qdisc_notify(skb, n, clid, NULL, q);
739 return err;
740
741create_n_graft:
742 if (!(n->nlmsg_flags&NLM_F_CREATE))
743 return -ENOENT;
744 if (clid == TC_H_INGRESS)
745 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
746 else
747 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
748 if (q == NULL) {
749 if (err == -EAGAIN)
750 goto replay;
751 return err;
752 }
753
754graft:
755 if (1) {
756 struct Qdisc *old_q = NULL;
757 err = qdisc_graft(dev, p, clid, q, &old_q);
758 if (err) {
759 if (q) {
760 spin_lock_bh(&dev->queue_lock);
761 qdisc_destroy(q);
762 spin_unlock_bh(&dev->queue_lock);
763 }
764 return err;
765 }
766 qdisc_notify(skb, n, clid, old_q, q);
767 if (old_q) {
768 spin_lock_bh(&dev->queue_lock);
769 qdisc_destroy(old_q);
770 spin_unlock_bh(&dev->queue_lock);
771 }
772 }
773 return 0;
774}
775
776static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700777 u32 pid, u32 seq, u16 flags, int event)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700778{
779 struct tcmsg *tcm;
780 struct nlmsghdr *nlh;
781 unsigned char *b = skb->tail;
782 struct gnet_dump d;
783
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700784 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700785 tcm = NLMSG_DATA(nlh);
786 tcm->tcm_family = AF_UNSPEC;
Patrick McHardy9ef1d4c2005-06-28 12:55:30 -0700787 tcm->tcm__pad1 = 0;
788 tcm->tcm__pad2 = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700789 tcm->tcm_ifindex = q->dev->ifindex;
790 tcm->tcm_parent = clid;
791 tcm->tcm_handle = q->handle;
792 tcm->tcm_info = atomic_read(&q->refcnt);
793 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
794 if (q->ops->dump && q->ops->dump(q, skb) < 0)
795 goto rtattr_failure;
796 q->qstats.qlen = q->q.qlen;
797
798 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
799 TCA_XSTATS, q->stats_lock, &d) < 0)
800 goto rtattr_failure;
801
802 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
803 goto rtattr_failure;
804
805 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
806#ifdef CONFIG_NET_ESTIMATOR
807 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
808#endif
809 gnet_stats_copy_queue(&d, &q->qstats) < 0)
810 goto rtattr_failure;
811
812 if (gnet_stats_finish_copy(&d) < 0)
813 goto rtattr_failure;
814
815 nlh->nlmsg_len = skb->tail - b;
816 return skb->len;
817
818nlmsg_failure:
819rtattr_failure:
820 skb_trim(skb, b - skb->data);
821 return -1;
822}
823
824static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
825 u32 clid, struct Qdisc *old, struct Qdisc *new)
826{
827 struct sk_buff *skb;
828 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
829
830 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
831 if (!skb)
832 return -ENOBUFS;
833
834 if (old && old->handle) {
835 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
836 goto err_out;
837 }
838 if (new) {
839 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
840 goto err_out;
841 }
842
843 if (skb->len)
Patrick McHardyac6d4392005-08-14 19:29:52 -0700844 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700845
846err_out:
847 kfree_skb(skb);
848 return -EINVAL;
849}
850
851static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
852{
853 int idx, q_idx;
854 int s_idx, s_q_idx;
855 struct net_device *dev;
856 struct Qdisc *q;
857
858 s_idx = cb->args[0];
859 s_q_idx = q_idx = cb->args[1];
860 read_lock(&dev_base_lock);
861 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
862 if (idx < s_idx)
863 continue;
864 if (idx > s_idx)
865 s_q_idx = 0;
Patrick McHardy85670cc2006-09-27 16:45:45 -0700866 read_lock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700867 q_idx = 0;
868 list_for_each_entry(q, &dev->qdisc_list, list) {
869 if (q_idx < s_q_idx) {
870 q_idx++;
871 continue;
872 }
873 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
874 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
Patrick McHardy85670cc2006-09-27 16:45:45 -0700875 read_unlock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700876 goto done;
877 }
878 q_idx++;
879 }
Patrick McHardy85670cc2006-09-27 16:45:45 -0700880 read_unlock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700881 }
882
883done:
884 read_unlock(&dev_base_lock);
885
886 cb->args[0] = idx;
887 cb->args[1] = q_idx;
888
889 return skb->len;
890}
891
892
893
894/************************************************
895 * Traffic classes manipulation. *
896 ************************************************/
897
898
899
900static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
901{
902 struct tcmsg *tcm = NLMSG_DATA(n);
903 struct rtattr **tca = arg;
904 struct net_device *dev;
905 struct Qdisc *q = NULL;
906 struct Qdisc_class_ops *cops;
907 unsigned long cl = 0;
908 unsigned long new_cl;
909 u32 pid = tcm->tcm_parent;
910 u32 clid = tcm->tcm_handle;
911 u32 qid = TC_H_MAJ(clid);
912 int err;
913
914 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
915 return -ENODEV;
916
917 /*
918 parent == TC_H_UNSPEC - unspecified parent.
919 parent == TC_H_ROOT - class is root, which has no parent.
920 parent == X:0 - parent is root class.
921 parent == X:Y - parent is a node in hierarchy.
922 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
923
924 handle == 0:0 - generate handle from kernel pool.
925 handle == 0:Y - class is X:Y, where X:0 is qdisc.
926 handle == X:Y - clear.
927 handle == X:0 - root class.
928 */
929
930 /* Step 1. Determine qdisc handle X:0 */
931
932 if (pid != TC_H_ROOT) {
933 u32 qid1 = TC_H_MAJ(pid);
934
935 if (qid && qid1) {
936 /* If both majors are known, they must be identical. */
937 if (qid != qid1)
938 return -EINVAL;
939 } else if (qid1) {
940 qid = qid1;
941 } else if (qid == 0)
942 qid = dev->qdisc_sleeping->handle;
943
944 /* Now qid is genuine qdisc handle consistent
945 both with parent and child.
946
947 TC_H_MAJ(pid) still may be unspecified, complete it now.
948 */
949 if (pid)
950 pid = TC_H_MAKE(qid, pid);
951 } else {
952 if (qid == 0)
953 qid = dev->qdisc_sleeping->handle;
954 }
955
956 /* OK. Locate qdisc */
957 if ((q = qdisc_lookup(dev, qid)) == NULL)
958 return -ENOENT;
959
960 /* An check that it supports classes */
961 cops = q->ops->cl_ops;
962 if (cops == NULL)
963 return -EINVAL;
964
965 /* Now try to get class */
966 if (clid == 0) {
967 if (pid == TC_H_ROOT)
968 clid = qid;
969 } else
970 clid = TC_H_MAKE(qid, clid);
971
972 if (clid)
973 cl = cops->get(q, clid);
974
975 if (cl == 0) {
976 err = -ENOENT;
977 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
978 goto out;
979 } else {
980 switch (n->nlmsg_type) {
981 case RTM_NEWTCLASS:
982 err = -EEXIST;
983 if (n->nlmsg_flags&NLM_F_EXCL)
984 goto out;
985 break;
986 case RTM_DELTCLASS:
987 err = cops->delete(q, cl);
988 if (err == 0)
989 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
990 goto out;
991 case RTM_GETTCLASS:
992 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
993 goto out;
994 default:
995 err = -EINVAL;
996 goto out;
997 }
998 }
999
1000 new_cl = cl;
1001 err = cops->change(q, clid, pid, tca, &new_cl);
1002 if (err == 0)
1003 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1004
1005out:
1006 if (cl)
1007 cops->put(q, cl);
1008
1009 return err;
1010}
1011
1012
1013static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1014 unsigned long cl,
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -07001015 u32 pid, u32 seq, u16 flags, int event)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001016{
1017 struct tcmsg *tcm;
1018 struct nlmsghdr *nlh;
1019 unsigned char *b = skb->tail;
1020 struct gnet_dump d;
1021 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1022
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -07001023 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001024 tcm = NLMSG_DATA(nlh);
1025 tcm->tcm_family = AF_UNSPEC;
1026 tcm->tcm_ifindex = q->dev->ifindex;
1027 tcm->tcm_parent = q->handle;
1028 tcm->tcm_handle = q->handle;
1029 tcm->tcm_info = 0;
1030 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1031 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1032 goto rtattr_failure;
1033
1034 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1035 TCA_XSTATS, q->stats_lock, &d) < 0)
1036 goto rtattr_failure;
1037
1038 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1039 goto rtattr_failure;
1040
1041 if (gnet_stats_finish_copy(&d) < 0)
1042 goto rtattr_failure;
1043
1044 nlh->nlmsg_len = skb->tail - b;
1045 return skb->len;
1046
1047nlmsg_failure:
1048rtattr_failure:
1049 skb_trim(skb, b - skb->data);
1050 return -1;
1051}
1052
1053static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1054 struct Qdisc *q, unsigned long cl, int event)
1055{
1056 struct sk_buff *skb;
1057 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1058
1059 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1060 if (!skb)
1061 return -ENOBUFS;
1062
1063 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1064 kfree_skb(skb);
1065 return -EINVAL;
1066 }
1067
Patrick McHardyac6d4392005-08-14 19:29:52 -07001068 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001069}
1070
1071struct qdisc_dump_args
1072{
1073 struct qdisc_walker w;
1074 struct sk_buff *skb;
1075 struct netlink_callback *cb;
1076};
1077
1078static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1079{
1080 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1081
1082 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1083 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1084}
1085
1086static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1087{
1088 int t;
1089 int s_t;
1090 struct net_device *dev;
1091 struct Qdisc *q;
1092 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1093 struct qdisc_dump_args arg;
1094
1095 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1096 return 0;
1097 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1098 return 0;
1099
1100 s_t = cb->args[0];
1101 t = 0;
1102
Patrick McHardy85670cc2006-09-27 16:45:45 -07001103 read_lock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001104 list_for_each_entry(q, &dev->qdisc_list, list) {
1105 if (t < s_t || !q->ops->cl_ops ||
1106 (tcm->tcm_parent &&
1107 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1108 t++;
1109 continue;
1110 }
1111 if (t > s_t)
1112 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1113 arg.w.fn = qdisc_class_dump;
1114 arg.skb = skb;
1115 arg.cb = cb;
1116 arg.w.stop = 0;
1117 arg.w.skip = cb->args[1];
1118 arg.w.count = 0;
1119 q->ops->cl_ops->walk(q, &arg.w);
1120 cb->args[1] = arg.w.count;
1121 if (arg.w.stop)
1122 break;
1123 t++;
1124 }
Patrick McHardy85670cc2006-09-27 16:45:45 -07001125 read_unlock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001126
1127 cb->args[0] = t;
1128
1129 dev_put(dev);
1130 return skb->len;
1131}
1132
1133/* Main classifier routine: scans classifier chain attached
1134 to this qdisc, (optionally) tests for protocol and asks
1135 specific classifiers.
1136 */
1137int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1138 struct tcf_result *res)
1139{
1140 int err = 0;
Al Viro66c6f522006-11-20 18:07:51 -08001141 __be16 protocol = skb->protocol;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001142#ifdef CONFIG_NET_CLS_ACT
1143 struct tcf_proto *otp = tp;
1144reclassify:
1145#endif
1146 protocol = skb->protocol;
1147
1148 for ( ; tp; tp = tp->next) {
1149 if ((tp->protocol == protocol ||
1150 tp->protocol == __constant_htons(ETH_P_ALL)) &&
1151 (err = tp->classify(skb, tp, res)) >= 0) {
1152#ifdef CONFIG_NET_CLS_ACT
1153 if ( TC_ACT_RECLASSIFY == err) {
1154 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1155 tp = otp;
1156
1157 if (MAX_REC_LOOP < verd++) {
1158 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1159 tp->prio&0xffff, ntohs(tp->protocol));
1160 return TC_ACT_SHOT;
1161 }
1162 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1163 goto reclassify;
1164 } else {
1165 if (skb->tc_verd)
1166 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1167 return err;
1168 }
1169#else
1170
1171 return err;
1172#endif
1173 }
1174
1175 }
1176 return -1;
1177}
1178
1179static int psched_us_per_tick = 1;
1180static int psched_tick_per_us = 1;
1181
1182#ifdef CONFIG_PROC_FS
1183static int psched_show(struct seq_file *seq, void *v)
1184{
1185 seq_printf(seq, "%08x %08x %08x %08x\n",
1186 psched_tick_per_us, psched_us_per_tick,
1187 1000000, HZ);
1188
1189 return 0;
1190}
1191
1192static int psched_open(struct inode *inode, struct file *file)
1193{
1194 return single_open(file, psched_show, PDE(inode)->data);
1195}
1196
1197static struct file_operations psched_fops = {
1198 .owner = THIS_MODULE,
1199 .open = psched_open,
1200 .read = seq_read,
1201 .llseek = seq_lseek,
1202 .release = single_release,
1203};
1204#endif
1205
1206#ifdef CONFIG_NET_SCH_CLK_CPU
1207psched_tdiff_t psched_clock_per_hz;
1208int psched_clock_scale;
1209EXPORT_SYMBOL(psched_clock_per_hz);
1210EXPORT_SYMBOL(psched_clock_scale);
1211
1212psched_time_t psched_time_base;
1213cycles_t psched_time_mark;
1214EXPORT_SYMBOL(psched_time_mark);
1215EXPORT_SYMBOL(psched_time_base);
1216
1217/*
1218 * Periodically adjust psched_time_base to avoid overflow
1219 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1220 */
1221static void psched_tick(unsigned long);
Ingo Molnar8d06afa2005-09-09 13:10:40 -07001222static DEFINE_TIMER(psched_timer, psched_tick, 0, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001223
1224static void psched_tick(unsigned long dummy)
1225{
1226 if (sizeof(cycles_t) == sizeof(u32)) {
1227 psched_time_t dummy_stamp;
1228 PSCHED_GET_TIME(dummy_stamp);
1229 psched_timer.expires = jiffies + 1*HZ;
1230 add_timer(&psched_timer);
1231 }
1232}
1233
1234int __init psched_calibrate_clock(void)
1235{
1236 psched_time_t stamp, stamp1;
1237 struct timeval tv, tv1;
1238 psched_tdiff_t delay;
1239 long rdelay;
1240 unsigned long stop;
1241
1242 psched_tick(0);
1243 stop = jiffies + HZ/10;
1244 PSCHED_GET_TIME(stamp);
1245 do_gettimeofday(&tv);
1246 while (time_before(jiffies, stop)) {
1247 barrier();
1248 cpu_relax();
1249 }
1250 PSCHED_GET_TIME(stamp1);
1251 do_gettimeofday(&tv1);
1252
1253 delay = PSCHED_TDIFF(stamp1, stamp);
1254 rdelay = tv1.tv_usec - tv.tv_usec;
1255 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1256 if (rdelay > delay)
1257 return -1;
1258 delay /= rdelay;
1259 psched_tick_per_us = delay;
1260 while ((delay>>=1) != 0)
1261 psched_clock_scale++;
1262 psched_us_per_tick = 1<<psched_clock_scale;
1263 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1264 return 0;
1265}
1266#endif
1267
1268static int __init pktsched_init(void)
1269{
1270 struct rtnetlink_link *link_p;
1271
1272#ifdef CONFIG_NET_SCH_CLK_CPU
1273 if (psched_calibrate_clock() < 0)
1274 return -1;
1275#elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1276 psched_tick_per_us = HZ<<PSCHED_JSCALE;
1277 psched_us_per_tick = 1000000;
1278#endif
1279
1280 link_p = rtnetlink_links[PF_UNSPEC];
1281
1282 /* Setup rtnetlink links. It is made here to avoid
1283 exporting large number of public symbols.
1284 */
1285
1286 if (link_p) {
1287 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1288 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1289 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1290 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1291 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1292 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1293 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1294 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1295 }
1296
1297 register_qdisc(&pfifo_qdisc_ops);
1298 register_qdisc(&bfifo_qdisc_ops);
1299 proc_net_fops_create("psched", 0, &psched_fops);
1300
1301 return 0;
1302}
1303
1304subsys_initcall(pktsched_init);
1305
Linus Torvalds1da177e2005-04-16 15:20:36 -07001306EXPORT_SYMBOL(qdisc_get_rtab);
1307EXPORT_SYMBOL(qdisc_put_rtab);
1308EXPORT_SYMBOL(register_qdisc);
1309EXPORT_SYMBOL(unregister_qdisc);
1310EXPORT_SYMBOL(tc_classify);