blob: d71bf79eb80b2fb1402d361436534372e7491f22 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
Linus Torvalds1da177e2005-04-16 15:20:36 -070018#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070021#include <linux/string.h>
22#include <linux/mm.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/in.h>
26#include <linux/errno.h>
27#include <linux/interrupt.h>
28#include <linux/netdevice.h>
29#include <linux/skbuff.h>
30#include <linux/rtnetlink.h>
31#include <linux/init.h>
32#include <linux/proc_fs.h>
33#include <linux/seq_file.h>
34#include <linux/kmod.h>
35#include <linux/list.h>
36#include <linux/bitops.h>
37
38#include <net/sock.h>
39#include <net/pkt_sched.h>
40
41#include <asm/processor.h>
42#include <asm/uaccess.h>
43#include <asm/system.h>
44
45static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
46 struct Qdisc *old, struct Qdisc *new);
47static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
48 struct Qdisc *q, unsigned long cl, int event);
49
50/*
51
52 Short review.
53 -------------
54
55 This file consists of two interrelated parts:
56
57 1. queueing disciplines manager frontend.
58 2. traffic classes manager frontend.
59
60 Generally, queueing discipline ("qdisc") is a black box,
61 which is able to enqueue packets and to dequeue them (when
62 device is ready to send something) in order and at times
63 determined by algorithm hidden in it.
64
65 qdisc's are divided to two categories:
66 - "queues", which have no internal structure visible from outside.
67 - "schedulers", which split all the packets to "traffic classes",
68 using "packet classifiers" (look at cls_api.c)
69
70 In turn, classes may have child qdiscs (as rule, queues)
71 attached to them etc. etc. etc.
72
73 The goal of the routines in this file is to translate
74 information supplied by user in the form of handles
75 to more intelligible for kernel form, to make some sanity
76 checks and part of work, which is common to all qdiscs
77 and to provide rtnetlink notifications.
78
79 All real intelligent work is done inside qdisc modules.
80
81
82
83 Every discipline has two major routines: enqueue and dequeue.
84
85 ---dequeue
86
87 dequeue usually returns a skb to send. It is allowed to return NULL,
88 but it does not mean that queue is empty, it just means that
89 discipline does not want to send anything this time.
90 Queue is really empty if q->q.qlen == 0.
91 For complicated disciplines with multiple queues q->q is not
92 real packet queue, but however q->q.qlen must be valid.
93
94 ---enqueue
95
96 enqueue returns 0, if packet was enqueued successfully.
97 If packet (this one or another one) was dropped, it returns
98 not zero error code.
99 NET_XMIT_DROP - this packet dropped
100 Expected action: do not backoff, but wait until queue will clear.
101 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
102 Expected action: backoff or ignore
103 NET_XMIT_POLICED - dropped by police.
104 Expected action: backoff or error to real-time apps.
105
106 Auxiliary routines:
107
108 ---requeue
109
110 requeues once dequeued packet. It is used for non-standard or
111 just buggy devices, which can defer output even if dev->tbusy=0.
112
113 ---reset
114
115 returns qdisc to initial state: purge all buffers, clear all
116 timers, counters (except for statistics) etc.
117
118 ---init
119
120 initializes newly created qdisc.
121
122 ---destroy
123
124 destroys resources allocated by init and during lifetime of qdisc.
125
126 ---change
127
128 changes qdisc parameters.
129 */
130
131/* Protects list of registered TC modules. It is pure SMP lock. */
132static DEFINE_RWLOCK(qdisc_mod_lock);
133
134
135/************************************************
136 * Queueing disciplines manipulation. *
137 ************************************************/
138
139
140/* The list of all installed queueing disciplines. */
141
142static struct Qdisc_ops *qdisc_base;
143
144/* Register/uregister queueing discipline */
145
146int register_qdisc(struct Qdisc_ops *qops)
147{
148 struct Qdisc_ops *q, **qp;
149 int rc = -EEXIST;
150
151 write_lock(&qdisc_mod_lock);
152 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
153 if (!strcmp(qops->id, q->id))
154 goto out;
155
156 if (qops->enqueue == NULL)
157 qops->enqueue = noop_qdisc_ops.enqueue;
158 if (qops->requeue == NULL)
159 qops->requeue = noop_qdisc_ops.requeue;
160 if (qops->dequeue == NULL)
161 qops->dequeue = noop_qdisc_ops.dequeue;
162
163 qops->next = NULL;
164 *qp = qops;
165 rc = 0;
166out:
167 write_unlock(&qdisc_mod_lock);
168 return rc;
169}
170
171int unregister_qdisc(struct Qdisc_ops *qops)
172{
173 struct Qdisc_ops *q, **qp;
174 int err = -ENOENT;
175
176 write_lock(&qdisc_mod_lock);
177 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
178 if (q == qops)
179 break;
180 if (q) {
181 *qp = q->next;
182 q->next = NULL;
183 err = 0;
184 }
185 write_unlock(&qdisc_mod_lock);
186 return err;
187}
188
189/* We know handle. Find qdisc among all qdisc's attached to device
190 (root qdisc, all its children, children of children etc.)
191 */
192
Patrick McHardy43effa12006-11-29 17:35:48 -0800193static struct Qdisc *__qdisc_lookup(struct net_device *dev, u32 handle)
194{
195 struct Qdisc *q;
196
197 list_for_each_entry(q, &dev->qdisc_list, list) {
198 if (q->handle == handle)
199 return q;
200 }
201 return NULL;
202}
203
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
205{
206 struct Qdisc *q;
207
Patrick McHardy85670cc2006-09-27 16:45:45 -0700208 read_lock(&qdisc_tree_lock);
Patrick McHardy43effa12006-11-29 17:35:48 -0800209 q = __qdisc_lookup(dev, handle);
Patrick McHardy85670cc2006-09-27 16:45:45 -0700210 read_unlock(&qdisc_tree_lock);
Patrick McHardy43effa12006-11-29 17:35:48 -0800211 return q;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700212}
213
214static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
215{
216 unsigned long cl;
217 struct Qdisc *leaf;
218 struct Qdisc_class_ops *cops = p->ops->cl_ops;
219
220 if (cops == NULL)
221 return NULL;
222 cl = cops->get(p, classid);
223
224 if (cl == 0)
225 return NULL;
226 leaf = cops->leaf(p, cl);
227 cops->put(p, cl);
228 return leaf;
229}
230
231/* Find queueing discipline by name */
232
233static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
234{
235 struct Qdisc_ops *q = NULL;
236
237 if (kind) {
238 read_lock(&qdisc_mod_lock);
239 for (q = qdisc_base; q; q = q->next) {
240 if (rtattr_strcmp(kind, q->id) == 0) {
241 if (!try_module_get(q->owner))
242 q = NULL;
243 break;
244 }
245 }
246 read_unlock(&qdisc_mod_lock);
247 }
248 return q;
249}
250
251static struct qdisc_rate_table *qdisc_rtab_list;
252
253struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
254{
255 struct qdisc_rate_table *rtab;
256
257 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
258 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
259 rtab->refcnt++;
260 return rtab;
261 }
262 }
263
264 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
265 return NULL;
266
267 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
268 if (rtab) {
269 rtab->rate = *r;
270 rtab->refcnt = 1;
271 memcpy(rtab->data, RTA_DATA(tab), 1024);
272 rtab->next = qdisc_rtab_list;
273 qdisc_rtab_list = rtab;
274 }
275 return rtab;
276}
277
278void qdisc_put_rtab(struct qdisc_rate_table *tab)
279{
280 struct qdisc_rate_table *rtab, **rtabp;
281
282 if (!tab || --tab->refcnt)
283 return;
284
285 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
286 if (rtab == tab) {
287 *rtabp = rtab->next;
288 kfree(rtab);
289 return;
290 }
291 }
292}
293
294
295/* Allocate an unique handle from space managed by kernel */
296
297static u32 qdisc_alloc_handle(struct net_device *dev)
298{
299 int i = 0x10000;
300 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
301
302 do {
303 autohandle += TC_H_MAKE(0x10000U, 0);
304 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
305 autohandle = TC_H_MAKE(0x80000000U, 0);
306 } while (qdisc_lookup(dev, autohandle) && --i > 0);
307
308 return i>0 ? autohandle : 0;
309}
310
311/* Attach toplevel qdisc to device dev */
312
313static struct Qdisc *
314dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
315{
316 struct Qdisc *oqdisc;
317
318 if (dev->flags & IFF_UP)
319 dev_deactivate(dev);
320
321 qdisc_lock_tree(dev);
322 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
323 oqdisc = dev->qdisc_ingress;
324 /* Prune old scheduler */
325 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
326 /* delete */
327 qdisc_reset(oqdisc);
328 dev->qdisc_ingress = NULL;
329 } else { /* new */
330 dev->qdisc_ingress = qdisc;
331 }
332
333 } else {
334
335 oqdisc = dev->qdisc_sleeping;
336
337 /* Prune old scheduler */
338 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
339 qdisc_reset(oqdisc);
340
341 /* ... and graft new one */
342 if (qdisc == NULL)
343 qdisc = &noop_qdisc;
344 dev->qdisc_sleeping = qdisc;
345 dev->qdisc = &noop_qdisc;
346 }
347
348 qdisc_unlock_tree(dev);
349
350 if (dev->flags & IFF_UP)
351 dev_activate(dev);
352
353 return oqdisc;
354}
355
Patrick McHardy43effa12006-11-29 17:35:48 -0800356void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
357{
358 struct Qdisc_class_ops *cops;
359 unsigned long cl;
360 u32 parentid;
361
362 if (n == 0)
363 return;
364 while ((parentid = sch->parent)) {
365 sch = __qdisc_lookup(sch->dev, TC_H_MAJ(parentid));
366 cops = sch->ops->cl_ops;
367 if (cops->qlen_notify) {
368 cl = cops->get(sch, parentid);
369 cops->qlen_notify(sch, cl);
370 cops->put(sch, cl);
371 }
372 sch->q.qlen -= n;
373 }
374}
375EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700376
377/* Graft qdisc "new" to class "classid" of qdisc "parent" or
378 to device "dev".
379
380 Old qdisc is not destroyed but returned in *old.
381 */
382
383static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
384 u32 classid,
385 struct Qdisc *new, struct Qdisc **old)
386{
387 int err = 0;
388 struct Qdisc *q = *old;
389
390
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900391 if (parent == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700392 if (q && q->flags&TCQ_F_INGRESS) {
393 *old = dev_graft_qdisc(dev, q);
394 } else {
395 *old = dev_graft_qdisc(dev, new);
396 }
397 } else {
398 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
399
400 err = -EINVAL;
401
402 if (cops) {
403 unsigned long cl = cops->get(parent, classid);
404 if (cl) {
405 err = cops->graft(parent, cl, new, old);
406 if (new)
407 new->parent = classid;
408 cops->put(parent, cl);
409 }
410 }
411 }
412 return err;
413}
414
415/*
416 Allocate and initialize new qdisc.
417
418 Parameters are passed via opt.
419 */
420
421static struct Qdisc *
422qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
423{
424 int err;
425 struct rtattr *kind = tca[TCA_KIND-1];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700426 struct Qdisc *sch;
427 struct Qdisc_ops *ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700428
429 ops = qdisc_lookup_ops(kind);
430#ifdef CONFIG_KMOD
431 if (ops == NULL && kind != NULL) {
432 char name[IFNAMSIZ];
433 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
434 /* We dropped the RTNL semaphore in order to
435 * perform the module load. So, even if we
436 * succeeded in loading the module we have to
437 * tell the caller to replay the request. We
438 * indicate this using -EAGAIN.
439 * We replay the request because the device may
440 * go away in the mean time.
441 */
442 rtnl_unlock();
443 request_module("sch_%s", name);
444 rtnl_lock();
445 ops = qdisc_lookup_ops(kind);
446 if (ops != NULL) {
447 /* We will try again qdisc_lookup_ops,
448 * so don't keep a reference.
449 */
450 module_put(ops->owner);
451 err = -EAGAIN;
452 goto err_out;
453 }
454 }
455 }
456#endif
457
Jamal Hadi Salimb9e2cc02006-08-03 16:36:51 -0700458 err = -ENOENT;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700459 if (ops == NULL)
460 goto err_out;
461
Thomas Graf3d54b822005-07-05 14:15:09 -0700462 sch = qdisc_alloc(dev, ops);
463 if (IS_ERR(sch)) {
464 err = PTR_ERR(sch);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700465 goto err_out2;
Thomas Graf3d54b822005-07-05 14:15:09 -0700466 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700467
Thomas Graf3d54b822005-07-05 14:15:09 -0700468 if (handle == TC_H_INGRESS) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469 sch->flags |= TCQ_F_INGRESS;
Thomas Graf3d54b822005-07-05 14:15:09 -0700470 handle = TC_H_MAKE(TC_H_INGRESS, 0);
471 } else if (handle == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472 handle = qdisc_alloc_handle(dev);
473 err = -ENOMEM;
474 if (handle == 0)
475 goto err_out3;
476 }
477
Thomas Graf3d54b822005-07-05 14:15:09 -0700478 sch->handle = handle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700479
480 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
Thomas Graf023e09a2005-07-05 14:15:53 -0700481#ifdef CONFIG_NET_ESTIMATOR
482 if (tca[TCA_RATE-1]) {
483 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
484 sch->stats_lock,
485 tca[TCA_RATE-1]);
486 if (err) {
487 /*
488 * Any broken qdiscs that would require
489 * a ops->reset() here? The qdisc was never
490 * in action so it shouldn't be necessary.
491 */
492 if (ops->destroy)
493 ops->destroy(sch);
494 goto err_out3;
495 }
496 }
497#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498 qdisc_lock_tree(dev);
499 list_add_tail(&sch->list, &dev->qdisc_list);
500 qdisc_unlock_tree(dev);
501
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502 return sch;
503 }
504err_out3:
505 dev_put(dev);
Thomas Graf3d54b822005-07-05 14:15:09 -0700506 kfree((char *) sch - sch->padded);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700507err_out2:
508 module_put(ops->owner);
509err_out:
510 *errp = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700511 return NULL;
512}
513
514static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
515{
516 if (tca[TCA_OPTIONS-1]) {
517 int err;
518
519 if (sch->ops->change == NULL)
520 return -EINVAL;
521 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
522 if (err)
523 return err;
524 }
525#ifdef CONFIG_NET_ESTIMATOR
526 if (tca[TCA_RATE-1])
527 gen_replace_estimator(&sch->bstats, &sch->rate_est,
528 sch->stats_lock, tca[TCA_RATE-1]);
529#endif
530 return 0;
531}
532
533struct check_loop_arg
534{
535 struct qdisc_walker w;
536 struct Qdisc *p;
537 int depth;
538};
539
540static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
541
542static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
543{
544 struct check_loop_arg arg;
545
546 if (q->ops->cl_ops == NULL)
547 return 0;
548
549 arg.w.stop = arg.w.skip = arg.w.count = 0;
550 arg.w.fn = check_loop_fn;
551 arg.depth = depth;
552 arg.p = p;
553 q->ops->cl_ops->walk(q, &arg.w);
554 return arg.w.stop ? -ELOOP : 0;
555}
556
557static int
558check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
559{
560 struct Qdisc *leaf;
561 struct Qdisc_class_ops *cops = q->ops->cl_ops;
562 struct check_loop_arg *arg = (struct check_loop_arg *)w;
563
564 leaf = cops->leaf(q, cl);
565 if (leaf) {
566 if (leaf == arg->p || arg->depth > 7)
567 return -ELOOP;
568 return check_loop(leaf, arg->p, arg->depth + 1);
569 }
570 return 0;
571}
572
573/*
574 * Delete/get qdisc.
575 */
576
577static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
578{
579 struct tcmsg *tcm = NLMSG_DATA(n);
580 struct rtattr **tca = arg;
581 struct net_device *dev;
582 u32 clid = tcm->tcm_parent;
583 struct Qdisc *q = NULL;
584 struct Qdisc *p = NULL;
585 int err;
586
587 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
588 return -ENODEV;
589
590 if (clid) {
591 if (clid != TC_H_ROOT) {
592 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
593 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
594 return -ENOENT;
595 q = qdisc_leaf(p, clid);
596 } else { /* ingress */
597 q = dev->qdisc_ingress;
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900598 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700599 } else {
600 q = dev->qdisc_sleeping;
601 }
602 if (!q)
603 return -ENOENT;
604
605 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
606 return -EINVAL;
607 } else {
608 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
609 return -ENOENT;
610 }
611
612 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
613 return -EINVAL;
614
615 if (n->nlmsg_type == RTM_DELQDISC) {
616 if (!clid)
617 return -EINVAL;
618 if (q->handle == 0)
619 return -ENOENT;
620 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
621 return err;
622 if (q) {
623 qdisc_notify(skb, n, clid, q, NULL);
624 spin_lock_bh(&dev->queue_lock);
625 qdisc_destroy(q);
626 spin_unlock_bh(&dev->queue_lock);
627 }
628 } else {
629 qdisc_notify(skb, n, clid, NULL, q);
630 }
631 return 0;
632}
633
634/*
635 Create/change qdisc.
636 */
637
638static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
639{
640 struct tcmsg *tcm;
641 struct rtattr **tca;
642 struct net_device *dev;
643 u32 clid;
644 struct Qdisc *q, *p;
645 int err;
646
647replay:
648 /* Reinit, just in case something touches this. */
649 tcm = NLMSG_DATA(n);
650 tca = arg;
651 clid = tcm->tcm_parent;
652 q = p = NULL;
653
654 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
655 return -ENODEV;
656
657 if (clid) {
658 if (clid != TC_H_ROOT) {
659 if (clid != TC_H_INGRESS) {
660 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
661 return -ENOENT;
662 q = qdisc_leaf(p, clid);
663 } else { /*ingress */
664 q = dev->qdisc_ingress;
665 }
666 } else {
667 q = dev->qdisc_sleeping;
668 }
669
670 /* It may be default qdisc, ignore it */
671 if (q && q->handle == 0)
672 q = NULL;
673
674 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
675 if (tcm->tcm_handle) {
676 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
677 return -EEXIST;
678 if (TC_H_MIN(tcm->tcm_handle))
679 return -EINVAL;
680 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
681 goto create_n_graft;
682 if (n->nlmsg_flags&NLM_F_EXCL)
683 return -EEXIST;
684 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
685 return -EINVAL;
686 if (q == p ||
687 (p && check_loop(q, p, 0)))
688 return -ELOOP;
689 atomic_inc(&q->refcnt);
690 goto graft;
691 } else {
692 if (q == NULL)
693 goto create_n_graft;
694
695 /* This magic test requires explanation.
696 *
697 * We know, that some child q is already
698 * attached to this parent and have choice:
699 * either to change it or to create/graft new one.
700 *
701 * 1. We are allowed to create/graft only
702 * if CREATE and REPLACE flags are set.
703 *
704 * 2. If EXCL is set, requestor wanted to say,
705 * that qdisc tcm_handle is not expected
706 * to exist, so that we choose create/graft too.
707 *
708 * 3. The last case is when no flags are set.
709 * Alas, it is sort of hole in API, we
710 * cannot decide what to do unambiguously.
711 * For now we select create/graft, if
712 * user gave KIND, which does not match existing.
713 */
714 if ((n->nlmsg_flags&NLM_F_CREATE) &&
715 (n->nlmsg_flags&NLM_F_REPLACE) &&
716 ((n->nlmsg_flags&NLM_F_EXCL) ||
717 (tca[TCA_KIND-1] &&
718 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
719 goto create_n_graft;
720 }
721 }
722 } else {
723 if (!tcm->tcm_handle)
724 return -EINVAL;
725 q = qdisc_lookup(dev, tcm->tcm_handle);
726 }
727
728 /* Change qdisc parameters */
729 if (q == NULL)
730 return -ENOENT;
731 if (n->nlmsg_flags&NLM_F_EXCL)
732 return -EEXIST;
733 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
734 return -EINVAL;
735 err = qdisc_change(q, tca);
736 if (err == 0)
737 qdisc_notify(skb, n, clid, NULL, q);
738 return err;
739
740create_n_graft:
741 if (!(n->nlmsg_flags&NLM_F_CREATE))
742 return -ENOENT;
743 if (clid == TC_H_INGRESS)
744 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900745 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700746 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
747 if (q == NULL) {
748 if (err == -EAGAIN)
749 goto replay;
750 return err;
751 }
752
753graft:
754 if (1) {
755 struct Qdisc *old_q = NULL;
756 err = qdisc_graft(dev, p, clid, q, &old_q);
757 if (err) {
758 if (q) {
759 spin_lock_bh(&dev->queue_lock);
760 qdisc_destroy(q);
761 spin_unlock_bh(&dev->queue_lock);
762 }
763 return err;
764 }
765 qdisc_notify(skb, n, clid, old_q, q);
766 if (old_q) {
767 spin_lock_bh(&dev->queue_lock);
768 qdisc_destroy(old_q);
769 spin_unlock_bh(&dev->queue_lock);
770 }
771 }
772 return 0;
773}
774
775static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700776 u32 pid, u32 seq, u16 flags, int event)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700777{
778 struct tcmsg *tcm;
779 struct nlmsghdr *nlh;
780 unsigned char *b = skb->tail;
781 struct gnet_dump d;
782
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700783 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700784 tcm = NLMSG_DATA(nlh);
785 tcm->tcm_family = AF_UNSPEC;
Patrick McHardy9ef1d4c2005-06-28 12:55:30 -0700786 tcm->tcm__pad1 = 0;
787 tcm->tcm__pad2 = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700788 tcm->tcm_ifindex = q->dev->ifindex;
789 tcm->tcm_parent = clid;
790 tcm->tcm_handle = q->handle;
791 tcm->tcm_info = atomic_read(&q->refcnt);
792 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
793 if (q->ops->dump && q->ops->dump(q, skb) < 0)
794 goto rtattr_failure;
795 q->qstats.qlen = q->q.qlen;
796
797 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
798 TCA_XSTATS, q->stats_lock, &d) < 0)
799 goto rtattr_failure;
800
801 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
802 goto rtattr_failure;
803
804 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
805#ifdef CONFIG_NET_ESTIMATOR
806 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
807#endif
808 gnet_stats_copy_queue(&d, &q->qstats) < 0)
809 goto rtattr_failure;
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900810
Linus Torvalds1da177e2005-04-16 15:20:36 -0700811 if (gnet_stats_finish_copy(&d) < 0)
812 goto rtattr_failure;
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900813
Linus Torvalds1da177e2005-04-16 15:20:36 -0700814 nlh->nlmsg_len = skb->tail - b;
815 return skb->len;
816
817nlmsg_failure:
818rtattr_failure:
819 skb_trim(skb, b - skb->data);
820 return -1;
821}
822
823static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
824 u32 clid, struct Qdisc *old, struct Qdisc *new)
825{
826 struct sk_buff *skb;
827 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
828
829 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
830 if (!skb)
831 return -ENOBUFS;
832
833 if (old && old->handle) {
834 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
835 goto err_out;
836 }
837 if (new) {
838 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
839 goto err_out;
840 }
841
842 if (skb->len)
Patrick McHardyac6d4392005-08-14 19:29:52 -0700843 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700844
845err_out:
846 kfree_skb(skb);
847 return -EINVAL;
848}
849
850static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
851{
852 int idx, q_idx;
853 int s_idx, s_q_idx;
854 struct net_device *dev;
855 struct Qdisc *q;
856
857 s_idx = cb->args[0];
858 s_q_idx = q_idx = cb->args[1];
859 read_lock(&dev_base_lock);
860 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
861 if (idx < s_idx)
862 continue;
863 if (idx > s_idx)
864 s_q_idx = 0;
Patrick McHardy85670cc2006-09-27 16:45:45 -0700865 read_lock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700866 q_idx = 0;
867 list_for_each_entry(q, &dev->qdisc_list, list) {
868 if (q_idx < s_q_idx) {
869 q_idx++;
870 continue;
871 }
872 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
873 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
Patrick McHardy85670cc2006-09-27 16:45:45 -0700874 read_unlock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700875 goto done;
876 }
877 q_idx++;
878 }
Patrick McHardy85670cc2006-09-27 16:45:45 -0700879 read_unlock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700880 }
881
882done:
883 read_unlock(&dev_base_lock);
884
885 cb->args[0] = idx;
886 cb->args[1] = q_idx;
887
888 return skb->len;
889}
890
891
892
893/************************************************
894 * Traffic classes manipulation. *
895 ************************************************/
896
897
898
899static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
900{
901 struct tcmsg *tcm = NLMSG_DATA(n);
902 struct rtattr **tca = arg;
903 struct net_device *dev;
904 struct Qdisc *q = NULL;
905 struct Qdisc_class_ops *cops;
906 unsigned long cl = 0;
907 unsigned long new_cl;
908 u32 pid = tcm->tcm_parent;
909 u32 clid = tcm->tcm_handle;
910 u32 qid = TC_H_MAJ(clid);
911 int err;
912
913 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
914 return -ENODEV;
915
916 /*
917 parent == TC_H_UNSPEC - unspecified parent.
918 parent == TC_H_ROOT - class is root, which has no parent.
919 parent == X:0 - parent is root class.
920 parent == X:Y - parent is a node in hierarchy.
921 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
922
923 handle == 0:0 - generate handle from kernel pool.
924 handle == 0:Y - class is X:Y, where X:0 is qdisc.
925 handle == X:Y - clear.
926 handle == X:0 - root class.
927 */
928
929 /* Step 1. Determine qdisc handle X:0 */
930
931 if (pid != TC_H_ROOT) {
932 u32 qid1 = TC_H_MAJ(pid);
933
934 if (qid && qid1) {
935 /* If both majors are known, they must be identical. */
936 if (qid != qid1)
937 return -EINVAL;
938 } else if (qid1) {
939 qid = qid1;
940 } else if (qid == 0)
941 qid = dev->qdisc_sleeping->handle;
942
943 /* Now qid is genuine qdisc handle consistent
944 both with parent and child.
945
946 TC_H_MAJ(pid) still may be unspecified, complete it now.
947 */
948 if (pid)
949 pid = TC_H_MAKE(qid, pid);
950 } else {
951 if (qid == 0)
952 qid = dev->qdisc_sleeping->handle;
953 }
954
955 /* OK. Locate qdisc */
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900956 if ((q = qdisc_lookup(dev, qid)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700957 return -ENOENT;
958
959 /* An check that it supports classes */
960 cops = q->ops->cl_ops;
961 if (cops == NULL)
962 return -EINVAL;
963
964 /* Now try to get class */
965 if (clid == 0) {
966 if (pid == TC_H_ROOT)
967 clid = qid;
968 } else
969 clid = TC_H_MAKE(qid, clid);
970
971 if (clid)
972 cl = cops->get(q, clid);
973
974 if (cl == 0) {
975 err = -ENOENT;
976 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
977 goto out;
978 } else {
979 switch (n->nlmsg_type) {
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900980 case RTM_NEWTCLASS:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700981 err = -EEXIST;
982 if (n->nlmsg_flags&NLM_F_EXCL)
983 goto out;
984 break;
985 case RTM_DELTCLASS:
986 err = cops->delete(q, cl);
987 if (err == 0)
988 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
989 goto out;
990 case RTM_GETTCLASS:
991 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
992 goto out;
993 default:
994 err = -EINVAL;
995 goto out;
996 }
997 }
998
999 new_cl = cl;
1000 err = cops->change(q, clid, pid, tca, &new_cl);
1001 if (err == 0)
1002 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1003
1004out:
1005 if (cl)
1006 cops->put(q, cl);
1007
1008 return err;
1009}
1010
1011
1012static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1013 unsigned long cl,
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -07001014 u32 pid, u32 seq, u16 flags, int event)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001015{
1016 struct tcmsg *tcm;
1017 struct nlmsghdr *nlh;
1018 unsigned char *b = skb->tail;
1019 struct gnet_dump d;
1020 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1021
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -07001022 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001023 tcm = NLMSG_DATA(nlh);
1024 tcm->tcm_family = AF_UNSPEC;
1025 tcm->tcm_ifindex = q->dev->ifindex;
1026 tcm->tcm_parent = q->handle;
1027 tcm->tcm_handle = q->handle;
1028 tcm->tcm_info = 0;
1029 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1030 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1031 goto rtattr_failure;
1032
1033 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1034 TCA_XSTATS, q->stats_lock, &d) < 0)
1035 goto rtattr_failure;
1036
1037 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1038 goto rtattr_failure;
1039
1040 if (gnet_stats_finish_copy(&d) < 0)
1041 goto rtattr_failure;
1042
1043 nlh->nlmsg_len = skb->tail - b;
1044 return skb->len;
1045
1046nlmsg_failure:
1047rtattr_failure:
1048 skb_trim(skb, b - skb->data);
1049 return -1;
1050}
1051
1052static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1053 struct Qdisc *q, unsigned long cl, int event)
1054{
1055 struct sk_buff *skb;
1056 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1057
1058 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1059 if (!skb)
1060 return -ENOBUFS;
1061
1062 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1063 kfree_skb(skb);
1064 return -EINVAL;
1065 }
1066
Patrick McHardyac6d4392005-08-14 19:29:52 -07001067 return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001068}
1069
1070struct qdisc_dump_args
1071{
1072 struct qdisc_walker w;
1073 struct sk_buff *skb;
1074 struct netlink_callback *cb;
1075};
1076
1077static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1078{
1079 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1080
1081 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1082 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1083}
1084
1085static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1086{
1087 int t;
1088 int s_t;
1089 struct net_device *dev;
1090 struct Qdisc *q;
1091 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1092 struct qdisc_dump_args arg;
1093
1094 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1095 return 0;
1096 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1097 return 0;
1098
1099 s_t = cb->args[0];
1100 t = 0;
1101
Patrick McHardy85670cc2006-09-27 16:45:45 -07001102 read_lock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001103 list_for_each_entry(q, &dev->qdisc_list, list) {
1104 if (t < s_t || !q->ops->cl_ops ||
1105 (tcm->tcm_parent &&
1106 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1107 t++;
1108 continue;
1109 }
1110 if (t > s_t)
1111 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1112 arg.w.fn = qdisc_class_dump;
1113 arg.skb = skb;
1114 arg.cb = cb;
1115 arg.w.stop = 0;
1116 arg.w.skip = cb->args[1];
1117 arg.w.count = 0;
1118 q->ops->cl_ops->walk(q, &arg.w);
1119 cb->args[1] = arg.w.count;
1120 if (arg.w.stop)
1121 break;
1122 t++;
1123 }
Patrick McHardy85670cc2006-09-27 16:45:45 -07001124 read_unlock(&qdisc_tree_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001125
1126 cb->args[0] = t;
1127
1128 dev_put(dev);
1129 return skb->len;
1130}
1131
1132/* Main classifier routine: scans classifier chain attached
1133 to this qdisc, (optionally) tests for protocol and asks
1134 specific classifiers.
1135 */
1136int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1137 struct tcf_result *res)
1138{
1139 int err = 0;
Al Viro66c6f522006-11-20 18:07:51 -08001140 __be16 protocol = skb->protocol;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001141#ifdef CONFIG_NET_CLS_ACT
1142 struct tcf_proto *otp = tp;
1143reclassify:
1144#endif
1145 protocol = skb->protocol;
1146
1147 for ( ; tp; tp = tp->next) {
1148 if ((tp->protocol == protocol ||
YOSHIFUJI Hideakib6d9bcb2007-03-07 14:21:20 +09001149 tp->protocol == htons(ETH_P_ALL)) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001150 (err = tp->classify(skb, tp, res)) >= 0) {
1151#ifdef CONFIG_NET_CLS_ACT
1152 if ( TC_ACT_RECLASSIFY == err) {
1153 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1154 tp = otp;
1155
1156 if (MAX_REC_LOOP < verd++) {
1157 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1158 tp->prio&0xffff, ntohs(tp->protocol));
1159 return TC_ACT_SHOT;
1160 }
1161 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1162 goto reclassify;
1163 } else {
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +09001164 if (skb->tc_verd)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001165 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1166 return err;
1167 }
1168#else
1169
1170 return err;
1171#endif
1172 }
1173
1174 }
1175 return -1;
1176}
1177
Linus Torvalds1da177e2005-04-16 15:20:36 -07001178#ifdef CONFIG_PROC_FS
1179static int psched_show(struct seq_file *seq, void *v)
1180{
1181 seq_printf(seq, "%08x %08x %08x %08x\n",
Patrick McHardy641b9e02007-03-16 01:18:42 -07001182 (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1183 1000000, HZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001184
1185 return 0;
1186}
1187
1188static int psched_open(struct inode *inode, struct file *file)
1189{
1190 return single_open(file, psched_show, PDE(inode)->data);
1191}
1192
Arjan van de Venda7071d2007-02-12 00:55:36 -08001193static const struct file_operations psched_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001194 .owner = THIS_MODULE,
1195 .open = psched_open,
1196 .read = seq_read,
1197 .llseek = seq_lseek,
1198 .release = single_release,
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +09001199};
Linus Torvalds1da177e2005-04-16 15:20:36 -07001200#endif
1201
Linus Torvalds1da177e2005-04-16 15:20:36 -07001202static int __init pktsched_init(void)
1203{
1204 struct rtnetlink_link *link_p;
1205
Linus Torvalds1da177e2005-04-16 15:20:36 -07001206 link_p = rtnetlink_links[PF_UNSPEC];
1207
1208 /* Setup rtnetlink links. It is made here to avoid
1209 exporting large number of public symbols.
1210 */
1211
1212 if (link_p) {
1213 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1214 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1215 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1216 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1217 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1218 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1219 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1220 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1221 }
1222
1223 register_qdisc(&pfifo_qdisc_ops);
1224 register_qdisc(&bfifo_qdisc_ops);
1225 proc_net_fops_create("psched", 0, &psched_fops);
1226
1227 return 0;
1228}
1229
1230subsys_initcall(pktsched_init);
1231
Linus Torvalds1da177e2005-04-16 15:20:36 -07001232EXPORT_SYMBOL(qdisc_get_rtab);
1233EXPORT_SYMBOL(qdisc_put_rtab);
1234EXPORT_SYMBOL(register_qdisc);
1235EXPORT_SYMBOL(unregister_qdisc);
1236EXPORT_SYMBOL(tc_classify);