blob: 74924893ef7f7f8f28ef1ff24f6a6ca2dbe1bfe4 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
Linus Torvalds1da177e2005-04-16 15:20:36 -070018#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070021#include <linux/string.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/errno.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070023#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070024#include <linux/init.h>
25#include <linux/proc_fs.h>
26#include <linux/seq_file.h>
27#include <linux/kmod.h>
28#include <linux/list.h>
Patrick McHardy41794772007-03-16 01:19:15 -070029#include <linux/hrtimer.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070030
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020031#include <net/net_namespace.h>
Denis V. Lunevb8542722007-12-01 00:21:31 +110032#include <net/sock.h>
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -070033#include <net/netlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <net/pkt_sched.h>
35
Linus Torvalds1da177e2005-04-16 15:20:36 -070036static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
37 struct Qdisc *old, struct Qdisc *new);
38static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
39 struct Qdisc *q, unsigned long cl, int event);
40
41/*
42
43 Short review.
44 -------------
45
46 This file consists of two interrelated parts:
47
48 1. queueing disciplines manager frontend.
49 2. traffic classes manager frontend.
50
51 Generally, queueing discipline ("qdisc") is a black box,
52 which is able to enqueue packets and to dequeue them (when
53 device is ready to send something) in order and at times
54 determined by algorithm hidden in it.
55
56 qdisc's are divided to two categories:
57 - "queues", which have no internal structure visible from outside.
58 - "schedulers", which split all the packets to "traffic classes",
59 using "packet classifiers" (look at cls_api.c)
60
61 In turn, classes may have child qdiscs (as rule, queues)
62 attached to them etc. etc. etc.
63
64 The goal of the routines in this file is to translate
65 information supplied by user in the form of handles
66 to more intelligible for kernel form, to make some sanity
67 checks and part of work, which is common to all qdiscs
68 and to provide rtnetlink notifications.
69
70 All real intelligent work is done inside qdisc modules.
71
72
73
74 Every discipline has two major routines: enqueue and dequeue.
75
76 ---dequeue
77
78 dequeue usually returns a skb to send. It is allowed to return NULL,
79 but it does not mean that queue is empty, it just means that
80 discipline does not want to send anything this time.
81 Queue is really empty if q->q.qlen == 0.
82 For complicated disciplines with multiple queues q->q is not
83 real packet queue, but however q->q.qlen must be valid.
84
85 ---enqueue
86
87 enqueue returns 0, if packet was enqueued successfully.
88 If packet (this one or another one) was dropped, it returns
89 not zero error code.
90 NET_XMIT_DROP - this packet dropped
91 Expected action: do not backoff, but wait until queue will clear.
92 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
93 Expected action: backoff or ignore
94 NET_XMIT_POLICED - dropped by police.
95 Expected action: backoff or error to real-time apps.
96
97 Auxiliary routines:
98
99 ---requeue
100
101 requeues once dequeued packet. It is used for non-standard or
David S. Millere65d22e2008-07-08 16:46:01 -0700102 just buggy devices, which can defer output even if netif_queue_stopped()=0.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103
104 ---reset
105
106 returns qdisc to initial state: purge all buffers, clear all
107 timers, counters (except for statistics) etc.
108
109 ---init
110
111 initializes newly created qdisc.
112
113 ---destroy
114
115 destroys resources allocated by init and during lifetime of qdisc.
116
117 ---change
118
119 changes qdisc parameters.
120 */
121
122/* Protects list of registered TC modules. It is pure SMP lock. */
123static DEFINE_RWLOCK(qdisc_mod_lock);
124
125
126/************************************************
127 * Queueing disciplines manipulation. *
128 ************************************************/
129
130
131/* The list of all installed queueing disciplines. */
132
133static struct Qdisc_ops *qdisc_base;
134
135/* Register/uregister queueing discipline */
136
137int register_qdisc(struct Qdisc_ops *qops)
138{
139 struct Qdisc_ops *q, **qp;
140 int rc = -EEXIST;
141
142 write_lock(&qdisc_mod_lock);
143 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
144 if (!strcmp(qops->id, q->id))
145 goto out;
146
147 if (qops->enqueue == NULL)
148 qops->enqueue = noop_qdisc_ops.enqueue;
149 if (qops->requeue == NULL)
150 qops->requeue = noop_qdisc_ops.requeue;
151 if (qops->dequeue == NULL)
152 qops->dequeue = noop_qdisc_ops.dequeue;
153
154 qops->next = NULL;
155 *qp = qops;
156 rc = 0;
157out:
158 write_unlock(&qdisc_mod_lock);
159 return rc;
160}
Patrick McHardy62e3ba12008-01-22 22:10:23 -0800161EXPORT_SYMBOL(register_qdisc);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700162
163int unregister_qdisc(struct Qdisc_ops *qops)
164{
165 struct Qdisc_ops *q, **qp;
166 int err = -ENOENT;
167
168 write_lock(&qdisc_mod_lock);
169 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
170 if (q == qops)
171 break;
172 if (q) {
173 *qp = q->next;
174 q->next = NULL;
175 err = 0;
176 }
177 write_unlock(&qdisc_mod_lock);
178 return err;
179}
Patrick McHardy62e3ba12008-01-22 22:10:23 -0800180EXPORT_SYMBOL(unregister_qdisc);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181
182/* We know handle. Find qdisc among all qdisc's attached to device
183 (root qdisc, all its children, children of children etc.)
184 */
185
David S. Milleread81cc2008-07-17 00:50:32 -0700186struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
Patrick McHardy43effa12006-11-29 17:35:48 -0800187{
188 struct Qdisc *q;
189
David S. Milleread81cc2008-07-17 00:50:32 -0700190 list_for_each_entry(q, &dev->qdisc_list, list) {
Patrick McHardy43effa12006-11-29 17:35:48 -0800191 if (q->handle == handle)
192 return q;
193 }
194 return NULL;
195}
196
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
198{
199 unsigned long cl;
200 struct Qdisc *leaf;
Eric Dumazet20fea082007-11-14 01:44:41 -0800201 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202
203 if (cops == NULL)
204 return NULL;
205 cl = cops->get(p, classid);
206
207 if (cl == 0)
208 return NULL;
209 leaf = cops->leaf(p, cl);
210 cops->put(p, cl);
211 return leaf;
212}
213
214/* Find queueing discipline by name */
215
Patrick McHardy1e904742008-01-22 22:11:17 -0800216static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700217{
218 struct Qdisc_ops *q = NULL;
219
220 if (kind) {
221 read_lock(&qdisc_mod_lock);
222 for (q = qdisc_base; q; q = q->next) {
Patrick McHardy1e904742008-01-22 22:11:17 -0800223 if (nla_strcmp(kind, q->id) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700224 if (!try_module_get(q->owner))
225 q = NULL;
226 break;
227 }
228 }
229 read_unlock(&qdisc_mod_lock);
230 }
231 return q;
232}
233
234static struct qdisc_rate_table *qdisc_rtab_list;
235
Patrick McHardy1e904742008-01-22 22:11:17 -0800236struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237{
238 struct qdisc_rate_table *rtab;
239
240 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
241 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
242 rtab->refcnt++;
243 return rtab;
244 }
245 }
246
Patrick McHardy5feb5e12008-01-23 20:35:19 -0800247 if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
248 nla_len(tab) != TC_RTAB_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700249 return NULL;
250
251 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
252 if (rtab) {
253 rtab->rate = *r;
254 rtab->refcnt = 1;
Patrick McHardy1e904742008-01-22 22:11:17 -0800255 memcpy(rtab->data, nla_data(tab), 1024);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256 rtab->next = qdisc_rtab_list;
257 qdisc_rtab_list = rtab;
258 }
259 return rtab;
260}
Patrick McHardy62e3ba12008-01-22 22:10:23 -0800261EXPORT_SYMBOL(qdisc_get_rtab);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700262
263void qdisc_put_rtab(struct qdisc_rate_table *tab)
264{
265 struct qdisc_rate_table *rtab, **rtabp;
266
267 if (!tab || --tab->refcnt)
268 return;
269
270 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
271 if (rtab == tab) {
272 *rtabp = rtab->next;
273 kfree(rtab);
274 return;
275 }
276 }
277}
Patrick McHardy62e3ba12008-01-22 22:10:23 -0800278EXPORT_SYMBOL(qdisc_put_rtab);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700279
Patrick McHardy41794772007-03-16 01:19:15 -0700280static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
281{
282 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
283 timer);
284
285 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
Stephen Hemminger11274e52007-03-22 12:17:42 -0700286 smp_wmb();
David S. Miller37437bb2008-07-16 02:15:04 -0700287 __netif_schedule(wd->qdisc);
Stephen Hemminger19365022007-03-22 12:18:35 -0700288
Patrick McHardy41794772007-03-16 01:19:15 -0700289 return HRTIMER_NORESTART;
290}
291
292void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
293{
294 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
295 wd->timer.function = qdisc_watchdog;
296 wd->qdisc = qdisc;
297}
298EXPORT_SYMBOL(qdisc_watchdog_init);
299
300void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
301{
302 ktime_t time;
303
304 wd->qdisc->flags |= TCQ_F_THROTTLED;
305 time = ktime_set(0, 0);
306 time = ktime_add_ns(time, PSCHED_US2NS(expires));
307 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
308}
309EXPORT_SYMBOL(qdisc_watchdog_schedule);
310
311void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
312{
313 hrtimer_cancel(&wd->timer);
314 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
315}
316EXPORT_SYMBOL(qdisc_watchdog_cancel);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700317
Patrick McHardy6fe1c7a2008-07-05 23:21:31 -0700318struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
319{
320 unsigned int size = n * sizeof(struct hlist_head), i;
321 struct hlist_head *h;
322
323 if (size <= PAGE_SIZE)
324 h = kmalloc(size, GFP_KERNEL);
325 else
326 h = (struct hlist_head *)
327 __get_free_pages(GFP_KERNEL, get_order(size));
328
329 if (h != NULL) {
330 for (i = 0; i < n; i++)
331 INIT_HLIST_HEAD(&h[i]);
332 }
333 return h;
334}
335
336static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
337{
338 unsigned int size = n * sizeof(struct hlist_head);
339
340 if (size <= PAGE_SIZE)
341 kfree(h);
342 else
343 free_pages((unsigned long)h, get_order(size));
344}
345
346void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
347{
348 struct Qdisc_class_common *cl;
349 struct hlist_node *n, *next;
350 struct hlist_head *nhash, *ohash;
351 unsigned int nsize, nmask, osize;
352 unsigned int i, h;
353
354 /* Rehash when load factor exceeds 0.75 */
355 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
356 return;
357 nsize = clhash->hashsize * 2;
358 nmask = nsize - 1;
359 nhash = qdisc_class_hash_alloc(nsize);
360 if (nhash == NULL)
361 return;
362
363 ohash = clhash->hash;
364 osize = clhash->hashsize;
365
366 sch_tree_lock(sch);
367 for (i = 0; i < osize; i++) {
368 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
369 h = qdisc_class_hash(cl->classid, nmask);
370 hlist_add_head(&cl->hnode, &nhash[h]);
371 }
372 }
373 clhash->hash = nhash;
374 clhash->hashsize = nsize;
375 clhash->hashmask = nmask;
376 sch_tree_unlock(sch);
377
378 qdisc_class_hash_free(ohash, osize);
379}
380EXPORT_SYMBOL(qdisc_class_hash_grow);
381
382int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
383{
384 unsigned int size = 4;
385
386 clhash->hash = qdisc_class_hash_alloc(size);
387 if (clhash->hash == NULL)
388 return -ENOMEM;
389 clhash->hashsize = size;
390 clhash->hashmask = size - 1;
391 clhash->hashelems = 0;
392 return 0;
393}
394EXPORT_SYMBOL(qdisc_class_hash_init);
395
396void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
397{
398 qdisc_class_hash_free(clhash->hash, clhash->hashsize);
399}
400EXPORT_SYMBOL(qdisc_class_hash_destroy);
401
402void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
403 struct Qdisc_class_common *cl)
404{
405 unsigned int h;
406
407 INIT_HLIST_NODE(&cl->hnode);
408 h = qdisc_class_hash(cl->classid, clhash->hashmask);
409 hlist_add_head(&cl->hnode, &clhash->hash[h]);
410 clhash->hashelems++;
411}
412EXPORT_SYMBOL(qdisc_class_hash_insert);
413
414void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
415 struct Qdisc_class_common *cl)
416{
417 hlist_del(&cl->hnode);
418 clhash->hashelems--;
419}
420EXPORT_SYMBOL(qdisc_class_hash_remove);
421
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422/* Allocate an unique handle from space managed by kernel */
423
424static u32 qdisc_alloc_handle(struct net_device *dev)
425{
426 int i = 0x10000;
427 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
428
429 do {
430 autohandle += TC_H_MAKE(0x10000U, 0);
431 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
432 autohandle = TC_H_MAKE(0x80000000U, 0);
433 } while (qdisc_lookup(dev, autohandle) && --i > 0);
434
435 return i>0 ? autohandle : 0;
436}
437
438/* Attach toplevel qdisc to device dev */
439
440static struct Qdisc *
441dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
442{
David S. Millerb0e1e642008-07-08 17:42:10 -0700443 struct netdev_queue *dev_queue;
David S. Miller53049972008-07-16 03:00:19 -0700444 spinlock_t *root_lock;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445 struct Qdisc *oqdisc;
David S. Miller53049972008-07-16 03:00:19 -0700446 int ingress;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700447
448 if (dev->flags & IFF_UP)
449 dev_deactivate(dev);
450
David S. Miller53049972008-07-16 03:00:19 -0700451 ingress = 0;
452 if (qdisc && qdisc->flags&TCQ_F_INGRESS)
453 ingress = 1;
454
455 if (ingress) {
David S. Miller816f3252008-07-08 22:49:00 -0700456 dev_queue = &dev->rx_queue;
457 oqdisc = dev_queue->qdisc;
David S. Miller53049972008-07-16 03:00:19 -0700458 } else {
459 dev_queue = netdev_get_tx_queue(dev, 0);
460 oqdisc = dev_queue->qdisc_sleeping;
461 }
462
463 root_lock = qdisc_root_lock(oqdisc);
464 spin_lock_bh(root_lock);
465
466 if (ingress) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700467 /* Prune old scheduler */
468 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
469 /* delete */
470 qdisc_reset(oqdisc);
David S. Miller816f3252008-07-08 22:49:00 -0700471 dev_queue->qdisc = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472 } else { /* new */
David S. Miller816f3252008-07-08 22:49:00 -0700473 dev_queue->qdisc = qdisc;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474 }
475
476 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700477 /* Prune old scheduler */
478 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
479 qdisc_reset(oqdisc);
480
481 /* ... and graft new one */
482 if (qdisc == NULL)
483 qdisc = &noop_qdisc;
David S. Millerb0e1e642008-07-08 17:42:10 -0700484 dev_queue->qdisc_sleeping = qdisc;
485 dev_queue->qdisc = &noop_qdisc;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 }
487
David S. Miller53049972008-07-16 03:00:19 -0700488 spin_unlock_bh(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700489
490 if (dev->flags & IFF_UP)
491 dev_activate(dev);
492
493 return oqdisc;
494}
495
Patrick McHardy43effa12006-11-29 17:35:48 -0800496void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
497{
Eric Dumazet20fea082007-11-14 01:44:41 -0800498 const struct Qdisc_class_ops *cops;
Patrick McHardy43effa12006-11-29 17:35:48 -0800499 unsigned long cl;
500 u32 parentid;
501
502 if (n == 0)
503 return;
504 while ((parentid = sch->parent)) {
Jarek Poplawski066a3b52008-04-14 15:10:42 -0700505 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
506 return;
507
David S. Miller5ce2d482008-07-08 17:06:30 -0700508 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
Patrick McHardyffc8fef2007-07-30 17:11:50 -0700509 if (sch == NULL) {
510 WARN_ON(parentid != TC_H_ROOT);
511 return;
512 }
Patrick McHardy43effa12006-11-29 17:35:48 -0800513 cops = sch->ops->cl_ops;
514 if (cops->qlen_notify) {
515 cl = cops->get(sch, parentid);
516 cops->qlen_notify(sch, cl);
517 cops->put(sch, cl);
518 }
519 sch->q.qlen -= n;
520 }
521}
522EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700523
524/* Graft qdisc "new" to class "classid" of qdisc "parent" or
525 to device "dev".
526
527 Old qdisc is not destroyed but returned in *old.
528 */
529
530static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
531 u32 classid,
532 struct Qdisc *new, struct Qdisc **old)
533{
534 int err = 0;
535 struct Qdisc *q = *old;
536
537
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900538 if (parent == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700539 if (q && q->flags&TCQ_F_INGRESS) {
540 *old = dev_graft_qdisc(dev, q);
541 } else {
542 *old = dev_graft_qdisc(dev, new);
543 }
544 } else {
Eric Dumazet20fea082007-11-14 01:44:41 -0800545 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700546
547 err = -EINVAL;
548
549 if (cops) {
550 unsigned long cl = cops->get(parent, classid);
551 if (cl) {
552 err = cops->graft(parent, cl, new, old);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700553 cops->put(parent, cl);
554 }
555 }
556 }
557 return err;
558}
559
560/*
561 Allocate and initialize new qdisc.
562
563 Parameters are passed via opt.
564 */
565
566static struct Qdisc *
David S. Millerbb949fb2008-07-08 16:55:56 -0700567qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
568 u32 parent, u32 handle, struct nlattr **tca, int *errp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700569{
570 int err;
Patrick McHardy1e904742008-01-22 22:11:17 -0800571 struct nlattr *kind = tca[TCA_KIND];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700572 struct Qdisc *sch;
573 struct Qdisc_ops *ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700574
575 ops = qdisc_lookup_ops(kind);
576#ifdef CONFIG_KMOD
577 if (ops == NULL && kind != NULL) {
578 char name[IFNAMSIZ];
Patrick McHardy1e904742008-01-22 22:11:17 -0800579 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700580 /* We dropped the RTNL semaphore in order to
581 * perform the module load. So, even if we
582 * succeeded in loading the module we have to
583 * tell the caller to replay the request. We
584 * indicate this using -EAGAIN.
585 * We replay the request because the device may
586 * go away in the mean time.
587 */
588 rtnl_unlock();
589 request_module("sch_%s", name);
590 rtnl_lock();
591 ops = qdisc_lookup_ops(kind);
592 if (ops != NULL) {
593 /* We will try again qdisc_lookup_ops,
594 * so don't keep a reference.
595 */
596 module_put(ops->owner);
597 err = -EAGAIN;
598 goto err_out;
599 }
600 }
601 }
602#endif
603
Jamal Hadi Salimb9e2cc02006-08-03 16:36:51 -0700604 err = -ENOENT;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700605 if (ops == NULL)
606 goto err_out;
607
David S. Miller5ce2d482008-07-08 17:06:30 -0700608 sch = qdisc_alloc(dev_queue, ops);
Thomas Graf3d54b822005-07-05 14:15:09 -0700609 if (IS_ERR(sch)) {
610 err = PTR_ERR(sch);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611 goto err_out2;
Thomas Graf3d54b822005-07-05 14:15:09 -0700612 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700613
Patrick McHardyffc8fef2007-07-30 17:11:50 -0700614 sch->parent = parent;
615
Thomas Graf3d54b822005-07-05 14:15:09 -0700616 if (handle == TC_H_INGRESS) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700617 sch->flags |= TCQ_F_INGRESS;
Thomas Graf3d54b822005-07-05 14:15:09 -0700618 handle = TC_H_MAKE(TC_H_INGRESS, 0);
Patrick McHardyfd44de72007-04-16 17:07:08 -0700619 } else {
Patrick McHardyfd44de72007-04-16 17:07:08 -0700620 if (handle == 0) {
621 handle = qdisc_alloc_handle(dev);
622 err = -ENOMEM;
623 if (handle == 0)
624 goto err_out3;
625 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700626 }
627
Thomas Graf3d54b822005-07-05 14:15:09 -0700628 sch->handle = handle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629
Patrick McHardy1e904742008-01-22 22:11:17 -0800630 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
631 if (tca[TCA_RATE]) {
Thomas Graf023e09a2005-07-05 14:15:53 -0700632 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
David S. Miller7698b4f2008-07-16 01:42:40 -0700633 qdisc_root_lock(sch),
Patrick McHardy1e904742008-01-22 22:11:17 -0800634 tca[TCA_RATE]);
Thomas Graf023e09a2005-07-05 14:15:53 -0700635 if (err) {
636 /*
637 * Any broken qdiscs that would require
638 * a ops->reset() here? The qdisc was never
639 * in action so it shouldn't be necessary.
640 */
641 if (ops->destroy)
642 ops->destroy(sch);
643 goto err_out3;
644 }
645 }
David S. Milleread81cc2008-07-17 00:50:32 -0700646 spin_lock_bh(&dev->qdisc_list_lock);
647 list_add_tail(&sch->list, &dev->qdisc_list);
648 spin_unlock_bh(&dev->qdisc_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700649
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650 return sch;
651 }
652err_out3:
653 dev_put(dev);
Thomas Graf3d54b822005-07-05 14:15:09 -0700654 kfree((char *) sch - sch->padded);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700655err_out2:
656 module_put(ops->owner);
657err_out:
658 *errp = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659 return NULL;
660}
661
Patrick McHardy1e904742008-01-22 22:11:17 -0800662static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663{
Patrick McHardy1e904742008-01-22 22:11:17 -0800664 if (tca[TCA_OPTIONS]) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665 int err;
666
667 if (sch->ops->change == NULL)
668 return -EINVAL;
Patrick McHardy1e904742008-01-22 22:11:17 -0800669 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700670 if (err)
671 return err;
672 }
Patrick McHardy1e904742008-01-22 22:11:17 -0800673 if (tca[TCA_RATE])
Linus Torvalds1da177e2005-04-16 15:20:36 -0700674 gen_replace_estimator(&sch->bstats, &sch->rate_est,
David S. Miller7698b4f2008-07-16 01:42:40 -0700675 qdisc_root_lock(sch), tca[TCA_RATE]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700676 return 0;
677}
678
679struct check_loop_arg
680{
681 struct qdisc_walker w;
682 struct Qdisc *p;
683 int depth;
684};
685
686static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
687
688static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
689{
690 struct check_loop_arg arg;
691
692 if (q->ops->cl_ops == NULL)
693 return 0;
694
695 arg.w.stop = arg.w.skip = arg.w.count = 0;
696 arg.w.fn = check_loop_fn;
697 arg.depth = depth;
698 arg.p = p;
699 q->ops->cl_ops->walk(q, &arg.w);
700 return arg.w.stop ? -ELOOP : 0;
701}
702
703static int
704check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
705{
706 struct Qdisc *leaf;
Eric Dumazet20fea082007-11-14 01:44:41 -0800707 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700708 struct check_loop_arg *arg = (struct check_loop_arg *)w;
709
710 leaf = cops->leaf(q, cl);
711 if (leaf) {
712 if (leaf == arg->p || arg->depth > 7)
713 return -ELOOP;
714 return check_loop(leaf, arg->p, arg->depth + 1);
715 }
716 return 0;
717}
718
719/*
720 * Delete/get qdisc.
721 */
722
723static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
724{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +0900725 struct net *net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700726 struct tcmsg *tcm = NLMSG_DATA(n);
Patrick McHardy1e904742008-01-22 22:11:17 -0800727 struct nlattr *tca[TCA_MAX + 1];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700728 struct net_device *dev;
729 u32 clid = tcm->tcm_parent;
730 struct Qdisc *q = NULL;
731 struct Qdisc *p = NULL;
732 int err;
733
Denis V. Lunevb8542722007-12-01 00:21:31 +1100734 if (net != &init_net)
735 return -EINVAL;
736
Eric W. Biederman881d9662007-09-17 11:56:21 -0700737 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700738 return -ENODEV;
739
Patrick McHardy1e904742008-01-22 22:11:17 -0800740 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
741 if (err < 0)
742 return err;
743
Linus Torvalds1da177e2005-04-16 15:20:36 -0700744 if (clid) {
745 if (clid != TC_H_ROOT) {
746 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
747 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
748 return -ENOENT;
749 q = qdisc_leaf(p, clid);
750 } else { /* ingress */
David S. Miller816f3252008-07-08 22:49:00 -0700751 q = dev->rx_queue.qdisc;
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900752 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700753 } else {
David S. Millere8a04642008-07-17 00:34:19 -0700754 struct netdev_queue *dev_queue;
755 dev_queue = netdev_get_tx_queue(dev, 0);
David S. Millerb0e1e642008-07-08 17:42:10 -0700756 q = dev_queue->qdisc_sleeping;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700757 }
758 if (!q)
759 return -ENOENT;
760
761 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
762 return -EINVAL;
763 } else {
764 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
765 return -ENOENT;
766 }
767
Patrick McHardy1e904742008-01-22 22:11:17 -0800768 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700769 return -EINVAL;
770
771 if (n->nlmsg_type == RTM_DELQDISC) {
772 if (!clid)
773 return -EINVAL;
774 if (q->handle == 0)
775 return -ENOENT;
776 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
777 return err;
778 if (q) {
David S. Miller53049972008-07-16 03:00:19 -0700779 spinlock_t *root_lock = qdisc_root_lock(q);
780
Linus Torvalds1da177e2005-04-16 15:20:36 -0700781 qdisc_notify(skb, n, clid, q, NULL);
David S. Miller53049972008-07-16 03:00:19 -0700782 spin_unlock_bh(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700783 qdisc_destroy(q);
David S. Miller53049972008-07-16 03:00:19 -0700784 spin_unlock_bh(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700785 }
786 } else {
787 qdisc_notify(skb, n, clid, NULL, q);
788 }
789 return 0;
790}
791
792/*
793 Create/change qdisc.
794 */
795
796static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
797{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +0900798 struct net *net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799 struct tcmsg *tcm;
Patrick McHardy1e904742008-01-22 22:11:17 -0800800 struct nlattr *tca[TCA_MAX + 1];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700801 struct net_device *dev;
802 u32 clid;
803 struct Qdisc *q, *p;
804 int err;
805
Denis V. Lunevb8542722007-12-01 00:21:31 +1100806 if (net != &init_net)
807 return -EINVAL;
808
Linus Torvalds1da177e2005-04-16 15:20:36 -0700809replay:
810 /* Reinit, just in case something touches this. */
811 tcm = NLMSG_DATA(n);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700812 clid = tcm->tcm_parent;
813 q = p = NULL;
814
Eric W. Biederman881d9662007-09-17 11:56:21 -0700815 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700816 return -ENODEV;
817
Patrick McHardy1e904742008-01-22 22:11:17 -0800818 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
819 if (err < 0)
820 return err;
821
Linus Torvalds1da177e2005-04-16 15:20:36 -0700822 if (clid) {
823 if (clid != TC_H_ROOT) {
824 if (clid != TC_H_INGRESS) {
825 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
826 return -ENOENT;
827 q = qdisc_leaf(p, clid);
828 } else { /*ingress */
David S. Miller816f3252008-07-08 22:49:00 -0700829 q = dev->rx_queue.qdisc;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700830 }
831 } else {
David S. Millere8a04642008-07-17 00:34:19 -0700832 struct netdev_queue *dev_queue;
833 dev_queue = netdev_get_tx_queue(dev, 0);
David S. Millerb0e1e642008-07-08 17:42:10 -0700834 q = dev_queue->qdisc_sleeping;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700835 }
836
837 /* It may be default qdisc, ignore it */
838 if (q && q->handle == 0)
839 q = NULL;
840
841 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
842 if (tcm->tcm_handle) {
843 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
844 return -EEXIST;
845 if (TC_H_MIN(tcm->tcm_handle))
846 return -EINVAL;
847 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
848 goto create_n_graft;
849 if (n->nlmsg_flags&NLM_F_EXCL)
850 return -EEXIST;
Patrick McHardy1e904742008-01-22 22:11:17 -0800851 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700852 return -EINVAL;
853 if (q == p ||
854 (p && check_loop(q, p, 0)))
855 return -ELOOP;
856 atomic_inc(&q->refcnt);
857 goto graft;
858 } else {
859 if (q == NULL)
860 goto create_n_graft;
861
862 /* This magic test requires explanation.
863 *
864 * We know, that some child q is already
865 * attached to this parent and have choice:
866 * either to change it or to create/graft new one.
867 *
868 * 1. We are allowed to create/graft only
869 * if CREATE and REPLACE flags are set.
870 *
871 * 2. If EXCL is set, requestor wanted to say,
872 * that qdisc tcm_handle is not expected
873 * to exist, so that we choose create/graft too.
874 *
875 * 3. The last case is when no flags are set.
876 * Alas, it is sort of hole in API, we
877 * cannot decide what to do unambiguously.
878 * For now we select create/graft, if
879 * user gave KIND, which does not match existing.
880 */
881 if ((n->nlmsg_flags&NLM_F_CREATE) &&
882 (n->nlmsg_flags&NLM_F_REPLACE) &&
883 ((n->nlmsg_flags&NLM_F_EXCL) ||
Patrick McHardy1e904742008-01-22 22:11:17 -0800884 (tca[TCA_KIND] &&
885 nla_strcmp(tca[TCA_KIND], q->ops->id))))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700886 goto create_n_graft;
887 }
888 }
889 } else {
890 if (!tcm->tcm_handle)
891 return -EINVAL;
892 q = qdisc_lookup(dev, tcm->tcm_handle);
893 }
894
895 /* Change qdisc parameters */
896 if (q == NULL)
897 return -ENOENT;
898 if (n->nlmsg_flags&NLM_F_EXCL)
899 return -EEXIST;
Patrick McHardy1e904742008-01-22 22:11:17 -0800900 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700901 return -EINVAL;
902 err = qdisc_change(q, tca);
903 if (err == 0)
904 qdisc_notify(skb, n, clid, NULL, q);
905 return err;
906
907create_n_graft:
908 if (!(n->nlmsg_flags&NLM_F_CREATE))
909 return -ENOENT;
910 if (clid == TC_H_INGRESS)
David S. Millerbb949fb2008-07-08 16:55:56 -0700911 q = qdisc_create(dev, &dev->rx_queue,
912 tcm->tcm_parent, tcm->tcm_parent,
Patrick McHardyffc8fef2007-07-30 17:11:50 -0700913 tca, &err);
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900914 else
David S. Millere8a04642008-07-17 00:34:19 -0700915 q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
David S. Millerbb949fb2008-07-08 16:55:56 -0700916 tcm->tcm_parent, tcm->tcm_handle,
Patrick McHardyffc8fef2007-07-30 17:11:50 -0700917 tca, &err);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700918 if (q == NULL) {
919 if (err == -EAGAIN)
920 goto replay;
921 return err;
922 }
923
924graft:
925 if (1) {
926 struct Qdisc *old_q = NULL;
David S. Miller53049972008-07-16 03:00:19 -0700927 spinlock_t *root_lock;
928
Linus Torvalds1da177e2005-04-16 15:20:36 -0700929 err = qdisc_graft(dev, p, clid, q, &old_q);
930 if (err) {
931 if (q) {
David S. Miller53049972008-07-16 03:00:19 -0700932 root_lock = qdisc_root_lock(q);
933 spin_lock_bh(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700934 qdisc_destroy(q);
David S. Miller53049972008-07-16 03:00:19 -0700935 spin_unlock_bh(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700936 }
937 return err;
938 }
939 qdisc_notify(skb, n, clid, old_q, q);
940 if (old_q) {
David S. Miller53049972008-07-16 03:00:19 -0700941 root_lock = qdisc_root_lock(old_q);
942 spin_lock_bh(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700943 qdisc_destroy(old_q);
David S. Miller53049972008-07-16 03:00:19 -0700944 spin_unlock_bh(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700945 }
946 }
947 return 0;
948}
949
950static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700951 u32 pid, u32 seq, u16 flags, int event)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700952{
953 struct tcmsg *tcm;
954 struct nlmsghdr *nlh;
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -0700955 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700956 struct gnet_dump d;
957
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700958 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700959 tcm = NLMSG_DATA(nlh);
960 tcm->tcm_family = AF_UNSPEC;
Patrick McHardy9ef1d4c2005-06-28 12:55:30 -0700961 tcm->tcm__pad1 = 0;
962 tcm->tcm__pad2 = 0;
David S. Miller5ce2d482008-07-08 17:06:30 -0700963 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700964 tcm->tcm_parent = clid;
965 tcm->tcm_handle = q->handle;
966 tcm->tcm_info = atomic_read(&q->refcnt);
Patrick McHardy57e1c482008-01-23 20:34:28 -0800967 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968 if (q->ops->dump && q->ops->dump(q, skb) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -0800969 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700970 q->qstats.qlen = q->q.qlen;
971
972 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
David S. Miller7698b4f2008-07-16 01:42:40 -0700973 TCA_XSTATS, qdisc_root_lock(q), &d) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -0800974 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700975
976 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -0800977 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700978
979 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700981 gnet_stats_copy_queue(&d, &q->qstats) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -0800982 goto nla_put_failure;
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900983
Linus Torvalds1da177e2005-04-16 15:20:36 -0700984 if (gnet_stats_finish_copy(&d) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -0800985 goto nla_put_failure;
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900986
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -0700987 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700988 return skb->len;
989
990nlmsg_failure:
Patrick McHardy1e904742008-01-22 22:11:17 -0800991nla_put_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -0700992 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700993 return -1;
994}
995
996static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
997 u32 clid, struct Qdisc *old, struct Qdisc *new)
998{
999 struct sk_buff *skb;
1000 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1001
1002 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1003 if (!skb)
1004 return -ENOBUFS;
1005
1006 if (old && old->handle) {
1007 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1008 goto err_out;
1009 }
1010 if (new) {
1011 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1012 goto err_out;
1013 }
1014
1015 if (skb->len)
Denis V. Lunev97c53ca2007-11-19 22:26:51 -08001016 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001017
1018err_out:
1019 kfree_skb(skb);
1020 return -EINVAL;
1021}
1022
1023static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1024{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001025 struct net *net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001026 int idx, q_idx;
1027 int s_idx, s_q_idx;
1028 struct net_device *dev;
1029 struct Qdisc *q;
1030
Denis V. Lunevb8542722007-12-01 00:21:31 +11001031 if (net != &init_net)
1032 return 0;
1033
Linus Torvalds1da177e2005-04-16 15:20:36 -07001034 s_idx = cb->args[0];
1035 s_q_idx = q_idx = cb->args[1];
1036 read_lock(&dev_base_lock);
Pavel Emelianov7562f872007-05-03 15:13:45 -07001037 idx = 0;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001038 for_each_netdev(&init_net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001039 if (idx < s_idx)
Pavel Emelianov7562f872007-05-03 15:13:45 -07001040 goto cont;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001041 if (idx > s_idx)
1042 s_q_idx = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001043 q_idx = 0;
David S. Milleread81cc2008-07-17 00:50:32 -07001044 list_for_each_entry(q, &dev->qdisc_list, list) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001045 if (q_idx < s_q_idx) {
1046 q_idx++;
1047 continue;
1048 }
1049 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
Patrick McHardy0463d4a2007-04-16 17:02:10 -07001050 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001051 goto done;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001052 q_idx++;
1053 }
Pavel Emelianov7562f872007-05-03 15:13:45 -07001054cont:
1055 idx++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001056 }
1057
1058done:
1059 read_unlock(&dev_base_lock);
1060
1061 cb->args[0] = idx;
1062 cb->args[1] = q_idx;
1063
1064 return skb->len;
1065}
1066
1067
1068
1069/************************************************
1070 * Traffic classes manipulation. *
1071 ************************************************/
1072
1073
1074
1075static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1076{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001077 struct net *net = sock_net(skb->sk);
David S. Millerb0e1e642008-07-08 17:42:10 -07001078 struct netdev_queue *dev_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001079 struct tcmsg *tcm = NLMSG_DATA(n);
Patrick McHardy1e904742008-01-22 22:11:17 -08001080 struct nlattr *tca[TCA_MAX + 1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001081 struct net_device *dev;
1082 struct Qdisc *q = NULL;
Eric Dumazet20fea082007-11-14 01:44:41 -08001083 const struct Qdisc_class_ops *cops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001084 unsigned long cl = 0;
1085 unsigned long new_cl;
1086 u32 pid = tcm->tcm_parent;
1087 u32 clid = tcm->tcm_handle;
1088 u32 qid = TC_H_MAJ(clid);
1089 int err;
1090
Denis V. Lunevb8542722007-12-01 00:21:31 +11001091 if (net != &init_net)
1092 return -EINVAL;
1093
Eric W. Biederman881d9662007-09-17 11:56:21 -07001094 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001095 return -ENODEV;
1096
Patrick McHardy1e904742008-01-22 22:11:17 -08001097 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1098 if (err < 0)
1099 return err;
1100
Linus Torvalds1da177e2005-04-16 15:20:36 -07001101 /*
1102 parent == TC_H_UNSPEC - unspecified parent.
1103 parent == TC_H_ROOT - class is root, which has no parent.
1104 parent == X:0 - parent is root class.
1105 parent == X:Y - parent is a node in hierarchy.
1106 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1107
1108 handle == 0:0 - generate handle from kernel pool.
1109 handle == 0:Y - class is X:Y, where X:0 is qdisc.
1110 handle == X:Y - clear.
1111 handle == X:0 - root class.
1112 */
1113
1114 /* Step 1. Determine qdisc handle X:0 */
1115
David S. Millere8a04642008-07-17 00:34:19 -07001116 dev_queue = netdev_get_tx_queue(dev, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001117 if (pid != TC_H_ROOT) {
1118 u32 qid1 = TC_H_MAJ(pid);
1119
1120 if (qid && qid1) {
1121 /* If both majors are known, they must be identical. */
1122 if (qid != qid1)
1123 return -EINVAL;
1124 } else if (qid1) {
1125 qid = qid1;
1126 } else if (qid == 0)
David S. Millerb0e1e642008-07-08 17:42:10 -07001127 qid = dev_queue->qdisc_sleeping->handle;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001128
1129 /* Now qid is genuine qdisc handle consistent
1130 both with parent and child.
1131
1132 TC_H_MAJ(pid) still may be unspecified, complete it now.
1133 */
1134 if (pid)
1135 pid = TC_H_MAKE(qid, pid);
1136 } else {
1137 if (qid == 0)
David S. Millerb0e1e642008-07-08 17:42:10 -07001138 qid = dev_queue->qdisc_sleeping->handle;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001139 }
1140
1141 /* OK. Locate qdisc */
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +09001142 if ((q = qdisc_lookup(dev, qid)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001143 return -ENOENT;
1144
1145 /* An check that it supports classes */
1146 cops = q->ops->cl_ops;
1147 if (cops == NULL)
1148 return -EINVAL;
1149
1150 /* Now try to get class */
1151 if (clid == 0) {
1152 if (pid == TC_H_ROOT)
1153 clid = qid;
1154 } else
1155 clid = TC_H_MAKE(qid, clid);
1156
1157 if (clid)
1158 cl = cops->get(q, clid);
1159
1160 if (cl == 0) {
1161 err = -ENOENT;
1162 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1163 goto out;
1164 } else {
1165 switch (n->nlmsg_type) {
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +09001166 case RTM_NEWTCLASS:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001167 err = -EEXIST;
1168 if (n->nlmsg_flags&NLM_F_EXCL)
1169 goto out;
1170 break;
1171 case RTM_DELTCLASS:
1172 err = cops->delete(q, cl);
1173 if (err == 0)
1174 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1175 goto out;
1176 case RTM_GETTCLASS:
1177 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1178 goto out;
1179 default:
1180 err = -EINVAL;
1181 goto out;
1182 }
1183 }
1184
1185 new_cl = cl;
1186 err = cops->change(q, clid, pid, tca, &new_cl);
1187 if (err == 0)
1188 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1189
1190out:
1191 if (cl)
1192 cops->put(q, cl);
1193
1194 return err;
1195}
1196
1197
1198static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1199 unsigned long cl,
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -07001200 u32 pid, u32 seq, u16 flags, int event)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001201{
1202 struct tcmsg *tcm;
1203 struct nlmsghdr *nlh;
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001204 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001205 struct gnet_dump d;
Eric Dumazet20fea082007-11-14 01:44:41 -08001206 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001207
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -07001208 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001209 tcm = NLMSG_DATA(nlh);
1210 tcm->tcm_family = AF_UNSPEC;
David S. Miller5ce2d482008-07-08 17:06:30 -07001211 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001212 tcm->tcm_parent = q->handle;
1213 tcm->tcm_handle = q->handle;
1214 tcm->tcm_info = 0;
Patrick McHardy57e1c482008-01-23 20:34:28 -08001215 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001216 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -08001217 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001218
1219 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
David S. Miller7698b4f2008-07-16 01:42:40 -07001220 TCA_XSTATS, qdisc_root_lock(q), &d) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -08001221 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001222
1223 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -08001224 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001225
1226 if (gnet_stats_finish_copy(&d) < 0)
Patrick McHardy1e904742008-01-22 22:11:17 -08001227 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001228
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001229 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001230 return skb->len;
1231
1232nlmsg_failure:
Patrick McHardy1e904742008-01-22 22:11:17 -08001233nla_put_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001234 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001235 return -1;
1236}
1237
1238static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1239 struct Qdisc *q, unsigned long cl, int event)
1240{
1241 struct sk_buff *skb;
1242 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1243
1244 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1245 if (!skb)
1246 return -ENOBUFS;
1247
1248 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1249 kfree_skb(skb);
1250 return -EINVAL;
1251 }
1252
Denis V. Lunev97c53ca2007-11-19 22:26:51 -08001253 return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001254}
1255
1256struct qdisc_dump_args
1257{
1258 struct qdisc_walker w;
1259 struct sk_buff *skb;
1260 struct netlink_callback *cb;
1261};
1262
1263static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1264{
1265 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1266
1267 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1268 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1269}
1270
1271static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1272{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001273 struct net *net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001274 int t;
1275 int s_t;
1276 struct net_device *dev;
1277 struct Qdisc *q;
1278 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1279 struct qdisc_dump_args arg;
1280
Denis V. Lunevb8542722007-12-01 00:21:31 +11001281 if (net != &init_net)
1282 return 0;
1283
Linus Torvalds1da177e2005-04-16 15:20:36 -07001284 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1285 return 0;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001286 if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001287 return 0;
1288
1289 s_t = cb->args[0];
1290 t = 0;
1291
David S. Milleread81cc2008-07-17 00:50:32 -07001292 list_for_each_entry(q, &dev->qdisc_list, list) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001293 if (t < s_t || !q->ops->cl_ops ||
1294 (tcm->tcm_parent &&
1295 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1296 t++;
1297 continue;
1298 }
1299 if (t > s_t)
1300 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1301 arg.w.fn = qdisc_class_dump;
1302 arg.skb = skb;
1303 arg.cb = cb;
1304 arg.w.stop = 0;
1305 arg.w.skip = cb->args[1];
1306 arg.w.count = 0;
1307 q->ops->cl_ops->walk(q, &arg.w);
1308 cb->args[1] = arg.w.count;
1309 if (arg.w.stop)
1310 break;
1311 t++;
1312 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001313
1314 cb->args[0] = t;
1315
1316 dev_put(dev);
1317 return skb->len;
1318}
1319
1320/* Main classifier routine: scans classifier chain attached
1321 to this qdisc, (optionally) tests for protocol and asks
1322 specific classifiers.
1323 */
Patrick McHardy73ca4912007-07-15 00:02:31 -07001324int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1325 struct tcf_result *res)
1326{
1327 __be16 protocol = skb->protocol;
1328 int err = 0;
1329
1330 for (; tp; tp = tp->next) {
1331 if ((tp->protocol == protocol ||
1332 tp->protocol == htons(ETH_P_ALL)) &&
1333 (err = tp->classify(skb, tp, res)) >= 0) {
1334#ifdef CONFIG_NET_CLS_ACT
1335 if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1336 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1337#endif
1338 return err;
1339 }
1340 }
1341 return -1;
1342}
1343EXPORT_SYMBOL(tc_classify_compat);
1344
Linus Torvalds1da177e2005-04-16 15:20:36 -07001345int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
Patrick McHardy73ca4912007-07-15 00:02:31 -07001346 struct tcf_result *res)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001347{
1348 int err = 0;
Patrick McHardy73ca4912007-07-15 00:02:31 -07001349 __be16 protocol;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001350#ifdef CONFIG_NET_CLS_ACT
1351 struct tcf_proto *otp = tp;
1352reclassify:
1353#endif
1354 protocol = skb->protocol;
1355
Patrick McHardy73ca4912007-07-15 00:02:31 -07001356 err = tc_classify_compat(skb, tp, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001357#ifdef CONFIG_NET_CLS_ACT
Patrick McHardy73ca4912007-07-15 00:02:31 -07001358 if (err == TC_ACT_RECLASSIFY) {
1359 u32 verd = G_TC_VERD(skb->tc_verd);
1360 tp = otp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001361
Patrick McHardy73ca4912007-07-15 00:02:31 -07001362 if (verd++ >= MAX_REC_LOOP) {
1363 printk("rule prio %u protocol %02x reclassify loop, "
1364 "packet dropped\n",
1365 tp->prio&0xffff, ntohs(tp->protocol));
1366 return TC_ACT_SHOT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001367 }
Patrick McHardy73ca4912007-07-15 00:02:31 -07001368 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1369 goto reclassify;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001370 }
Patrick McHardy73ca4912007-07-15 00:02:31 -07001371#endif
1372 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001373}
Patrick McHardy73ca4912007-07-15 00:02:31 -07001374EXPORT_SYMBOL(tc_classify);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001375
Patrick McHardya48b5a62007-03-23 11:29:43 -07001376void tcf_destroy(struct tcf_proto *tp)
1377{
1378 tp->ops->destroy(tp);
1379 module_put(tp->ops->owner);
1380 kfree(tp);
1381}
1382
Patrick McHardyff31ab52008-07-01 19:52:38 -07001383void tcf_destroy_chain(struct tcf_proto **fl)
Patrick McHardya48b5a62007-03-23 11:29:43 -07001384{
1385 struct tcf_proto *tp;
1386
Patrick McHardyff31ab52008-07-01 19:52:38 -07001387 while ((tp = *fl) != NULL) {
1388 *fl = tp->next;
Patrick McHardya48b5a62007-03-23 11:29:43 -07001389 tcf_destroy(tp);
1390 }
1391}
1392EXPORT_SYMBOL(tcf_destroy_chain);
1393
Linus Torvalds1da177e2005-04-16 15:20:36 -07001394#ifdef CONFIG_PROC_FS
1395static int psched_show(struct seq_file *seq, void *v)
1396{
Patrick McHardy3c0cfc12007-10-10 16:32:41 -07001397 struct timespec ts;
1398
1399 hrtimer_get_res(CLOCK_MONOTONIC, &ts);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001400 seq_printf(seq, "%08x %08x %08x %08x\n",
Patrick McHardy641b9e02007-03-16 01:18:42 -07001401 (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
Patrick McHardy514bca32007-03-16 12:34:52 -07001402 1000000,
Patrick McHardy3c0cfc12007-10-10 16:32:41 -07001403 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001404
1405 return 0;
1406}
1407
1408static int psched_open(struct inode *inode, struct file *file)
1409{
1410 return single_open(file, psched_show, PDE(inode)->data);
1411}
1412
Arjan van de Venda7071d2007-02-12 00:55:36 -08001413static const struct file_operations psched_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001414 .owner = THIS_MODULE,
1415 .open = psched_open,
1416 .read = seq_read,
1417 .llseek = seq_lseek,
1418 .release = single_release,
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +09001419};
Linus Torvalds1da177e2005-04-16 15:20:36 -07001420#endif
1421
Linus Torvalds1da177e2005-04-16 15:20:36 -07001422static int __init pktsched_init(void)
1423{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001424 register_qdisc(&pfifo_qdisc_ops);
1425 register_qdisc(&bfifo_qdisc_ops);
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02001426 proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001427
Thomas Grafbe577dd2007-03-22 11:55:50 -07001428 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1429 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1430 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1431 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1432 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1433 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1434
Linus Torvalds1da177e2005-04-16 15:20:36 -07001435 return 0;
1436}
1437
1438subsys_initcall(pktsched_init);