blob: 864b8d353ffaeada27037dc86f9564aaa06309b0 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * net/sched/sch_netem.c Network emulator
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Many of the algorithms and ideas for this came from
10 * NIST Net which is not copyrighted.
11 *
12 * Authors: Stephen Hemminger <shemminger@osdl.org>
13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14 */
15
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/bitops.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/errno.h>
22#include <linux/netdevice.h>
23#include <linux/skbuff.h>
24#include <linux/rtnetlink.h>
25
26#include <net/pkt_sched.h>
27
28/* Network Emulation Queuing algorithm.
29 ====================================
30
31 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
32 Network Emulation Tool
33 [2] Luigi Rizzo, DummyNet for FreeBSD
34
35 ----------------------------------------------------------------
36
37 This started out as a simple way to delay outgoing packets to
38 test TCP but has grown to include most of the functionality
39 of a full blown network emulator like NISTnet. It can delay
40 packets and add random jitter (and correlation). The random
41 distribution can be loaded from a table as well to provide
42 normal, Pareto, or experimental curves. Packet loss,
43 duplication, and reordering can also be emulated.
44
45 This qdisc does not do classification that can be handled in
46 layering other disciplines. It does not need to do bandwidth
47 control either since that can be handled by using token
48 bucket or other rate control.
49
50 The simulator is limited by the Linux timer resolution
51 and will create packet bursts on the HZ boundary (1ms).
52*/
53
54struct netem_sched_data {
55 struct Qdisc *qdisc;
56 struct sk_buff_head delayed;
57 struct timer_list timer;
58
59 u32 latency;
60 u32 loss;
61 u32 limit;
62 u32 counter;
63 u32 gap;
64 u32 jitter;
65 u32 duplicate;
66
67 struct crndstate {
68 unsigned long last;
69 unsigned long rho;
70 } delay_cor, loss_cor, dup_cor;
71
72 struct disttable {
73 u32 size;
74 s16 table[0];
75 } *delay_dist;
76};
77
78/* Time stamp put into socket buffer control block */
79struct netem_skb_cb {
80 psched_time_t time_to_send;
81};
82
83/* init_crandom - initialize correlated random number generator
84 * Use entropy source for initial seed.
85 */
86static void init_crandom(struct crndstate *state, unsigned long rho)
87{
88 state->rho = rho;
89 state->last = net_random();
90}
91
92/* get_crandom - correlated random number generator
93 * Next number depends on last value.
94 * rho is scaled to avoid floating point.
95 */
96static unsigned long get_crandom(struct crndstate *state)
97{
98 u64 value, rho;
99 unsigned long answer;
100
101 if (state->rho == 0) /* no correllation */
102 return net_random();
103
104 value = net_random();
105 rho = (u64)state->rho + 1;
106 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
107 state->last = answer;
108 return answer;
109}
110
111/* tabledist - return a pseudo-randomly distributed value with mean mu and
112 * std deviation sigma. Uses table lookup to approximate the desired
113 * distribution, and a uniformly-distributed pseudo-random source.
114 */
115static long tabledist(unsigned long mu, long sigma,
116 struct crndstate *state, const struct disttable *dist)
117{
118 long t, x;
119 unsigned long rnd;
120
121 if (sigma == 0)
122 return mu;
123
124 rnd = get_crandom(state);
125
126 /* default uniform distribution */
127 if (dist == NULL)
128 return (rnd % (2*sigma)) - sigma + mu;
129
130 t = dist->table[rnd % dist->size];
131 x = (sigma % NETEM_DIST_SCALE) * t;
132 if (x >= 0)
133 x += NETEM_DIST_SCALE/2;
134 else
135 x -= NETEM_DIST_SCALE/2;
136
137 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
138}
139
140/* Put skb in the private delayed queue. */
Stephen Hemminger771018e2005-05-03 16:24:32 -0700141static int netem_delay(struct Qdisc *sch, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142{
143 struct netem_sched_data *q = qdisc_priv(sch);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144 psched_tdiff_t td;
145 psched_time_t now;
146
147 PSCHED_GET_TIME(now);
148 td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149
150 /* Always queue at tail to keep packets in order */
151 if (likely(q->delayed.qlen < q->limit)) {
Stephen Hemminger771018e2005-05-03 16:24:32 -0700152 struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
153
154 PSCHED_TADD2(now, td, cb->time_to_send);
155
156 pr_debug("netem_delay: skb=%p now=%llu tosend=%llu\n", skb,
157 now, cb->time_to_send);
158
Linus Torvalds1da177e2005-04-16 15:20:36 -0700159 __skb_queue_tail(&q->delayed, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160 return NET_XMIT_SUCCESS;
161 }
162
Stephen Hemminger771018e2005-05-03 16:24:32 -0700163 pr_debug("netem_delay: queue over limit %d\n", q->limit);
164 sch->qstats.overlimits++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165 kfree_skb(skb);
166 return NET_XMIT_DROP;
167}
168
Stephen Hemminger771018e2005-05-03 16:24:32 -0700169/*
170 * Move a packet that is ready to send from the delay holding
171 * list to the underlying qdisc.
172 */
173static int netem_run(struct Qdisc *sch)
174{
175 struct netem_sched_data *q = qdisc_priv(sch);
176 struct sk_buff *skb;
177 psched_time_t now;
178
179 PSCHED_GET_TIME(now);
180
181 skb = skb_peek(&q->delayed);
182 if (skb) {
183 const struct netem_skb_cb *cb
184 = (const struct netem_skb_cb *)skb->cb;
185 long delay
186 = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
187 pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay);
188
189 /* if more time remaining? */
190 if (delay > 0) {
191 mod_timer(&q->timer, jiffies + delay);
192 return 1;
193 }
194
195 __skb_unlink(skb, &q->delayed);
196
197 if (q->qdisc->enqueue(skb, q->qdisc)) {
198 sch->q.qlen--;
199 sch->qstats.drops++;
200 }
201 }
202
203 return 0;
204}
205
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
207{
208 struct netem_sched_data *q = qdisc_priv(sch);
209 struct sk_buff *skb2;
210 int ret;
211
Stephen Hemminger771018e2005-05-03 16:24:32 -0700212 pr_debug("netem_enqueue skb=%p\n", skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213
214 /* Random packet drop 0 => none, ~0 => all */
215 if (q->loss && q->loss >= get_crandom(&q->loss_cor)) {
216 pr_debug("netem_enqueue: random loss\n");
217 sch->qstats.drops++;
218 kfree_skb(skb);
219 return 0; /* lie about loss so TCP doesn't know */
220 }
221
222 /* Random duplication */
223 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)
224 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
225 pr_debug("netem_enqueue: dup %p\n", skb2);
226
Stephen Hemminger771018e2005-05-03 16:24:32 -0700227 if (netem_delay(sch, skb2)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700228 sch->q.qlen++;
229 sch->bstats.bytes += skb2->len;
230 sch->bstats.packets++;
231 } else
232 sch->qstats.drops++;
233 }
234
235 /* If doing simple delay then gap == 0 so all packets
236 * go into the delayed holding queue
237 * otherwise if doing out of order only "1 out of gap"
238 * packets will be delayed.
239 */
240 if (q->counter < q->gap) {
241 ++q->counter;
242 ret = q->qdisc->enqueue(skb, q->qdisc);
243 } else {
244 q->counter = 0;
Stephen Hemminger771018e2005-05-03 16:24:32 -0700245 ret = netem_delay(sch, skb);
246 netem_run(sch);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700247 }
248
249 if (likely(ret == NET_XMIT_SUCCESS)) {
250 sch->q.qlen++;
251 sch->bstats.bytes += skb->len;
252 sch->bstats.packets++;
253 } else
254 sch->qstats.drops++;
255
256 return ret;
257}
258
259/* Requeue packets but don't change time stamp */
260static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
261{
262 struct netem_sched_data *q = qdisc_priv(sch);
263 int ret;
264
265 if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
266 sch->q.qlen++;
267 sch->qstats.requeues++;
268 }
269
270 return ret;
271}
272
273static unsigned int netem_drop(struct Qdisc* sch)
274{
275 struct netem_sched_data *q = qdisc_priv(sch);
276 unsigned int len;
277
278 if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
279 sch->q.qlen--;
280 sch->qstats.drops++;
281 }
282 return len;
283}
284
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285static struct sk_buff *netem_dequeue(struct Qdisc *sch)
286{
287 struct netem_sched_data *q = qdisc_priv(sch);
288 struct sk_buff *skb;
Stephen Hemminger771018e2005-05-03 16:24:32 -0700289 int pending;
290
291 pending = netem_run(sch);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700292
293 skb = q->qdisc->dequeue(q->qdisc);
Stephen Hemminger771018e2005-05-03 16:24:32 -0700294 if (skb) {
295 pr_debug("netem_dequeue: return skb=%p\n", skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296 sch->q.qlen--;
Stephen Hemminger771018e2005-05-03 16:24:32 -0700297 sch->flags &= ~TCQ_F_THROTTLED;
298 }
299 else if (pending) {
300 pr_debug("netem_dequeue: throttling\n");
301 sch->flags |= TCQ_F_THROTTLED;
302 }
303
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304 return skb;
305}
306
307static void netem_watchdog(unsigned long arg)
308{
309 struct Qdisc *sch = (struct Qdisc *)arg;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700310
Stephen Hemminger771018e2005-05-03 16:24:32 -0700311 pr_debug("netem_watchdog qlen=%d\n", sch->q.qlen);
312 sch->flags &= ~TCQ_F_THROTTLED;
313 netif_schedule(sch->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700314}
315
316static void netem_reset(struct Qdisc *sch)
317{
318 struct netem_sched_data *q = qdisc_priv(sch);
319
320 qdisc_reset(q->qdisc);
321 skb_queue_purge(&q->delayed);
322
323 sch->q.qlen = 0;
Stephen Hemminger771018e2005-05-03 16:24:32 -0700324 sch->flags &= ~TCQ_F_THROTTLED;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325 del_timer_sync(&q->timer);
326}
327
328static int set_fifo_limit(struct Qdisc *q, int limit)
329{
330 struct rtattr *rta;
331 int ret = -ENOMEM;
332
333 rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
334 if (rta) {
335 rta->rta_type = RTM_NEWQDISC;
336 rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
337 ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
338
339 ret = q->ops->change(q, rta);
340 kfree(rta);
341 }
342 return ret;
343}
344
345/*
346 * Distribution data is a variable size payload containing
347 * signed 16 bit values.
348 */
349static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
350{
351 struct netem_sched_data *q = qdisc_priv(sch);
352 unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
353 const __s16 *data = RTA_DATA(attr);
354 struct disttable *d;
355 int i;
356
357 if (n > 65536)
358 return -EINVAL;
359
360 d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
361 if (!d)
362 return -ENOMEM;
363
364 d->size = n;
365 for (i = 0; i < n; i++)
366 d->table[i] = data[i];
367
368 spin_lock_bh(&sch->dev->queue_lock);
369 d = xchg(&q->delay_dist, d);
370 spin_unlock_bh(&sch->dev->queue_lock);
371
372 kfree(d);
373 return 0;
374}
375
376static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
377{
378 struct netem_sched_data *q = qdisc_priv(sch);
379 const struct tc_netem_corr *c = RTA_DATA(attr);
380
381 if (RTA_PAYLOAD(attr) != sizeof(*c))
382 return -EINVAL;
383
384 init_crandom(&q->delay_cor, c->delay_corr);
385 init_crandom(&q->loss_cor, c->loss_corr);
386 init_crandom(&q->dup_cor, c->dup_corr);
387 return 0;
388}
389
390static int netem_change(struct Qdisc *sch, struct rtattr *opt)
391{
392 struct netem_sched_data *q = qdisc_priv(sch);
393 struct tc_netem_qopt *qopt;
394 int ret;
395
396 if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
397 return -EINVAL;
398
399 qopt = RTA_DATA(opt);
400 ret = set_fifo_limit(q->qdisc, qopt->limit);
401 if (ret) {
402 pr_debug("netem: can't set fifo limit\n");
403 return ret;
404 }
405
406 q->latency = qopt->latency;
407 q->jitter = qopt->jitter;
408 q->limit = qopt->limit;
409 q->gap = qopt->gap;
410 q->loss = qopt->loss;
411 q->duplicate = qopt->duplicate;
412
413 /* Handle nested options after initial queue options.
414 * Should have put all options in nested format but too late now.
415 */
416 if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
417 struct rtattr *tb[TCA_NETEM_MAX];
418 if (rtattr_parse(tb, TCA_NETEM_MAX,
419 RTA_DATA(opt) + sizeof(*qopt),
420 RTA_PAYLOAD(opt) - sizeof(*qopt)))
421 return -EINVAL;
422
423 if (tb[TCA_NETEM_CORR-1]) {
424 ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
425 if (ret)
426 return ret;
427 }
428
429 if (tb[TCA_NETEM_DELAY_DIST-1]) {
430 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
431 if (ret)
432 return ret;
433 }
434 }
435
436
437 return 0;
438}
439
440static int netem_init(struct Qdisc *sch, struct rtattr *opt)
441{
442 struct netem_sched_data *q = qdisc_priv(sch);
443 int ret;
444
445 if (!opt)
446 return -EINVAL;
447
448 skb_queue_head_init(&q->delayed);
449 init_timer(&q->timer);
450 q->timer.function = netem_watchdog;
451 q->timer.data = (unsigned long) sch;
452 q->counter = 0;
453
454 q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
455 if (!q->qdisc) {
456 pr_debug("netem: qdisc create failed\n");
457 return -ENOMEM;
458 }
459
460 ret = netem_change(sch, opt);
461 if (ret) {
462 pr_debug("netem: change failed\n");
463 qdisc_destroy(q->qdisc);
464 }
465 return ret;
466}
467
468static void netem_destroy(struct Qdisc *sch)
469{
470 struct netem_sched_data *q = qdisc_priv(sch);
471
472 del_timer_sync(&q->timer);
473 qdisc_destroy(q->qdisc);
474 kfree(q->delay_dist);
475}
476
477static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
478{
479 const struct netem_sched_data *q = qdisc_priv(sch);
480 unsigned char *b = skb->tail;
481 struct rtattr *rta = (struct rtattr *) b;
482 struct tc_netem_qopt qopt;
483 struct tc_netem_corr cor;
484
485 qopt.latency = q->latency;
486 qopt.jitter = q->jitter;
487 qopt.limit = q->limit;
488 qopt.loss = q->loss;
489 qopt.gap = q->gap;
490 qopt.duplicate = q->duplicate;
491 RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
492
493 cor.delay_corr = q->delay_cor.rho;
494 cor.loss_corr = q->loss_cor.rho;
495 cor.dup_corr = q->dup_cor.rho;
496 RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
497 rta->rta_len = skb->tail - b;
498
499 return skb->len;
500
501rtattr_failure:
502 skb_trim(skb, b - skb->data);
503 return -1;
504}
505
506static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
507 struct sk_buff *skb, struct tcmsg *tcm)
508{
509 struct netem_sched_data *q = qdisc_priv(sch);
510
511 if (cl != 1) /* only one class */
512 return -ENOENT;
513
514 tcm->tcm_handle |= TC_H_MIN(1);
515 tcm->tcm_info = q->qdisc->handle;
516
517 return 0;
518}
519
520static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
521 struct Qdisc **old)
522{
523 struct netem_sched_data *q = qdisc_priv(sch);
524
525 if (new == NULL)
526 new = &noop_qdisc;
527
528 sch_tree_lock(sch);
529 *old = xchg(&q->qdisc, new);
530 qdisc_reset(*old);
531 sch->q.qlen = 0;
532 sch_tree_unlock(sch);
533
534 return 0;
535}
536
537static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
538{
539 struct netem_sched_data *q = qdisc_priv(sch);
540 return q->qdisc;
541}
542
543static unsigned long netem_get(struct Qdisc *sch, u32 classid)
544{
545 return 1;
546}
547
548static void netem_put(struct Qdisc *sch, unsigned long arg)
549{
550}
551
552static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
553 struct rtattr **tca, unsigned long *arg)
554{
555 return -ENOSYS;
556}
557
558static int netem_delete(struct Qdisc *sch, unsigned long arg)
559{
560 return -ENOSYS;
561}
562
563static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
564{
565 if (!walker->stop) {
566 if (walker->count >= walker->skip)
567 if (walker->fn(sch, 1, walker) < 0) {
568 walker->stop = 1;
569 return;
570 }
571 walker->count++;
572 }
573}
574
575static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
576{
577 return NULL;
578}
579
580static struct Qdisc_class_ops netem_class_ops = {
581 .graft = netem_graft,
582 .leaf = netem_leaf,
583 .get = netem_get,
584 .put = netem_put,
585 .change = netem_change_class,
586 .delete = netem_delete,
587 .walk = netem_walk,
588 .tcf_chain = netem_find_tcf,
589 .dump = netem_dump_class,
590};
591
592static struct Qdisc_ops netem_qdisc_ops = {
593 .id = "netem",
594 .cl_ops = &netem_class_ops,
595 .priv_size = sizeof(struct netem_sched_data),
596 .enqueue = netem_enqueue,
597 .dequeue = netem_dequeue,
598 .requeue = netem_requeue,
599 .drop = netem_drop,
600 .init = netem_init,
601 .reset = netem_reset,
602 .destroy = netem_destroy,
603 .change = netem_change,
604 .dump = netem_dump,
605 .owner = THIS_MODULE,
606};
607
608
609static int __init netem_module_init(void)
610{
611 return register_qdisc(&netem_qdisc_ops);
612}
613static void __exit netem_module_exit(void)
614{
615 unregister_qdisc(&netem_qdisc_ops);
616}
617module_init(netem_module_init)
618module_exit(netem_module_exit)
619MODULE_LICENSE("GPL");