blob: 31c29deb139d3ed49cf1d7e70bd778438ee8541a [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * net/sched/sch_netem.c Network emulator
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Many of the algorithms and ideas for this came from
10 * NIST Net which is not copyrighted.
11 *
12 * Authors: Stephen Hemminger <shemminger@osdl.org>
13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14 */
15
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/bitops.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/errno.h>
22#include <linux/netdevice.h>
23#include <linux/skbuff.h>
24#include <linux/rtnetlink.h>
25
26#include <net/pkt_sched.h>
27
28/* Network Emulation Queuing algorithm.
29 ====================================
30
31 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
32 Network Emulation Tool
33 [2] Luigi Rizzo, DummyNet for FreeBSD
34
35 ----------------------------------------------------------------
36
37 This started out as a simple way to delay outgoing packets to
38 test TCP but has grown to include most of the functionality
39 of a full blown network emulator like NISTnet. It can delay
40 packets and add random jitter (and correlation). The random
41 distribution can be loaded from a table as well to provide
42 normal, Pareto, or experimental curves. Packet loss,
43 duplication, and reordering can also be emulated.
44
45 This qdisc does not do classification that can be handled in
46 layering other disciplines. It does not need to do bandwidth
47 control either since that can be handled by using token
48 bucket or other rate control.
49
50 The simulator is limited by the Linux timer resolution
51 and will create packet bursts on the HZ boundary (1ms).
52*/
53
54struct netem_sched_data {
55 struct Qdisc *qdisc;
56 struct sk_buff_head delayed;
57 struct timer_list timer;
58
59 u32 latency;
60 u32 loss;
61 u32 limit;
62 u32 counter;
63 u32 gap;
64 u32 jitter;
65 u32 duplicate;
66
67 struct crndstate {
68 unsigned long last;
69 unsigned long rho;
70 } delay_cor, loss_cor, dup_cor;
71
72 struct disttable {
73 u32 size;
74 s16 table[0];
75 } *delay_dist;
76};
77
78/* Time stamp put into socket buffer control block */
79struct netem_skb_cb {
80 psched_time_t time_to_send;
81};
82
83/* init_crandom - initialize correlated random number generator
84 * Use entropy source for initial seed.
85 */
86static void init_crandom(struct crndstate *state, unsigned long rho)
87{
88 state->rho = rho;
89 state->last = net_random();
90}
91
92/* get_crandom - correlated random number generator
93 * Next number depends on last value.
94 * rho is scaled to avoid floating point.
95 */
96static unsigned long get_crandom(struct crndstate *state)
97{
98 u64 value, rho;
99 unsigned long answer;
100
101 if (state->rho == 0) /* no correllation */
102 return net_random();
103
104 value = net_random();
105 rho = (u64)state->rho + 1;
106 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
107 state->last = answer;
108 return answer;
109}
110
111/* tabledist - return a pseudo-randomly distributed value with mean mu and
112 * std deviation sigma. Uses table lookup to approximate the desired
113 * distribution, and a uniformly-distributed pseudo-random source.
114 */
115static long tabledist(unsigned long mu, long sigma,
116 struct crndstate *state, const struct disttable *dist)
117{
118 long t, x;
119 unsigned long rnd;
120
121 if (sigma == 0)
122 return mu;
123
124 rnd = get_crandom(state);
125
126 /* default uniform distribution */
127 if (dist == NULL)
128 return (rnd % (2*sigma)) - sigma + mu;
129
130 t = dist->table[rnd % dist->size];
131 x = (sigma % NETEM_DIST_SCALE) * t;
132 if (x >= 0)
133 x += NETEM_DIST_SCALE/2;
134 else
135 x -= NETEM_DIST_SCALE/2;
136
137 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
138}
139
140/* Put skb in the private delayed queue. */
141static int delay_skb(struct Qdisc *sch, struct sk_buff *skb)
142{
143 struct netem_sched_data *q = qdisc_priv(sch);
144 struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
145 psched_tdiff_t td;
146 psched_time_t now;
147
148 PSCHED_GET_TIME(now);
149 td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist);
150 PSCHED_TADD2(now, td, cb->time_to_send);
151
152 /* Always queue at tail to keep packets in order */
153 if (likely(q->delayed.qlen < q->limit)) {
154 __skb_queue_tail(&q->delayed, skb);
155 if (!timer_pending(&q->timer)) {
156 q->timer.expires = jiffies + PSCHED_US2JIFFIE(td);
157 add_timer(&q->timer);
158 }
159 return NET_XMIT_SUCCESS;
160 }
161
162 kfree_skb(skb);
163 return NET_XMIT_DROP;
164}
165
166static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
167{
168 struct netem_sched_data *q = qdisc_priv(sch);
169 struct sk_buff *skb2;
170 int ret;
171
172 pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies);
173
174 /* Random packet drop 0 => none, ~0 => all */
175 if (q->loss && q->loss >= get_crandom(&q->loss_cor)) {
176 pr_debug("netem_enqueue: random loss\n");
177 sch->qstats.drops++;
178 kfree_skb(skb);
179 return 0; /* lie about loss so TCP doesn't know */
180 }
181
182 /* Random duplication */
183 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)
184 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
185 pr_debug("netem_enqueue: dup %p\n", skb2);
186
187 if (delay_skb(sch, skb2)) {
188 sch->q.qlen++;
189 sch->bstats.bytes += skb2->len;
190 sch->bstats.packets++;
191 } else
192 sch->qstats.drops++;
193 }
194
195 /* If doing simple delay then gap == 0 so all packets
196 * go into the delayed holding queue
197 * otherwise if doing out of order only "1 out of gap"
198 * packets will be delayed.
199 */
200 if (q->counter < q->gap) {
201 ++q->counter;
202 ret = q->qdisc->enqueue(skb, q->qdisc);
203 } else {
204 q->counter = 0;
205 ret = delay_skb(sch, skb);
206 }
207
208 if (likely(ret == NET_XMIT_SUCCESS)) {
209 sch->q.qlen++;
210 sch->bstats.bytes += skb->len;
211 sch->bstats.packets++;
212 } else
213 sch->qstats.drops++;
214
215 return ret;
216}
217
218/* Requeue packets but don't change time stamp */
219static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
220{
221 struct netem_sched_data *q = qdisc_priv(sch);
222 int ret;
223
224 if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
225 sch->q.qlen++;
226 sch->qstats.requeues++;
227 }
228
229 return ret;
230}
231
232static unsigned int netem_drop(struct Qdisc* sch)
233{
234 struct netem_sched_data *q = qdisc_priv(sch);
235 unsigned int len;
236
237 if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
238 sch->q.qlen--;
239 sch->qstats.drops++;
240 }
241 return len;
242}
243
244/* Dequeue packet.
245 * Move all packets that are ready to send from the delay holding
246 * list to the underlying qdisc, then just call dequeue
247 */
248static struct sk_buff *netem_dequeue(struct Qdisc *sch)
249{
250 struct netem_sched_data *q = qdisc_priv(sch);
251 struct sk_buff *skb;
252
253 skb = q->qdisc->dequeue(q->qdisc);
254 if (skb)
255 sch->q.qlen--;
256 return skb;
257}
258
259static void netem_watchdog(unsigned long arg)
260{
261 struct Qdisc *sch = (struct Qdisc *)arg;
262 struct netem_sched_data *q = qdisc_priv(sch);
263 struct net_device *dev = sch->dev;
264 struct sk_buff *skb;
265 psched_time_t now;
266
267 pr_debug("netem_watchdog: fired @%lu\n", jiffies);
268
269 spin_lock_bh(&dev->queue_lock);
270 PSCHED_GET_TIME(now);
271
272 while ((skb = skb_peek(&q->delayed)) != NULL) {
273 const struct netem_skb_cb *cb
274 = (const struct netem_skb_cb *)skb->cb;
275 long delay
276 = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
277 pr_debug("netem_watchdog: skb %p@%lu %ld\n",
278 skb, jiffies, delay);
279
280 /* if more time remaining? */
281 if (delay > 0) {
282 mod_timer(&q->timer, jiffies + delay);
283 break;
284 }
285 __skb_unlink(skb, &q->delayed);
286
287 if (q->qdisc->enqueue(skb, q->qdisc)) {
288 sch->q.qlen--;
289 sch->qstats.drops++;
290 }
291 }
292 qdisc_run(dev);
293 spin_unlock_bh(&dev->queue_lock);
294}
295
296static void netem_reset(struct Qdisc *sch)
297{
298 struct netem_sched_data *q = qdisc_priv(sch);
299
300 qdisc_reset(q->qdisc);
301 skb_queue_purge(&q->delayed);
302
303 sch->q.qlen = 0;
304 del_timer_sync(&q->timer);
305}
306
307static int set_fifo_limit(struct Qdisc *q, int limit)
308{
309 struct rtattr *rta;
310 int ret = -ENOMEM;
311
312 rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
313 if (rta) {
314 rta->rta_type = RTM_NEWQDISC;
315 rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
316 ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
317
318 ret = q->ops->change(q, rta);
319 kfree(rta);
320 }
321 return ret;
322}
323
324/*
325 * Distribution data is a variable size payload containing
326 * signed 16 bit values.
327 */
328static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
329{
330 struct netem_sched_data *q = qdisc_priv(sch);
331 unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
332 const __s16 *data = RTA_DATA(attr);
333 struct disttable *d;
334 int i;
335
336 if (n > 65536)
337 return -EINVAL;
338
339 d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
340 if (!d)
341 return -ENOMEM;
342
343 d->size = n;
344 for (i = 0; i < n; i++)
345 d->table[i] = data[i];
346
347 spin_lock_bh(&sch->dev->queue_lock);
348 d = xchg(&q->delay_dist, d);
349 spin_unlock_bh(&sch->dev->queue_lock);
350
351 kfree(d);
352 return 0;
353}
354
355static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
356{
357 struct netem_sched_data *q = qdisc_priv(sch);
358 const struct tc_netem_corr *c = RTA_DATA(attr);
359
360 if (RTA_PAYLOAD(attr) != sizeof(*c))
361 return -EINVAL;
362
363 init_crandom(&q->delay_cor, c->delay_corr);
364 init_crandom(&q->loss_cor, c->loss_corr);
365 init_crandom(&q->dup_cor, c->dup_corr);
366 return 0;
367}
368
369static int netem_change(struct Qdisc *sch, struct rtattr *opt)
370{
371 struct netem_sched_data *q = qdisc_priv(sch);
372 struct tc_netem_qopt *qopt;
373 int ret;
374
375 if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
376 return -EINVAL;
377
378 qopt = RTA_DATA(opt);
379 ret = set_fifo_limit(q->qdisc, qopt->limit);
380 if (ret) {
381 pr_debug("netem: can't set fifo limit\n");
382 return ret;
383 }
384
385 q->latency = qopt->latency;
386 q->jitter = qopt->jitter;
387 q->limit = qopt->limit;
388 q->gap = qopt->gap;
389 q->loss = qopt->loss;
390 q->duplicate = qopt->duplicate;
391
392 /* Handle nested options after initial queue options.
393 * Should have put all options in nested format but too late now.
394 */
395 if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
396 struct rtattr *tb[TCA_NETEM_MAX];
397 if (rtattr_parse(tb, TCA_NETEM_MAX,
398 RTA_DATA(opt) + sizeof(*qopt),
399 RTA_PAYLOAD(opt) - sizeof(*qopt)))
400 return -EINVAL;
401
402 if (tb[TCA_NETEM_CORR-1]) {
403 ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
404 if (ret)
405 return ret;
406 }
407
408 if (tb[TCA_NETEM_DELAY_DIST-1]) {
409 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
410 if (ret)
411 return ret;
412 }
413 }
414
415
416 return 0;
417}
418
419static int netem_init(struct Qdisc *sch, struct rtattr *opt)
420{
421 struct netem_sched_data *q = qdisc_priv(sch);
422 int ret;
423
424 if (!opt)
425 return -EINVAL;
426
427 skb_queue_head_init(&q->delayed);
428 init_timer(&q->timer);
429 q->timer.function = netem_watchdog;
430 q->timer.data = (unsigned long) sch;
431 q->counter = 0;
432
433 q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
434 if (!q->qdisc) {
435 pr_debug("netem: qdisc create failed\n");
436 return -ENOMEM;
437 }
438
439 ret = netem_change(sch, opt);
440 if (ret) {
441 pr_debug("netem: change failed\n");
442 qdisc_destroy(q->qdisc);
443 }
444 return ret;
445}
446
447static void netem_destroy(struct Qdisc *sch)
448{
449 struct netem_sched_data *q = qdisc_priv(sch);
450
451 del_timer_sync(&q->timer);
452 qdisc_destroy(q->qdisc);
453 kfree(q->delay_dist);
454}
455
456static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
457{
458 const struct netem_sched_data *q = qdisc_priv(sch);
459 unsigned char *b = skb->tail;
460 struct rtattr *rta = (struct rtattr *) b;
461 struct tc_netem_qopt qopt;
462 struct tc_netem_corr cor;
463
464 qopt.latency = q->latency;
465 qopt.jitter = q->jitter;
466 qopt.limit = q->limit;
467 qopt.loss = q->loss;
468 qopt.gap = q->gap;
469 qopt.duplicate = q->duplicate;
470 RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
471
472 cor.delay_corr = q->delay_cor.rho;
473 cor.loss_corr = q->loss_cor.rho;
474 cor.dup_corr = q->dup_cor.rho;
475 RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
476 rta->rta_len = skb->tail - b;
477
478 return skb->len;
479
480rtattr_failure:
481 skb_trim(skb, b - skb->data);
482 return -1;
483}
484
485static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
486 struct sk_buff *skb, struct tcmsg *tcm)
487{
488 struct netem_sched_data *q = qdisc_priv(sch);
489
490 if (cl != 1) /* only one class */
491 return -ENOENT;
492
493 tcm->tcm_handle |= TC_H_MIN(1);
494 tcm->tcm_info = q->qdisc->handle;
495
496 return 0;
497}
498
499static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
500 struct Qdisc **old)
501{
502 struct netem_sched_data *q = qdisc_priv(sch);
503
504 if (new == NULL)
505 new = &noop_qdisc;
506
507 sch_tree_lock(sch);
508 *old = xchg(&q->qdisc, new);
509 qdisc_reset(*old);
510 sch->q.qlen = 0;
511 sch_tree_unlock(sch);
512
513 return 0;
514}
515
516static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
517{
518 struct netem_sched_data *q = qdisc_priv(sch);
519 return q->qdisc;
520}
521
522static unsigned long netem_get(struct Qdisc *sch, u32 classid)
523{
524 return 1;
525}
526
527static void netem_put(struct Qdisc *sch, unsigned long arg)
528{
529}
530
531static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
532 struct rtattr **tca, unsigned long *arg)
533{
534 return -ENOSYS;
535}
536
537static int netem_delete(struct Qdisc *sch, unsigned long arg)
538{
539 return -ENOSYS;
540}
541
542static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
543{
544 if (!walker->stop) {
545 if (walker->count >= walker->skip)
546 if (walker->fn(sch, 1, walker) < 0) {
547 walker->stop = 1;
548 return;
549 }
550 walker->count++;
551 }
552}
553
554static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
555{
556 return NULL;
557}
558
559static struct Qdisc_class_ops netem_class_ops = {
560 .graft = netem_graft,
561 .leaf = netem_leaf,
562 .get = netem_get,
563 .put = netem_put,
564 .change = netem_change_class,
565 .delete = netem_delete,
566 .walk = netem_walk,
567 .tcf_chain = netem_find_tcf,
568 .dump = netem_dump_class,
569};
570
571static struct Qdisc_ops netem_qdisc_ops = {
572 .id = "netem",
573 .cl_ops = &netem_class_ops,
574 .priv_size = sizeof(struct netem_sched_data),
575 .enqueue = netem_enqueue,
576 .dequeue = netem_dequeue,
577 .requeue = netem_requeue,
578 .drop = netem_drop,
579 .init = netem_init,
580 .reset = netem_reset,
581 .destroy = netem_destroy,
582 .change = netem_change,
583 .dump = netem_dump,
584 .owner = THIS_MODULE,
585};
586
587
588static int __init netem_module_init(void)
589{
590 return register_qdisc(&netem_qdisc_ops);
591}
592static void __exit netem_module_exit(void)
593{
594 unregister_qdisc(&netem_qdisc_ops);
595}
596module_init(netem_module_init)
597module_exit(netem_module_exit)
598MODULE_LICENSE("GPL");