[NETEM]: Support time based reordering
Change netem to support packets getting reordered because of variations in
delay. Introduce a special case version of FIFO that queues packets in order
based on the netem delay.
Since netem is classful, those users that don't want jitter based reordering
can just insert a pfifo instead of the default.
This required changes to generic skbuff code to allow finer grain manipulation
of sk_buff_head. Insertion into the middle and reverse walk.
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4286d83..fdfb8fe 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -603,6 +603,30 @@
*/
/**
+ * __skb_queue_after - queue a buffer at the list head
+ * @list: list to use
+ * @prev: place after this buffer
+ * @newsk: buffer to queue
+ *
+ * Queue a buffer int the middle of a list. This function takes no locks
+ * and you must therefore hold required locks before calling it.
+ *
+ * A buffer cannot be placed on two lists at the same time.
+ */
+static inline void __skb_queue_after(struct sk_buff_head *list,
+ struct sk_buff *prev,
+ struct sk_buff *newsk)
+{
+ struct sk_buff *next;
+ list->qlen++;
+
+ next = prev->next;
+ newsk->next = next;
+ newsk->prev = prev;
+ next->prev = prev->next = newsk;
+}
+
+/**
* __skb_queue_head - queue a buffer at the list head
* @list: list to use
* @newsk: buffer to queue
@@ -616,14 +640,7 @@
static inline void __skb_queue_head(struct sk_buff_head *list,
struct sk_buff *newsk)
{
- struct sk_buff *prev, *next;
-
- list->qlen++;
- prev = (struct sk_buff *)list;
- next = prev->next;
- newsk->next = next;
- newsk->prev = prev;
- next->prev = prev->next = newsk;
+ __skb_queue_after(list, (struct sk_buff *)list, newsk);
}
/**
@@ -1203,6 +1220,11 @@
prefetch(skb->next), (skb != (struct sk_buff *)(queue)); \
skb = skb->next)
+#define skb_queue_reverse_walk(queue, skb) \
+ for (skb = (queue)->prev; \
+ prefetch(skb->prev), (skb != (struct sk_buff *)(queue)); \
+ skb = skb->prev)
+
extern struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
int noblock, int *err);
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index d871fe7..7c10ef3 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -300,11 +300,16 @@
del_timer_sync(&q->timer);
}
+/* Pass size change message down to embedded FIFO */
static int set_fifo_limit(struct Qdisc *q, int limit)
{
struct rtattr *rta;
int ret = -ENOMEM;
+ /* Hack to avoid sending change message to non-FIFO */
+ if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
+ return 0;
+
rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
if (rta) {
rta->rta_type = RTM_NEWQDISC;
@@ -436,6 +441,84 @@
return 0;
}
+/*
+ * Special case version of FIFO queue for use by netem.
+ * It queues in order based on timestamps in skb's
+ */
+struct fifo_sched_data {
+ u32 limit;
+};
+
+static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
+{
+ struct fifo_sched_data *q = qdisc_priv(sch);
+ struct sk_buff_head *list = &sch->q;
+ const struct netem_skb_cb *ncb
+ = (const struct netem_skb_cb *)nskb->cb;
+ struct sk_buff *skb;
+
+ if (likely(skb_queue_len(list) < q->limit)) {
+ skb_queue_reverse_walk(list, skb) {
+ const struct netem_skb_cb *cb
+ = (const struct netem_skb_cb *)skb->cb;
+
+ if (PSCHED_TLESS(cb->time_to_send, ncb->time_to_send))
+ break;
+ }
+
+ __skb_queue_after(list, skb, nskb);
+
+ sch->qstats.backlog += nskb->len;
+ sch->bstats.bytes += nskb->len;
+ sch->bstats.packets++;
+
+ return NET_XMIT_SUCCESS;
+ }
+
+ return qdisc_drop(nskb, sch);
+}
+
+static int tfifo_init(struct Qdisc *sch, struct rtattr *opt)
+{
+ struct fifo_sched_data *q = qdisc_priv(sch);
+
+ if (opt) {
+ struct tc_fifo_qopt *ctl = RTA_DATA(opt);
+ if (RTA_PAYLOAD(opt) < sizeof(*ctl))
+ return -EINVAL;
+
+ q->limit = ctl->limit;
+ } else
+ q->limit = max_t(u32, sch->dev->tx_queue_len, 1);
+
+ return 0;
+}
+
+static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct fifo_sched_data *q = qdisc_priv(sch);
+ struct tc_fifo_qopt opt = { .limit = q->limit };
+
+ RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+ return skb->len;
+
+rtattr_failure:
+ return -1;
+}
+
+static struct Qdisc_ops tfifo_qdisc_ops = {
+ .id = "tfifo",
+ .priv_size = sizeof(struct fifo_sched_data),
+ .enqueue = tfifo_enqueue,
+ .dequeue = qdisc_dequeue_head,
+ .requeue = qdisc_requeue,
+ .drop = qdisc_queue_drop,
+ .init = tfifo_init,
+ .reset = qdisc_reset_queue,
+ .change = tfifo_init,
+ .dump = tfifo_dump,
+};
+
static int netem_init(struct Qdisc *sch, struct rtattr *opt)
{
struct netem_sched_data *q = qdisc_priv(sch);
@@ -448,7 +531,7 @@
q->timer.function = netem_watchdog;
q->timer.data = (unsigned long) sch;
- q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+ q->qdisc = qdisc_create_dflt(sch->dev, &tfifo_qdisc_ops);
if (!q->qdisc) {
pr_debug("netem: qdisc create failed\n");
return -ENOMEM;