Blame - net/sched/sch_fq.c - kernel/msm-4.9

blob: 48501a2baf75ed050a2c7da081d458c39f116f60 [file] [log] [blame]

Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	1	/*
				2	* net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
				3	*
				4	* Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public License
				8	* as published by the Free Software Foundation; either version
				9	* 2 of the License, or (at your option) any later version.
				10	*
				11	* Meant to be mostly used for localy generated traffic :
				12	* Fast classification depends on skb->sk being set before reaching us.
				13	* If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
				14	* All packets belonging to a socket are considered as a 'flow'.
				15	*
				16	* Flows are dynamically allocated and stored in a hash table of RB trees
				17	* They are also part of one Round Robin 'queues' (new or old flows)
				18	*
				19	* Burst avoidance (aka pacing) capability :
				20	*
				21	* Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
				22	* bunch of packets, and this packet scheduler adds delay between
				23	* packets to respect rate limitation.
				24	*
				25	* enqueue() :
				26	* - lookup one RB tree (out of 1024 or more) to find the flow.
				27	* If non existent flow, create it, add it to the tree.
				28	* Add skb to the per flow list of skb (fifo).
				29	* - Use a special fifo for high prio packets
				30	*
				31	* dequeue() : serves flows in Round Robin
				32	* Note : When a flow becomes empty, we do not immediately remove it from
				33	* rb trees, for performance reasons (its expected to send additional packets,
				34	* or SLAB cache will reuse socket for another flow)
				35	*/
				36
				37	#include <linux/module.h>
				38	#include <linux/types.h>
				39	#include <linux/kernel.h>
				40	#include <linux/jiffies.h>
				41	#include <linux/string.h>
				42	#include <linux/in.h>
				43	#include <linux/errno.h>
				44	#include <linux/init.h>
				45	#include <linux/skbuff.h>
				46	#include <linux/slab.h>
				47	#include <linux/rbtree.h>
				48	#include <linux/hash.h>
Eric Dumazet	08f89b9	2013-08-30 09:46:43 -0700	[diff] [blame]	49	#include <linux/prefetch.h>
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	50	#include <net/netlink.h>
				51	#include <net/pkt_sched.h>
				52	#include <net/sock.h>
				53	#include <net/tcp_states.h>
				54
				55	/*
				56	* Per flow structure, dynamically allocated
				57	*/
				58	struct fq_flow {
				59	struct sk_buff head; / list of skbs for this flow : first skb */
				60	union {
				61	struct sk_buff tail; / last skb in the list */
				62	unsigned long age; /* jiffies when flow was emptied, for gc */
				63	};
				64	struct rb_node fq_node; /* anchor in fq_root[] trees */
				65	struct sock *sk;
				66	int qlen; /* number of packets in flow queue */
				67	int credit;
				68	u32 socket_hash; /* sk_hash */
				69	struct fq_flow next; / next pointer in RR lists, or &detached */
				70
				71	struct rb_node rate_node; /* anchor in q->delayed tree */
				72	u64 time_next_packet;
				73	};
				74
				75	struct fq_flow_head {
				76	struct fq_flow *first;
				77	struct fq_flow *last;
				78	};
				79
				80	struct fq_sched_data {
				81	struct fq_flow_head new_flows;
				82
				83	struct fq_flow_head old_flows;
				84
				85	struct rb_root delayed; /* for rate limited flows */
				86	u64 time_next_delayed_flow;
				87
				88	struct fq_flow internal; /* for non classified or high prio packets */
				89	u32 quantum;
				90	u32 initial_quantum;
				91	u32 flow_default_rate;/* rate per flow : bytes per second */
				92	u32 flow_max_rate; /* optional max rate per flow */
				93	u32 flow_plimit; /* max packets per flow */
				94	struct rb_root *fq_root;
				95	u8 rate_enable;
				96	u8 fq_trees_log;
				97
				98	u32 flows;
				99	u32 inactive_flows;
				100	u32 throttled_flows;
				101
				102	u64 stat_gc_flows;
				103	u64 stat_internal_packets;
				104	u64 stat_tcp_retrans;
				105	u64 stat_throttled;
				106	u64 stat_flows_plimit;
				107	u64 stat_pkts_too_long;
				108	u64 stat_allocation_errors;
				109	struct qdisc_watchdog watchdog;
				110	};
				111
				112	/* special value to mark a detached flow (not on old/new list) */
				113	static struct fq_flow detached, throttled;
				114
				115	static void fq_flow_set_detached(struct fq_flow *f)
				116	{
				117	f->next = &detached;
				118	}
				119
				120	static bool fq_flow_is_detached(const struct fq_flow *f)
				121	{
				122	return f->next == &detached;
				123	}
				124
				125	static void fq_flow_set_throttled(struct fq_sched_data q, struct fq_flow f)
				126	{
				127	struct rb_node *p = &q->delayed.rb_node, parent = NULL;
				128
				129	while (*p) {
				130	struct fq_flow *aux;
				131
				132	parent = *p;
				133	aux = container_of(parent, struct fq_flow, rate_node);
				134	if (f->time_next_packet >= aux->time_next_packet)
				135	p = &parent->rb_right;
				136	else
				137	p = &parent->rb_left;
				138	}
				139	rb_link_node(&f->rate_node, parent, p);
				140	rb_insert_color(&f->rate_node, &q->delayed);
				141	q->throttled_flows++;
				142	q->stat_throttled++;
				143
				144	f->next = &throttled;
				145	if (q->time_next_delayed_flow > f->time_next_packet)
				146	q->time_next_delayed_flow = f->time_next_packet;
				147	}
				148
				149
				150	static struct kmem_cache *fq_flow_cachep __read_mostly;
				151
				152	static void fq_flow_add_tail(struct fq_flow_head head, struct fq_flow flow)
				153	{
				154	if (head->first)
				155	head->last->next = flow;
				156	else
				157	head->first = flow;
				158	head->last = flow;
				159	flow->next = NULL;
				160	}
				161
				162	/* limit number of collected flows per round */
				163	#define FQ_GC_MAX 8
				164	#define FQ_GC_AGE (3*HZ)
				165
				166	static bool fq_gc_candidate(const struct fq_flow *f)
				167	{
				168	return fq_flow_is_detached(f) &&
				169	time_after(jiffies, f->age + FQ_GC_AGE);
				170	}
				171
				172	static void fq_gc(struct fq_sched_data *q,
				173	struct rb_root *root,
				174	struct sock *sk)
				175	{
				176	struct fq_flow f, tofree[FQ_GC_MAX];
				177	struct rb_node *p, parent;
				178	int fcnt = 0;
				179
				180	p = &root->rb_node;
				181	parent = NULL;
				182	while (*p) {
				183	parent = *p;
				184
				185	f = container_of(parent, struct fq_flow, fq_node);
				186	if (f->sk == sk)
				187	break;
				188
				189	if (fq_gc_candidate(f)) {
				190	tofree[fcnt++] = f;
				191	if (fcnt == FQ_GC_MAX)
				192	break;
				193	}
				194
				195	if (f->sk > sk)
				196	p = &parent->rb_right;
				197	else
				198	p = &parent->rb_left;
				199	}
				200
				201	q->flows -= fcnt;
				202	q->inactive_flows -= fcnt;
				203	q->stat_gc_flows += fcnt;
				204	while (fcnt) {
				205	struct fq_flow *f = tofree[--fcnt];
				206
				207	rb_erase(&f->fq_node, root);
				208	kmem_cache_free(fq_flow_cachep, f);
				209	}
				210	}
				211
				212	static const u8 prio2band[TC_PRIO_MAX + 1] = {
				213	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
				214	};
				215
				216	static struct fq_flow fq_classify(struct sk_buff skb, struct fq_sched_data *q)
				217	{
				218	struct rb_node *p, parent;
				219	struct sock *sk = skb->sk;
				220	struct rb_root *root;
				221	struct fq_flow *f;
				222	int band;
				223
				224	/* warning: no starvation prevention... */
				225	band = prio2band[skb->priority & TC_PRIO_MAX];
				226	if (unlikely(band == 0))
				227	return &q->internal;
				228
				229	if (unlikely(!sk)) {
				230	/* By forcing low order bit to 1, we make sure to not
				231	* collide with a local flow (socket pointers are word aligned)
				232	*/
				233	sk = (struct sock *)(skb_get_rxhash(skb) \| 1L);
				234	}
				235
				236	root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
				237
				238	if (q->flows >= (2U << q->fq_trees_log) &&
				239	q->inactive_flows > q->flows/2)
				240	fq_gc(q, root, sk);
				241
				242	p = &root->rb_node;
				243	parent = NULL;
				244	while (*p) {
				245	parent = *p;
				246
				247	f = container_of(parent, struct fq_flow, fq_node);
				248	if (f->sk == sk) {
				249	/* socket might have been reallocated, so check
				250	* if its sk_hash is the same.
				251	* It not, we need to refill credit with
				252	* initial quantum
				253	*/
				254	if (unlikely(skb->sk &&
				255	f->socket_hash != sk->sk_hash)) {
				256	f->credit = q->initial_quantum;
				257	f->socket_hash = sk->sk_hash;
				258	}
				259	return f;
				260	}
				261	if (f->sk > sk)
				262	p = &parent->rb_right;
				263	else
				264	p = &parent->rb_left;
				265	}
				266
				267	f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC \| __GFP_NOWARN);
				268	if (unlikely(!f)) {
				269	q->stat_allocation_errors++;
				270	return &q->internal;
				271	}
				272	fq_flow_set_detached(f);
				273	f->sk = sk;
				274	if (skb->sk)
				275	f->socket_hash = sk->sk_hash;
				276	f->credit = q->initial_quantum;
				277
				278	rb_link_node(&f->fq_node, parent, p);
				279	rb_insert_color(&f->fq_node, root);
				280
				281	q->flows++;
				282	q->inactive_flows++;
				283	return f;
				284	}
				285
				286
				287	/* remove one skb from head of flow queue */
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	288	static struct sk_buff fq_dequeue_head(struct Qdisc sch, struct fq_flow *flow)
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	289	{
				290	struct sk_buff *skb = flow->head;
				291
				292	if (skb) {
				293	flow->head = skb->next;
				294	skb->next = NULL;
				295	flow->qlen--;
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	296	sch->qstats.backlog -= qdisc_pkt_len(skb);
				297	sch->q.qlen--;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	298	}
				299	return skb;
				300	}
				301
				302	/* We might add in the future detection of retransmits
				303	* For the time being, just return false
				304	*/
				305	static bool skb_is_retransmit(struct sk_buff *skb)
				306	{
				307	return false;
				308	}
				309
				310	/* add skb to flow queue
				311	* flow queue is a linked list, kind of FIFO, except for TCP retransmits
				312	* We special case tcp retransmits to be transmitted before other packets.
				313	* We rely on fact that TCP retransmits are unlikely, so we do not waste
				314	* a separate queue or a pointer.
				315	* head-> [retrans pkt 1]
				316	* [retrans pkt 2]
				317	* [ normal pkt 1]
				318	* [ normal pkt 2]
				319	* [ normal pkt 3]
				320	* tail-> [ normal pkt 4]
				321	*/
				322	static void flow_queue_add(struct fq_flow flow, struct sk_buff skb)
				323	{
				324	struct sk_buff prev, head = flow->head;
				325
				326	skb->next = NULL;
				327	if (!head) {
				328	flow->head = skb;
				329	flow->tail = skb;
				330	return;
				331	}
				332	if (likely(!skb_is_retransmit(skb))) {
				333	flow->tail->next = skb;
				334	flow->tail = skb;
				335	return;
				336	}
				337
				338	/* This skb is a tcp retransmit,
				339	* find the last retrans packet in the queue
				340	*/
				341	prev = NULL;
				342	while (skb_is_retransmit(head)) {
				343	prev = head;
				344	head = head->next;
				345	if (!head)
				346	break;
				347	}
				348	if (!prev) { /* no rtx packet in queue, become the new head */
				349	skb->next = flow->head;
				350	flow->head = skb;
				351	} else {
				352	if (prev == flow->tail)
				353	flow->tail = skb;
				354	else
				355	skb->next = prev->next;
				356	prev->next = skb;
				357	}
				358	}
				359
				360	static int fq_enqueue(struct sk_buff skb, struct Qdisc sch)
				361	{
				362	struct fq_sched_data *q = qdisc_priv(sch);
				363	struct fq_flow *f;
				364
				365	if (unlikely(sch->q.qlen >= sch->limit))
				366	return qdisc_drop(skb, sch);
				367
				368	f = fq_classify(skb, q);
				369	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
				370	q->stat_flows_plimit++;
				371	return qdisc_drop(skb, sch);
				372	}
				373
				374	f->qlen++;
				375	flow_queue_add(f, skb);
				376	if (skb_is_retransmit(skb))
				377	q->stat_tcp_retrans++;
				378	sch->qstats.backlog += qdisc_pkt_len(skb);
				379	if (fq_flow_is_detached(f)) {
				380	fq_flow_add_tail(&q->new_flows, f);
				381	if (q->quantum > f->credit)
				382	f->credit = q->quantum;
				383	q->inactive_flows--;
				384	qdisc_unthrottled(sch);
				385	}
				386	if (unlikely(f == &q->internal)) {
				387	q->stat_internal_packets++;
				388	qdisc_unthrottled(sch);
				389	}
				390	sch->q.qlen++;
				391
				392	return NET_XMIT_SUCCESS;
				393	}
				394
				395	static void fq_check_throttled(struct fq_sched_data *q, u64 now)
				396	{
				397	struct rb_node *p;
				398
				399	if (q->time_next_delayed_flow > now)
				400	return;
				401
				402	q->time_next_delayed_flow = ~0ULL;
				403	while ((p = rb_first(&q->delayed)) != NULL) {
				404	struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
				405
				406	if (f->time_next_packet > now) {
				407	q->time_next_delayed_flow = f->time_next_packet;
				408	break;
				409	}
				410	rb_erase(p, &q->delayed);
				411	q->throttled_flows--;
				412	fq_flow_add_tail(&q->old_flows, f);
				413	}
				414	}
				415
				416	static struct sk_buff fq_dequeue(struct Qdisc sch)
				417	{
				418	struct fq_sched_data *q = qdisc_priv(sch);
				419	u64 now = ktime_to_ns(ktime_get());
				420	struct fq_flow_head *head;
				421	struct sk_buff *skb;
				422	struct fq_flow *f;
Eric Dumazet	0eab5eb	2013-10-01 09:10:16 -0700	[diff] [blame]	423	u32 rate;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	424
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	425	skb = fq_dequeue_head(sch, &q->internal);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	426	if (skb)
				427	goto out;
				428	fq_check_throttled(q, now);
				429	begin:
				430	head = &q->new_flows;
				431	if (!head->first) {
				432	head = &q->old_flows;
				433	if (!head->first) {
				434	if (q->time_next_delayed_flow != ~0ULL)
				435	qdisc_watchdog_schedule_ns(&q->watchdog,
				436	q->time_next_delayed_flow);
				437	return NULL;
				438	}
				439	}
				440	f = head->first;
				441
				442	if (f->credit <= 0) {
				443	f->credit += q->quantum;
				444	head->first = f->next;
				445	fq_flow_add_tail(&q->old_flows, f);
				446	goto begin;
				447	}
				448
				449	if (unlikely(f->head && now < f->time_next_packet)) {
				450	head->first = f->next;
				451	fq_flow_set_throttled(q, f);
				452	goto begin;
				453	}
				454
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	455	skb = fq_dequeue_head(sch, f);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	456	if (!skb) {
				457	head->first = f->next;
				458	/* force a pass through old_flows to prevent starvation */
				459	if ((head == &q->new_flows) && q->old_flows.first) {
				460	fq_flow_add_tail(&q->old_flows, f);
				461	} else {
				462	fq_flow_set_detached(f);
				463	f->age = jiffies;
				464	q->inactive_flows++;
				465	}
				466	goto begin;
				467	}
Eric Dumazet	08f89b9	2013-08-30 09:46:43 -0700	[diff] [blame]	468	prefetch(&skb->end);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	469	f->time_next_packet = now;
				470	f->credit -= qdisc_pkt_len(skb);
				471
Eric Dumazet	0eab5eb	2013-10-01 09:10:16 -0700	[diff] [blame]	472	if (f->credit > 0 \|\| !q->rate_enable)
				473	goto out;
				474
				475	if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) {
				476	rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	477
				478	rate = min(rate, q->flow_max_rate);
Eric Dumazet	0eab5eb	2013-10-01 09:10:16 -0700	[diff] [blame]	479	} else {
				480	rate = q->flow_max_rate;
				481	if (rate == ~0U)
				482	goto out;
				483	}
				484	if (rate) {
				485	u32 plen = max(qdisc_pkt_len(skb), q->quantum);
				486	u64 len = (u64)plen * NSEC_PER_SEC;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	487
Eric Dumazet	0eab5eb	2013-10-01 09:10:16 -0700	[diff] [blame]	488	do_div(len, rate);
				489	/* Since socket rate can change later,
				490	* clamp the delay to 125 ms.
				491	* TODO: maybe segment the too big skb, as in commit
				492	* e43ac79a4bc ("sch_tbf: segment too big GSO packets")
				493	*/
				494	if (unlikely(len > 125 * NSEC_PER_MSEC)) {
				495	len = 125 * NSEC_PER_MSEC;
				496	q->stat_pkts_too_long++;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	497	}
Eric Dumazet	0eab5eb	2013-10-01 09:10:16 -0700	[diff] [blame]	498
				499	f->time_next_packet = now + len;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	500	}
				501	out:
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	502	qdisc_bstats_update(sch, skb);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	503	qdisc_unthrottled(sch);
				504	return skb;
				505	}
				506
				507	static void fq_reset(struct Qdisc *sch)
				508	{
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	509	struct fq_sched_data *q = qdisc_priv(sch);
				510	struct rb_root *root;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	511	struct sk_buff *skb;
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	512	struct rb_node *p;
				513	struct fq_flow *f;
				514	unsigned int idx;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	515
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	516	while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL)
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	517	kfree_skb(skb);
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	518
				519	if (!q->fq_root)
				520	return;
				521
				522	for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
				523	root = &q->fq_root[idx];
				524	while ((p = rb_first(root)) != NULL) {
				525	f = container_of(p, struct fq_flow, fq_node);
				526	rb_erase(p, root);
				527
				528	while ((skb = fq_dequeue_head(sch, f)) != NULL)
				529	kfree_skb(skb);
				530
				531	kmem_cache_free(fq_flow_cachep, f);
				532	}
				533	}
				534	q->new_flows.first = NULL;
				535	q->old_flows.first = NULL;
				536	q->delayed = RB_ROOT;
				537	q->flows = 0;
				538	q->inactive_flows = 0;
				539	q->throttled_flows = 0;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	540	}
				541
				542	static void fq_rehash(struct fq_sched_data *q,
				543	struct rb_root *old_array, u32 old_log,
				544	struct rb_root *new_array, u32 new_log)
				545	{
				546	struct rb_node op, np, parent;
				547	struct rb_root oroot, nroot;
				548	struct fq_flow of, nf;
				549	int fcnt = 0;
				550	u32 idx;
				551
				552	for (idx = 0; idx < (1U << old_log); idx++) {
				553	oroot = &old_array[idx];
				554	while ((op = rb_first(oroot)) != NULL) {
				555	rb_erase(op, oroot);
				556	of = container_of(op, struct fq_flow, fq_node);
				557	if (fq_gc_candidate(of)) {
				558	fcnt++;
				559	kmem_cache_free(fq_flow_cachep, of);
				560	continue;
				561	}
				562	nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
				563
				564	np = &nroot->rb_node;
				565	parent = NULL;
				566	while (*np) {
				567	parent = *np;
				568
				569	nf = container_of(parent, struct fq_flow, fq_node);
				570	BUG_ON(nf->sk == of->sk);
				571
				572	if (nf->sk > of->sk)
				573	np = &parent->rb_right;
				574	else
				575	np = &parent->rb_left;
				576	}
				577
				578	rb_link_node(&of->fq_node, parent, np);
				579	rb_insert_color(&of->fq_node, nroot);
				580	}
				581	}
				582	q->flows -= fcnt;
				583	q->inactive_flows -= fcnt;
				584	q->stat_gc_flows += fcnt;
				585	}
				586
				587	static int fq_resize(struct fq_sched_data *q, u32 log)
				588	{
				589	struct rb_root *array;
				590	u32 idx;
				591
				592	if (q->fq_root && log == q->fq_trees_log)
				593	return 0;
				594
				595	array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL);
				596	if (!array)
				597	return -ENOMEM;
				598
				599	for (idx = 0; idx < (1U << log); idx++)
				600	array[idx] = RB_ROOT;
				601
				602	if (q->fq_root) {
				603	fq_rehash(q, q->fq_root, q->fq_trees_log, array, log);
				604	kfree(q->fq_root);
				605	}
				606	q->fq_root = array;
				607	q->fq_trees_log = log;
				608
				609	return 0;
				610	}
				611
				612	static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
				613	[TCA_FQ_PLIMIT] = { .type = NLA_U32 },
				614	[TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
				615	[TCA_FQ_QUANTUM] = { .type = NLA_U32 },
				616	[TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
				617	[TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
				618	[TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
				619	[TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
				620	[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
				621	};
				622
				623	static int fq_change(struct Qdisc sch, struct nlattr opt)
				624	{
				625	struct fq_sched_data *q = qdisc_priv(sch);
				626	struct nlattr *tb[TCA_FQ_MAX + 1];
				627	int err, drop_count = 0;
				628	u32 fq_log;
				629
				630	if (!opt)
				631	return -EINVAL;
				632
				633	err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
				634	if (err < 0)
				635	return err;
				636
				637	sch_tree_lock(sch);
				638
				639	fq_log = q->fq_trees_log;
				640
				641	if (tb[TCA_FQ_BUCKETS_LOG]) {
				642	u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
				643
				644	if (nval >= 1 && nval <= ilog2(256*1024))
				645	fq_log = nval;
				646	else
				647	err = -EINVAL;
				648	}
				649	if (tb[TCA_FQ_PLIMIT])
				650	sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
				651
				652	if (tb[TCA_FQ_FLOW_PLIMIT])
				653	q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
				654
				655	if (tb[TCA_FQ_QUANTUM])
				656	q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
				657
				658	if (tb[TCA_FQ_INITIAL_QUANTUM])
Eric Dumazet	ede869c	2013-10-07 12:50:18 -0700	[diff] [blame^]	659	q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	660
				661	if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
				662	q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]);
				663
				664	if (tb[TCA_FQ_FLOW_MAX_RATE])
				665	q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
				666
				667	if (tb[TCA_FQ_RATE_ENABLE]) {
				668	u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
				669
				670	if (enable <= 1)
				671	q->rate_enable = enable;
				672	else
				673	err = -EINVAL;
				674	}
				675
				676	if (!err)
				677	err = fq_resize(q, fq_log);
				678
				679	while (sch->q.qlen > sch->limit) {
				680	struct sk_buff *skb = fq_dequeue(sch);
				681
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	682	if (!skb)
				683	break;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	684	kfree_skb(skb);
				685	drop_count++;
				686	}
				687	qdisc_tree_decrease_qlen(sch, drop_count);
				688
				689	sch_tree_unlock(sch);
				690	return err;
				691	}
				692
				693	static void fq_destroy(struct Qdisc *sch)
				694	{
				695	struct fq_sched_data *q = qdisc_priv(sch);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	696
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	697	fq_reset(sch);
				698	kfree(q->fq_root);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	699	qdisc_watchdog_cancel(&q->watchdog);
				700	}
				701
				702	static int fq_init(struct Qdisc sch, struct nlattr opt)
				703	{
				704	struct fq_sched_data *q = qdisc_priv(sch);
				705	int err;
				706
				707	sch->limit = 10000;
				708	q->flow_plimit = 100;
				709	q->quantum = 2 * psched_mtu(qdisc_dev(sch));
				710	q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
				711	q->flow_default_rate = 0;
				712	q->flow_max_rate = ~0U;
				713	q->rate_enable = 1;
				714	q->new_flows.first = NULL;
				715	q->old_flows.first = NULL;
				716	q->delayed = RB_ROOT;
				717	q->fq_root = NULL;
				718	q->fq_trees_log = ilog2(1024);
				719	qdisc_watchdog_init(&q->watchdog, sch);
				720
				721	if (opt)
				722	err = fq_change(sch, opt);
				723	else
				724	err = fq_resize(q, q->fq_trees_log);
				725
				726	return err;
				727	}
				728
				729	static int fq_dump(struct Qdisc sch, struct sk_buff skb)
				730	{
				731	struct fq_sched_data *q = qdisc_priv(sch);
				732	struct nlattr *opts;
				733
				734	opts = nla_nest_start(skb, TCA_OPTIONS);
				735	if (opts == NULL)
				736	goto nla_put_failure;
				737
				738	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) \|\|
				739	nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) \|\|
				740	nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) \|\|
				741	nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) \|\|
				742	nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) \|\|
				743	nla_put_u32(skb, TCA_FQ_FLOW_DEFAULT_RATE, q->flow_default_rate) \|\|
				744	nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) \|\|
				745	nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
				746	goto nla_put_failure;
				747
				748	nla_nest_end(skb, opts);
				749	return skb->len;
				750
				751	nla_put_failure:
				752	return -1;
				753	}
				754
				755	static int fq_dump_stats(struct Qdisc sch, struct gnet_dump d)
				756	{
				757	struct fq_sched_data *q = qdisc_priv(sch);
				758	u64 now = ktime_to_ns(ktime_get());
				759	struct tc_fq_qd_stats st = {
				760	.gc_flows = q->stat_gc_flows,
				761	.highprio_packets = q->stat_internal_packets,
				762	.tcp_retrans = q->stat_tcp_retrans,
				763	.throttled = q->stat_throttled,
				764	.flows_plimit = q->stat_flows_plimit,
				765	.pkts_too_long = q->stat_pkts_too_long,
				766	.allocation_errors = q->stat_allocation_errors,
				767	.flows = q->flows,
				768	.inactive_flows = q->inactive_flows,
				769	.throttled_flows = q->throttled_flows,
				770	.time_next_delayed_flow = q->time_next_delayed_flow - now,
				771	};
				772
				773	return gnet_stats_copy_app(d, &st, sizeof(st));
				774	}
				775
				776	static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
				777	.id = "fq",
				778	.priv_size = sizeof(struct fq_sched_data),
				779
				780	.enqueue = fq_enqueue,
				781	.dequeue = fq_dequeue,
				782	.peek = qdisc_peek_dequeued,
				783	.init = fq_init,
				784	.reset = fq_reset,
				785	.destroy = fq_destroy,
				786	.change = fq_change,
				787	.dump = fq_dump,
				788	.dump_stats = fq_dump_stats,
				789	.owner = THIS_MODULE,
				790	};
				791
				792	static int __init fq_module_init(void)
				793	{
				794	int ret;
				795
				796	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
				797	sizeof(struct fq_flow),
				798	0, 0, NULL);
				799	if (!fq_flow_cachep)
				800	return -ENOMEM;
				801
				802	ret = register_qdisc(&fq_qdisc_ops);
				803	if (ret)
				804	kmem_cache_destroy(fq_flow_cachep);
				805	return ret;
				806	}
				807
				808	static void __exit fq_module_exit(void)
				809	{
				810	unregister_qdisc(&fq_qdisc_ops);
				811	kmem_cache_destroy(fq_flow_cachep);
				812	}
				813
				814	module_init(fq_module_init)
				815	module_exit(fq_module_exit)
				816	MODULE_AUTHOR("Eric Dumazet");
				817	MODULE_LICENSE("GPL");