Blame - net/sched/sch_fq.c - kernel/msm-4.9

blob: d4fa38e4af80888ce50c5c7f7fbcc763c7ec200d [file] [log] [blame]

Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	1	/*
				2	* net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
				3	*
				4	* Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public License
				8	* as published by the Free Software Foundation; either version
				9	* 2 of the License, or (at your option) any later version.
				10	*
				11	* Meant to be mostly used for localy generated traffic :
				12	* Fast classification depends on skb->sk being set before reaching us.
				13	* If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
				14	* All packets belonging to a socket are considered as a 'flow'.
				15	*
				16	* Flows are dynamically allocated and stored in a hash table of RB trees
				17	* They are also part of one Round Robin 'queues' (new or old flows)
				18	*
				19	* Burst avoidance (aka pacing) capability :
				20	*
				21	* Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
				22	* bunch of packets, and this packet scheduler adds delay between
				23	* packets to respect rate limitation.
				24	*
				25	* enqueue() :
				26	* - lookup one RB tree (out of 1024 or more) to find the flow.
				27	* If non existent flow, create it, add it to the tree.
				28	* Add skb to the per flow list of skb (fifo).
				29	* - Use a special fifo for high prio packets
				30	*
				31	* dequeue() : serves flows in Round Robin
				32	* Note : When a flow becomes empty, we do not immediately remove it from
				33	* rb trees, for performance reasons (its expected to send additional packets,
				34	* or SLAB cache will reuse socket for another flow)
				35	*/
				36
				37	#include <linux/module.h>
				38	#include <linux/types.h>
				39	#include <linux/kernel.h>
				40	#include <linux/jiffies.h>
				41	#include <linux/string.h>
				42	#include <linux/in.h>
				43	#include <linux/errno.h>
				44	#include <linux/init.h>
				45	#include <linux/skbuff.h>
				46	#include <linux/slab.h>
				47	#include <linux/rbtree.h>
				48	#include <linux/hash.h>
Eric Dumazet	08f89b9	2013-08-30 09:46:43 -0700	[diff] [blame]	49	#include <linux/prefetch.h>
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	50	#include <net/netlink.h>
				51	#include <net/pkt_sched.h>
				52	#include <net/sock.h>
				53	#include <net/tcp_states.h>
				54
				55	/*
				56	* Per flow structure, dynamically allocated
				57	*/
				58	struct fq_flow {
				59	struct sk_buff head; / list of skbs for this flow : first skb */
				60	union {
				61	struct sk_buff tail; / last skb in the list */
				62	unsigned long age; /* jiffies when flow was emptied, for gc */
				63	};
				64	struct rb_node fq_node; /* anchor in fq_root[] trees */
				65	struct sock *sk;
				66	int qlen; /* number of packets in flow queue */
				67	int credit;
				68	u32 socket_hash; /* sk_hash */
				69	struct fq_flow next; / next pointer in RR lists, or &detached */
				70
				71	struct rb_node rate_node; /* anchor in q->delayed tree */
				72	u64 time_next_packet;
				73	};
				74
				75	struct fq_flow_head {
				76	struct fq_flow *first;
				77	struct fq_flow *last;
				78	};
				79
				80	struct fq_sched_data {
				81	struct fq_flow_head new_flows;
				82
				83	struct fq_flow_head old_flows;
				84
				85	struct rb_root delayed; /* for rate limited flows */
				86	u64 time_next_delayed_flow;
				87
				88	struct fq_flow internal; /* for non classified or high prio packets */
				89	u32 quantum;
				90	u32 initial_quantum;
				91	u32 flow_default_rate;/* rate per flow : bytes per second */
				92	u32 flow_max_rate; /* optional max rate per flow */
				93	u32 flow_plimit; /* max packets per flow */
				94	struct rb_root *fq_root;
				95	u8 rate_enable;
				96	u8 fq_trees_log;
				97
				98	u32 flows;
				99	u32 inactive_flows;
				100	u32 throttled_flows;
				101
				102	u64 stat_gc_flows;
				103	u64 stat_internal_packets;
				104	u64 stat_tcp_retrans;
				105	u64 stat_throttled;
				106	u64 stat_flows_plimit;
				107	u64 stat_pkts_too_long;
				108	u64 stat_allocation_errors;
				109	struct qdisc_watchdog watchdog;
				110	};
				111
				112	/* special value to mark a detached flow (not on old/new list) */
				113	static struct fq_flow detached, throttled;
				114
				115	static void fq_flow_set_detached(struct fq_flow *f)
				116	{
				117	f->next = &detached;
				118	}
				119
				120	static bool fq_flow_is_detached(const struct fq_flow *f)
				121	{
				122	return f->next == &detached;
				123	}
				124
				125	static void fq_flow_set_throttled(struct fq_sched_data q, struct fq_flow f)
				126	{
				127	struct rb_node *p = &q->delayed.rb_node, parent = NULL;
				128
				129	while (*p) {
				130	struct fq_flow *aux;
				131
				132	parent = *p;
				133	aux = container_of(parent, struct fq_flow, rate_node);
				134	if (f->time_next_packet >= aux->time_next_packet)
				135	p = &parent->rb_right;
				136	else
				137	p = &parent->rb_left;
				138	}
				139	rb_link_node(&f->rate_node, parent, p);
				140	rb_insert_color(&f->rate_node, &q->delayed);
				141	q->throttled_flows++;
				142	q->stat_throttled++;
				143
				144	f->next = &throttled;
				145	if (q->time_next_delayed_flow > f->time_next_packet)
				146	q->time_next_delayed_flow = f->time_next_packet;
				147	}
				148
				149
				150	static struct kmem_cache *fq_flow_cachep __read_mostly;
				151
				152	static void fq_flow_add_tail(struct fq_flow_head head, struct fq_flow flow)
				153	{
				154	if (head->first)
				155	head->last->next = flow;
				156	else
				157	head->first = flow;
				158	head->last = flow;
				159	flow->next = NULL;
				160	}
				161
				162	/* limit number of collected flows per round */
				163	#define FQ_GC_MAX 8
				164	#define FQ_GC_AGE (3*HZ)
				165
				166	static bool fq_gc_candidate(const struct fq_flow *f)
				167	{
				168	return fq_flow_is_detached(f) &&
				169	time_after(jiffies, f->age + FQ_GC_AGE);
				170	}
				171
				172	static void fq_gc(struct fq_sched_data *q,
				173	struct rb_root *root,
				174	struct sock *sk)
				175	{
				176	struct fq_flow f, tofree[FQ_GC_MAX];
				177	struct rb_node *p, parent;
				178	int fcnt = 0;
				179
				180	p = &root->rb_node;
				181	parent = NULL;
				182	while (*p) {
				183	parent = *p;
				184
				185	f = container_of(parent, struct fq_flow, fq_node);
				186	if (f->sk == sk)
				187	break;
				188
				189	if (fq_gc_candidate(f)) {
				190	tofree[fcnt++] = f;
				191	if (fcnt == FQ_GC_MAX)
				192	break;
				193	}
				194
				195	if (f->sk > sk)
				196	p = &parent->rb_right;
				197	else
				198	p = &parent->rb_left;
				199	}
				200
				201	q->flows -= fcnt;
				202	q->inactive_flows -= fcnt;
				203	q->stat_gc_flows += fcnt;
				204	while (fcnt) {
				205	struct fq_flow *f = tofree[--fcnt];
				206
				207	rb_erase(&f->fq_node, root);
				208	kmem_cache_free(fq_flow_cachep, f);
				209	}
				210	}
				211
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	212	static struct fq_flow fq_classify(struct sk_buff skb, struct fq_sched_data *q)
				213	{
				214	struct rb_node *p, parent;
				215	struct sock *sk = skb->sk;
				216	struct rb_root *root;
				217	struct fq_flow *f;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	218
				219	/* warning: no starvation prevention... */
Maciej Żenczykowski	2abc2f0	2013-11-14 08:50:43 -0800	[diff] [blame]	220	if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	221	return &q->internal;
				222
				223	if (unlikely(!sk)) {
				224	/* By forcing low order bit to 1, we make sure to not
				225	* collide with a local flow (socket pointers are word aligned)
				226	*/
				227	sk = (struct sock *)(skb_get_rxhash(skb) \| 1L);
				228	}
				229
				230	root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
				231
				232	if (q->flows >= (2U << q->fq_trees_log) &&
				233	q->inactive_flows > q->flows/2)
				234	fq_gc(q, root, sk);
				235
				236	p = &root->rb_node;
				237	parent = NULL;
				238	while (*p) {
				239	parent = *p;
				240
				241	f = container_of(parent, struct fq_flow, fq_node);
				242	if (f->sk == sk) {
				243	/* socket might have been reallocated, so check
				244	* if its sk_hash is the same.
				245	* It not, we need to refill credit with
				246	* initial quantum
				247	*/
				248	if (unlikely(skb->sk &&
				249	f->socket_hash != sk->sk_hash)) {
				250	f->credit = q->initial_quantum;
				251	f->socket_hash = sk->sk_hash;
Eric Dumazet	fc59d5b	2013-10-27 16:26:43 -0700	[diff] [blame]	252	f->time_next_packet = 0ULL;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	253	}
				254	return f;
				255	}
				256	if (f->sk > sk)
				257	p = &parent->rb_right;
				258	else
				259	p = &parent->rb_left;
				260	}
				261
				262	f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC \| __GFP_NOWARN);
				263	if (unlikely(!f)) {
				264	q->stat_allocation_errors++;
				265	return &q->internal;
				266	}
				267	fq_flow_set_detached(f);
				268	f->sk = sk;
				269	if (skb->sk)
				270	f->socket_hash = sk->sk_hash;
				271	f->credit = q->initial_quantum;
				272
				273	rb_link_node(&f->fq_node, parent, p);
				274	rb_insert_color(&f->fq_node, root);
				275
				276	q->flows++;
				277	q->inactive_flows++;
				278	return f;
				279	}
				280
				281
				282	/* remove one skb from head of flow queue */
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	283	static struct sk_buff fq_dequeue_head(struct Qdisc sch, struct fq_flow *flow)
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	284	{
				285	struct sk_buff *skb = flow->head;
				286
				287	if (skb) {
				288	flow->head = skb->next;
				289	skb->next = NULL;
				290	flow->qlen--;
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	291	sch->qstats.backlog -= qdisc_pkt_len(skb);
				292	sch->q.qlen--;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	293	}
				294	return skb;
				295	}
				296
				297	/* We might add in the future detection of retransmits
				298	* For the time being, just return false
				299	*/
				300	static bool skb_is_retransmit(struct sk_buff *skb)
				301	{
				302	return false;
				303	}
				304
				305	/* add skb to flow queue
				306	* flow queue is a linked list, kind of FIFO, except for TCP retransmits
				307	* We special case tcp retransmits to be transmitted before other packets.
				308	* We rely on fact that TCP retransmits are unlikely, so we do not waste
				309	* a separate queue or a pointer.
				310	* head-> [retrans pkt 1]
				311	* [retrans pkt 2]
				312	* [ normal pkt 1]
				313	* [ normal pkt 2]
				314	* [ normal pkt 3]
				315	* tail-> [ normal pkt 4]
				316	*/
				317	static void flow_queue_add(struct fq_flow flow, struct sk_buff skb)
				318	{
				319	struct sk_buff prev, head = flow->head;
				320
				321	skb->next = NULL;
				322	if (!head) {
				323	flow->head = skb;
				324	flow->tail = skb;
				325	return;
				326	}
				327	if (likely(!skb_is_retransmit(skb))) {
				328	flow->tail->next = skb;
				329	flow->tail = skb;
				330	return;
				331	}
				332
				333	/* This skb is a tcp retransmit,
				334	* find the last retrans packet in the queue
				335	*/
				336	prev = NULL;
				337	while (skb_is_retransmit(head)) {
				338	prev = head;
				339	head = head->next;
				340	if (!head)
				341	break;
				342	}
				343	if (!prev) { /* no rtx packet in queue, become the new head */
				344	skb->next = flow->head;
				345	flow->head = skb;
				346	} else {
				347	if (prev == flow->tail)
				348	flow->tail = skb;
				349	else
				350	skb->next = prev->next;
				351	prev->next = skb;
				352	}
				353	}
				354
				355	static int fq_enqueue(struct sk_buff skb, struct Qdisc sch)
				356	{
				357	struct fq_sched_data *q = qdisc_priv(sch);
				358	struct fq_flow *f;
				359
				360	if (unlikely(sch->q.qlen >= sch->limit))
				361	return qdisc_drop(skb, sch);
				362
				363	f = fq_classify(skb, q);
				364	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
				365	q->stat_flows_plimit++;
				366	return qdisc_drop(skb, sch);
				367	}
				368
				369	f->qlen++;
				370	flow_queue_add(f, skb);
				371	if (skb_is_retransmit(skb))
				372	q->stat_tcp_retrans++;
				373	sch->qstats.backlog += qdisc_pkt_len(skb);
				374	if (fq_flow_is_detached(f)) {
				375	fq_flow_add_tail(&q->new_flows, f);
				376	if (q->quantum > f->credit)
				377	f->credit = q->quantum;
				378	q->inactive_flows--;
				379	qdisc_unthrottled(sch);
				380	}
				381	if (unlikely(f == &q->internal)) {
				382	q->stat_internal_packets++;
				383	qdisc_unthrottled(sch);
				384	}
				385	sch->q.qlen++;
				386
				387	return NET_XMIT_SUCCESS;
				388	}
				389
				390	static void fq_check_throttled(struct fq_sched_data *q, u64 now)
				391	{
				392	struct rb_node *p;
				393
				394	if (q->time_next_delayed_flow > now)
				395	return;
				396
				397	q->time_next_delayed_flow = ~0ULL;
				398	while ((p = rb_first(&q->delayed)) != NULL) {
				399	struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
				400
				401	if (f->time_next_packet > now) {
				402	q->time_next_delayed_flow = f->time_next_packet;
				403	break;
				404	}
				405	rb_erase(p, &q->delayed);
				406	q->throttled_flows--;
				407	fq_flow_add_tail(&q->old_flows, f);
				408	}
				409	}
				410
				411	static struct sk_buff fq_dequeue(struct Qdisc sch)
				412	{
				413	struct fq_sched_data *q = qdisc_priv(sch);
				414	u64 now = ktime_to_ns(ktime_get());
				415	struct fq_flow_head *head;
				416	struct sk_buff *skb;
				417	struct fq_flow *f;
Eric Dumazet	0eab5eb	2013-10-01 09:10:16 -0700	[diff] [blame]	418	u32 rate;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	419
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	420	skb = fq_dequeue_head(sch, &q->internal);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	421	if (skb)
				422	goto out;
				423	fq_check_throttled(q, now);
				424	begin:
				425	head = &q->new_flows;
				426	if (!head->first) {
				427	head = &q->old_flows;
				428	if (!head->first) {
				429	if (q->time_next_delayed_flow != ~0ULL)
				430	qdisc_watchdog_schedule_ns(&q->watchdog,
				431	q->time_next_delayed_flow);
				432	return NULL;
				433	}
				434	}
				435	f = head->first;
				436
				437	if (f->credit <= 0) {
				438	f->credit += q->quantum;
				439	head->first = f->next;
				440	fq_flow_add_tail(&q->old_flows, f);
				441	goto begin;
				442	}
				443
				444	if (unlikely(f->head && now < f->time_next_packet)) {
				445	head->first = f->next;
				446	fq_flow_set_throttled(q, f);
				447	goto begin;
				448	}
				449
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	450	skb = fq_dequeue_head(sch, f);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	451	if (!skb) {
				452	head->first = f->next;
				453	/* force a pass through old_flows to prevent starvation */
				454	if ((head == &q->new_flows) && q->old_flows.first) {
				455	fq_flow_add_tail(&q->old_flows, f);
				456	} else {
				457	fq_flow_set_detached(f);
				458	f->age = jiffies;
				459	q->inactive_flows++;
				460	}
				461	goto begin;
				462	}
Eric Dumazet	08f89b9	2013-08-30 09:46:43 -0700	[diff] [blame]	463	prefetch(&skb->end);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	464	f->time_next_packet = now;
				465	f->credit -= qdisc_pkt_len(skb);
				466
Eric Dumazet	0eab5eb	2013-10-01 09:10:16 -0700	[diff] [blame]	467	if (f->credit > 0 \|\| !q->rate_enable)
				468	goto out;
				469
Eric Dumazet	7eec417	2013-10-08 15:16:00 -0700	[diff] [blame]	470	rate = q->flow_max_rate;
				471	if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT)
				472	rate = min(skb->sk->sk_pacing_rate, rate);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	473
Eric Dumazet	7eec417	2013-10-08 15:16:00 -0700	[diff] [blame]	474	if (rate != ~0U) {
Eric Dumazet	0eab5eb	2013-10-01 09:10:16 -0700	[diff] [blame]	475	u32 plen = max(qdisc_pkt_len(skb), q->quantum);
				476	u64 len = (u64)plen * NSEC_PER_SEC;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	477
Eric Dumazet	7eec417	2013-10-08 15:16:00 -0700	[diff] [blame]	478	if (likely(rate))
				479	do_div(len, rate);
Eric Dumazet	0eab5eb	2013-10-01 09:10:16 -0700	[diff] [blame]	480	/* Since socket rate can change later,
				481	* clamp the delay to 125 ms.
				482	* TODO: maybe segment the too big skb, as in commit
				483	* e43ac79a4bc ("sch_tbf: segment too big GSO packets")
				484	*/
				485	if (unlikely(len > 125 * NSEC_PER_MSEC)) {
				486	len = 125 * NSEC_PER_MSEC;
				487	q->stat_pkts_too_long++;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	488	}
Eric Dumazet	0eab5eb	2013-10-01 09:10:16 -0700	[diff] [blame]	489
				490	f->time_next_packet = now + len;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	491	}
				492	out:
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	493	qdisc_bstats_update(sch, skb);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	494	qdisc_unthrottled(sch);
				495	return skb;
				496	}
				497
				498	static void fq_reset(struct Qdisc *sch)
				499	{
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	500	struct fq_sched_data *q = qdisc_priv(sch);
				501	struct rb_root *root;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	502	struct sk_buff *skb;
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	503	struct rb_node *p;
				504	struct fq_flow *f;
				505	unsigned int idx;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	506
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	507	while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL)
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	508	kfree_skb(skb);
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	509
				510	if (!q->fq_root)
				511	return;
				512
				513	for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
				514	root = &q->fq_root[idx];
				515	while ((p = rb_first(root)) != NULL) {
				516	f = container_of(p, struct fq_flow, fq_node);
				517	rb_erase(p, root);
				518
				519	while ((skb = fq_dequeue_head(sch, f)) != NULL)
				520	kfree_skb(skb);
				521
				522	kmem_cache_free(fq_flow_cachep, f);
				523	}
				524	}
				525	q->new_flows.first = NULL;
				526	q->old_flows.first = NULL;
				527	q->delayed = RB_ROOT;
				528	q->flows = 0;
				529	q->inactive_flows = 0;
				530	q->throttled_flows = 0;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	531	}
				532
				533	static void fq_rehash(struct fq_sched_data *q,
				534	struct rb_root *old_array, u32 old_log,
				535	struct rb_root *new_array, u32 new_log)
				536	{
				537	struct rb_node op, np, parent;
				538	struct rb_root oroot, nroot;
				539	struct fq_flow of, nf;
				540	int fcnt = 0;
				541	u32 idx;
				542
				543	for (idx = 0; idx < (1U << old_log); idx++) {
				544	oroot = &old_array[idx];
				545	while ((op = rb_first(oroot)) != NULL) {
				546	rb_erase(op, oroot);
				547	of = container_of(op, struct fq_flow, fq_node);
				548	if (fq_gc_candidate(of)) {
				549	fcnt++;
				550	kmem_cache_free(fq_flow_cachep, of);
				551	continue;
				552	}
				553	nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
				554
				555	np = &nroot->rb_node;
				556	parent = NULL;
				557	while (*np) {
				558	parent = *np;
				559
				560	nf = container_of(parent, struct fq_flow, fq_node);
				561	BUG_ON(nf->sk == of->sk);
				562
				563	if (nf->sk > of->sk)
				564	np = &parent->rb_right;
				565	else
				566	np = &parent->rb_left;
				567	}
				568
				569	rb_link_node(&of->fq_node, parent, np);
				570	rb_insert_color(&of->fq_node, nroot);
				571	}
				572	}
				573	q->flows -= fcnt;
				574	q->inactive_flows -= fcnt;
				575	q->stat_gc_flows += fcnt;
				576	}
				577
				578	static int fq_resize(struct fq_sched_data *q, u32 log)
				579	{
				580	struct rb_root *array;
				581	u32 idx;
				582
				583	if (q->fq_root && log == q->fq_trees_log)
				584	return 0;
				585
				586	array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL);
				587	if (!array)
				588	return -ENOMEM;
				589
				590	for (idx = 0; idx < (1U << log); idx++)
				591	array[idx] = RB_ROOT;
				592
				593	if (q->fq_root) {
				594	fq_rehash(q, q->fq_root, q->fq_trees_log, array, log);
				595	kfree(q->fq_root);
				596	}
				597	q->fq_root = array;
				598	q->fq_trees_log = log;
				599
				600	return 0;
				601	}
				602
				603	static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
				604	[TCA_FQ_PLIMIT] = { .type = NLA_U32 },
				605	[TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
				606	[TCA_FQ_QUANTUM] = { .type = NLA_U32 },
				607	[TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
				608	[TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
				609	[TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
				610	[TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
				611	[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
				612	};
				613
				614	static int fq_change(struct Qdisc sch, struct nlattr opt)
				615	{
				616	struct fq_sched_data *q = qdisc_priv(sch);
				617	struct nlattr *tb[TCA_FQ_MAX + 1];
				618	int err, drop_count = 0;
				619	u32 fq_log;
				620
				621	if (!opt)
				622	return -EINVAL;
				623
				624	err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
				625	if (err < 0)
				626	return err;
				627
				628	sch_tree_lock(sch);
				629
				630	fq_log = q->fq_trees_log;
				631
				632	if (tb[TCA_FQ_BUCKETS_LOG]) {
				633	u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
				634
				635	if (nval >= 1 && nval <= ilog2(256*1024))
				636	fq_log = nval;
				637	else
				638	err = -EINVAL;
				639	}
				640	if (tb[TCA_FQ_PLIMIT])
				641	sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
				642
				643	if (tb[TCA_FQ_FLOW_PLIMIT])
				644	q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
				645
				646	if (tb[TCA_FQ_QUANTUM])
				647	q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
				648
				649	if (tb[TCA_FQ_INITIAL_QUANTUM])
Eric Dumazet	ede869c	2013-10-07 12:50:18 -0700	[diff] [blame]	650	q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	651
				652	if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
				653	q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]);
				654
				655	if (tb[TCA_FQ_FLOW_MAX_RATE])
				656	q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
				657
				658	if (tb[TCA_FQ_RATE_ENABLE]) {
				659	u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
				660
				661	if (enable <= 1)
				662	q->rate_enable = enable;
				663	else
				664	err = -EINVAL;
				665	}
				666
				667	if (!err)
				668	err = fq_resize(q, fq_log);
				669
				670	while (sch->q.qlen > sch->limit) {
				671	struct sk_buff *skb = fq_dequeue(sch);
				672
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	673	if (!skb)
				674	break;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	675	kfree_skb(skb);
				676	drop_count++;
				677	}
				678	qdisc_tree_decrease_qlen(sch, drop_count);
				679
				680	sch_tree_unlock(sch);
				681	return err;
				682	}
				683
				684	static void fq_destroy(struct Qdisc *sch)
				685	{
				686	struct fq_sched_data *q = qdisc_priv(sch);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	687
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	688	fq_reset(sch);
				689	kfree(q->fq_root);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	690	qdisc_watchdog_cancel(&q->watchdog);
				691	}
				692
				693	static int fq_init(struct Qdisc sch, struct nlattr opt)
				694	{
				695	struct fq_sched_data *q = qdisc_priv(sch);
				696	int err;
				697
				698	sch->limit = 10000;
				699	q->flow_plimit = 100;
				700	q->quantum = 2 * psched_mtu(qdisc_dev(sch));
				701	q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
				702	q->flow_default_rate = 0;
				703	q->flow_max_rate = ~0U;
				704	q->rate_enable = 1;
				705	q->new_flows.first = NULL;
				706	q->old_flows.first = NULL;
				707	q->delayed = RB_ROOT;
				708	q->fq_root = NULL;
				709	q->fq_trees_log = ilog2(1024);
				710	qdisc_watchdog_init(&q->watchdog, sch);
				711
				712	if (opt)
				713	err = fq_change(sch, opt);
				714	else
				715	err = fq_resize(q, q->fq_trees_log);
				716
				717	return err;
				718	}
				719
				720	static int fq_dump(struct Qdisc sch, struct sk_buff skb)
				721	{
				722	struct fq_sched_data *q = qdisc_priv(sch);
				723	struct nlattr *opts;
				724
				725	opts = nla_nest_start(skb, TCA_OPTIONS);
				726	if (opts == NULL)
				727	goto nla_put_failure;
				728
Eric Dumazet	7eec417	2013-10-08 15:16:00 -0700	[diff] [blame]	729	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore,
				730	* do not bother giving its value
				731	*/
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	732	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) \|\|
				733	nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) \|\|
				734	nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) \|\|
				735	nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) \|\|
				736	nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) \|\|
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	737	nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) \|\|
				738	nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
				739	goto nla_put_failure;
				740
				741	nla_nest_end(skb, opts);
				742	return skb->len;
				743
				744	nla_put_failure:
				745	return -1;
				746	}
				747
				748	static int fq_dump_stats(struct Qdisc sch, struct gnet_dump d)
				749	{
				750	struct fq_sched_data *q = qdisc_priv(sch);
				751	u64 now = ktime_to_ns(ktime_get());
				752	struct tc_fq_qd_stats st = {
				753	.gc_flows = q->stat_gc_flows,
				754	.highprio_packets = q->stat_internal_packets,
				755	.tcp_retrans = q->stat_tcp_retrans,
				756	.throttled = q->stat_throttled,
				757	.flows_plimit = q->stat_flows_plimit,
				758	.pkts_too_long = q->stat_pkts_too_long,
				759	.allocation_errors = q->stat_allocation_errors,
				760	.flows = q->flows,
				761	.inactive_flows = q->inactive_flows,
				762	.throttled_flows = q->throttled_flows,
				763	.time_next_delayed_flow = q->time_next_delayed_flow - now,
				764	};
				765
				766	return gnet_stats_copy_app(d, &st, sizeof(st));
				767	}
				768
				769	static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
				770	.id = "fq",
				771	.priv_size = sizeof(struct fq_sched_data),
				772
				773	.enqueue = fq_enqueue,
				774	.dequeue = fq_dequeue,
				775	.peek = qdisc_peek_dequeued,
				776	.init = fq_init,
				777	.reset = fq_reset,
				778	.destroy = fq_destroy,
				779	.change = fq_change,
				780	.dump = fq_dump,
				781	.dump_stats = fq_dump_stats,
				782	.owner = THIS_MODULE,
				783	};
				784
				785	static int __init fq_module_init(void)
				786	{
				787	int ret;
				788
				789	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
				790	sizeof(struct fq_flow),
				791	0, 0, NULL);
				792	if (!fq_flow_cachep)
				793	return -ENOMEM;
				794
				795	ret = register_qdisc(&fq_qdisc_ops);
				796	if (ret)
				797	kmem_cache_destroy(fq_flow_cachep);
				798	return ret;
				799	}
				800
				801	static void __exit fq_module_exit(void)
				802	{
				803	unregister_qdisc(&fq_qdisc_ops);
				804	kmem_cache_destroy(fq_flow_cachep);
				805	}
				806
				807	module_init(fq_module_init)
				808	module_exit(fq_module_exit)
				809	MODULE_AUTHOR("Eric Dumazet");
				810	MODULE_LICENSE("GPL");