Blame - net/sched/sch_fq.c - kernel/msm-4.9

blob: 32ad015ee8ce4a9c5b967c22dd90631881f2362b [file] [log] [blame]

Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	1	/*
				2	* net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
				3	*
				4	* Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public License
				8	* as published by the Free Software Foundation; either version
				9	* 2 of the License, or (at your option) any later version.
				10	*
				11	* Meant to be mostly used for localy generated traffic :
				12	* Fast classification depends on skb->sk being set before reaching us.
				13	* If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
				14	* All packets belonging to a socket are considered as a 'flow'.
				15	*
				16	* Flows are dynamically allocated and stored in a hash table of RB trees
				17	* They are also part of one Round Robin 'queues' (new or old flows)
				18	*
				19	* Burst avoidance (aka pacing) capability :
				20	*
				21	* Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
				22	* bunch of packets, and this packet scheduler adds delay between
				23	* packets to respect rate limitation.
				24	*
				25	* enqueue() :
				26	* - lookup one RB tree (out of 1024 or more) to find the flow.
				27	* If non existent flow, create it, add it to the tree.
				28	* Add skb to the per flow list of skb (fifo).
				29	* - Use a special fifo for high prio packets
				30	*
				31	* dequeue() : serves flows in Round Robin
				32	* Note : When a flow becomes empty, we do not immediately remove it from
				33	* rb trees, for performance reasons (its expected to send additional packets,
				34	* or SLAB cache will reuse socket for another flow)
				35	*/
				36
				37	#include <linux/module.h>
				38	#include <linux/types.h>
				39	#include <linux/kernel.h>
				40	#include <linux/jiffies.h>
				41	#include <linux/string.h>
				42	#include <linux/in.h>
				43	#include <linux/errno.h>
				44	#include <linux/init.h>
				45	#include <linux/skbuff.h>
				46	#include <linux/slab.h>
				47	#include <linux/rbtree.h>
				48	#include <linux/hash.h>
Eric Dumazet	08f89b9	2013-08-30 09:46:43 -0700	[diff] [blame]	49	#include <linux/prefetch.h>
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	50	#include <net/netlink.h>
				51	#include <net/pkt_sched.h>
				52	#include <net/sock.h>
				53	#include <net/tcp_states.h>
				54
				55	/*
				56	* Per flow structure, dynamically allocated
				57	*/
				58	struct fq_flow {
				59	struct sk_buff head; / list of skbs for this flow : first skb */
				60	union {
				61	struct sk_buff tail; / last skb in the list */
				62	unsigned long age; /* jiffies when flow was emptied, for gc */
				63	};
				64	struct rb_node fq_node; /* anchor in fq_root[] trees */
				65	struct sock *sk;
				66	int qlen; /* number of packets in flow queue */
				67	int credit;
				68	u32 socket_hash; /* sk_hash */
				69	struct fq_flow next; / next pointer in RR lists, or &detached */
				70
				71	struct rb_node rate_node; /* anchor in q->delayed tree */
				72	u64 time_next_packet;
				73	};
				74
				75	struct fq_flow_head {
				76	struct fq_flow *first;
				77	struct fq_flow *last;
				78	};
				79
				80	struct fq_sched_data {
				81	struct fq_flow_head new_flows;
				82
				83	struct fq_flow_head old_flows;
				84
				85	struct rb_root delayed; /* for rate limited flows */
				86	u64 time_next_delayed_flow;
				87
				88	struct fq_flow internal; /* for non classified or high prio packets */
				89	u32 quantum;
				90	u32 initial_quantum;
				91	u32 flow_default_rate;/* rate per flow : bytes per second */
				92	u32 flow_max_rate; /* optional max rate per flow */
				93	u32 flow_plimit; /* max packets per flow */
				94	struct rb_root *fq_root;
				95	u8 rate_enable;
				96	u8 fq_trees_log;
				97
				98	u32 flows;
				99	u32 inactive_flows;
				100	u32 throttled_flows;
				101
				102	u64 stat_gc_flows;
				103	u64 stat_internal_packets;
				104	u64 stat_tcp_retrans;
				105	u64 stat_throttled;
				106	u64 stat_flows_plimit;
				107	u64 stat_pkts_too_long;
				108	u64 stat_allocation_errors;
				109	struct qdisc_watchdog watchdog;
				110	};
				111
				112	/* special value to mark a detached flow (not on old/new list) */
				113	static struct fq_flow detached, throttled;
				114
				115	static void fq_flow_set_detached(struct fq_flow *f)
				116	{
				117	f->next = &detached;
				118	}
				119
				120	static bool fq_flow_is_detached(const struct fq_flow *f)
				121	{
				122	return f->next == &detached;
				123	}
				124
				125	static void fq_flow_set_throttled(struct fq_sched_data q, struct fq_flow f)
				126	{
				127	struct rb_node *p = &q->delayed.rb_node, parent = NULL;
				128
				129	while (*p) {
				130	struct fq_flow *aux;
				131
				132	parent = *p;
				133	aux = container_of(parent, struct fq_flow, rate_node);
				134	if (f->time_next_packet >= aux->time_next_packet)
				135	p = &parent->rb_right;
				136	else
				137	p = &parent->rb_left;
				138	}
				139	rb_link_node(&f->rate_node, parent, p);
				140	rb_insert_color(&f->rate_node, &q->delayed);
				141	q->throttled_flows++;
				142	q->stat_throttled++;
				143
				144	f->next = &throttled;
				145	if (q->time_next_delayed_flow > f->time_next_packet)
				146	q->time_next_delayed_flow = f->time_next_packet;
				147	}
				148
				149
				150	static struct kmem_cache *fq_flow_cachep __read_mostly;
				151
				152	static void fq_flow_add_tail(struct fq_flow_head head, struct fq_flow flow)
				153	{
				154	if (head->first)
				155	head->last->next = flow;
				156	else
				157	head->first = flow;
				158	head->last = flow;
				159	flow->next = NULL;
				160	}
				161
				162	/* limit number of collected flows per round */
				163	#define FQ_GC_MAX 8
				164	#define FQ_GC_AGE (3*HZ)
				165
				166	static bool fq_gc_candidate(const struct fq_flow *f)
				167	{
				168	return fq_flow_is_detached(f) &&
				169	time_after(jiffies, f->age + FQ_GC_AGE);
				170	}
				171
				172	static void fq_gc(struct fq_sched_data *q,
				173	struct rb_root *root,
				174	struct sock *sk)
				175	{
				176	struct fq_flow f, tofree[FQ_GC_MAX];
				177	struct rb_node *p, parent;
				178	int fcnt = 0;
				179
				180	p = &root->rb_node;
				181	parent = NULL;
				182	while (*p) {
				183	parent = *p;
				184
				185	f = container_of(parent, struct fq_flow, fq_node);
				186	if (f->sk == sk)
				187	break;
				188
				189	if (fq_gc_candidate(f)) {
				190	tofree[fcnt++] = f;
				191	if (fcnt == FQ_GC_MAX)
				192	break;
				193	}
				194
				195	if (f->sk > sk)
				196	p = &parent->rb_right;
				197	else
				198	p = &parent->rb_left;
				199	}
				200
				201	q->flows -= fcnt;
				202	q->inactive_flows -= fcnt;
				203	q->stat_gc_flows += fcnt;
				204	while (fcnt) {
				205	struct fq_flow *f = tofree[--fcnt];
				206
				207	rb_erase(&f->fq_node, root);
				208	kmem_cache_free(fq_flow_cachep, f);
				209	}
				210	}
				211
				212	static const u8 prio2band[TC_PRIO_MAX + 1] = {
				213	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
				214	};
				215
				216	static struct fq_flow fq_classify(struct sk_buff skb, struct fq_sched_data *q)
				217	{
				218	struct rb_node *p, parent;
				219	struct sock *sk = skb->sk;
				220	struct rb_root *root;
				221	struct fq_flow *f;
				222	int band;
				223
				224	/* warning: no starvation prevention... */
				225	band = prio2band[skb->priority & TC_PRIO_MAX];
				226	if (unlikely(band == 0))
				227	return &q->internal;
				228
				229	if (unlikely(!sk)) {
				230	/* By forcing low order bit to 1, we make sure to not
				231	* collide with a local flow (socket pointers are word aligned)
				232	*/
				233	sk = (struct sock *)(skb_get_rxhash(skb) \| 1L);
				234	}
				235
				236	root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
				237
				238	if (q->flows >= (2U << q->fq_trees_log) &&
				239	q->inactive_flows > q->flows/2)
				240	fq_gc(q, root, sk);
				241
				242	p = &root->rb_node;
				243	parent = NULL;
				244	while (*p) {
				245	parent = *p;
				246
				247	f = container_of(parent, struct fq_flow, fq_node);
				248	if (f->sk == sk) {
				249	/* socket might have been reallocated, so check
				250	* if its sk_hash is the same.
				251	* It not, we need to refill credit with
				252	* initial quantum
				253	*/
				254	if (unlikely(skb->sk &&
				255	f->socket_hash != sk->sk_hash)) {
				256	f->credit = q->initial_quantum;
				257	f->socket_hash = sk->sk_hash;
				258	}
				259	return f;
				260	}
				261	if (f->sk > sk)
				262	p = &parent->rb_right;
				263	else
				264	p = &parent->rb_left;
				265	}
				266
				267	f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC \| __GFP_NOWARN);
				268	if (unlikely(!f)) {
				269	q->stat_allocation_errors++;
				270	return &q->internal;
				271	}
				272	fq_flow_set_detached(f);
				273	f->sk = sk;
				274	if (skb->sk)
				275	f->socket_hash = sk->sk_hash;
				276	f->credit = q->initial_quantum;
				277
				278	rb_link_node(&f->fq_node, parent, p);
				279	rb_insert_color(&f->fq_node, root);
				280
				281	q->flows++;
				282	q->inactive_flows++;
				283	return f;
				284	}
				285
				286
				287	/* remove one skb from head of flow queue */
				288	static struct sk_buff fq_dequeue_head(struct fq_flow flow)
				289	{
				290	struct sk_buff *skb = flow->head;
				291
				292	if (skb) {
				293	flow->head = skb->next;
				294	skb->next = NULL;
				295	flow->qlen--;
				296	}
				297	return skb;
				298	}
				299
				300	/* We might add in the future detection of retransmits
				301	* For the time being, just return false
				302	*/
				303	static bool skb_is_retransmit(struct sk_buff *skb)
				304	{
				305	return false;
				306	}
				307
				308	/* add skb to flow queue
				309	* flow queue is a linked list, kind of FIFO, except for TCP retransmits
				310	* We special case tcp retransmits to be transmitted before other packets.
				311	* We rely on fact that TCP retransmits are unlikely, so we do not waste
				312	* a separate queue or a pointer.
				313	* head-> [retrans pkt 1]
				314	* [retrans pkt 2]
				315	* [ normal pkt 1]
				316	* [ normal pkt 2]
				317	* [ normal pkt 3]
				318	* tail-> [ normal pkt 4]
				319	*/
				320	static void flow_queue_add(struct fq_flow flow, struct sk_buff skb)
				321	{
				322	struct sk_buff prev, head = flow->head;
				323
				324	skb->next = NULL;
				325	if (!head) {
				326	flow->head = skb;
				327	flow->tail = skb;
				328	return;
				329	}
				330	if (likely(!skb_is_retransmit(skb))) {
				331	flow->tail->next = skb;
				332	flow->tail = skb;
				333	return;
				334	}
				335
				336	/* This skb is a tcp retransmit,
				337	* find the last retrans packet in the queue
				338	*/
				339	prev = NULL;
				340	while (skb_is_retransmit(head)) {
				341	prev = head;
				342	head = head->next;
				343	if (!head)
				344	break;
				345	}
				346	if (!prev) { /* no rtx packet in queue, become the new head */
				347	skb->next = flow->head;
				348	flow->head = skb;
				349	} else {
				350	if (prev == flow->tail)
				351	flow->tail = skb;
				352	else
				353	skb->next = prev->next;
				354	prev->next = skb;
				355	}
				356	}
				357
				358	static int fq_enqueue(struct sk_buff skb, struct Qdisc sch)
				359	{
				360	struct fq_sched_data *q = qdisc_priv(sch);
				361	struct fq_flow *f;
				362
				363	if (unlikely(sch->q.qlen >= sch->limit))
				364	return qdisc_drop(skb, sch);
				365
				366	f = fq_classify(skb, q);
				367	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
				368	q->stat_flows_plimit++;
				369	return qdisc_drop(skb, sch);
				370	}
				371
				372	f->qlen++;
				373	flow_queue_add(f, skb);
				374	if (skb_is_retransmit(skb))
				375	q->stat_tcp_retrans++;
				376	sch->qstats.backlog += qdisc_pkt_len(skb);
				377	if (fq_flow_is_detached(f)) {
				378	fq_flow_add_tail(&q->new_flows, f);
				379	if (q->quantum > f->credit)
				380	f->credit = q->quantum;
				381	q->inactive_flows--;
				382	qdisc_unthrottled(sch);
				383	}
				384	if (unlikely(f == &q->internal)) {
				385	q->stat_internal_packets++;
				386	qdisc_unthrottled(sch);
				387	}
				388	sch->q.qlen++;
				389
				390	return NET_XMIT_SUCCESS;
				391	}
				392
				393	static void fq_check_throttled(struct fq_sched_data *q, u64 now)
				394	{
				395	struct rb_node *p;
				396
				397	if (q->time_next_delayed_flow > now)
				398	return;
				399
				400	q->time_next_delayed_flow = ~0ULL;
				401	while ((p = rb_first(&q->delayed)) != NULL) {
				402	struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
				403
				404	if (f->time_next_packet > now) {
				405	q->time_next_delayed_flow = f->time_next_packet;
				406	break;
				407	}
				408	rb_erase(p, &q->delayed);
				409	q->throttled_flows--;
				410	fq_flow_add_tail(&q->old_flows, f);
				411	}
				412	}
				413
				414	static struct sk_buff fq_dequeue(struct Qdisc sch)
				415	{
				416	struct fq_sched_data *q = qdisc_priv(sch);
				417	u64 now = ktime_to_ns(ktime_get());
				418	struct fq_flow_head *head;
				419	struct sk_buff *skb;
				420	struct fq_flow *f;
				421
				422	skb = fq_dequeue_head(&q->internal);
				423	if (skb)
				424	goto out;
				425	fq_check_throttled(q, now);
				426	begin:
				427	head = &q->new_flows;
				428	if (!head->first) {
				429	head = &q->old_flows;
				430	if (!head->first) {
				431	if (q->time_next_delayed_flow != ~0ULL)
				432	qdisc_watchdog_schedule_ns(&q->watchdog,
				433	q->time_next_delayed_flow);
				434	return NULL;
				435	}
				436	}
				437	f = head->first;
				438
				439	if (f->credit <= 0) {
				440	f->credit += q->quantum;
				441	head->first = f->next;
				442	fq_flow_add_tail(&q->old_flows, f);
				443	goto begin;
				444	}
				445
				446	if (unlikely(f->head && now < f->time_next_packet)) {
				447	head->first = f->next;
				448	fq_flow_set_throttled(q, f);
				449	goto begin;
				450	}
				451
				452	skb = fq_dequeue_head(f);
				453	if (!skb) {
				454	head->first = f->next;
				455	/* force a pass through old_flows to prevent starvation */
				456	if ((head == &q->new_flows) && q->old_flows.first) {
				457	fq_flow_add_tail(&q->old_flows, f);
				458	} else {
				459	fq_flow_set_detached(f);
				460	f->age = jiffies;
				461	q->inactive_flows++;
				462	}
				463	goto begin;
				464	}
Eric Dumazet	08f89b9	2013-08-30 09:46:43 -0700	[diff] [blame]	465	prefetch(&skb->end);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	466	f->time_next_packet = now;
				467	f->credit -= qdisc_pkt_len(skb);
				468
				469	if (f->credit <= 0 &&
				470	q->rate_enable &&
				471	skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) {
				472	u32 rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate;
				473
				474	rate = min(rate, q->flow_max_rate);
				475	if (rate) {
				476	u64 len = (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC;
				477
				478	do_div(len, rate);
				479	/* Since socket rate can change later,
				480	* clamp the delay to 125 ms.
				481	* TODO: maybe segment the too big skb, as in commit
				482	* e43ac79a4bc ("sch_tbf: segment too big GSO packets")
				483	*/
				484	if (unlikely(len > 125 * NSEC_PER_MSEC)) {
				485	len = 125 * NSEC_PER_MSEC;
				486	q->stat_pkts_too_long++;
				487	}
				488
				489	f->time_next_packet = now + len;
				490	}
				491	}
				492	out:
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	493	sch->qstats.backlog -= qdisc_pkt_len(skb);
				494	qdisc_bstats_update(sch, skb);
				495	sch->q.qlen--;
				496	qdisc_unthrottled(sch);
				497	return skb;
				498	}
				499
				500	static void fq_reset(struct Qdisc *sch)
				501	{
				502	struct sk_buff *skb;
				503
				504	while ((skb = fq_dequeue(sch)) != NULL)
				505	kfree_skb(skb);
				506	}
				507
				508	static void fq_rehash(struct fq_sched_data *q,
				509	struct rb_root *old_array, u32 old_log,
				510	struct rb_root *new_array, u32 new_log)
				511	{
				512	struct rb_node op, np, parent;
				513	struct rb_root oroot, nroot;
				514	struct fq_flow of, nf;
				515	int fcnt = 0;
				516	u32 idx;
				517
				518	for (idx = 0; idx < (1U << old_log); idx++) {
				519	oroot = &old_array[idx];
				520	while ((op = rb_first(oroot)) != NULL) {
				521	rb_erase(op, oroot);
				522	of = container_of(op, struct fq_flow, fq_node);
				523	if (fq_gc_candidate(of)) {
				524	fcnt++;
				525	kmem_cache_free(fq_flow_cachep, of);
				526	continue;
				527	}
				528	nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
				529
				530	np = &nroot->rb_node;
				531	parent = NULL;
				532	while (*np) {
				533	parent = *np;
				534
				535	nf = container_of(parent, struct fq_flow, fq_node);
				536	BUG_ON(nf->sk == of->sk);
				537
				538	if (nf->sk > of->sk)
				539	np = &parent->rb_right;
				540	else
				541	np = &parent->rb_left;
				542	}
				543
				544	rb_link_node(&of->fq_node, parent, np);
				545	rb_insert_color(&of->fq_node, nroot);
				546	}
				547	}
				548	q->flows -= fcnt;
				549	q->inactive_flows -= fcnt;
				550	q->stat_gc_flows += fcnt;
				551	}
				552
				553	static int fq_resize(struct fq_sched_data *q, u32 log)
				554	{
				555	struct rb_root *array;
				556	u32 idx;
				557
				558	if (q->fq_root && log == q->fq_trees_log)
				559	return 0;
				560
				561	array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL);
				562	if (!array)
				563	return -ENOMEM;
				564
				565	for (idx = 0; idx < (1U << log); idx++)
				566	array[idx] = RB_ROOT;
				567
				568	if (q->fq_root) {
				569	fq_rehash(q, q->fq_root, q->fq_trees_log, array, log);
				570	kfree(q->fq_root);
				571	}
				572	q->fq_root = array;
				573	q->fq_trees_log = log;
				574
				575	return 0;
				576	}
				577
				578	static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
				579	[TCA_FQ_PLIMIT] = { .type = NLA_U32 },
				580	[TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
				581	[TCA_FQ_QUANTUM] = { .type = NLA_U32 },
				582	[TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
				583	[TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
				584	[TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
				585	[TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
				586	[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
				587	};
				588
				589	static int fq_change(struct Qdisc sch, struct nlattr opt)
				590	{
				591	struct fq_sched_data *q = qdisc_priv(sch);
				592	struct nlattr *tb[TCA_FQ_MAX + 1];
				593	int err, drop_count = 0;
				594	u32 fq_log;
				595
				596	if (!opt)
				597	return -EINVAL;
				598
				599	err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
				600	if (err < 0)
				601	return err;
				602
				603	sch_tree_lock(sch);
				604
				605	fq_log = q->fq_trees_log;
				606
				607	if (tb[TCA_FQ_BUCKETS_LOG]) {
				608	u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
				609
				610	if (nval >= 1 && nval <= ilog2(256*1024))
				611	fq_log = nval;
				612	else
				613	err = -EINVAL;
				614	}
				615	if (tb[TCA_FQ_PLIMIT])
				616	sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
				617
				618	if (tb[TCA_FQ_FLOW_PLIMIT])
				619	q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
				620
				621	if (tb[TCA_FQ_QUANTUM])
				622	q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
				623
				624	if (tb[TCA_FQ_INITIAL_QUANTUM])
				625	q->quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
				626
				627	if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
				628	q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]);
				629
				630	if (tb[TCA_FQ_FLOW_MAX_RATE])
				631	q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
				632
				633	if (tb[TCA_FQ_RATE_ENABLE]) {
				634	u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
				635
				636	if (enable <= 1)
				637	q->rate_enable = enable;
				638	else
				639	err = -EINVAL;
				640	}
				641
				642	if (!err)
				643	err = fq_resize(q, fq_log);
				644
				645	while (sch->q.qlen > sch->limit) {
				646	struct sk_buff *skb = fq_dequeue(sch);
				647
				648	kfree_skb(skb);
				649	drop_count++;
				650	}
				651	qdisc_tree_decrease_qlen(sch, drop_count);
				652
				653	sch_tree_unlock(sch);
				654	return err;
				655	}
				656
				657	static void fq_destroy(struct Qdisc *sch)
				658	{
				659	struct fq_sched_data *q = qdisc_priv(sch);
				660	struct rb_root *root;
				661	struct rb_node *p;
				662	unsigned int idx;
				663
				664	if (q->fq_root) {
				665	for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
				666	root = &q->fq_root[idx];
				667	while ((p = rb_first(root)) != NULL) {
				668	rb_erase(p, root);
				669	kmem_cache_free(fq_flow_cachep,
				670	container_of(p, struct fq_flow, fq_node));
				671	}
				672	}
				673	kfree(q->fq_root);
				674	}
				675	qdisc_watchdog_cancel(&q->watchdog);
				676	}
				677
				678	static int fq_init(struct Qdisc sch, struct nlattr opt)
				679	{
				680	struct fq_sched_data *q = qdisc_priv(sch);
				681	int err;
				682
				683	sch->limit = 10000;
				684	q->flow_plimit = 100;
				685	q->quantum = 2 * psched_mtu(qdisc_dev(sch));
				686	q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
				687	q->flow_default_rate = 0;
				688	q->flow_max_rate = ~0U;
				689	q->rate_enable = 1;
				690	q->new_flows.first = NULL;
				691	q->old_flows.first = NULL;
				692	q->delayed = RB_ROOT;
				693	q->fq_root = NULL;
				694	q->fq_trees_log = ilog2(1024);
				695	qdisc_watchdog_init(&q->watchdog, sch);
				696
				697	if (opt)
				698	err = fq_change(sch, opt);
				699	else
				700	err = fq_resize(q, q->fq_trees_log);
				701
				702	return err;
				703	}
				704
				705	static int fq_dump(struct Qdisc sch, struct sk_buff skb)
				706	{
				707	struct fq_sched_data *q = qdisc_priv(sch);
				708	struct nlattr *opts;
				709
				710	opts = nla_nest_start(skb, TCA_OPTIONS);
				711	if (opts == NULL)
				712	goto nla_put_failure;
				713
				714	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) \|\|
				715	nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) \|\|
				716	nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) \|\|
				717	nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) \|\|
				718	nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) \|\|
				719	nla_put_u32(skb, TCA_FQ_FLOW_DEFAULT_RATE, q->flow_default_rate) \|\|
				720	nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) \|\|
				721	nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
				722	goto nla_put_failure;
				723
				724	nla_nest_end(skb, opts);
				725	return skb->len;
				726
				727	nla_put_failure:
				728	return -1;
				729	}
				730
				731	static int fq_dump_stats(struct Qdisc sch, struct gnet_dump d)
				732	{
				733	struct fq_sched_data *q = qdisc_priv(sch);
				734	u64 now = ktime_to_ns(ktime_get());
				735	struct tc_fq_qd_stats st = {
				736	.gc_flows = q->stat_gc_flows,
				737	.highprio_packets = q->stat_internal_packets,
				738	.tcp_retrans = q->stat_tcp_retrans,
				739	.throttled = q->stat_throttled,
				740	.flows_plimit = q->stat_flows_plimit,
				741	.pkts_too_long = q->stat_pkts_too_long,
				742	.allocation_errors = q->stat_allocation_errors,
				743	.flows = q->flows,
				744	.inactive_flows = q->inactive_flows,
				745	.throttled_flows = q->throttled_flows,
				746	.time_next_delayed_flow = q->time_next_delayed_flow - now,
				747	};
				748
				749	return gnet_stats_copy_app(d, &st, sizeof(st));
				750	}
				751
				752	static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
				753	.id = "fq",
				754	.priv_size = sizeof(struct fq_sched_data),
				755
				756	.enqueue = fq_enqueue,
				757	.dequeue = fq_dequeue,
				758	.peek = qdisc_peek_dequeued,
				759	.init = fq_init,
				760	.reset = fq_reset,
				761	.destroy = fq_destroy,
				762	.change = fq_change,
				763	.dump = fq_dump,
				764	.dump_stats = fq_dump_stats,
				765	.owner = THIS_MODULE,
				766	};
				767
				768	static int __init fq_module_init(void)
				769	{
				770	int ret;
				771
				772	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
				773	sizeof(struct fq_flow),
				774	0, 0, NULL);
				775	if (!fq_flow_cachep)
				776	return -ENOMEM;
				777
				778	ret = register_qdisc(&fq_qdisc_ops);
				779	if (ret)
				780	kmem_cache_destroy(fq_flow_cachep);
				781	return ret;
				782	}
				783
				784	static void __exit fq_module_exit(void)
				785	{
				786	unregister_qdisc(&fq_qdisc_ops);
				787	kmem_cache_destroy(fq_flow_cachep);
				788	}
				789
				790	module_init(fq_module_init)
				791	module_exit(fq_module_exit)
				792	MODULE_AUTHOR("Eric Dumazet");
				793	MODULE_LICENSE("GPL");