Blame - net/sched/sch_fq.c - kernel/msm-5.4

blob: 91ceca7bcb52e70c476174e07b90b8975c7933b1 [file] [log] [blame]

Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame^]	1	/*
				2	* net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
				3	*
				4	* Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public License
				8	* as published by the Free Software Foundation; either version
				9	* 2 of the License, or (at your option) any later version.
				10	*
				11	* Meant to be mostly used for localy generated traffic :
				12	* Fast classification depends on skb->sk being set before reaching us.
				13	* If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
				14	* All packets belonging to a socket are considered as a 'flow'.
				15	*
				16	* Flows are dynamically allocated and stored in a hash table of RB trees
				17	* They are also part of one Round Robin 'queues' (new or old flows)
				18	*
				19	* Burst avoidance (aka pacing) capability :
				20	*
				21	* Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
				22	* bunch of packets, and this packet scheduler adds delay between
				23	* packets to respect rate limitation.
				24	*
				25	* enqueue() :
				26	* - lookup one RB tree (out of 1024 or more) to find the flow.
				27	* If non existent flow, create it, add it to the tree.
				28	* Add skb to the per flow list of skb (fifo).
				29	* - Use a special fifo for high prio packets
				30	*
				31	* dequeue() : serves flows in Round Robin
				32	* Note : When a flow becomes empty, we do not immediately remove it from
				33	* rb trees, for performance reasons (its expected to send additional packets,
				34	* or SLAB cache will reuse socket for another flow)
				35	*/
				36
				37	#include <linux/module.h>
				38	#include <linux/types.h>
				39	#include <linux/kernel.h>
				40	#include <linux/jiffies.h>
				41	#include <linux/string.h>
				42	#include <linux/in.h>
				43	#include <linux/errno.h>
				44	#include <linux/init.h>
				45	#include <linux/skbuff.h>
				46	#include <linux/slab.h>
				47	#include <linux/rbtree.h>
				48	#include <linux/hash.h>
				49	#include <net/netlink.h>
				50	#include <net/pkt_sched.h>
				51	#include <net/sock.h>
				52	#include <net/tcp_states.h>
				53
				54	/*
				55	* Per flow structure, dynamically allocated
				56	*/
				57	struct fq_flow {
				58	struct sk_buff head; / list of skbs for this flow : first skb */
				59	union {
				60	struct sk_buff tail; / last skb in the list */
				61	unsigned long age; /* jiffies when flow was emptied, for gc */
				62	};
				63	struct rb_node fq_node; /* anchor in fq_root[] trees */
				64	struct sock *sk;
				65	int qlen; /* number of packets in flow queue */
				66	int credit;
				67	u32 socket_hash; /* sk_hash */
				68	struct fq_flow next; / next pointer in RR lists, or &detached */
				69
				70	struct rb_node rate_node; /* anchor in q->delayed tree */
				71	u64 time_next_packet;
				72	};
				73
				74	struct fq_flow_head {
				75	struct fq_flow *first;
				76	struct fq_flow *last;
				77	};
				78
				79	struct fq_sched_data {
				80	struct fq_flow_head new_flows;
				81
				82	struct fq_flow_head old_flows;
				83
				84	struct rb_root delayed; /* for rate limited flows */
				85	u64 time_next_delayed_flow;
				86
				87	struct fq_flow internal; /* for non classified or high prio packets */
				88	u32 quantum;
				89	u32 initial_quantum;
				90	u32 flow_default_rate;/* rate per flow : bytes per second */
				91	u32 flow_max_rate; /* optional max rate per flow */
				92	u32 flow_plimit; /* max packets per flow */
				93	struct rb_root *fq_root;
				94	u8 rate_enable;
				95	u8 fq_trees_log;
				96
				97	u32 flows;
				98	u32 inactive_flows;
				99	u32 throttled_flows;
				100
				101	u64 stat_gc_flows;
				102	u64 stat_internal_packets;
				103	u64 stat_tcp_retrans;
				104	u64 stat_throttled;
				105	u64 stat_flows_plimit;
				106	u64 stat_pkts_too_long;
				107	u64 stat_allocation_errors;
				108	struct qdisc_watchdog watchdog;
				109	};
				110
				111	/* special value to mark a detached flow (not on old/new list) */
				112	static struct fq_flow detached, throttled;
				113
				114	static void fq_flow_set_detached(struct fq_flow *f)
				115	{
				116	f->next = &detached;
				117	}
				118
				119	static bool fq_flow_is_detached(const struct fq_flow *f)
				120	{
				121	return f->next == &detached;
				122	}
				123
				124	static void fq_flow_set_throttled(struct fq_sched_data q, struct fq_flow f)
				125	{
				126	struct rb_node *p = &q->delayed.rb_node, parent = NULL;
				127
				128	while (*p) {
				129	struct fq_flow *aux;
				130
				131	parent = *p;
				132	aux = container_of(parent, struct fq_flow, rate_node);
				133	if (f->time_next_packet >= aux->time_next_packet)
				134	p = &parent->rb_right;
				135	else
				136	p = &parent->rb_left;
				137	}
				138	rb_link_node(&f->rate_node, parent, p);
				139	rb_insert_color(&f->rate_node, &q->delayed);
				140	q->throttled_flows++;
				141	q->stat_throttled++;
				142
				143	f->next = &throttled;
				144	if (q->time_next_delayed_flow > f->time_next_packet)
				145	q->time_next_delayed_flow = f->time_next_packet;
				146	}
				147
				148
				149	static struct kmem_cache *fq_flow_cachep __read_mostly;
				150
				151	static void fq_flow_add_tail(struct fq_flow_head head, struct fq_flow flow)
				152	{
				153	if (head->first)
				154	head->last->next = flow;
				155	else
				156	head->first = flow;
				157	head->last = flow;
				158	flow->next = NULL;
				159	}
				160
				161	/* limit number of collected flows per round */
				162	#define FQ_GC_MAX 8
				163	#define FQ_GC_AGE (3*HZ)
				164
				165	static bool fq_gc_candidate(const struct fq_flow *f)
				166	{
				167	return fq_flow_is_detached(f) &&
				168	time_after(jiffies, f->age + FQ_GC_AGE);
				169	}
				170
				171	static void fq_gc(struct fq_sched_data *q,
				172	struct rb_root *root,
				173	struct sock *sk)
				174	{
				175	struct fq_flow f, tofree[FQ_GC_MAX];
				176	struct rb_node *p, parent;
				177	int fcnt = 0;
				178
				179	p = &root->rb_node;
				180	parent = NULL;
				181	while (*p) {
				182	parent = *p;
				183
				184	f = container_of(parent, struct fq_flow, fq_node);
				185	if (f->sk == sk)
				186	break;
				187
				188	if (fq_gc_candidate(f)) {
				189	tofree[fcnt++] = f;
				190	if (fcnt == FQ_GC_MAX)
				191	break;
				192	}
				193
				194	if (f->sk > sk)
				195	p = &parent->rb_right;
				196	else
				197	p = &parent->rb_left;
				198	}
				199
				200	q->flows -= fcnt;
				201	q->inactive_flows -= fcnt;
				202	q->stat_gc_flows += fcnt;
				203	while (fcnt) {
				204	struct fq_flow *f = tofree[--fcnt];
				205
				206	rb_erase(&f->fq_node, root);
				207	kmem_cache_free(fq_flow_cachep, f);
				208	}
				209	}
				210
				211	static const u8 prio2band[TC_PRIO_MAX + 1] = {
				212	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
				213	};
				214
				215	static struct fq_flow fq_classify(struct sk_buff skb, struct fq_sched_data *q)
				216	{
				217	struct rb_node *p, parent;
				218	struct sock *sk = skb->sk;
				219	struct rb_root *root;
				220	struct fq_flow *f;
				221	int band;
				222
				223	/* warning: no starvation prevention... */
				224	band = prio2band[skb->priority & TC_PRIO_MAX];
				225	if (unlikely(band == 0))
				226	return &q->internal;
				227
				228	if (unlikely(!sk)) {
				229	/* By forcing low order bit to 1, we make sure to not
				230	* collide with a local flow (socket pointers are word aligned)
				231	*/
				232	sk = (struct sock *)(skb_get_rxhash(skb) \| 1L);
				233	}
				234
				235	root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
				236
				237	if (q->flows >= (2U << q->fq_trees_log) &&
				238	q->inactive_flows > q->flows/2)
				239	fq_gc(q, root, sk);
				240
				241	p = &root->rb_node;
				242	parent = NULL;
				243	while (*p) {
				244	parent = *p;
				245
				246	f = container_of(parent, struct fq_flow, fq_node);
				247	if (f->sk == sk) {
				248	/* socket might have been reallocated, so check
				249	* if its sk_hash is the same.
				250	* It not, we need to refill credit with
				251	* initial quantum
				252	*/
				253	if (unlikely(skb->sk &&
				254	f->socket_hash != sk->sk_hash)) {
				255	f->credit = q->initial_quantum;
				256	f->socket_hash = sk->sk_hash;
				257	}
				258	return f;
				259	}
				260	if (f->sk > sk)
				261	p = &parent->rb_right;
				262	else
				263	p = &parent->rb_left;
				264	}
				265
				266	f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC \| __GFP_NOWARN);
				267	if (unlikely(!f)) {
				268	q->stat_allocation_errors++;
				269	return &q->internal;
				270	}
				271	fq_flow_set_detached(f);
				272	f->sk = sk;
				273	if (skb->sk)
				274	f->socket_hash = sk->sk_hash;
				275	f->credit = q->initial_quantum;
				276
				277	rb_link_node(&f->fq_node, parent, p);
				278	rb_insert_color(&f->fq_node, root);
				279
				280	q->flows++;
				281	q->inactive_flows++;
				282	return f;
				283	}
				284
				285
				286	/* remove one skb from head of flow queue */
				287	static struct sk_buff fq_dequeue_head(struct fq_flow flow)
				288	{
				289	struct sk_buff *skb = flow->head;
				290
				291	if (skb) {
				292	flow->head = skb->next;
				293	skb->next = NULL;
				294	flow->qlen--;
				295	}
				296	return skb;
				297	}
				298
				299	/* We might add in the future detection of retransmits
				300	* For the time being, just return false
				301	*/
				302	static bool skb_is_retransmit(struct sk_buff *skb)
				303	{
				304	return false;
				305	}
				306
				307	/* add skb to flow queue
				308	* flow queue is a linked list, kind of FIFO, except for TCP retransmits
				309	* We special case tcp retransmits to be transmitted before other packets.
				310	* We rely on fact that TCP retransmits are unlikely, so we do not waste
				311	* a separate queue or a pointer.
				312	* head-> [retrans pkt 1]
				313	* [retrans pkt 2]
				314	* [ normal pkt 1]
				315	* [ normal pkt 2]
				316	* [ normal pkt 3]
				317	* tail-> [ normal pkt 4]
				318	*/
				319	static void flow_queue_add(struct fq_flow flow, struct sk_buff skb)
				320	{
				321	struct sk_buff prev, head = flow->head;
				322
				323	skb->next = NULL;
				324	if (!head) {
				325	flow->head = skb;
				326	flow->tail = skb;
				327	return;
				328	}
				329	if (likely(!skb_is_retransmit(skb))) {
				330	flow->tail->next = skb;
				331	flow->tail = skb;
				332	return;
				333	}
				334
				335	/* This skb is a tcp retransmit,
				336	* find the last retrans packet in the queue
				337	*/
				338	prev = NULL;
				339	while (skb_is_retransmit(head)) {
				340	prev = head;
				341	head = head->next;
				342	if (!head)
				343	break;
				344	}
				345	if (!prev) { /* no rtx packet in queue, become the new head */
				346	skb->next = flow->head;
				347	flow->head = skb;
				348	} else {
				349	if (prev == flow->tail)
				350	flow->tail = skb;
				351	else
				352	skb->next = prev->next;
				353	prev->next = skb;
				354	}
				355	}
				356
				357	static int fq_enqueue(struct sk_buff skb, struct Qdisc sch)
				358	{
				359	struct fq_sched_data *q = qdisc_priv(sch);
				360	struct fq_flow *f;
				361
				362	if (unlikely(sch->q.qlen >= sch->limit))
				363	return qdisc_drop(skb, sch);
				364
				365	f = fq_classify(skb, q);
				366	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
				367	q->stat_flows_plimit++;
				368	return qdisc_drop(skb, sch);
				369	}
				370
				371	f->qlen++;
				372	flow_queue_add(f, skb);
				373	if (skb_is_retransmit(skb))
				374	q->stat_tcp_retrans++;
				375	sch->qstats.backlog += qdisc_pkt_len(skb);
				376	if (fq_flow_is_detached(f)) {
				377	fq_flow_add_tail(&q->new_flows, f);
				378	if (q->quantum > f->credit)
				379	f->credit = q->quantum;
				380	q->inactive_flows--;
				381	qdisc_unthrottled(sch);
				382	}
				383	if (unlikely(f == &q->internal)) {
				384	q->stat_internal_packets++;
				385	qdisc_unthrottled(sch);
				386	}
				387	sch->q.qlen++;
				388
				389	return NET_XMIT_SUCCESS;
				390	}
				391
				392	static void fq_check_throttled(struct fq_sched_data *q, u64 now)
				393	{
				394	struct rb_node *p;
				395
				396	if (q->time_next_delayed_flow > now)
				397	return;
				398
				399	q->time_next_delayed_flow = ~0ULL;
				400	while ((p = rb_first(&q->delayed)) != NULL) {
				401	struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
				402
				403	if (f->time_next_packet > now) {
				404	q->time_next_delayed_flow = f->time_next_packet;
				405	break;
				406	}
				407	rb_erase(p, &q->delayed);
				408	q->throttled_flows--;
				409	fq_flow_add_tail(&q->old_flows, f);
				410	}
				411	}
				412
				413	static struct sk_buff fq_dequeue(struct Qdisc sch)
				414	{
				415	struct fq_sched_data *q = qdisc_priv(sch);
				416	u64 now = ktime_to_ns(ktime_get());
				417	struct fq_flow_head *head;
				418	struct sk_buff *skb;
				419	struct fq_flow *f;
				420
				421	skb = fq_dequeue_head(&q->internal);
				422	if (skb)
				423	goto out;
				424	fq_check_throttled(q, now);
				425	begin:
				426	head = &q->new_flows;
				427	if (!head->first) {
				428	head = &q->old_flows;
				429	if (!head->first) {
				430	if (q->time_next_delayed_flow != ~0ULL)
				431	qdisc_watchdog_schedule_ns(&q->watchdog,
				432	q->time_next_delayed_flow);
				433	return NULL;
				434	}
				435	}
				436	f = head->first;
				437
				438	if (f->credit <= 0) {
				439	f->credit += q->quantum;
				440	head->first = f->next;
				441	fq_flow_add_tail(&q->old_flows, f);
				442	goto begin;
				443	}
				444
				445	if (unlikely(f->head && now < f->time_next_packet)) {
				446	head->first = f->next;
				447	fq_flow_set_throttled(q, f);
				448	goto begin;
				449	}
				450
				451	skb = fq_dequeue_head(f);
				452	if (!skb) {
				453	head->first = f->next;
				454	/* force a pass through old_flows to prevent starvation */
				455	if ((head == &q->new_flows) && q->old_flows.first) {
				456	fq_flow_add_tail(&q->old_flows, f);
				457	} else {
				458	fq_flow_set_detached(f);
				459	f->age = jiffies;
				460	q->inactive_flows++;
				461	}
				462	goto begin;
				463	}
				464	f->time_next_packet = now;
				465	f->credit -= qdisc_pkt_len(skb);
				466
				467	if (f->credit <= 0 &&
				468	q->rate_enable &&
				469	skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) {
				470	u32 rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate;
				471
				472	rate = min(rate, q->flow_max_rate);
				473	if (rate) {
				474	u64 len = (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC;
				475
				476	do_div(len, rate);
				477	/* Since socket rate can change later,
				478	* clamp the delay to 125 ms.
				479	* TODO: maybe segment the too big skb, as in commit
				480	* e43ac79a4bc ("sch_tbf: segment too big GSO packets")
				481	*/
				482	if (unlikely(len > 125 * NSEC_PER_MSEC)) {
				483	len = 125 * NSEC_PER_MSEC;
				484	q->stat_pkts_too_long++;
				485	}
				486
				487	f->time_next_packet = now + len;
				488	}
				489	}
				490	out:
				491	prefetch(&skb->end);
				492	sch->qstats.backlog -= qdisc_pkt_len(skb);
				493	qdisc_bstats_update(sch, skb);
				494	sch->q.qlen--;
				495	qdisc_unthrottled(sch);
				496	return skb;
				497	}
				498
				499	static void fq_reset(struct Qdisc *sch)
				500	{
				501	struct sk_buff *skb;
				502
				503	while ((skb = fq_dequeue(sch)) != NULL)
				504	kfree_skb(skb);
				505	}
				506
				507	static void fq_rehash(struct fq_sched_data *q,
				508	struct rb_root *old_array, u32 old_log,
				509	struct rb_root *new_array, u32 new_log)
				510	{
				511	struct rb_node op, np, parent;
				512	struct rb_root oroot, nroot;
				513	struct fq_flow of, nf;
				514	int fcnt = 0;
				515	u32 idx;
				516
				517	for (idx = 0; idx < (1U << old_log); idx++) {
				518	oroot = &old_array[idx];
				519	while ((op = rb_first(oroot)) != NULL) {
				520	rb_erase(op, oroot);
				521	of = container_of(op, struct fq_flow, fq_node);
				522	if (fq_gc_candidate(of)) {
				523	fcnt++;
				524	kmem_cache_free(fq_flow_cachep, of);
				525	continue;
				526	}
				527	nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
				528
				529	np = &nroot->rb_node;
				530	parent = NULL;
				531	while (*np) {
				532	parent = *np;
				533
				534	nf = container_of(parent, struct fq_flow, fq_node);
				535	BUG_ON(nf->sk == of->sk);
				536
				537	if (nf->sk > of->sk)
				538	np = &parent->rb_right;
				539	else
				540	np = &parent->rb_left;
				541	}
				542
				543	rb_link_node(&of->fq_node, parent, np);
				544	rb_insert_color(&of->fq_node, nroot);
				545	}
				546	}
				547	q->flows -= fcnt;
				548	q->inactive_flows -= fcnt;
				549	q->stat_gc_flows += fcnt;
				550	}
				551
				552	static int fq_resize(struct fq_sched_data *q, u32 log)
				553	{
				554	struct rb_root *array;
				555	u32 idx;
				556
				557	if (q->fq_root && log == q->fq_trees_log)
				558	return 0;
				559
				560	array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL);
				561	if (!array)
				562	return -ENOMEM;
				563
				564	for (idx = 0; idx < (1U << log); idx++)
				565	array[idx] = RB_ROOT;
				566
				567	if (q->fq_root) {
				568	fq_rehash(q, q->fq_root, q->fq_trees_log, array, log);
				569	kfree(q->fq_root);
				570	}
				571	q->fq_root = array;
				572	q->fq_trees_log = log;
				573
				574	return 0;
				575	}
				576
				577	static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
				578	[TCA_FQ_PLIMIT] = { .type = NLA_U32 },
				579	[TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
				580	[TCA_FQ_QUANTUM] = { .type = NLA_U32 },
				581	[TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
				582	[TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
				583	[TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
				584	[TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
				585	[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
				586	};
				587
				588	static int fq_change(struct Qdisc sch, struct nlattr opt)
				589	{
				590	struct fq_sched_data *q = qdisc_priv(sch);
				591	struct nlattr *tb[TCA_FQ_MAX + 1];
				592	int err, drop_count = 0;
				593	u32 fq_log;
				594
				595	if (!opt)
				596	return -EINVAL;
				597
				598	err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
				599	if (err < 0)
				600	return err;
				601
				602	sch_tree_lock(sch);
				603
				604	fq_log = q->fq_trees_log;
				605
				606	if (tb[TCA_FQ_BUCKETS_LOG]) {
				607	u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
				608
				609	if (nval >= 1 && nval <= ilog2(256*1024))
				610	fq_log = nval;
				611	else
				612	err = -EINVAL;
				613	}
				614	if (tb[TCA_FQ_PLIMIT])
				615	sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
				616
				617	if (tb[TCA_FQ_FLOW_PLIMIT])
				618	q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
				619
				620	if (tb[TCA_FQ_QUANTUM])
				621	q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
				622
				623	if (tb[TCA_FQ_INITIAL_QUANTUM])
				624	q->quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
				625
				626	if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
				627	q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]);
				628
				629	if (tb[TCA_FQ_FLOW_MAX_RATE])
				630	q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
				631
				632	if (tb[TCA_FQ_RATE_ENABLE]) {
				633	u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
				634
				635	if (enable <= 1)
				636	q->rate_enable = enable;
				637	else
				638	err = -EINVAL;
				639	}
				640
				641	if (!err)
				642	err = fq_resize(q, fq_log);
				643
				644	while (sch->q.qlen > sch->limit) {
				645	struct sk_buff *skb = fq_dequeue(sch);
				646
				647	kfree_skb(skb);
				648	drop_count++;
				649	}
				650	qdisc_tree_decrease_qlen(sch, drop_count);
				651
				652	sch_tree_unlock(sch);
				653	return err;
				654	}
				655
				656	static void fq_destroy(struct Qdisc *sch)
				657	{
				658	struct fq_sched_data *q = qdisc_priv(sch);
				659	struct rb_root *root;
				660	struct rb_node *p;
				661	unsigned int idx;
				662
				663	if (q->fq_root) {
				664	for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
				665	root = &q->fq_root[idx];
				666	while ((p = rb_first(root)) != NULL) {
				667	rb_erase(p, root);
				668	kmem_cache_free(fq_flow_cachep,
				669	container_of(p, struct fq_flow, fq_node));
				670	}
				671	}
				672	kfree(q->fq_root);
				673	}
				674	qdisc_watchdog_cancel(&q->watchdog);
				675	}
				676
				677	static int fq_init(struct Qdisc sch, struct nlattr opt)
				678	{
				679	struct fq_sched_data *q = qdisc_priv(sch);
				680	int err;
				681
				682	sch->limit = 10000;
				683	q->flow_plimit = 100;
				684	q->quantum = 2 * psched_mtu(qdisc_dev(sch));
				685	q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
				686	q->flow_default_rate = 0;
				687	q->flow_max_rate = ~0U;
				688	q->rate_enable = 1;
				689	q->new_flows.first = NULL;
				690	q->old_flows.first = NULL;
				691	q->delayed = RB_ROOT;
				692	q->fq_root = NULL;
				693	q->fq_trees_log = ilog2(1024);
				694	qdisc_watchdog_init(&q->watchdog, sch);
				695
				696	if (opt)
				697	err = fq_change(sch, opt);
				698	else
				699	err = fq_resize(q, q->fq_trees_log);
				700
				701	return err;
				702	}
				703
				704	static int fq_dump(struct Qdisc sch, struct sk_buff skb)
				705	{
				706	struct fq_sched_data *q = qdisc_priv(sch);
				707	struct nlattr *opts;
				708
				709	opts = nla_nest_start(skb, TCA_OPTIONS);
				710	if (opts == NULL)
				711	goto nla_put_failure;
				712
				713	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) \|\|
				714	nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) \|\|
				715	nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) \|\|
				716	nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) \|\|
				717	nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) \|\|
				718	nla_put_u32(skb, TCA_FQ_FLOW_DEFAULT_RATE, q->flow_default_rate) \|\|
				719	nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) \|\|
				720	nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
				721	goto nla_put_failure;
				722
				723	nla_nest_end(skb, opts);
				724	return skb->len;
				725
				726	nla_put_failure:
				727	return -1;
				728	}
				729
				730	static int fq_dump_stats(struct Qdisc sch, struct gnet_dump d)
				731	{
				732	struct fq_sched_data *q = qdisc_priv(sch);
				733	u64 now = ktime_to_ns(ktime_get());
				734	struct tc_fq_qd_stats st = {
				735	.gc_flows = q->stat_gc_flows,
				736	.highprio_packets = q->stat_internal_packets,
				737	.tcp_retrans = q->stat_tcp_retrans,
				738	.throttled = q->stat_throttled,
				739	.flows_plimit = q->stat_flows_plimit,
				740	.pkts_too_long = q->stat_pkts_too_long,
				741	.allocation_errors = q->stat_allocation_errors,
				742	.flows = q->flows,
				743	.inactive_flows = q->inactive_flows,
				744	.throttled_flows = q->throttled_flows,
				745	.time_next_delayed_flow = q->time_next_delayed_flow - now,
				746	};
				747
				748	return gnet_stats_copy_app(d, &st, sizeof(st));
				749	}
				750
				751	static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
				752	.id = "fq",
				753	.priv_size = sizeof(struct fq_sched_data),
				754
				755	.enqueue = fq_enqueue,
				756	.dequeue = fq_dequeue,
				757	.peek = qdisc_peek_dequeued,
				758	.init = fq_init,
				759	.reset = fq_reset,
				760	.destroy = fq_destroy,
				761	.change = fq_change,
				762	.dump = fq_dump,
				763	.dump_stats = fq_dump_stats,
				764	.owner = THIS_MODULE,
				765	};
				766
				767	static int __init fq_module_init(void)
				768	{
				769	int ret;
				770
				771	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
				772	sizeof(struct fq_flow),
				773	0, 0, NULL);
				774	if (!fq_flow_cachep)
				775	return -ENOMEM;
				776
				777	ret = register_qdisc(&fq_qdisc_ops);
				778	if (ret)
				779	kmem_cache_destroy(fq_flow_cachep);
				780	return ret;
				781	}
				782
				783	static void __exit fq_module_exit(void)
				784	{
				785	unregister_qdisc(&fq_qdisc_ops);
				786	kmem_cache_destroy(fq_flow_cachep);
				787	}
				788
				789	module_init(fq_module_init)
				790	module_exit(fq_module_exit)
				791	MODULE_AUTHOR("Eric Dumazet");
				792	MODULE_LICENSE("GPL");