Blame - net/sched/sch_fq.c - kernel/msm-4.19

blob: fdc041c5785360731154521fb3492dba825776ff [file] [log] [blame]

Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	1	/*
				2	* net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
				3	*
				4	* Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public License
				8	* as published by the Free Software Foundation; either version
				9	* 2 of the License, or (at your option) any later version.
				10	*
				11	* Meant to be mostly used for localy generated traffic :
				12	* Fast classification depends on skb->sk being set before reaching us.
				13	* If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
				14	* All packets belonging to a socket are considered as a 'flow'.
				15	*
				16	* Flows are dynamically allocated and stored in a hash table of RB trees
				17	* They are also part of one Round Robin 'queues' (new or old flows)
				18	*
				19	* Burst avoidance (aka pacing) capability :
				20	*
				21	* Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
				22	* bunch of packets, and this packet scheduler adds delay between
				23	* packets to respect rate limitation.
				24	*
				25	* enqueue() :
				26	* - lookup one RB tree (out of 1024 or more) to find the flow.
				27	* If non existent flow, create it, add it to the tree.
				28	* Add skb to the per flow list of skb (fifo).
				29	* - Use a special fifo for high prio packets
				30	*
				31	* dequeue() : serves flows in Round Robin
				32	* Note : When a flow becomes empty, we do not immediately remove it from
				33	* rb trees, for performance reasons (its expected to send additional packets,
				34	* or SLAB cache will reuse socket for another flow)
				35	*/
				36
				37	#include <linux/module.h>
				38	#include <linux/types.h>
				39	#include <linux/kernel.h>
				40	#include <linux/jiffies.h>
				41	#include <linux/string.h>
				42	#include <linux/in.h>
				43	#include <linux/errno.h>
				44	#include <linux/init.h>
				45	#include <linux/skbuff.h>
				46	#include <linux/slab.h>
				47	#include <linux/rbtree.h>
				48	#include <linux/hash.h>
Eric Dumazet	08f89b9	2013-08-30 09:46:43 -0700	[diff] [blame]	49	#include <linux/prefetch.h>
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	50	#include <net/netlink.h>
				51	#include <net/pkt_sched.h>
				52	#include <net/sock.h>
				53	#include <net/tcp_states.h>
				54
				55	/*
				56	* Per flow structure, dynamically allocated
				57	*/
				58	struct fq_flow {
				59	struct sk_buff head; / list of skbs for this flow : first skb */
				60	union {
				61	struct sk_buff tail; / last skb in the list */
				62	unsigned long age; /* jiffies when flow was emptied, for gc */
				63	};
				64	struct rb_node fq_node; /* anchor in fq_root[] trees */
				65	struct sock *sk;
				66	int qlen; /* number of packets in flow queue */
				67	int credit;
				68	u32 socket_hash; /* sk_hash */
				69	struct fq_flow next; / next pointer in RR lists, or &detached */
				70
				71	struct rb_node rate_node; /* anchor in q->delayed tree */
				72	u64 time_next_packet;
				73	};
				74
				75	struct fq_flow_head {
				76	struct fq_flow *first;
				77	struct fq_flow *last;
				78	};
				79
				80	struct fq_sched_data {
				81	struct fq_flow_head new_flows;
				82
				83	struct fq_flow_head old_flows;
				84
				85	struct rb_root delayed; /* for rate limited flows */
				86	u64 time_next_delayed_flow;
				87
				88	struct fq_flow internal; /* for non classified or high prio packets */
				89	u32 quantum;
				90	u32 initial_quantum;
				91	u32 flow_default_rate;/* rate per flow : bytes per second */
				92	u32 flow_max_rate; /* optional max rate per flow */
				93	u32 flow_plimit; /* max packets per flow */
				94	struct rb_root *fq_root;
				95	u8 rate_enable;
				96	u8 fq_trees_log;
				97
				98	u32 flows;
				99	u32 inactive_flows;
				100	u32 throttled_flows;
				101
				102	u64 stat_gc_flows;
				103	u64 stat_internal_packets;
				104	u64 stat_tcp_retrans;
				105	u64 stat_throttled;
				106	u64 stat_flows_plimit;
				107	u64 stat_pkts_too_long;
				108	u64 stat_allocation_errors;
				109	struct qdisc_watchdog watchdog;
				110	};
				111
				112	/* special value to mark a detached flow (not on old/new list) */
				113	static struct fq_flow detached, throttled;
				114
				115	static void fq_flow_set_detached(struct fq_flow *f)
				116	{
				117	f->next = &detached;
				118	}
				119
				120	static bool fq_flow_is_detached(const struct fq_flow *f)
				121	{
				122	return f->next == &detached;
				123	}
				124
				125	static void fq_flow_set_throttled(struct fq_sched_data q, struct fq_flow f)
				126	{
				127	struct rb_node *p = &q->delayed.rb_node, parent = NULL;
				128
				129	while (*p) {
				130	struct fq_flow *aux;
				131
				132	parent = *p;
				133	aux = container_of(parent, struct fq_flow, rate_node);
				134	if (f->time_next_packet >= aux->time_next_packet)
				135	p = &parent->rb_right;
				136	else
				137	p = &parent->rb_left;
				138	}
				139	rb_link_node(&f->rate_node, parent, p);
				140	rb_insert_color(&f->rate_node, &q->delayed);
				141	q->throttled_flows++;
				142	q->stat_throttled++;
				143
				144	f->next = &throttled;
				145	if (q->time_next_delayed_flow > f->time_next_packet)
				146	q->time_next_delayed_flow = f->time_next_packet;
				147	}
				148
				149
				150	static struct kmem_cache *fq_flow_cachep __read_mostly;
				151
				152	static void fq_flow_add_tail(struct fq_flow_head head, struct fq_flow flow)
				153	{
				154	if (head->first)
				155	head->last->next = flow;
				156	else
				157	head->first = flow;
				158	head->last = flow;
				159	flow->next = NULL;
				160	}
				161
				162	/* limit number of collected flows per round */
				163	#define FQ_GC_MAX 8
				164	#define FQ_GC_AGE (3*HZ)
				165
				166	static bool fq_gc_candidate(const struct fq_flow *f)
				167	{
				168	return fq_flow_is_detached(f) &&
				169	time_after(jiffies, f->age + FQ_GC_AGE);
				170	}
				171
				172	static void fq_gc(struct fq_sched_data *q,
				173	struct rb_root *root,
				174	struct sock *sk)
				175	{
				176	struct fq_flow f, tofree[FQ_GC_MAX];
				177	struct rb_node *p, parent;
				178	int fcnt = 0;
				179
				180	p = &root->rb_node;
				181	parent = NULL;
				182	while (*p) {
				183	parent = *p;
				184
				185	f = container_of(parent, struct fq_flow, fq_node);
				186	if (f->sk == sk)
				187	break;
				188
				189	if (fq_gc_candidate(f)) {
				190	tofree[fcnt++] = f;
				191	if (fcnt == FQ_GC_MAX)
				192	break;
				193	}
				194
				195	if (f->sk > sk)
				196	p = &parent->rb_right;
				197	else
				198	p = &parent->rb_left;
				199	}
				200
				201	q->flows -= fcnt;
				202	q->inactive_flows -= fcnt;
				203	q->stat_gc_flows += fcnt;
				204	while (fcnt) {
				205	struct fq_flow *f = tofree[--fcnt];
				206
				207	rb_erase(&f->fq_node, root);
				208	kmem_cache_free(fq_flow_cachep, f);
				209	}
				210	}
				211
				212	static const u8 prio2band[TC_PRIO_MAX + 1] = {
				213	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
				214	};
				215
				216	static struct fq_flow fq_classify(struct sk_buff skb, struct fq_sched_data *q)
				217	{
				218	struct rb_node *p, parent;
				219	struct sock *sk = skb->sk;
				220	struct rb_root *root;
				221	struct fq_flow *f;
				222	int band;
				223
				224	/* warning: no starvation prevention... */
				225	band = prio2band[skb->priority & TC_PRIO_MAX];
				226	if (unlikely(band == 0))
				227	return &q->internal;
				228
				229	if (unlikely(!sk)) {
				230	/* By forcing low order bit to 1, we make sure to not
				231	* collide with a local flow (socket pointers are word aligned)
				232	*/
				233	sk = (struct sock *)(skb_get_rxhash(skb) \| 1L);
				234	}
				235
				236	root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
				237
				238	if (q->flows >= (2U << q->fq_trees_log) &&
				239	q->inactive_flows > q->flows/2)
				240	fq_gc(q, root, sk);
				241
				242	p = &root->rb_node;
				243	parent = NULL;
				244	while (*p) {
				245	parent = *p;
				246
				247	f = container_of(parent, struct fq_flow, fq_node);
				248	if (f->sk == sk) {
				249	/* socket might have been reallocated, so check
				250	* if its sk_hash is the same.
				251	* It not, we need to refill credit with
				252	* initial quantum
				253	*/
				254	if (unlikely(skb->sk &&
				255	f->socket_hash != sk->sk_hash)) {
				256	f->credit = q->initial_quantum;
				257	f->socket_hash = sk->sk_hash;
Eric Dumazet	fc59d5b	2013-10-27 16:26:43 -0700	[diff] [blame^]	258	f->time_next_packet = 0ULL;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	259	}
				260	return f;
				261	}
				262	if (f->sk > sk)
				263	p = &parent->rb_right;
				264	else
				265	p = &parent->rb_left;
				266	}
				267
				268	f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC \| __GFP_NOWARN);
				269	if (unlikely(!f)) {
				270	q->stat_allocation_errors++;
				271	return &q->internal;
				272	}
				273	fq_flow_set_detached(f);
				274	f->sk = sk;
				275	if (skb->sk)
				276	f->socket_hash = sk->sk_hash;
				277	f->credit = q->initial_quantum;
				278
				279	rb_link_node(&f->fq_node, parent, p);
				280	rb_insert_color(&f->fq_node, root);
				281
				282	q->flows++;
				283	q->inactive_flows++;
				284	return f;
				285	}
				286
				287
				288	/* remove one skb from head of flow queue */
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	289	static struct sk_buff fq_dequeue_head(struct Qdisc sch, struct fq_flow *flow)
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	290	{
				291	struct sk_buff *skb = flow->head;
				292
				293	if (skb) {
				294	flow->head = skb->next;
				295	skb->next = NULL;
				296	flow->qlen--;
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	297	sch->qstats.backlog -= qdisc_pkt_len(skb);
				298	sch->q.qlen--;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	299	}
				300	return skb;
				301	}
				302
				303	/* We might add in the future detection of retransmits
				304	* For the time being, just return false
				305	*/
				306	static bool skb_is_retransmit(struct sk_buff *skb)
				307	{
				308	return false;
				309	}
				310
				311	/* add skb to flow queue
				312	* flow queue is a linked list, kind of FIFO, except for TCP retransmits
				313	* We special case tcp retransmits to be transmitted before other packets.
				314	* We rely on fact that TCP retransmits are unlikely, so we do not waste
				315	* a separate queue or a pointer.
				316	* head-> [retrans pkt 1]
				317	* [retrans pkt 2]
				318	* [ normal pkt 1]
				319	* [ normal pkt 2]
				320	* [ normal pkt 3]
				321	* tail-> [ normal pkt 4]
				322	*/
				323	static void flow_queue_add(struct fq_flow flow, struct sk_buff skb)
				324	{
				325	struct sk_buff prev, head = flow->head;
				326
				327	skb->next = NULL;
				328	if (!head) {
				329	flow->head = skb;
				330	flow->tail = skb;
				331	return;
				332	}
				333	if (likely(!skb_is_retransmit(skb))) {
				334	flow->tail->next = skb;
				335	flow->tail = skb;
				336	return;
				337	}
				338
				339	/* This skb is a tcp retransmit,
				340	* find the last retrans packet in the queue
				341	*/
				342	prev = NULL;
				343	while (skb_is_retransmit(head)) {
				344	prev = head;
				345	head = head->next;
				346	if (!head)
				347	break;
				348	}
				349	if (!prev) { /* no rtx packet in queue, become the new head */
				350	skb->next = flow->head;
				351	flow->head = skb;
				352	} else {
				353	if (prev == flow->tail)
				354	flow->tail = skb;
				355	else
				356	skb->next = prev->next;
				357	prev->next = skb;
				358	}
				359	}
				360
				361	static int fq_enqueue(struct sk_buff skb, struct Qdisc sch)
				362	{
				363	struct fq_sched_data *q = qdisc_priv(sch);
				364	struct fq_flow *f;
				365
				366	if (unlikely(sch->q.qlen >= sch->limit))
				367	return qdisc_drop(skb, sch);
				368
				369	f = fq_classify(skb, q);
				370	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
				371	q->stat_flows_plimit++;
				372	return qdisc_drop(skb, sch);
				373	}
				374
				375	f->qlen++;
				376	flow_queue_add(f, skb);
				377	if (skb_is_retransmit(skb))
				378	q->stat_tcp_retrans++;
				379	sch->qstats.backlog += qdisc_pkt_len(skb);
				380	if (fq_flow_is_detached(f)) {
				381	fq_flow_add_tail(&q->new_flows, f);
				382	if (q->quantum > f->credit)
				383	f->credit = q->quantum;
				384	q->inactive_flows--;
				385	qdisc_unthrottled(sch);
				386	}
				387	if (unlikely(f == &q->internal)) {
				388	q->stat_internal_packets++;
				389	qdisc_unthrottled(sch);
				390	}
				391	sch->q.qlen++;
				392
				393	return NET_XMIT_SUCCESS;
				394	}
				395
				396	static void fq_check_throttled(struct fq_sched_data *q, u64 now)
				397	{
				398	struct rb_node *p;
				399
				400	if (q->time_next_delayed_flow > now)
				401	return;
				402
				403	q->time_next_delayed_flow = ~0ULL;
				404	while ((p = rb_first(&q->delayed)) != NULL) {
				405	struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
				406
				407	if (f->time_next_packet > now) {
				408	q->time_next_delayed_flow = f->time_next_packet;
				409	break;
				410	}
				411	rb_erase(p, &q->delayed);
				412	q->throttled_flows--;
				413	fq_flow_add_tail(&q->old_flows, f);
				414	}
				415	}
				416
				417	static struct sk_buff fq_dequeue(struct Qdisc sch)
				418	{
				419	struct fq_sched_data *q = qdisc_priv(sch);
				420	u64 now = ktime_to_ns(ktime_get());
				421	struct fq_flow_head *head;
				422	struct sk_buff *skb;
				423	struct fq_flow *f;
Eric Dumazet	0eab5eb	2013-10-01 09:10:16 -0700	[diff] [blame]	424	u32 rate;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	425
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	426	skb = fq_dequeue_head(sch, &q->internal);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	427	if (skb)
				428	goto out;
				429	fq_check_throttled(q, now);
				430	begin:
				431	head = &q->new_flows;
				432	if (!head->first) {
				433	head = &q->old_flows;
				434	if (!head->first) {
				435	if (q->time_next_delayed_flow != ~0ULL)
				436	qdisc_watchdog_schedule_ns(&q->watchdog,
				437	q->time_next_delayed_flow);
				438	return NULL;
				439	}
				440	}
				441	f = head->first;
				442
				443	if (f->credit <= 0) {
				444	f->credit += q->quantum;
				445	head->first = f->next;
				446	fq_flow_add_tail(&q->old_flows, f);
				447	goto begin;
				448	}
				449
				450	if (unlikely(f->head && now < f->time_next_packet)) {
				451	head->first = f->next;
				452	fq_flow_set_throttled(q, f);
				453	goto begin;
				454	}
				455
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	456	skb = fq_dequeue_head(sch, f);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	457	if (!skb) {
				458	head->first = f->next;
				459	/* force a pass through old_flows to prevent starvation */
				460	if ((head == &q->new_flows) && q->old_flows.first) {
				461	fq_flow_add_tail(&q->old_flows, f);
				462	} else {
				463	fq_flow_set_detached(f);
				464	f->age = jiffies;
				465	q->inactive_flows++;
				466	}
				467	goto begin;
				468	}
Eric Dumazet	08f89b9	2013-08-30 09:46:43 -0700	[diff] [blame]	469	prefetch(&skb->end);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	470	f->time_next_packet = now;
				471	f->credit -= qdisc_pkt_len(skb);
				472
Eric Dumazet	0eab5eb	2013-10-01 09:10:16 -0700	[diff] [blame]	473	if (f->credit > 0 \|\| !q->rate_enable)
				474	goto out;
				475
Eric Dumazet	7eec417	2013-10-08 15:16:00 -0700	[diff] [blame]	476	rate = q->flow_max_rate;
				477	if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT)
				478	rate = min(skb->sk->sk_pacing_rate, rate);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	479
Eric Dumazet	7eec417	2013-10-08 15:16:00 -0700	[diff] [blame]	480	if (rate != ~0U) {
Eric Dumazet	0eab5eb	2013-10-01 09:10:16 -0700	[diff] [blame]	481	u32 plen = max(qdisc_pkt_len(skb), q->quantum);
				482	u64 len = (u64)plen * NSEC_PER_SEC;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	483
Eric Dumazet	7eec417	2013-10-08 15:16:00 -0700	[diff] [blame]	484	if (likely(rate))
				485	do_div(len, rate);
Eric Dumazet	0eab5eb	2013-10-01 09:10:16 -0700	[diff] [blame]	486	/* Since socket rate can change later,
				487	* clamp the delay to 125 ms.
				488	* TODO: maybe segment the too big skb, as in commit
				489	* e43ac79a4bc ("sch_tbf: segment too big GSO packets")
				490	*/
				491	if (unlikely(len > 125 * NSEC_PER_MSEC)) {
				492	len = 125 * NSEC_PER_MSEC;
				493	q->stat_pkts_too_long++;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	494	}
Eric Dumazet	0eab5eb	2013-10-01 09:10:16 -0700	[diff] [blame]	495
				496	f->time_next_packet = now + len;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	497	}
				498	out:
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	499	qdisc_bstats_update(sch, skb);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	500	qdisc_unthrottled(sch);
				501	return skb;
				502	}
				503
				504	static void fq_reset(struct Qdisc *sch)
				505	{
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	506	struct fq_sched_data *q = qdisc_priv(sch);
				507	struct rb_root *root;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	508	struct sk_buff *skb;
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	509	struct rb_node *p;
				510	struct fq_flow *f;
				511	unsigned int idx;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	512
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	513	while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL)
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	514	kfree_skb(skb);
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	515
				516	if (!q->fq_root)
				517	return;
				518
				519	for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
				520	root = &q->fq_root[idx];
				521	while ((p = rb_first(root)) != NULL) {
				522	f = container_of(p, struct fq_flow, fq_node);
				523	rb_erase(p, root);
				524
				525	while ((skb = fq_dequeue_head(sch, f)) != NULL)
				526	kfree_skb(skb);
				527
				528	kmem_cache_free(fq_flow_cachep, f);
				529	}
				530	}
				531	q->new_flows.first = NULL;
				532	q->old_flows.first = NULL;
				533	q->delayed = RB_ROOT;
				534	q->flows = 0;
				535	q->inactive_flows = 0;
				536	q->throttled_flows = 0;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	537	}
				538
				539	static void fq_rehash(struct fq_sched_data *q,
				540	struct rb_root *old_array, u32 old_log,
				541	struct rb_root *new_array, u32 new_log)
				542	{
				543	struct rb_node op, np, parent;
				544	struct rb_root oroot, nroot;
				545	struct fq_flow of, nf;
				546	int fcnt = 0;
				547	u32 idx;
				548
				549	for (idx = 0; idx < (1U << old_log); idx++) {
				550	oroot = &old_array[idx];
				551	while ((op = rb_first(oroot)) != NULL) {
				552	rb_erase(op, oroot);
				553	of = container_of(op, struct fq_flow, fq_node);
				554	if (fq_gc_candidate(of)) {
				555	fcnt++;
				556	kmem_cache_free(fq_flow_cachep, of);
				557	continue;
				558	}
				559	nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
				560
				561	np = &nroot->rb_node;
				562	parent = NULL;
				563	while (*np) {
				564	parent = *np;
				565
				566	nf = container_of(parent, struct fq_flow, fq_node);
				567	BUG_ON(nf->sk == of->sk);
				568
				569	if (nf->sk > of->sk)
				570	np = &parent->rb_right;
				571	else
				572	np = &parent->rb_left;
				573	}
				574
				575	rb_link_node(&of->fq_node, parent, np);
				576	rb_insert_color(&of->fq_node, nroot);
				577	}
				578	}
				579	q->flows -= fcnt;
				580	q->inactive_flows -= fcnt;
				581	q->stat_gc_flows += fcnt;
				582	}
				583
				584	static int fq_resize(struct fq_sched_data *q, u32 log)
				585	{
				586	struct rb_root *array;
				587	u32 idx;
				588
				589	if (q->fq_root && log == q->fq_trees_log)
				590	return 0;
				591
				592	array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL);
				593	if (!array)
				594	return -ENOMEM;
				595
				596	for (idx = 0; idx < (1U << log); idx++)
				597	array[idx] = RB_ROOT;
				598
				599	if (q->fq_root) {
				600	fq_rehash(q, q->fq_root, q->fq_trees_log, array, log);
				601	kfree(q->fq_root);
				602	}
				603	q->fq_root = array;
				604	q->fq_trees_log = log;
				605
				606	return 0;
				607	}
				608
				609	static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
				610	[TCA_FQ_PLIMIT] = { .type = NLA_U32 },
				611	[TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
				612	[TCA_FQ_QUANTUM] = { .type = NLA_U32 },
				613	[TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
				614	[TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
				615	[TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
				616	[TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
				617	[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
				618	};
				619
				620	static int fq_change(struct Qdisc sch, struct nlattr opt)
				621	{
				622	struct fq_sched_data *q = qdisc_priv(sch);
				623	struct nlattr *tb[TCA_FQ_MAX + 1];
				624	int err, drop_count = 0;
				625	u32 fq_log;
				626
				627	if (!opt)
				628	return -EINVAL;
				629
				630	err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
				631	if (err < 0)
				632	return err;
				633
				634	sch_tree_lock(sch);
				635
				636	fq_log = q->fq_trees_log;
				637
				638	if (tb[TCA_FQ_BUCKETS_LOG]) {
				639	u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
				640
				641	if (nval >= 1 && nval <= ilog2(256*1024))
				642	fq_log = nval;
				643	else
				644	err = -EINVAL;
				645	}
				646	if (tb[TCA_FQ_PLIMIT])
				647	sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
				648
				649	if (tb[TCA_FQ_FLOW_PLIMIT])
				650	q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
				651
				652	if (tb[TCA_FQ_QUANTUM])
				653	q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
				654
				655	if (tb[TCA_FQ_INITIAL_QUANTUM])
Eric Dumazet	ede869c	2013-10-07 12:50:18 -0700	[diff] [blame]	656	q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	657
				658	if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
				659	q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]);
				660
				661	if (tb[TCA_FQ_FLOW_MAX_RATE])
				662	q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
				663
				664	if (tb[TCA_FQ_RATE_ENABLE]) {
				665	u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
				666
				667	if (enable <= 1)
				668	q->rate_enable = enable;
				669	else
				670	err = -EINVAL;
				671	}
				672
				673	if (!err)
				674	err = fq_resize(q, fq_log);
				675
				676	while (sch->q.qlen > sch->limit) {
				677	struct sk_buff *skb = fq_dequeue(sch);
				678
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	679	if (!skb)
				680	break;
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	681	kfree_skb(skb);
				682	drop_count++;
				683	}
				684	qdisc_tree_decrease_qlen(sch, drop_count);
				685
				686	sch_tree_unlock(sch);
				687	return err;
				688	}
				689
				690	static void fq_destroy(struct Qdisc *sch)
				691	{
				692	struct fq_sched_data *q = qdisc_priv(sch);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	693
Eric Dumazet	8d34ce1	2013-09-27 14:20:01 -0700	[diff] [blame]	694	fq_reset(sch);
				695	kfree(q->fq_root);
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	696	qdisc_watchdog_cancel(&q->watchdog);
				697	}
				698
				699	static int fq_init(struct Qdisc sch, struct nlattr opt)
				700	{
				701	struct fq_sched_data *q = qdisc_priv(sch);
				702	int err;
				703
				704	sch->limit = 10000;
				705	q->flow_plimit = 100;
				706	q->quantum = 2 * psched_mtu(qdisc_dev(sch));
				707	q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
				708	q->flow_default_rate = 0;
				709	q->flow_max_rate = ~0U;
				710	q->rate_enable = 1;
				711	q->new_flows.first = NULL;
				712	q->old_flows.first = NULL;
				713	q->delayed = RB_ROOT;
				714	q->fq_root = NULL;
				715	q->fq_trees_log = ilog2(1024);
				716	qdisc_watchdog_init(&q->watchdog, sch);
				717
				718	if (opt)
				719	err = fq_change(sch, opt);
				720	else
				721	err = fq_resize(q, q->fq_trees_log);
				722
				723	return err;
				724	}
				725
				726	static int fq_dump(struct Qdisc sch, struct sk_buff skb)
				727	{
				728	struct fq_sched_data *q = qdisc_priv(sch);
				729	struct nlattr *opts;
				730
				731	opts = nla_nest_start(skb, TCA_OPTIONS);
				732	if (opts == NULL)
				733	goto nla_put_failure;
				734
Eric Dumazet	7eec417	2013-10-08 15:16:00 -0700	[diff] [blame]	735	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore,
				736	* do not bother giving its value
				737	*/
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	738	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) \|\|
				739	nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) \|\|
				740	nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) \|\|
				741	nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) \|\|
				742	nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) \|\|
Eric Dumazet	afe4fd0	2013-08-29 15:49:55 -0700	[diff] [blame]	743	nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) \|\|
				744	nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
				745	goto nla_put_failure;
				746
				747	nla_nest_end(skb, opts);
				748	return skb->len;
				749
				750	nla_put_failure:
				751	return -1;
				752	}
				753
				754	static int fq_dump_stats(struct Qdisc sch, struct gnet_dump d)
				755	{
				756	struct fq_sched_data *q = qdisc_priv(sch);
				757	u64 now = ktime_to_ns(ktime_get());
				758	struct tc_fq_qd_stats st = {
				759	.gc_flows = q->stat_gc_flows,
				760	.highprio_packets = q->stat_internal_packets,
				761	.tcp_retrans = q->stat_tcp_retrans,
				762	.throttled = q->stat_throttled,
				763	.flows_plimit = q->stat_flows_plimit,
				764	.pkts_too_long = q->stat_pkts_too_long,
				765	.allocation_errors = q->stat_allocation_errors,
				766	.flows = q->flows,
				767	.inactive_flows = q->inactive_flows,
				768	.throttled_flows = q->throttled_flows,
				769	.time_next_delayed_flow = q->time_next_delayed_flow - now,
				770	};
				771
				772	return gnet_stats_copy_app(d, &st, sizeof(st));
				773	}
				774
				775	static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
				776	.id = "fq",
				777	.priv_size = sizeof(struct fq_sched_data),
				778
				779	.enqueue = fq_enqueue,
				780	.dequeue = fq_dequeue,
				781	.peek = qdisc_peek_dequeued,
				782	.init = fq_init,
				783	.reset = fq_reset,
				784	.destroy = fq_destroy,
				785	.change = fq_change,
				786	.dump = fq_dump,
				787	.dump_stats = fq_dump_stats,
				788	.owner = THIS_MODULE,
				789	};
				790
				791	static int __init fq_module_init(void)
				792	{
				793	int ret;
				794
				795	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
				796	sizeof(struct fq_flow),
				797	0, 0, NULL);
				798	if (!fq_flow_cachep)
				799	return -ENOMEM;
				800
				801	ret = register_qdisc(&fq_qdisc_ops);
				802	if (ret)
				803	kmem_cache_destroy(fq_flow_cachep);
				804	return ret;
				805	}
				806
				807	static void __exit fq_module_exit(void)
				808	{
				809	unregister_qdisc(&fq_qdisc_ops);
				810	kmem_cache_destroy(fq_flow_cachep);
				811	}
				812
				813	module_init(fq_module_init)
				814	module_exit(fq_module_exit)
				815	MODULE_AUTHOR("Eric Dumazet");
				816	MODULE_LICENSE("GPL");