blob: 57abe8266be198dd483c71ce07cf3ce0757ced0e [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * net/sched/gen_estimator.c Simple rate estimator.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Changes:
12 * Jamal Hadi Salim - moved it to net/core and reshulfed
13 * names to make it usable in general net subsystem.
14 */
15
16#include <asm/uaccess.h>
17#include <asm/system.h>
Jiri Slaby1977f032007-10-18 23:40:25 -070018#include <linux/bitops.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070019#include <linux/module.h>
20#include <linux/types.h>
21#include <linux/kernel.h>
22#include <linux/jiffies.h>
23#include <linux/string.h>
24#include <linux/mm.h>
25#include <linux/socket.h>
26#include <linux/sockios.h>
27#include <linux/in.h>
28#include <linux/errno.h>
29#include <linux/interrupt.h>
30#include <linux/netdevice.h>
31#include <linux/skbuff.h>
32#include <linux/rtnetlink.h>
33#include <linux/init.h>
34#include <net/sock.h>
35#include <net/gen_stats.h>
36
37/*
38 This code is NOT intended to be used for statistics collection,
39 its purpose is to provide a base for statistical multiplexing
40 for controlled load service.
41 If you need only statistics, run a user level daemon which
42 periodically reads byte counters.
43
44 Unfortunately, rate estimation is not a very easy task.
45 F.e. I did not find a simple way to estimate the current peak rate
46 and even failed to formulate the problem 8)8)
47
48 So I preferred not to built an estimator into the scheduler,
49 but run this task separately.
50 Ideally, it should be kernel thread(s), but for now it runs
51 from timers, which puts apparent top bounds on the number of rated
52 flows, has minimal overhead on small, but is enough
53 to handle controlled load service, sets of aggregates.
54
55 We measure rate over A=(1<<interval) seconds and evaluate EWMA:
56
57 avrate = avrate*(1-W) + rate*W
58
59 where W is chosen as negative power of 2: W = 2^(-ewma_log)
60
61 The resulting time constant is:
62
63 T = A/(-ln(1-W))
64
65
66 NOTES.
67
68 * The stored value for avbps is scaled by 2^5, so that maximal
69 rate is ~1Gbit, avpps is scaled by 2^10.
70
71 * Minimal interval is HZ/4=250msec (it is the greatest common divisor
72 for HZ=100 and HZ=1024 8)), maximal interval
73 is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
74 are too expensive, longer ones can be implemented
75 at user level painlessly.
76 */
77
78#define EST_MAX_INTERVAL 5
79
80struct gen_estimator
81{
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -070082 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -070083 struct gnet_stats_basic *bstats;
84 struct gnet_stats_rate_est *rate_est;
85 spinlock_t *stats_lock;
Linus Torvalds1da177e2005-04-16 15:20:36 -070086 int ewma_log;
87 u64 last_bytes;
88 u32 last_packets;
89 u32 avpps;
90 u32 avbps;
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -070091 struct rcu_head e_rcu;
Linus Torvalds1da177e2005-04-16 15:20:36 -070092};
93
94struct gen_estimator_head
95{
96 struct timer_list timer;
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -070097 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -070098};
99
100static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
101
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700102/* Protects against NULL dereference */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103static DEFINE_RWLOCK(est_lock);
104
105static void est_timer(unsigned long arg)
106{
107 int idx = (int)arg;
108 struct gen_estimator *e;
109
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700110 rcu_read_lock();
111 list_for_each_entry_rcu(e, &elist[idx].list, list) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112 u64 nbytes;
113 u32 npackets;
114 u32 rate;
115
116 spin_lock(e->stats_lock);
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700117 read_lock(&est_lock);
118 if (e->bstats == NULL)
119 goto skip;
120
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121 nbytes = e->bstats->bytes;
122 npackets = e->bstats->packets;
123 rate = (nbytes - e->last_bytes)<<(7 - idx);
124 e->last_bytes = nbytes;
125 e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
126 e->rate_est->bps = (e->avbps+0xF)>>5;
127
128 rate = (npackets - e->last_packets)<<(12 - idx);
129 e->last_packets = npackets;
130 e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
131 e->rate_est->pps = (e->avpps+0x1FF)>>10;
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700132skip:
133 read_unlock(&est_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134 spin_unlock(e->stats_lock);
135 }
136
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700137 if (!list_empty(&elist[idx].list))
Eric Dumazet789675e2008-01-03 20:40:01 -0800138 mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700139 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700140}
141
142/**
143 * gen_new_estimator - create a new rate estimator
144 * @bstats: basic statistics
145 * @rate_est: rate estimator statistics
146 * @stats_lock: statistics lock
147 * @opt: rate estimator configuration TLV
148 *
149 * Creates a new rate estimator with &bstats as source and &rate_est
150 * as destination. A new timer with the interval specified in the
151 * configuration TLV is created. Upon each interval, the latest statistics
152 * will be read from &bstats and the estimated rate will be stored in
153 * &rate_est with the statistics lock grabed during this period.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900154 *
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 * Returns 0 on success or a negative error code.
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700156 *
157 * NOTE: Called under rtnl_mutex
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158 */
159int gen_new_estimator(struct gnet_stats_basic *bstats,
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700160 struct gnet_stats_rate_est *rate_est,
161 spinlock_t *stats_lock,
Patrick McHardy1e904742008-01-22 22:11:17 -0800162 struct nlattr *opt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163{
164 struct gen_estimator *est;
Patrick McHardy1e904742008-01-22 22:11:17 -0800165 struct gnet_estimator *parm = nla_data(opt);
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700166 int idx;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167
Patrick McHardy1e904742008-01-22 22:11:17 -0800168 if (nla_len(opt) < sizeof(*parm))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169 return -EINVAL;
170
171 if (parm->interval < -2 || parm->interval > 3)
172 return -EINVAL;
173
Andrew Morton77d04bd2006-04-07 14:52:59 -0700174 est = kzalloc(sizeof(*est), GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700175 if (est == NULL)
176 return -ENOBUFS;
177
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700178 idx = parm->interval + 2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700179 est->bstats = bstats;
180 est->rate_est = rate_est;
181 est->stats_lock = stats_lock;
182 est->ewma_log = parm->ewma_log;
183 est->last_bytes = bstats->bytes;
184 est->avbps = rate_est->bps<<5;
185 est->last_packets = bstats->packets;
186 est->avpps = rate_est->pps<<10;
187
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700188 if (!elist[idx].timer.function) {
189 INIT_LIST_HEAD(&elist[idx].list);
190 setup_timer(&elist[idx].timer, est_timer, idx);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700191 }
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700192
193 if (list_empty(&elist[idx].list))
Eric Dumazet789675e2008-01-03 20:40:01 -0800194 mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700195
196 list_add_rcu(&est->list, &elist[idx].list);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197 return 0;
198}
199
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700200static void __gen_kill_estimator(struct rcu_head *head)
201{
202 struct gen_estimator *e = container_of(head,
203 struct gen_estimator, e_rcu);
204 kfree(e);
205}
206
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207/**
208 * gen_kill_estimator - remove a rate estimator
209 * @bstats: basic statistics
210 * @rate_est: rate estimator statistics
211 *
212 * Removes the rate estimator specified by &bstats and &rate_est
213 * and deletes the timer.
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700214 *
215 * NOTE: Called under rtnl_mutex
Linus Torvalds1da177e2005-04-16 15:20:36 -0700216 */
217void gen_kill_estimator(struct gnet_stats_basic *bstats,
218 struct gnet_stats_rate_est *rate_est)
219{
220 int idx;
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700221 struct gen_estimator *e, *n;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222
223 for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700224
225 /* Skip non initialized indexes */
226 if (!elist[idx].timer.function)
227 continue;
228
229 list_for_each_entry_safe(e, n, &elist[idx].list, list) {
230 if (e->rate_est != rate_est || e->bstats != bstats)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700231 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700232
233 write_lock_bh(&est_lock);
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700234 e->bstats = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700235 write_unlock_bh(&est_lock);
236
Ranko Zivojnovic0929c2d2007-07-16 18:28:32 -0700237 list_del_rcu(&e->list);
238 call_rcu(&e->e_rcu, __gen_kill_estimator);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700239 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700240 }
241}
242
243/**
Jarek Poplawski96750162008-01-21 02:36:02 -0800244 * gen_replace_estimator - replace rate estimator configuration
Linus Torvalds1da177e2005-04-16 15:20:36 -0700245 * @bstats: basic statistics
246 * @rate_est: rate estimator statistics
247 * @stats_lock: statistics lock
248 * @opt: rate estimator configuration TLV
249 *
250 * Replaces the configuration of a rate estimator by calling
251 * gen_kill_estimator() and gen_new_estimator().
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900252 *
Linus Torvalds1da177e2005-04-16 15:20:36 -0700253 * Returns 0 on success or a negative error code.
254 */
Jarek Poplawski96750162008-01-21 02:36:02 -0800255int gen_replace_estimator(struct gnet_stats_basic *bstats,
256 struct gnet_stats_rate_est *rate_est,
Patrick McHardy1e904742008-01-22 22:11:17 -0800257 spinlock_t *stats_lock, struct nlattr *opt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258{
Jarek Poplawski96750162008-01-21 02:36:02 -0800259 gen_kill_estimator(bstats, rate_est);
260 return gen_new_estimator(bstats, rate_est, stats_lock, opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261}
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900262
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263
264EXPORT_SYMBOL(gen_kill_estimator);
265EXPORT_SYMBOL(gen_new_estimator);
266EXPORT_SYMBOL(gen_replace_estimator);