blob: bb5dc4bfb6b65c2164f5307e43c9ab1f7f61bcee [file] [log] [blame]
Stephen Hemmingerdf3271f2005-12-13 23:13:28 -08001/*
2 * TCP CUBIC: Binary Increase Congestion control for TCP v2.0
3 *
4 * This is from the implementation of CUBIC TCP in
5 * Injong Rhee, Lisong Xu.
6 * "CUBIC: A New TCP-Friendly High-Speed TCP Variant
7 * in PFLDnet 2005
8 * Available from:
9 * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
10 *
11 * Unless CUBIC is enabled and congestion window is large
12 * this behaves the same as the original Reno.
13 */
14
15#include <linux/config.h>
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <net/tcp.h>
19
20
21#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
22 * max_cwnd = snd_cwnd * beta
23 */
24#define BICTCP_B 4 /*
25 * In binary search,
26 * go to point (max+min)/N
27 */
28#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */
29
30static int fast_convergence = 1;
31static int max_increment = 16;
32static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
33static int initial_ssthresh = 100;
34static int bic_scale = 41;
35static int tcp_friendliness = 1;
36
37module_param(fast_convergence, int, 0644);
38MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
39module_param(max_increment, int, 0644);
40MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
41module_param(beta, int, 0644);
42MODULE_PARM_DESC(beta, "beta for multiplicative increase");
43module_param(initial_ssthresh, int, 0644);
44MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
45module_param(bic_scale, int, 0644);
46MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
47module_param(tcp_friendliness, int, 0644);
48MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
49
50
51/* BIC TCP Parameters */
52struct bictcp {
53 u32 cnt; /* increase cwnd by 1 after ACKs */
54 u32 last_max_cwnd; /* last maximum snd_cwnd */
55 u32 loss_cwnd; /* congestion window at last loss */
56 u32 last_cwnd; /* the last snd_cwnd */
57 u32 last_time; /* time when updated last_cwnd */
58 u32 bic_origin_point;/* origin point of bic function */
59 u32 bic_K; /* time to origin point from the beginning of the current epoch */
60 u32 delay_min; /* min delay */
61 u32 epoch_start; /* beginning of an epoch */
62 u32 ack_cnt; /* number of acks */
63 u32 tcp_cwnd; /* estimated tcp cwnd */
64#define ACK_RATIO_SHIFT 4
65 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
66};
67
68static inline void bictcp_reset(struct bictcp *ca)
69{
70 ca->cnt = 0;
71 ca->last_max_cwnd = 0;
72 ca->loss_cwnd = 0;
73 ca->last_cwnd = 0;
74 ca->last_time = 0;
75 ca->bic_origin_point = 0;
76 ca->bic_K = 0;
77 ca->delay_min = 0;
78 ca->epoch_start = 0;
79 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
80 ca->ack_cnt = 0;
81 ca->tcp_cwnd = 0;
82}
83
84static void bictcp_init(struct sock *sk)
85{
86 bictcp_reset(inet_csk_ca(sk));
87 if (initial_ssthresh)
88 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
89}
90
91/* 65536 times the cubic root */
92static const u64 cubic_table[8]
93 = {0, 65536, 82570, 94519, 104030, 112063, 119087, 125367};
94
95/*
96 * calculate the cubic root of x
97 * the basic idea is that x can be expressed as i*8^j
98 * so cubic_root(x) = cubic_root(i)*2^j
99 * in the following code, x is i, and y is 2^j
100 * because of integer calculation, there are errors in calculation
101 * so finally use binary search to find out the exact solution
102 */
103static u32 cubic_root(u64 x)
104{
105 u64 y, app, target, start, end, mid, start_diff, end_diff;
106
107 if (x == 0)
108 return 0;
109
110 target = x;
111
112 /* first estimate lower and upper bound */
113 y = 1;
114 while (x >= 8){
115 x = (x >> 3);
116 y = (y << 1);
117 }
118 start = (y*cubic_table[x])>>16;
119 if (x==7)
120 end = (y<<1);
121 else
122 end = (y*cubic_table[x+1]+65535)>>16;
123
124 /* binary search for more accurate one */
125 while (start < end-1) {
126 mid = (start+end) >> 1;
127 app = mid*mid*mid;
128 if (app < target)
129 start = mid;
130 else if (app > target)
131 end = mid;
132 else
133 return mid;
134 }
135
136 /* find the most accurate one from start and end */
137 app = start*start*start;
138 if (app < target)
139 start_diff = target - app;
140 else
141 start_diff = app - target;
142 app = end*end*end;
143 if (app < target)
144 end_diff = target - app;
145 else
146 end_diff = app - target;
147
148 if (start_diff < end_diff)
149 return (u32)start;
150 else
151 return (u32)end;
152}
153
154static inline u32 bictcp_K(u32 dist, u32 srtt)
155{
156 u64 d64;
157 u32 d32;
158 u32 count;
159 u32 result;
160
161 /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
162 so K = cubic_root( (wmax-cwnd)*rtt/c )
163 the unit of K is bictcp_HZ=2^10, not HZ
164
165 c = bic_scale >> 10
166 rtt = (tp->srtt >> 3 ) / HZ
167
168 the following code has been designed and tested for
169 cwnd < 1 million packets
170 RTT < 100 seconds
171 HZ < 1,000,00 (corresponding to 10 nano-second)
172
173 */
174
175 /* 1/c * 2^2*bictcp_HZ */
176 d32 = (1 << (10+2*BICTCP_HZ)) / bic_scale;
177 d64 = (__u64)d32;
178
179 /* srtt * 2^count / HZ
180 1) to get a better accuracy of the following d32,
181 the larger the "count", the better the accuracy
182 2) and avoid overflow of the following d64
183 the larger the "count", the high possibility of overflow
184 3) so find a "count" between bictcp_hz-3 and bictcp_hz
185 "count" may be less than bictcp_HZ,
186 then d64 becomes 0. that is OK
187 */
188 d32 = srtt;
189 count = 0;
190 while (((d32 & 0x80000000)==0) && (count < BICTCP_HZ)){
191 d32 = d32 << 1;
192 count++;
193 }
194 d32 = d32 / HZ;
195
196 /* (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) */
197 d64 = (d64 * dist * d32) >> (count+3-BICTCP_HZ);
198
199 /* cubic root */
200 d64 = cubic_root(d64);
201
202 result = (u32)d64;
203 return result;
204}
205
206/*
207 * Compute congestion window to use.
208 */
209static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
210{
211 u64 d64;
212 u32 d32, t, srtt, bic_target, min_cnt, max_cnt;
213
214 ca->ack_cnt++; /* count the number of ACKs */
215
216 if (ca->last_cwnd == cwnd &&
217 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
218 return;
219
220 ca->last_cwnd = cwnd;
221 ca->last_time = tcp_time_stamp;
222
223 srtt = (HZ << 3)/10; /* use real time-based growth function */
224
225 if (ca->epoch_start == 0) {
226 ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */
227 ca->ack_cnt = 1; /* start counting */
228 ca->tcp_cwnd = cwnd; /* syn with cubic */
229
230 if (ca->last_max_cwnd <= cwnd) {
231 ca->bic_K = 0;
232 ca->bic_origin_point = cwnd;
233 } else {
234 ca->bic_K = bictcp_K(ca->last_max_cwnd-cwnd, srtt);
235 ca->bic_origin_point = ca->last_max_cwnd;
236 }
237 }
238
239 /* cubic function - calc*/
240 /* calculate c * time^3 / rtt,
241 * while considering overflow in calculation of time^3
242 * (so time^3 is done by using d64)
243 * and without the support of division of 64bit numbers
244 * (so all divisions are done by using d32)
245 * also NOTE the unit of those veriables
246 * time = (t - K) / 2^bictcp_HZ
247 * c = bic_scale >> 10
248 * rtt = (srtt >> 3) / HZ
249 * !!! The following code does not have overflow problems,
250 * if the cwnd < 1 million packets !!!
251 */
252
253 /* change the unit from HZ to bictcp_HZ */
254 t = ((tcp_time_stamp + ca->delay_min - ca->epoch_start)
255 << BICTCP_HZ) / HZ;
256
257 if (t < ca->bic_K) /* t - K */
258 d32 = ca->bic_K - t;
259 else
260 d32 = t - ca->bic_K;
261
262 d64 = (u64)d32;
263 d32 = (bic_scale << 3) * HZ / srtt; /* 1024*c/rtt */
264 d64 = (d32 * d64 * d64 * d64) >> (10+3*BICTCP_HZ); /* c/rtt * (t-K)^3 */
265 d32 = (u32)d64;
266 if (t < ca->bic_K) /* below origin*/
267 bic_target = ca->bic_origin_point - d32;
268 else /* above origin*/
269 bic_target = ca->bic_origin_point + d32;
270
271 /* cubic function - calc bictcp_cnt*/
272 if (bic_target > cwnd) {
273 ca->cnt = cwnd / (bic_target - cwnd);
274 } else {
275 ca->cnt = 100 * cwnd; /* very small increment*/
276 }
277
278 if (ca->delay_min > 0) {
279 /* max increment = Smax * rtt / 0.1 */
280 min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min);
281 if (ca->cnt < min_cnt)
282 ca->cnt = min_cnt;
283 }
284
285 /* slow start and low utilization */
286 if (ca->loss_cwnd == 0) /* could be aggressive in slow start */
287 ca->cnt = 50;
288
289 /* TCP Friendly */
290 if (tcp_friendliness) {
291 u32 scale = 8*(BICTCP_BETA_SCALE+beta)/3/(BICTCP_BETA_SCALE-beta);
292 d32 = (cwnd * scale) >> 3;
293 while (ca->ack_cnt > d32) { /* update tcp cwnd */
294 ca->ack_cnt -= d32;
295 ca->tcp_cwnd++;
296 }
297
298 if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */
299 d32 = ca->tcp_cwnd - cwnd;
300 max_cnt = cwnd / d32;
301 if (ca->cnt > max_cnt)
302 ca->cnt = max_cnt;
303 }
304 }
305
306 ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
307 if (ca->cnt == 0) /* cannot be zero */
308 ca->cnt = 1;
309}
310
311
312/* Keep track of minimum rtt */
313static inline void measure_delay(struct sock *sk)
314{
315 const struct tcp_sock *tp = tcp_sk(sk);
316 struct bictcp *ca = inet_csk_ca(sk);
317 u32 delay;
318
319 /* No time stamp */
320 if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
321 /* Discard delay samples right after fast recovery */
322 (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
323 return;
324
325 delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
326 if (delay == 0)
327 delay = 1;
328
329 /* first time call or link delay decreases */
330 if (ca->delay_min == 0 || ca->delay_min > delay)
331 ca->delay_min = delay;
332}
333
334static void bictcp_cong_avoid(struct sock *sk, u32 ack,
335 u32 seq_rtt, u32 in_flight, int data_acked)
336{
337 struct tcp_sock *tp = tcp_sk(sk);
338 struct bictcp *ca = inet_csk_ca(sk);
339
340 if (data_acked)
341 measure_delay(sk);
342
343 if (!tcp_is_cwnd_limited(sk, in_flight))
344 return;
345
346 if (tp->snd_cwnd <= tp->snd_ssthresh)
347 tcp_slow_start(tp);
348 else {
349 bictcp_update(ca, tp->snd_cwnd);
350
351 /* In dangerous area, increase slowly.
352 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
353 */
354 if (tp->snd_cwnd_cnt >= ca->cnt) {
355 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
356 tp->snd_cwnd++;
357 tp->snd_cwnd_cnt = 0;
358 } else
359 tp->snd_cwnd_cnt++;
360 }
361
362}
363
364static u32 bictcp_recalc_ssthresh(struct sock *sk)
365{
366 const struct tcp_sock *tp = tcp_sk(sk);
367 struct bictcp *ca = inet_csk_ca(sk);
368
369 ca->epoch_start = 0; /* end of epoch */
370
371 /* Wmax and fast convergence */
372 if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
373 ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
374 / (2 * BICTCP_BETA_SCALE);
375 else
376 ca->last_max_cwnd = tp->snd_cwnd;
377
378 ca->loss_cwnd = tp->snd_cwnd;
379
380 return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
381}
382
383static u32 bictcp_undo_cwnd(struct sock *sk)
384{
385 struct bictcp *ca = inet_csk_ca(sk);
386
387 return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
388}
389
390static u32 bictcp_min_cwnd(struct sock *sk)
391{
392 return tcp_sk(sk)->snd_ssthresh;
393}
394
395static void bictcp_state(struct sock *sk, u8 new_state)
396{
397 if (new_state == TCP_CA_Loss)
398 bictcp_reset(inet_csk_ca(sk));
399}
400
401/* Track delayed acknowledgment ratio using sliding window
402 * ratio = (15*ratio + sample) / 16
403 */
404static void bictcp_acked(struct sock *sk, u32 cnt)
405{
406 const struct inet_connection_sock *icsk = inet_csk(sk);
407
408 if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) {
409 struct bictcp *ca = inet_csk_ca(sk);
410 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
411 ca->delayed_ack += cnt;
412 }
413}
414
415
416static struct tcp_congestion_ops cubictcp = {
417 .init = bictcp_init,
418 .ssthresh = bictcp_recalc_ssthresh,
419 .cong_avoid = bictcp_cong_avoid,
420 .set_state = bictcp_state,
421 .undo_cwnd = bictcp_undo_cwnd,
422 .min_cwnd = bictcp_min_cwnd,
423 .pkts_acked = bictcp_acked,
424 .owner = THIS_MODULE,
425 .name = "cubic",
426};
427
428static int __init cubictcp_register(void)
429{
430 BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
431 return tcp_register_congestion_control(&cubictcp);
432}
433
434static void __exit cubictcp_unregister(void)
435{
436 tcp_unregister_congestion_control(&cubictcp);
437}
438
439module_init(cubictcp_register);
440module_exit(cubictcp_unregister);
441
442MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
443MODULE_LICENSE("GPL");
444MODULE_DESCRIPTION("CUBIC TCP");
445MODULE_VERSION("2.0");