blob: e4e37edbad60f6fd9a83d7098f98ca0bf2e6582f [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Linus Torvalds1da177e2005-04-16 15:20:36 -070065#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070072#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070082#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
Herbert Xu352e5122007-11-13 21:34:06 -080093#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020094#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700105#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700106#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU 0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700119static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
120static int ip_rt_gc_interval __read_mostly = 60 * HZ;
121static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
122static int ip_rt_redirect_number __read_mostly = 9;
123static int ip_rt_redirect_load __read_mostly = HZ / 50;
124static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125static int ip_rt_error_cost __read_mostly = HZ;
126static int ip_rt_error_burst __read_mostly = 5 * HZ;
127static int ip_rt_gc_elasticity __read_mostly = 8;
128static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800133static void rt_worker_func(struct work_struct *work);
134static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135
136/*
137 * Interface to generic destination cache.
138 */
139
140static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141static void ipv4_dst_destroy(struct dst_entry *dst);
142static void ipv4_dst_ifdown(struct dst_entry *dst,
143 struct net_device *dev, int how);
144static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145static void ipv4_link_failure(struct sk_buff *skb);
146static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800147static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148
149
150static struct dst_ops ipv4_dst_ops = {
151 .family = AF_INET,
152 .protocol = __constant_htons(ETH_P_IP),
153 .gc = rt_garbage_collect,
154 .check = ipv4_dst_check,
155 .destroy = ipv4_dst_destroy,
156 .ifdown = ipv4_dst_ifdown,
157 .negative_advice = ipv4_negative_advice,
158 .link_failure = ipv4_link_failure,
159 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700160 .local_out = __ip_local_out,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700161 .entry_size = sizeof(struct rtable),
Eric Dumazete2422972008-01-30 20:07:45 -0800162 .entries = ATOMIC_INIT(0),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163};
164
165#define ECN_OR_COST(class) TC_PRIO_##class
166
Philippe De Muyter4839c522007-07-09 15:32:57 -0700167const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700168 TC_PRIO_BESTEFFORT,
169 ECN_OR_COST(FILLER),
170 TC_PRIO_BESTEFFORT,
171 ECN_OR_COST(BESTEFFORT),
172 TC_PRIO_BULK,
173 ECN_OR_COST(BULK),
174 TC_PRIO_BULK,
175 ECN_OR_COST(BULK),
176 TC_PRIO_INTERACTIVE,
177 ECN_OR_COST(INTERACTIVE),
178 TC_PRIO_INTERACTIVE,
179 ECN_OR_COST(INTERACTIVE),
180 TC_PRIO_INTERACTIVE_BULK,
181 ECN_OR_COST(INTERACTIVE_BULK),
182 TC_PRIO_INTERACTIVE_BULK,
183 ECN_OR_COST(INTERACTIVE_BULK)
184};
185
186
187/*
188 * Route cache.
189 */
190
191/* The locking scheme is rather straight forward:
192 *
193 * 1) Read-Copy Update protects the buckets of the central route hash.
194 * 2) Only writers remove entries, and they hold the lock
195 * as they look at rtable reference counts.
196 * 3) Only readers acquire references to rtable entries,
197 * they do so with atomic increments and with the
198 * lock held.
199 */
200
201struct rt_hash_bucket {
202 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700203};
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700204#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
205 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700206/*
207 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
208 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700209 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700210 */
Ingo Molnar62051202006-07-03 00:24:59 -0700211#ifdef CONFIG_LOCKDEP
212# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700213#else
Ingo Molnar62051202006-07-03 00:24:59 -0700214# if NR_CPUS >= 32
215# define RT_HASH_LOCK_SZ 4096
216# elif NR_CPUS >= 16
217# define RT_HASH_LOCK_SZ 2048
218# elif NR_CPUS >= 8
219# define RT_HASH_LOCK_SZ 1024
220# elif NR_CPUS >= 4
221# define RT_HASH_LOCK_SZ 512
222# else
223# define RT_HASH_LOCK_SZ 256
224# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700225#endif
226
227static spinlock_t *rt_hash_locks;
228# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800229
230static __init void rt_hash_lock_init(void)
231{
232 int i;
233
234 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
235 GFP_KERNEL);
236 if (!rt_hash_locks)
237 panic("IP: failed to allocate rt_hash_locks\n");
238
239 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
240 spin_lock_init(&rt_hash_locks[i]);
241}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700242#else
243# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800244
245static inline void rt_hash_lock_init(void)
246{
247}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700248#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700249
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700250static struct rt_hash_bucket *rt_hash_table __read_mostly;
251static unsigned rt_hash_mask __read_mostly;
252static unsigned int rt_hash_log __read_mostly;
253static atomic_t rt_genid __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700254
Eric Dumazet2f970d82006-01-17 02:54:36 -0800255static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Andrew Mortondbd29152006-01-17 21:58:01 -0800256#define RT_CACHE_STAT_INC(field) \
Paul Mackerrasbfe5d832006-06-25 05:47:14 -0700257 (__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700259static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
260 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261{
Stephen Hemminger1294fc42008-04-10 01:54:01 -0700262 return jhash_3words((__force u32)(__be32)(daddr),
263 (__force u32)(__be32)(saddr),
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700264 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800265 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700266}
267
268#ifdef CONFIG_PROC_FS
269struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800270 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800272 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700273};
274
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900275static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700276{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900277 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700279
280 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
281 rcu_read_lock_bh();
Eric Dumazet29e75252008-01-31 17:05:09 -0800282 r = rcu_dereference(rt_hash_table[st->bucket].chain);
283 while (r) {
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900284 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800285 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800286 return r;
287 r = rcu_dereference(r->u.dst.rt_next);
288 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289 rcu_read_unlock_bh();
290 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800291 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700292}
293
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900294static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800295 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900297 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800298 r = r->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700299 while (!r) {
300 rcu_read_unlock_bh();
301 if (--st->bucket < 0)
302 break;
303 rcu_read_lock_bh();
304 r = rt_hash_table[st->bucket].chain;
305 }
Eric Dumazet0bccead2008-01-10 03:55:57 -0800306 return rcu_dereference(r);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307}
308
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900309static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800310 struct rtable *r)
311{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900312 struct rt_cache_iter_state *st = seq->private;
313 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
314 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800315 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800316 if (r->rt_genid == st->genid)
317 break;
318 }
319 return r;
320}
321
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900322static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900324 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325
326 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900327 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700328 --pos;
329 return pos ? NULL : r;
330}
331
332static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
333{
Eric Dumazet29e75252008-01-31 17:05:09 -0800334 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800335 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900336 return rt_cache_get_idx(seq, *pos - 1);
Eric Dumazet29e75252008-01-31 17:05:09 -0800337 st->genid = atomic_read(&rt_genid);
338 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339}
340
341static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
342{
Eric Dumazet29e75252008-01-31 17:05:09 -0800343 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344
345 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900346 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900348 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700349 ++*pos;
350 return r;
351}
352
353static void rt_cache_seq_stop(struct seq_file *seq, void *v)
354{
355 if (v && v != SEQ_START_TOKEN)
356 rcu_read_unlock_bh();
357}
358
359static int rt_cache_seq_show(struct seq_file *seq, void *v)
360{
361 if (v == SEQ_START_TOKEN)
362 seq_printf(seq, "%-127s\n",
363 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
364 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
365 "HHUptod\tSpecDst");
366 else {
367 struct rtable *r = v;
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700368 int len;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700369
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700370 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
371 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700372 r->u.dst.dev ? r->u.dst.dev->name : "*",
373 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
374 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
375 r->u.dst.__use, 0, (unsigned long)r->rt_src,
376 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
377 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
378 dst_metric(&r->u.dst, RTAX_WINDOW),
379 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
380 dst_metric(&r->u.dst, RTAX_RTTVAR)),
381 r->fl.fl4_tos,
382 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
383 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
384 dev_queue_xmit) : 0,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700385 r->rt_spec_dst, &len);
386
387 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900388 }
389 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390}
391
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700392static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393 .start = rt_cache_seq_start,
394 .next = rt_cache_seq_next,
395 .stop = rt_cache_seq_stop,
396 .show = rt_cache_seq_show,
397};
398
399static int rt_cache_seq_open(struct inode *inode, struct file *file)
400{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800401 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700402 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403}
404
Arjan van de Ven9a321442007-02-12 00:55:35 -0800405static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700406 .owner = THIS_MODULE,
407 .open = rt_cache_seq_open,
408 .read = seq_read,
409 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800410 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700411};
412
413
414static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
415{
416 int cpu;
417
418 if (*pos == 0)
419 return SEQ_START_TOKEN;
420
421 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
422 if (!cpu_possible(cpu))
423 continue;
424 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800425 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700426 }
427 return NULL;
428}
429
430static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
431{
432 int cpu;
433
434 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
435 if (!cpu_possible(cpu))
436 continue;
437 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800438 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700439 }
440 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900441
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442}
443
444static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
445{
446
447}
448
449static int rt_cpu_seq_show(struct seq_file *seq, void *v)
450{
451 struct rt_cache_stat *st = v;
452
453 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700454 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700455 return 0;
456 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900457
Linus Torvalds1da177e2005-04-16 15:20:36 -0700458 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
459 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
460 atomic_read(&ipv4_dst_ops.entries),
461 st->in_hit,
462 st->in_slow_tot,
463 st->in_slow_mc,
464 st->in_no_route,
465 st->in_brd,
466 st->in_martian_dst,
467 st->in_martian_src,
468
469 st->out_hit,
470 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900471 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472
473 st->gc_total,
474 st->gc_ignored,
475 st->gc_goal_miss,
476 st->gc_dst_overflow,
477 st->in_hlist_search,
478 st->out_hlist_search
479 );
480 return 0;
481}
482
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700483static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700484 .start = rt_cpu_seq_start,
485 .next = rt_cpu_seq_next,
486 .stop = rt_cpu_seq_stop,
487 .show = rt_cpu_seq_show,
488};
489
490
491static int rt_cpu_seq_open(struct inode *inode, struct file *file)
492{
493 return seq_open(file, &rt_cpu_seq_ops);
494}
495
Arjan van de Ven9a321442007-02-12 00:55:35 -0800496static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700497 .owner = THIS_MODULE,
498 .open = rt_cpu_seq_open,
499 .read = seq_read,
500 .llseek = seq_lseek,
501 .release = seq_release,
502};
503
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800504#ifdef CONFIG_NET_CLS_ROUTE
505static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
506 int length, int *eof, void *data)
507{
508 unsigned int i;
509
510 if ((offset & 3) || (length & 3))
511 return -EIO;
512
513 if (offset >= sizeof(struct ip_rt_acct) * 256) {
514 *eof = 1;
515 return 0;
516 }
517
518 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
519 length = sizeof(struct ip_rt_acct) * 256 - offset;
520 *eof = 1;
521 }
522
523 offset /= sizeof(u32);
524
525 if (length > 0) {
526 u32 *dst = (u32 *) buffer;
527
528 *start = buffer;
529 memset(dst, 0, length);
530
531 for_each_possible_cpu(i) {
532 unsigned int j;
533 u32 *src;
534
535 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
536 for (j = 0; j < length/4; j++)
537 dst[j] += src[j];
538 }
539 }
540 return length;
541}
542#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800543
Denis V. Lunev73b38712008-02-28 20:51:18 -0800544static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800545{
546 struct proc_dir_entry *pde;
547
548 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
549 &rt_cache_seq_fops);
550 if (!pde)
551 goto err1;
552
Wang Chen77020722008-02-28 14:14:25 -0800553 pde = proc_create("rt_cache", S_IRUGO,
554 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800555 if (!pde)
556 goto err2;
557
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800558#ifdef CONFIG_NET_CLS_ROUTE
559 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
560 ip_rt_acct_read, NULL);
561 if (!pde)
562 goto err3;
563#endif
564 return 0;
565
566#ifdef CONFIG_NET_CLS_ROUTE
567err3:
568 remove_proc_entry("rt_cache", net->proc_net_stat);
569#endif
570err2:
571 remove_proc_entry("rt_cache", net->proc_net);
572err1:
573 return -ENOMEM;
574}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800575
576static void __net_exit ip_rt_do_proc_exit(struct net *net)
577{
578 remove_proc_entry("rt_cache", net->proc_net_stat);
579 remove_proc_entry("rt_cache", net->proc_net);
580 remove_proc_entry("rt_acct", net->proc_net);
581}
582
583static struct pernet_operations ip_rt_proc_ops __net_initdata = {
584 .init = ip_rt_do_proc_init,
585 .exit = ip_rt_do_proc_exit,
586};
587
588static int __init ip_rt_proc_init(void)
589{
590 return register_pernet_subsys(&ip_rt_proc_ops);
591}
592
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800593#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800594static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800595{
596 return 0;
597}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700598#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900599
Stephen Hemminger5969f712008-04-10 01:52:09 -0700600static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700601{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700602 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
603}
604
Stephen Hemminger5969f712008-04-10 01:52:09 -0700605static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700606{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700607 ip_rt_put(rt);
608 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
609}
610
Stephen Hemminger5969f712008-04-10 01:52:09 -0700611static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612{
613 /* Kill broadcast/multicast entries very aggresively, if they
614 collide in hash table with more useful entries */
615 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800616 rth->fl.iif && rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700617}
618
Stephen Hemminger5969f712008-04-10 01:52:09 -0700619static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700620{
621 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
622 rth->u.dst.expires;
623}
624
625static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
626{
627 unsigned long age;
628 int ret = 0;
629
630 if (atomic_read(&rth->u.dst.__refcnt))
631 goto out;
632
633 ret = 1;
634 if (rth->u.dst.expires &&
635 time_after_eq(jiffies, rth->u.dst.expires))
636 goto out;
637
638 age = jiffies - rth->u.dst.lastuse;
639 ret = 0;
640 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
641 (age <= tmo2 && rt_valuable(rth)))
642 goto out;
643 ret = 1;
644out: return ret;
645}
646
647/* Bits of score are:
648 * 31: very valuable
649 * 30: not quite useless
650 * 29..0: usage counter
651 */
652static inline u32 rt_score(struct rtable *rt)
653{
654 u32 score = jiffies - rt->u.dst.lastuse;
655
656 score = ~score & ~(3<<30);
657
658 if (rt_valuable(rt))
659 score |= (1<<31);
660
661 if (!rt->fl.iif ||
662 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
663 score |= (1<<30);
664
665 return score;
666}
667
668static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
669{
Al Viro714e85b2006-11-14 20:51:49 -0800670 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
671 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800672 (fl1->mark ^ fl2->mark) |
David S. Miller8238b212006-10-12 00:49:15 -0700673 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
674 *(u16 *)&fl2->nl_u.ip4_u.tos) |
675 (fl1->oif ^ fl2->oif) |
676 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700677}
678
Denis V. Lunevb5921912008-01-22 23:50:25 -0800679static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
680{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900681 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
Denis V. Lunevb5921912008-01-22 23:50:25 -0800682}
683
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800684/*
685 * Perform a full scan of hash table and free all entries.
686 * Can be called by a softirq or a process.
687 * In the later case, we want to be reschedule if necessary
688 */
689static void rt_do_flush(int process_context)
690{
691 unsigned int i;
692 struct rtable *rth, *next;
693
694 for (i = 0; i <= rt_hash_mask; i++) {
695 if (process_context && need_resched())
696 cond_resched();
697 rth = rt_hash_table[i].chain;
698 if (!rth)
699 continue;
700
701 spin_lock_bh(rt_hash_lock_addr(i));
702 rth = rt_hash_table[i].chain;
703 rt_hash_table[i].chain = NULL;
704 spin_unlock_bh(rt_hash_lock_addr(i));
705
706 for (; rth; rth = next) {
707 next = rth->u.dst.rt_next;
708 rt_free(rth);
709 }
710 }
711}
712
713static void rt_check_expire(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700714{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700715 static unsigned int rover;
716 unsigned int i = rover, goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700717 struct rtable *rth, **rthp;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700718 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700719
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700720 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
721 if (ip_rt_gc_timeout > 1)
722 do_div(mult, ip_rt_gc_timeout);
723 goal = (unsigned int)mult;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700724 if (goal > rt_hash_mask)
725 goal = rt_hash_mask + 1;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700726 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700727 unsigned long tmo = ip_rt_gc_timeout;
728
729 i = (i + 1) & rt_hash_mask;
730 rthp = &rt_hash_table[i].chain;
731
Eric Dumazetd90bf5a2007-11-14 16:14:05 -0800732 if (need_resched())
733 cond_resched();
734
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700735 if (*rthp == NULL)
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700736 continue;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700737 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700738 while ((rth = *rthp) != NULL) {
Eric Dumazet29e75252008-01-31 17:05:09 -0800739 if (rth->rt_genid != atomic_read(&rt_genid)) {
740 *rthp = rth->u.dst.rt_next;
741 rt_free(rth);
742 continue;
743 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700744 if (rth->u.dst.expires) {
745 /* Entry is expired even if it is in use */
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700746 if (time_before_eq(jiffies, rth->u.dst.expires)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700747 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800748 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700749 continue;
750 }
751 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
752 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800753 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700754 continue;
755 }
756
757 /* Cleanup aged off entries. */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800758 *rthp = rth->u.dst.rt_next;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900759 rt_free(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700760 }
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700761 spin_unlock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700762 }
763 rover = i;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800764}
765
766/*
767 * rt_worker_func() is run in process context.
Eric Dumazet29e75252008-01-31 17:05:09 -0800768 * we call rt_check_expire() to scan part of the hash table
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800769 */
770static void rt_worker_func(struct work_struct *work)
771{
Eric Dumazet29e75252008-01-31 17:05:09 -0800772 rt_check_expire();
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700773 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700774}
775
Eric Dumazet29e75252008-01-31 17:05:09 -0800776/*
777 * Pertubation of rt_genid by a small quantity [1..256]
778 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
779 * many times (2^24) without giving recent rt_genid.
780 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700781 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700782static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700783{
Eric Dumazet29e75252008-01-31 17:05:09 -0800784 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700785
Eric Dumazet29e75252008-01-31 17:05:09 -0800786 get_random_bytes(&shuffle, sizeof(shuffle));
787 atomic_add(shuffle + 1U, &rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700788}
789
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800790/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800791 * delay < 0 : invalidate cache (fast : entries will be deleted later)
792 * delay >= 0 : invalidate & flush cache (can be long)
793 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700794void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800795{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700796 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800797 if (delay >= 0)
798 rt_do_flush(!in_softirq());
799}
800
801/*
802 * We change rt_genid and let gc do the cleanup
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800803 */
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700804static void rt_secret_rebuild(unsigned long __net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700805{
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700806 struct net *net = (struct net *)__net;
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700807 rt_cache_invalidate(net);
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700808 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700809}
810
811/*
812 Short description of GC goals.
813
814 We want to build algorithm, which will keep routing cache
815 at some equilibrium point, when number of aged off entries
816 is kept approximately equal to newly generated ones.
817
818 Current expiration strength is variable "expire".
819 We try to adjust it dynamically, so that if networking
820 is idle expires is large enough to keep enough of warm entries,
821 and when load increases it reduces to limit cache size.
822 */
823
Daniel Lezcano569d3642008-01-18 03:56:57 -0800824static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700825{
826 static unsigned long expire = RT_GC_TIMEOUT;
827 static unsigned long last_gc;
828 static int rover;
829 static int equilibrium;
830 struct rtable *rth, **rthp;
831 unsigned long now = jiffies;
832 int goal;
833
834 /*
835 * Garbage collection is pretty expensive,
836 * do not make it too frequently.
837 */
838
839 RT_CACHE_STAT_INC(gc_total);
840
841 if (now - last_gc < ip_rt_gc_min_interval &&
842 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
843 RT_CACHE_STAT_INC(gc_ignored);
844 goto out;
845 }
846
847 /* Calculate number of entries, which we want to expire now. */
848 goal = atomic_read(&ipv4_dst_ops.entries) -
849 (ip_rt_gc_elasticity << rt_hash_log);
850 if (goal <= 0) {
851 if (equilibrium < ipv4_dst_ops.gc_thresh)
852 equilibrium = ipv4_dst_ops.gc_thresh;
853 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
854 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800855 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700856 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
857 }
858 } else {
859 /* We are in dangerous area. Try to reduce cache really
860 * aggressively.
861 */
Eric Dumazetb790ced2007-12-21 01:49:07 -0800862 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700863 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
864 }
865
866 if (now - last_gc >= ip_rt_gc_min_interval)
867 last_gc = now;
868
869 if (goal <= 0) {
870 equilibrium += goal;
871 goto work_done;
872 }
873
874 do {
875 int i, k;
876
877 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
878 unsigned long tmo = expire;
879
880 k = (k + 1) & rt_hash_mask;
881 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700882 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700883 while ((rth = *rthp) != NULL) {
Eric Dumazet29e75252008-01-31 17:05:09 -0800884 if (rth->rt_genid == atomic_read(&rt_genid) &&
885 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700886 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800887 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700888 continue;
889 }
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800890 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700891 rt_free(rth);
892 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700893 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700894 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700895 if (goal <= 0)
896 break;
897 }
898 rover = k;
899
900 if (goal <= 0)
901 goto work_done;
902
903 /* Goal is not achieved. We stop process if:
904
905 - if expire reduced to zero. Otherwise, expire is halfed.
906 - if table is not full.
907 - if we are called from interrupt.
908 - jiffies check is just fallback/debug loop breaker.
909 We will not spin here for long time in any case.
910 */
911
912 RT_CACHE_STAT_INC(gc_goal_miss);
913
914 if (expire == 0)
915 break;
916
917 expire >>= 1;
918#if RT_CACHE_DEBUG >= 2
919 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
920 atomic_read(&ipv4_dst_ops.entries), goal, i);
921#endif
922
923 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
924 goto out;
925 } while (!in_softirq() && time_before_eq(jiffies, now));
926
927 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
928 goto out;
929 if (net_ratelimit())
930 printk(KERN_WARNING "dst cache overflow\n");
931 RT_CACHE_STAT_INC(gc_dst_overflow);
932 return 1;
933
934work_done:
935 expire += ip_rt_gc_min_interval;
936 if (expire > ip_rt_gc_timeout ||
937 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
938 expire = ip_rt_gc_timeout;
939#if RT_CACHE_DEBUG >= 2
940 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
941 atomic_read(&ipv4_dst_ops.entries), goal, rover);
942#endif
943out: return 0;
944}
945
946static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
947{
948 struct rtable *rth, **rthp;
949 unsigned long now;
950 struct rtable *cand, **candp;
951 u32 min_score;
952 int chain_length;
953 int attempts = !in_softirq();
954
955restart:
956 chain_length = 0;
957 min_score = ~(u32)0;
958 cand = NULL;
959 candp = NULL;
960 now = jiffies;
961
962 rthp = &rt_hash_table[hash].chain;
963
Eric Dumazet22c047c2005-07-05 14:55:24 -0700964 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700965 while ((rth = *rthp) != NULL) {
Eric Dumazet29e75252008-01-31 17:05:09 -0800966 if (rth->rt_genid != atomic_read(&rt_genid)) {
967 *rthp = rth->u.dst.rt_next;
968 rt_free(rth);
969 continue;
970 }
Denis V. Lunevb5921912008-01-22 23:50:25 -0800971 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700972 /* Put it first */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800973 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700974 /*
975 * Since lookup is lockfree, the deletion
976 * must be visible to another weakly ordered CPU before
977 * the insertion at the start of the hash chain.
978 */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800979 rcu_assign_pointer(rth->u.dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980 rt_hash_table[hash].chain);
981 /*
982 * Since lookup is lockfree, the update writes
983 * must be ordered for consistency on SMP.
984 */
985 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
986
Pavel Emelyanov03f49f32007-11-10 21:28:34 -0800987 dst_use(&rth->u.dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -0700988 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700989
990 rt_drop(rt);
991 *rp = rth;
992 return 0;
993 }
994
995 if (!atomic_read(&rth->u.dst.__refcnt)) {
996 u32 score = rt_score(rth);
997
998 if (score <= min_score) {
999 cand = rth;
1000 candp = rthp;
1001 min_score = score;
1002 }
1003 }
1004
1005 chain_length++;
1006
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001007 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001008 }
1009
1010 if (cand) {
1011 /* ip_rt_gc_elasticity used to be average length of chain
1012 * length, when exceeded gc becomes really aggressive.
1013 *
1014 * The second limit is less certain. At the moment it allows
1015 * only 2 entries per bucket. We will see.
1016 */
1017 if (chain_length > ip_rt_gc_elasticity) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001018 *candp = cand->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001019 rt_free(cand);
1020 }
1021 }
1022
1023 /* Try to bind route to arp only if it is output
1024 route or unicast forwarding path.
1025 */
1026 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1027 int err = arp_bind_neighbour(&rt->u.dst);
1028 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001029 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001030
1031 if (err != -ENOBUFS) {
1032 rt_drop(rt);
1033 return err;
1034 }
1035
1036 /* Neighbour tables are full and nothing
1037 can be released. Try to shrink route cache,
1038 it is most likely it holds some neighbour records.
1039 */
1040 if (attempts-- > 0) {
1041 int saved_elasticity = ip_rt_gc_elasticity;
1042 int saved_int = ip_rt_gc_min_interval;
1043 ip_rt_gc_elasticity = 1;
1044 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001045 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046 ip_rt_gc_min_interval = saved_int;
1047 ip_rt_gc_elasticity = saved_elasticity;
1048 goto restart;
1049 }
1050
1051 if (net_ratelimit())
1052 printk(KERN_WARNING "Neighbour table overflow.\n");
1053 rt_drop(rt);
1054 return -ENOBUFS;
1055 }
1056 }
1057
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001058 rt->u.dst.rt_next = rt_hash_table[hash].chain;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001059#if RT_CACHE_DEBUG >= 2
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001060 if (rt->u.dst.rt_next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001061 struct rtable *trt;
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001062 printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001063 NIPQUAD(rt->rt_dst));
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001064 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001065 printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001066 printk("\n");
1067 }
1068#endif
1069 rt_hash_table[hash].chain = rt;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001070 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001071 *rp = rt;
1072 return 0;
1073}
1074
1075void rt_bind_peer(struct rtable *rt, int create)
1076{
1077 static DEFINE_SPINLOCK(rt_peer_lock);
1078 struct inet_peer *peer;
1079
1080 peer = inet_getpeer(rt->rt_dst, create);
1081
1082 spin_lock_bh(&rt_peer_lock);
1083 if (rt->peer == NULL) {
1084 rt->peer = peer;
1085 peer = NULL;
1086 }
1087 spin_unlock_bh(&rt_peer_lock);
1088 if (peer)
1089 inet_putpeer(peer);
1090}
1091
1092/*
1093 * Peer allocation may fail only in serious out-of-memory conditions. However
1094 * we still can generate some output.
1095 * Random ID selection looks a bit dangerous because we have no chances to
1096 * select ID being unique in a reasonable period of time.
1097 * But broken packet identifier may be better than no packet at all.
1098 */
1099static void ip_select_fb_ident(struct iphdr *iph)
1100{
1101 static DEFINE_SPINLOCK(ip_fb_id_lock);
1102 static u32 ip_fallback_id;
1103 u32 salt;
1104
1105 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001106 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001107 iph->id = htons(salt & 0xFFFF);
1108 ip_fallback_id = salt;
1109 spin_unlock_bh(&ip_fb_id_lock);
1110}
1111
1112void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1113{
1114 struct rtable *rt = (struct rtable *) dst;
1115
1116 if (rt) {
1117 if (rt->peer == NULL)
1118 rt_bind_peer(rt, 1);
1119
1120 /* If peer is attached to destination, it is never detached,
1121 so that we need not to grab a lock to dereference it.
1122 */
1123 if (rt->peer) {
1124 iph->id = htons(inet_getid(rt->peer, more));
1125 return;
1126 }
1127 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001128 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001129 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130
1131 ip_select_fb_ident(iph);
1132}
1133
1134static void rt_del(unsigned hash, struct rtable *rt)
1135{
Eric Dumazet29e75252008-01-31 17:05:09 -08001136 struct rtable **rthp, *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001137
Eric Dumazet29e75252008-01-31 17:05:09 -08001138 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001139 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001140 ip_rt_put(rt);
Eric Dumazet29e75252008-01-31 17:05:09 -08001141 while ((aux = *rthp) != NULL) {
1142 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1143 *rthp = aux->u.dst.rt_next;
1144 rt_free(aux);
1145 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001146 }
Eric Dumazet29e75252008-01-31 17:05:09 -08001147 rthp = &aux->u.dst.rt_next;
1148 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001149 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001150}
1151
Al Virof7655222006-09-26 21:25:43 -07001152void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1153 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001154{
1155 int i, k;
1156 struct in_device *in_dev = in_dev_get(dev);
1157 struct rtable *rth, **rthp;
Al Virof7655222006-09-26 21:25:43 -07001158 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001159 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001160 struct netevent_redirect netevent;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001161 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001162
Linus Torvalds1da177e2005-04-16 15:20:36 -07001163 if (!in_dev)
1164 return;
1165
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001166 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001167 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001168 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
Joe Perchesf97c1e02007-12-16 13:45:43 -08001169 || ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001170 goto reject_redirect;
1171
1172 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1173 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1174 goto reject_redirect;
1175 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1176 goto reject_redirect;
1177 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001178 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001179 goto reject_redirect;
1180 }
1181
1182 for (i = 0; i < 2; i++) {
1183 for (k = 0; k < 2; k++) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001184 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1185 atomic_read(&rt_genid));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001186
1187 rthp=&rt_hash_table[hash].chain;
1188
1189 rcu_read_lock();
1190 while ((rth = rcu_dereference(*rthp)) != NULL) {
1191 struct rtable *rt;
1192
1193 if (rth->fl.fl4_dst != daddr ||
1194 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001195 rth->fl.oif != ikeys[k] ||
Eric Dumazet29e75252008-01-31 17:05:09 -08001196 rth->fl.iif != 0 ||
Denis V. Lunev317805b2008-02-28 20:50:06 -08001197 rth->rt_genid != atomic_read(&rt_genid) ||
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001198 !net_eq(dev_net(rth->u.dst.dev), net)) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001199 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001200 continue;
1201 }
1202
1203 if (rth->rt_dst != daddr ||
1204 rth->rt_src != saddr ||
1205 rth->u.dst.error ||
1206 rth->rt_gateway != old_gw ||
1207 rth->u.dst.dev != dev)
1208 break;
1209
1210 dst_hold(&rth->u.dst);
1211 rcu_read_unlock();
1212
1213 rt = dst_alloc(&ipv4_dst_ops);
1214 if (rt == NULL) {
1215 ip_rt_put(rth);
1216 in_dev_put(in_dev);
1217 return;
1218 }
1219
1220 /* Copy all the information. */
1221 *rt = *rth;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001222 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001223 rt->u.dst.__use = 1;
1224 atomic_set(&rt->u.dst.__refcnt, 1);
1225 rt->u.dst.child = NULL;
1226 if (rt->u.dst.dev)
1227 dev_hold(rt->u.dst.dev);
1228 if (rt->idev)
1229 in_dev_hold(rt->idev);
1230 rt->u.dst.obsolete = 0;
1231 rt->u.dst.lastuse = jiffies;
1232 rt->u.dst.path = &rt->u.dst;
1233 rt->u.dst.neighbour = NULL;
1234 rt->u.dst.hh = NULL;
1235 rt->u.dst.xfrm = NULL;
Eric Dumazet29e75252008-01-31 17:05:09 -08001236 rt->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001237 rt->rt_flags |= RTCF_REDIRECTED;
1238
1239 /* Gateway is different ... */
1240 rt->rt_gateway = new_gw;
1241
1242 /* Redirect received -> path was valid */
1243 dst_confirm(&rth->u.dst);
1244
1245 if (rt->peer)
1246 atomic_inc(&rt->peer->refcnt);
1247
1248 if (arp_bind_neighbour(&rt->u.dst) ||
1249 !(rt->u.dst.neighbour->nud_state &
1250 NUD_VALID)) {
1251 if (rt->u.dst.neighbour)
1252 neigh_event_send(rt->u.dst.neighbour, NULL);
1253 ip_rt_put(rth);
1254 rt_drop(rt);
1255 goto do_next;
1256 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001257
Tom Tucker8d717402006-07-30 20:43:36 -07001258 netevent.old = &rth->u.dst;
1259 netevent.new = &rt->u.dst;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001260 call_netevent_notifiers(NETEVENT_REDIRECT,
1261 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001262
1263 rt_del(hash, rth);
1264 if (!rt_intern_hash(hash, rt, &rt))
1265 ip_rt_put(rt);
1266 goto do_next;
1267 }
1268 rcu_read_unlock();
1269 do_next:
1270 ;
1271 }
1272 }
1273 in_dev_put(in_dev);
1274 return;
1275
1276reject_redirect:
1277#ifdef CONFIG_IP_ROUTE_VERBOSE
1278 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001279 printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1280 NIPQUAD_FMT " ignored.\n"
1281 " Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001282 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001283 NIPQUAD(saddr), NIPQUAD(daddr));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001284#endif
1285 in_dev_put(in_dev);
1286}
1287
1288static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1289{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001290 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001291 struct dst_entry *ret = dst;
1292
1293 if (rt) {
1294 if (dst->obsolete) {
1295 ip_rt_put(rt);
1296 ret = NULL;
1297 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1298 rt->u.dst.expires) {
Al Viro8c7bc842006-09-26 21:26:19 -07001299 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001300 rt->fl.oif,
1301 atomic_read(&rt_genid));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001302#if RT_CACHE_DEBUG >= 1
Denis V. Lunev56c99d02007-12-06 02:19:07 -08001303 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001304 NIPQUAD_FMT "/%02x dropped\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001305 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1306#endif
1307 rt_del(hash, rt);
1308 ret = NULL;
1309 }
1310 }
1311 return ret;
1312}
1313
1314/*
1315 * Algorithm:
1316 * 1. The first ip_rt_redirect_number redirects are sent
1317 * with exponential backoff, then we stop sending them at all,
1318 * assuming that the host ignores our redirects.
1319 * 2. If we did not see packets requiring redirects
1320 * during ip_rt_redirect_silence, we assume that the host
1321 * forgot redirected route and start to send redirects again.
1322 *
1323 * This algorithm is much cheaper and more intelligent than dumb load limiting
1324 * in icmp.c.
1325 *
1326 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1327 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1328 */
1329
1330void ip_rt_send_redirect(struct sk_buff *skb)
1331{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001332 struct rtable *rt = skb->rtable;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001333 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1334
1335 if (!in_dev)
1336 return;
1337
1338 if (!IN_DEV_TX_REDIRECTS(in_dev))
1339 goto out;
1340
1341 /* No redirected packets during ip_rt_redirect_silence;
1342 * reset the algorithm.
1343 */
1344 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1345 rt->u.dst.rate_tokens = 0;
1346
1347 /* Too many ignored redirects; do not send anything
1348 * set u.dst.rate_last to the last seen redirected packet.
1349 */
1350 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1351 rt->u.dst.rate_last = jiffies;
1352 goto out;
1353 }
1354
1355 /* Check for load limit; set rate_last to the latest sent
1356 * redirect.
1357 */
Li Yewang14fb8a72006-12-18 00:26:35 -08001358 if (rt->u.dst.rate_tokens == 0 ||
1359 time_after(jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001360 (rt->u.dst.rate_last +
1361 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1362 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1363 rt->u.dst.rate_last = jiffies;
1364 ++rt->u.dst.rate_tokens;
1365#ifdef CONFIG_IP_ROUTE_VERBOSE
1366 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1367 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1368 net_ratelimit())
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001369 printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1370 "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001371 NIPQUAD(rt->rt_src), rt->rt_iif,
1372 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1373#endif
1374 }
1375out:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001376 in_dev_put(in_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001377}
1378
1379static int ip_error(struct sk_buff *skb)
1380{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001381 struct rtable *rt = skb->rtable;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001382 unsigned long now;
1383 int code;
1384
1385 switch (rt->u.dst.error) {
1386 case EINVAL:
1387 default:
1388 goto out;
1389 case EHOSTUNREACH:
1390 code = ICMP_HOST_UNREACH;
1391 break;
1392 case ENETUNREACH:
1393 code = ICMP_NET_UNREACH;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08001394 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001395 break;
1396 case EACCES:
1397 code = ICMP_PKT_FILTERED;
1398 break;
1399 }
1400
1401 now = jiffies;
1402 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1403 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1404 rt->u.dst.rate_tokens = ip_rt_error_burst;
1405 rt->u.dst.rate_last = now;
1406 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1407 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1408 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1409 }
1410
1411out: kfree_skb(skb);
1412 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001413}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001414
1415/*
1416 * The last two values are not from the RFC but
1417 * are needed for AMPRnet AX.25 paths.
1418 */
1419
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001420static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001421{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1422
Stephen Hemminger5969f712008-04-10 01:52:09 -07001423static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001424{
1425 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001426
Linus Torvalds1da177e2005-04-16 15:20:36 -07001427 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1428 if (old_mtu > mtu_plateau[i])
1429 return mtu_plateau[i];
1430 return 68;
1431}
1432
Denis V. Lunevb5921912008-01-22 23:50:25 -08001433unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001434 unsigned short new_mtu,
1435 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001436{
Timo Teras0010e462008-04-29 03:32:25 -07001437 int i, k;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001438 unsigned short old_mtu = ntohs(iph->tot_len);
1439 struct rtable *rth;
Timo Teras0010e462008-04-29 03:32:25 -07001440 int ikeys[2] = { dev->ifindex, 0 };
Al Viroe4485152006-09-26 22:15:01 -07001441 __be32 skeys[2] = { iph->saddr, 0, };
1442 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001443 unsigned short est_mtu = 0;
1444
1445 if (ipv4_config.no_pmtu_disc)
1446 return 0;
1447
Timo Teras0010e462008-04-29 03:32:25 -07001448 for (k = 0; k < 2; k++) {
1449 for (i = 0; i < 2; i++) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001450 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1451 atomic_read(&rt_genid));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001452
Timo Teras0010e462008-04-29 03:32:25 -07001453 rcu_read_lock();
1454 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1455 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001456 unsigned short mtu = new_mtu;
1457
Timo Teras0010e462008-04-29 03:32:25 -07001458 if (rth->fl.fl4_dst != daddr ||
1459 rth->fl.fl4_src != skeys[i] ||
1460 rth->rt_dst != daddr ||
1461 rth->rt_src != iph->saddr ||
1462 rth->fl.oif != ikeys[k] ||
1463 rth->fl.iif != 0 ||
1464 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1465 !net_eq(dev_net(rth->u.dst.dev), net) ||
1466 rth->rt_genid != atomic_read(&rt_genid))
1467 continue;
1468
Linus Torvalds1da177e2005-04-16 15:20:36 -07001469 if (new_mtu < 68 || new_mtu >= old_mtu) {
1470
1471 /* BSD 4.2 compatibility hack :-( */
1472 if (mtu == 0 &&
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001473 old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001474 old_mtu >= 68 + (iph->ihl << 2))
1475 old_mtu -= iph->ihl << 2;
1476
1477 mtu = guess_mtu(old_mtu);
1478 }
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001479 if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
1480 if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001481 dst_confirm(&rth->u.dst);
1482 if (mtu < ip_rt_min_pmtu) {
1483 mtu = ip_rt_min_pmtu;
1484 rth->u.dst.metrics[RTAX_LOCK-1] |=
1485 (1 << RTAX_MTU);
1486 }
1487 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1488 dst_set_expires(&rth->u.dst,
1489 ip_rt_mtu_expires);
1490 }
1491 est_mtu = mtu;
1492 }
1493 }
Timo Teras0010e462008-04-29 03:32:25 -07001494 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001495 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001496 }
1497 return est_mtu ? : new_mtu;
1498}
1499
1500static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1501{
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001502 if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001503 !(dst_metric_locked(dst, RTAX_MTU))) {
1504 if (mtu < ip_rt_min_pmtu) {
1505 mtu = ip_rt_min_pmtu;
1506 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1507 }
1508 dst->metrics[RTAX_MTU-1] = mtu;
1509 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001510 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511 }
1512}
1513
1514static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1515{
1516 return NULL;
1517}
1518
1519static void ipv4_dst_destroy(struct dst_entry *dst)
1520{
1521 struct rtable *rt = (struct rtable *) dst;
1522 struct inet_peer *peer = rt->peer;
1523 struct in_device *idev = rt->idev;
1524
1525 if (peer) {
1526 rt->peer = NULL;
1527 inet_putpeer(peer);
1528 }
1529
1530 if (idev) {
1531 rt->idev = NULL;
1532 in_dev_put(idev);
1533 }
1534}
1535
1536static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1537 int how)
1538{
1539 struct rtable *rt = (struct rtable *) dst;
1540 struct in_device *idev = rt->idev;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001541 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
Denis V. Lunev5a3e55d2007-12-07 00:38:10 -08001542 struct in_device *loopback_idev =
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001543 in_dev_get(dev_net(dev)->loopback_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001544 if (loopback_idev) {
1545 rt->idev = loopback_idev;
1546 in_dev_put(idev);
1547 }
1548 }
1549}
1550
1551static void ipv4_link_failure(struct sk_buff *skb)
1552{
1553 struct rtable *rt;
1554
1555 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1556
Eric Dumazetee6b9672008-03-05 18:30:47 -08001557 rt = skb->rtable;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001558 if (rt)
1559 dst_set_expires(&rt->u.dst, 0);
1560}
1561
1562static int ip_rt_bug(struct sk_buff *skb)
1563{
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001564 printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001565 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001566 skb->dev ? skb->dev->name : "?");
1567 kfree_skb(skb);
1568 return 0;
1569}
1570
1571/*
1572 We do not cache source address of outgoing interface,
1573 because it is used only by IP RR, TS and SRR options,
1574 so that it out of fast path.
1575
1576 BTW remember: "addr" is allowed to be not aligned
1577 in IP options!
1578 */
1579
1580void ip_rt_get_source(u8 *addr, struct rtable *rt)
1581{
Al Viroa61ced52006-09-26 21:27:54 -07001582 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001583 struct fib_result res;
1584
1585 if (rt->fl.iif == 0)
1586 src = rt->rt_src;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001587 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001588 src = FIB_RES_PREFSRC(res);
1589 fib_res_put(&res);
1590 } else
1591 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1592 RT_SCOPE_UNIVERSE);
1593 memcpy(addr, &src, 4);
1594}
1595
1596#ifdef CONFIG_NET_CLS_ROUTE
1597static void set_class_tag(struct rtable *rt, u32 tag)
1598{
1599 if (!(rt->u.dst.tclassid & 0xFFFF))
1600 rt->u.dst.tclassid |= tag & 0xFFFF;
1601 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1602 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1603}
1604#endif
1605
1606static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1607{
1608 struct fib_info *fi = res->fi;
1609
1610 if (fi) {
1611 if (FIB_RES_GW(*res) &&
1612 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1613 rt->rt_gateway = FIB_RES_GW(*res);
1614 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1615 sizeof(rt->u.dst.metrics));
1616 if (fi->fib_mtu == 0) {
1617 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
Satoru SATOH0bbeafd2008-05-04 22:12:43 -07001618 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001619 rt->rt_gateway != rt->rt_dst &&
1620 rt->u.dst.dev->mtu > 576)
1621 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1622 }
1623#ifdef CONFIG_NET_CLS_ROUTE
1624 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1625#endif
1626 } else
1627 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1628
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001629 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001630 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001631 if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001632 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001633 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001634 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1635 ip_rt_min_advmss);
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001636 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001637 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1638
1639#ifdef CONFIG_NET_CLS_ROUTE
1640#ifdef CONFIG_IP_MULTIPLE_TABLES
1641 set_class_tag(rt, fib_rules_tclass(res));
1642#endif
1643 set_class_tag(rt, itag);
1644#endif
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001645 rt->rt_type = res->type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001646}
1647
Al Viro9e12bb22006-09-26 21:25:20 -07001648static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001649 u8 tos, struct net_device *dev, int our)
1650{
1651 unsigned hash;
1652 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001653 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001654 struct in_device *in_dev = in_dev_get(dev);
1655 u32 itag = 0;
1656
1657 /* Primary sanity checks. */
1658
1659 if (in_dev == NULL)
1660 return -EINVAL;
1661
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001662 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001663 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001664 goto e_inval;
1665
Joe Perchesf97c1e02007-12-16 13:45:43 -08001666 if (ipv4_is_zeronet(saddr)) {
1667 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001668 goto e_inval;
1669 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1670 } else if (fib_validate_source(saddr, 0, tos, 0,
1671 dev, &spec_dst, &itag) < 0)
1672 goto e_inval;
1673
1674 rth = dst_alloc(&ipv4_dst_ops);
1675 if (!rth)
1676 goto e_nobufs;
1677
1678 rth->u.dst.output= ip_rt_bug;
1679
1680 atomic_set(&rth->u.dst.__refcnt, 1);
1681 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001682 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001683 rth->u.dst.flags |= DST_NOPOLICY;
1684 rth->fl.fl4_dst = daddr;
1685 rth->rt_dst = daddr;
1686 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001687 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001688 rth->fl.fl4_src = saddr;
1689 rth->rt_src = saddr;
1690#ifdef CONFIG_NET_CLS_ROUTE
1691 rth->u.dst.tclassid = itag;
1692#endif
1693 rth->rt_iif =
1694 rth->fl.iif = dev->ifindex;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001695 rth->u.dst.dev = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001696 dev_hold(rth->u.dst.dev);
1697 rth->idev = in_dev_get(rth->u.dst.dev);
1698 rth->fl.oif = 0;
1699 rth->rt_gateway = daddr;
1700 rth->rt_spec_dst= spec_dst;
Eric Dumazet29e75252008-01-31 17:05:09 -08001701 rth->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001702 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001703 rth->rt_type = RTN_MULTICAST;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001704 if (our) {
1705 rth->u.dst.input= ip_local_deliver;
1706 rth->rt_flags |= RTCF_LOCAL;
1707 }
1708
1709#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001710 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001711 rth->u.dst.input = ip_mr_input;
1712#endif
1713 RT_CACHE_STAT_INC(in_slow_mc);
1714
1715 in_dev_put(in_dev);
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001716 hash = rt_hash(daddr, saddr, dev->ifindex, atomic_read(&rt_genid));
Eric Dumazetee6b9672008-03-05 18:30:47 -08001717 return rt_intern_hash(hash, rth, &skb->rtable);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001718
1719e_nobufs:
1720 in_dev_put(in_dev);
1721 return -ENOBUFS;
1722
1723e_inval:
1724 in_dev_put(in_dev);
1725 return -EINVAL;
1726}
1727
1728
1729static void ip_handle_martian_source(struct net_device *dev,
1730 struct in_device *in_dev,
1731 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001732 __be32 daddr,
1733 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001734{
1735 RT_CACHE_STAT_INC(in_martian_src);
1736#ifdef CONFIG_IP_ROUTE_VERBOSE
1737 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1738 /*
1739 * RFC1812 recommendation, if source is martian,
1740 * the only hint is MAC header.
1741 */
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001742 printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1743 NIPQUAD_FMT", on dev %s\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001744 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001745 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001746 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001747 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001748 printk(KERN_WARNING "ll header: ");
1749 for (i = 0; i < dev->hard_header_len; i++, p++) {
1750 printk("%02x", *p);
1751 if (i < (dev->hard_header_len - 1))
1752 printk(":");
1753 }
1754 printk("\n");
1755 }
1756 }
1757#endif
1758}
1759
Stephen Hemminger5969f712008-04-10 01:52:09 -07001760static int __mkroute_input(struct sk_buff *skb,
1761 struct fib_result *res,
1762 struct in_device *in_dev,
1763 __be32 daddr, __be32 saddr, u32 tos,
1764 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001765{
1766
1767 struct rtable *rth;
1768 int err;
1769 struct in_device *out_dev;
1770 unsigned flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001771 __be32 spec_dst;
1772 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001773
1774 /* get a working reference to the output device */
1775 out_dev = in_dev_get(FIB_RES_DEV(*res));
1776 if (out_dev == NULL) {
1777 if (net_ratelimit())
1778 printk(KERN_CRIT "Bug in ip_route_input" \
1779 "_slow(). Please, report\n");
1780 return -EINVAL;
1781 }
1782
1783
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001784 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001785 in_dev->dev, &spec_dst, &itag);
1786 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001787 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001788 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001789
Linus Torvalds1da177e2005-04-16 15:20:36 -07001790 err = -EINVAL;
1791 goto cleanup;
1792 }
1793
1794 if (err)
1795 flags |= RTCF_DIRECTSRC;
1796
Thomas Graf51b77ca2008-06-03 16:36:01 -07001797 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001798 (IN_DEV_SHARED_MEDIA(out_dev) ||
1799 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1800 flags |= RTCF_DOREDIRECT;
1801
1802 if (skb->protocol != htons(ETH_P_IP)) {
1803 /* Not IP (i.e. ARP). Do not create route, if it is
1804 * invalid for proxy arp. DNAT routes are always valid.
1805 */
Rami Rosencb7928a2008-01-09 00:18:24 -08001806 if (out_dev == in_dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001807 err = -EINVAL;
1808 goto cleanup;
1809 }
1810 }
1811
1812
1813 rth = dst_alloc(&ipv4_dst_ops);
1814 if (!rth) {
1815 err = -ENOBUFS;
1816 goto cleanup;
1817 }
1818
Julian Anastasovce723d82005-09-08 13:34:47 -07001819 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001820 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001821 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001822 rth->u.dst.flags |= DST_NOPOLICY;
Herbert Xu42f811b2007-06-04 23:34:44 -07001823 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001824 rth->u.dst.flags |= DST_NOXFRM;
1825 rth->fl.fl4_dst = daddr;
1826 rth->rt_dst = daddr;
1827 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001828 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001829 rth->fl.fl4_src = saddr;
1830 rth->rt_src = saddr;
1831 rth->rt_gateway = daddr;
1832 rth->rt_iif =
1833 rth->fl.iif = in_dev->dev->ifindex;
1834 rth->u.dst.dev = (out_dev)->dev;
1835 dev_hold(rth->u.dst.dev);
1836 rth->idev = in_dev_get(rth->u.dst.dev);
1837 rth->fl.oif = 0;
1838 rth->rt_spec_dst= spec_dst;
1839
1840 rth->u.dst.input = ip_forward;
1841 rth->u.dst.output = ip_output;
Eric Dumazet29e75252008-01-31 17:05:09 -08001842 rth->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001843
1844 rt_set_nexthop(rth, res, itag);
1845
1846 rth->rt_flags = flags;
1847
1848 *result = rth;
1849 err = 0;
1850 cleanup:
1851 /* release the working reference to the output device */
1852 in_dev_put(out_dev);
1853 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001854}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001855
Stephen Hemminger5969f712008-04-10 01:52:09 -07001856static int ip_mkroute_input(struct sk_buff *skb,
1857 struct fib_result *res,
1858 const struct flowi *fl,
1859 struct in_device *in_dev,
1860 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001861{
Chuck Short7abaa272005-06-22 22:10:23 -07001862 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001863 int err;
1864 unsigned hash;
1865
1866#ifdef CONFIG_IP_ROUTE_MULTIPATH
1867 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1868 fib_select_multipath(fl, res);
1869#endif
1870
1871 /* create a routing cache entry */
1872 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1873 if (err)
1874 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001875
1876 /* put it into the cache */
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001877 hash = rt_hash(daddr, saddr, fl->iif, atomic_read(&rt_genid));
Eric Dumazetee6b9672008-03-05 18:30:47 -08001878 return rt_intern_hash(hash, rth, &skb->rtable);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001879}
1880
Linus Torvalds1da177e2005-04-16 15:20:36 -07001881/*
1882 * NOTE. We drop all the packets that has local source
1883 * addresses, because every properly looped back packet
1884 * must have correct destination already attached by output routine.
1885 *
1886 * Such approach solves two big problems:
1887 * 1. Not simplex devices are handled properly.
1888 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1889 */
1890
Al Viro9e12bb22006-09-26 21:25:20 -07001891static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001892 u8 tos, struct net_device *dev)
1893{
1894 struct fib_result res;
1895 struct in_device *in_dev = in_dev_get(dev);
1896 struct flowi fl = { .nl_u = { .ip4_u =
1897 { .daddr = daddr,
1898 .saddr = saddr,
1899 .tos = tos,
1900 .scope = RT_SCOPE_UNIVERSE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001901 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001902 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001903 .iif = dev->ifindex };
1904 unsigned flags = 0;
1905 u32 itag = 0;
1906 struct rtable * rth;
1907 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07001908 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001909 int err = -EINVAL;
1910 int free_res = 0;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001911 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001912
1913 /* IP on this device is disabled. */
1914
1915 if (!in_dev)
1916 goto out;
1917
1918 /* Check for the most weird martians, which can be not detected
1919 by fib_lookup.
1920 */
1921
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001922 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001923 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001924 goto martian_source;
1925
Al Viroe4485152006-09-26 22:15:01 -07001926 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001927 goto brd_input;
1928
1929 /* Accept zero addresses only to limited broadcast;
1930 * I even do not know to fix it or not. Waiting for complains :-)
1931 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08001932 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001933 goto martian_source;
1934
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001935 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001936 ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001937 goto martian_destination;
1938
1939 /*
1940 * Now we are ready to route packet.
1941 */
Denis V. Lunev84a885f2008-01-21 17:34:35 -08001942 if ((err = fib_lookup(net, &fl, &res)) != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001943 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001944 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001945 goto no_route;
1946 }
1947 free_res = 1;
1948
1949 RT_CACHE_STAT_INC(in_slow_tot);
1950
1951 if (res.type == RTN_BROADCAST)
1952 goto brd_input;
1953
1954 if (res.type == RTN_LOCAL) {
1955 int result;
1956 result = fib_validate_source(saddr, daddr, tos,
Denis V. Lunev84a885f2008-01-21 17:34:35 -08001957 net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001958 dev, &spec_dst, &itag);
1959 if (result < 0)
1960 goto martian_source;
1961 if (result)
1962 flags |= RTCF_DIRECTSRC;
1963 spec_dst = daddr;
1964 goto local_input;
1965 }
1966
1967 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001968 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001969 if (res.type != RTN_UNICAST)
1970 goto martian_destination;
1971
1972 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001973done:
1974 in_dev_put(in_dev);
1975 if (free_res)
1976 fib_res_put(&res);
1977out: return err;
1978
1979brd_input:
1980 if (skb->protocol != htons(ETH_P_IP))
1981 goto e_inval;
1982
Joe Perchesf97c1e02007-12-16 13:45:43 -08001983 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001984 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1985 else {
1986 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1987 &itag);
1988 if (err < 0)
1989 goto martian_source;
1990 if (err)
1991 flags |= RTCF_DIRECTSRC;
1992 }
1993 flags |= RTCF_BROADCAST;
1994 res.type = RTN_BROADCAST;
1995 RT_CACHE_STAT_INC(in_brd);
1996
1997local_input:
1998 rth = dst_alloc(&ipv4_dst_ops);
1999 if (!rth)
2000 goto e_nobufs;
2001
2002 rth->u.dst.output= ip_rt_bug;
Eric Dumazet29e75252008-01-31 17:05:09 -08002003 rth->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002004
2005 atomic_set(&rth->u.dst.__refcnt, 1);
2006 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002007 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002008 rth->u.dst.flags |= DST_NOPOLICY;
2009 rth->fl.fl4_dst = daddr;
2010 rth->rt_dst = daddr;
2011 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002012 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002013 rth->fl.fl4_src = saddr;
2014 rth->rt_src = saddr;
2015#ifdef CONFIG_NET_CLS_ROUTE
2016 rth->u.dst.tclassid = itag;
2017#endif
2018 rth->rt_iif =
2019 rth->fl.iif = dev->ifindex;
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002020 rth->u.dst.dev = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002021 dev_hold(rth->u.dst.dev);
2022 rth->idev = in_dev_get(rth->u.dst.dev);
2023 rth->rt_gateway = daddr;
2024 rth->rt_spec_dst= spec_dst;
2025 rth->u.dst.input= ip_local_deliver;
2026 rth->rt_flags = flags|RTCF_LOCAL;
2027 if (res.type == RTN_UNREACHABLE) {
2028 rth->u.dst.input= ip_error;
2029 rth->u.dst.error= -err;
2030 rth->rt_flags &= ~RTCF_LOCAL;
2031 }
2032 rth->rt_type = res.type;
Denis V. Lunevb00180d2008-07-05 19:04:09 -07002033 hash = rt_hash(daddr, saddr, fl.iif, atomic_read(&rt_genid));
Eric Dumazetee6b9672008-03-05 18:30:47 -08002034 err = rt_intern_hash(hash, rth, &skb->rtable);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002035 goto done;
2036
2037no_route:
2038 RT_CACHE_STAT_INC(in_no_route);
2039 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2040 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002041 if (err == -ESRCH)
2042 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002043 goto local_input;
2044
2045 /*
2046 * Do not cache martian addresses: they should be logged (RFC1812)
2047 */
2048martian_destination:
2049 RT_CACHE_STAT_INC(in_martian_dst);
2050#ifdef CONFIG_IP_ROUTE_VERBOSE
2051 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07002052 printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2053 NIPQUAD_FMT ", dev %s\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07002054 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2055#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002056
2057e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002058 err = -EHOSTUNREACH;
2059 goto done;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002060
Linus Torvalds1da177e2005-04-16 15:20:36 -07002061e_inval:
2062 err = -EINVAL;
2063 goto done;
2064
2065e_nobufs:
2066 err = -ENOBUFS;
2067 goto done;
2068
2069martian_source:
2070 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2071 goto e_inval;
2072}
2073
Al Viro9e12bb22006-09-26 21:25:20 -07002074int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002075 u8 tos, struct net_device *dev)
2076{
2077 struct rtable * rth;
2078 unsigned hash;
2079 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002080 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002081
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002082 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002083 tos &= IPTOS_RT_MASK;
Denis V. Lunevb00180d2008-07-05 19:04:09 -07002084 hash = rt_hash(daddr, saddr, iif, atomic_read(&rt_genid));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002085
2086 rcu_read_lock();
2087 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002088 rth = rcu_dereference(rth->u.dst.rt_next)) {
Stephen Hemmingerc0b8c322008-04-10 04:00:28 -07002089 if (((rth->fl.fl4_dst ^ daddr) |
2090 (rth->fl.fl4_src ^ saddr) |
2091 (rth->fl.iif ^ iif) |
2092 rth->fl.oif |
2093 (rth->fl.fl4_tos ^ tos)) == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002094 rth->fl.mark == skb->mark &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002095 net_eq(dev_net(rth->u.dst.dev), net) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08002096 rth->rt_genid == atomic_read(&rt_genid)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002097 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002098 RT_CACHE_STAT_INC(in_hit);
2099 rcu_read_unlock();
Eric Dumazetee6b9672008-03-05 18:30:47 -08002100 skb->rtable = rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002101 return 0;
2102 }
2103 RT_CACHE_STAT_INC(in_hlist_search);
2104 }
2105 rcu_read_unlock();
2106
2107 /* Multicast recognition logic is moved from route cache to here.
2108 The problem was that too many Ethernet cards have broken/missing
2109 hardware multicast filters :-( As result the host on multicasting
2110 network acquires a lot of useless route cache entries, sort of
2111 SDR messages from all the world. Now we try to get rid of them.
2112 Really, provided software IP multicast filter is organized
2113 reasonably (at least, hashed), it does not result in a slowdown
2114 comparing with route cache reject entries.
2115 Note, that multicast routers are not affected, because
2116 route cache entry is created eventually.
2117 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002118 if (ipv4_is_multicast(daddr)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002119 struct in_device *in_dev;
2120
2121 rcu_read_lock();
Herbert Xue5ed6392005-10-03 14:35:55 -07002122 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002123 int our = ip_check_mc(in_dev, daddr, saddr,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002124 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002125 if (our
2126#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08002127 || (!ipv4_is_local_multicast(daddr) &&
2128 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002129#endif
2130 ) {
2131 rcu_read_unlock();
2132 return ip_route_input_mc(skb, daddr, saddr,
2133 tos, dev, our);
2134 }
2135 }
2136 rcu_read_unlock();
2137 return -EINVAL;
2138 }
2139 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2140}
2141
Stephen Hemminger5969f712008-04-10 01:52:09 -07002142static int __mkroute_output(struct rtable **result,
2143 struct fib_result *res,
2144 const struct flowi *fl,
2145 const struct flowi *oldflp,
2146 struct net_device *dev_out,
2147 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002148{
2149 struct rtable *rth;
2150 struct in_device *in_dev;
2151 u32 tos = RT_FL_TOS(oldflp);
2152 int err = 0;
2153
Joe Perchesf97c1e02007-12-16 13:45:43 -08002154 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002155 return -EINVAL;
2156
Al Viroe4485152006-09-26 22:15:01 -07002157 if (fl->fl4_dst == htonl(0xFFFFFFFF))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002158 res->type = RTN_BROADCAST;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002159 else if (ipv4_is_multicast(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002160 res->type = RTN_MULTICAST;
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002161 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002162 return -EINVAL;
2163
2164 if (dev_out->flags & IFF_LOOPBACK)
2165 flags |= RTCF_LOCAL;
2166
2167 /* get work reference to inet device */
2168 in_dev = in_dev_get(dev_out);
2169 if (!in_dev)
2170 return -EINVAL;
2171
2172 if (res->type == RTN_BROADCAST) {
2173 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2174 if (res->fi) {
2175 fib_info_put(res->fi);
2176 res->fi = NULL;
2177 }
2178 } else if (res->type == RTN_MULTICAST) {
2179 flags |= RTCF_MULTICAST|RTCF_LOCAL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002180 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002181 oldflp->proto))
2182 flags &= ~RTCF_LOCAL;
2183 /* If multicast route do not exist use
2184 default one, but do not gateway in this case.
2185 Yes, it is hack.
2186 */
2187 if (res->fi && res->prefixlen < 4) {
2188 fib_info_put(res->fi);
2189 res->fi = NULL;
2190 }
2191 }
2192
2193
2194 rth = dst_alloc(&ipv4_dst_ops);
2195 if (!rth) {
2196 err = -ENOBUFS;
2197 goto cleanup;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002198 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002199
Julian Anastasovce723d82005-09-08 13:34:47 -07002200 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002202 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002203 rth->u.dst.flags |= DST_NOXFRM;
Herbert Xu42f811b2007-06-04 23:34:44 -07002204 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002205 rth->u.dst.flags |= DST_NOPOLICY;
2206
2207 rth->fl.fl4_dst = oldflp->fl4_dst;
2208 rth->fl.fl4_tos = tos;
2209 rth->fl.fl4_src = oldflp->fl4_src;
2210 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002211 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002212 rth->rt_dst = fl->fl4_dst;
2213 rth->rt_src = fl->fl4_src;
2214 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002215 /* get references to the devices that are to be hold by the routing
Linus Torvalds1da177e2005-04-16 15:20:36 -07002216 cache entry */
2217 rth->u.dst.dev = dev_out;
2218 dev_hold(dev_out);
2219 rth->idev = in_dev_get(dev_out);
2220 rth->rt_gateway = fl->fl4_dst;
2221 rth->rt_spec_dst= fl->fl4_src;
2222
2223 rth->u.dst.output=ip_output;
Eric Dumazet29e75252008-01-31 17:05:09 -08002224 rth->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002225
2226 RT_CACHE_STAT_INC(out_slow_tot);
2227
2228 if (flags & RTCF_LOCAL) {
2229 rth->u.dst.input = ip_local_deliver;
2230 rth->rt_spec_dst = fl->fl4_dst;
2231 }
2232 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2233 rth->rt_spec_dst = fl->fl4_src;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002234 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002235 !(dev_out->flags & IFF_LOOPBACK)) {
2236 rth->u.dst.output = ip_mc_output;
2237 RT_CACHE_STAT_INC(out_slow_mc);
2238 }
2239#ifdef CONFIG_IP_MROUTE
2240 if (res->type == RTN_MULTICAST) {
2241 if (IN_DEV_MFORWARD(in_dev) &&
Joe Perchesf97c1e02007-12-16 13:45:43 -08002242 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002243 rth->u.dst.input = ip_mr_input;
2244 rth->u.dst.output = ip_mc_output;
2245 }
2246 }
2247#endif
2248 }
2249
2250 rt_set_nexthop(rth, res, 0);
2251
2252 rth->rt_flags = flags;
2253
2254 *result = rth;
2255 cleanup:
2256 /* release work reference to inet device */
2257 in_dev_put(in_dev);
2258
2259 return err;
2260}
2261
Stephen Hemminger5969f712008-04-10 01:52:09 -07002262static int ip_mkroute_output(struct rtable **rp,
2263 struct fib_result *res,
2264 const struct flowi *fl,
2265 const struct flowi *oldflp,
2266 struct net_device *dev_out,
2267 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002268{
Chuck Short7abaa272005-06-22 22:10:23 -07002269 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002270 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2271 unsigned hash;
2272 if (err == 0) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07002273 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2274 atomic_read(&rt_genid));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002275 err = rt_intern_hash(hash, rth, rp);
2276 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002277
Linus Torvalds1da177e2005-04-16 15:20:36 -07002278 return err;
2279}
2280
Linus Torvalds1da177e2005-04-16 15:20:36 -07002281/*
2282 * Major route resolver routine.
2283 */
2284
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002285static int ip_route_output_slow(struct net *net, struct rtable **rp,
2286 const struct flowi *oldflp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002287{
2288 u32 tos = RT_FL_TOS(oldflp);
2289 struct flowi fl = { .nl_u = { .ip4_u =
2290 { .daddr = oldflp->fl4_dst,
2291 .saddr = oldflp->fl4_src,
2292 .tos = tos & IPTOS_RT_MASK,
2293 .scope = ((tos & RTO_ONLINK) ?
2294 RT_SCOPE_LINK :
2295 RT_SCOPE_UNIVERSE),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002296 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002297 .mark = oldflp->mark,
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002298 .iif = net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002299 .oif = oldflp->oif };
2300 struct fib_result res;
2301 unsigned flags = 0;
2302 struct net_device *dev_out = NULL;
2303 int free_res = 0;
2304 int err;
2305
2306
2307 res.fi = NULL;
2308#ifdef CONFIG_IP_MULTIPLE_TABLES
2309 res.r = NULL;
2310#endif
2311
2312 if (oldflp->fl4_src) {
2313 err = -EINVAL;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002314 if (ipv4_is_multicast(oldflp->fl4_src) ||
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002315 ipv4_is_lbcast(oldflp->fl4_src) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002316 ipv4_is_zeronet(oldflp->fl4_src))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317 goto out;
2318
2319 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002320 dev_out = ip_dev_find(net, oldflp->fl4_src);
David S. Millerf6c5d732007-05-18 02:07:50 -07002321 if (dev_out == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002322 goto out;
2323
2324 /* I removed check for oif == dev_out->oif here.
2325 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002326 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2327 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002328 2. Moreover, we are allowed to send packets with saddr
2329 of another iface. --ANK
2330 */
2331
David S. Millerf6c5d732007-05-18 02:07:50 -07002332 if (oldflp->oif == 0
Joe Perchesf97c1e02007-12-16 13:45:43 -08002333 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2334 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002335 /* Special hack: user can direct multicasts
2336 and limited broadcast via necessary interface
2337 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2338 This hack is not just for fun, it allows
2339 vic,vat and friends to work.
2340 They bind socket to loopback, set ttl to zero
2341 and expect that it will work.
2342 From the viewpoint of routing cache they are broken,
2343 because we are not allowed to build multicast path
2344 with loopback source addr (look, routing cache
2345 cannot know, that ttl is zero, so that packet
2346 will not leave this host and route is valid).
2347 Luckily, this hack is good workaround.
2348 */
2349
2350 fl.oif = dev_out->ifindex;
2351 goto make_route;
2352 }
2353 if (dev_out)
2354 dev_put(dev_out);
2355 dev_out = NULL;
2356 }
2357
2358
2359 if (oldflp->oif) {
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002360 dev_out = dev_get_by_index(net, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002361 err = -ENODEV;
2362 if (dev_out == NULL)
2363 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002364
2365 /* RACE: Check return value of inet_select_addr instead. */
2366 if (__in_dev_get_rtnl(dev_out) == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002367 dev_put(dev_out);
2368 goto out; /* Wrong error code */
2369 }
2370
Joe Perchesf97c1e02007-12-16 13:45:43 -08002371 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2372 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002373 if (!fl.fl4_src)
2374 fl.fl4_src = inet_select_addr(dev_out, 0,
2375 RT_SCOPE_LINK);
2376 goto make_route;
2377 }
2378 if (!fl.fl4_src) {
Joe Perchesf97c1e02007-12-16 13:45:43 -08002379 if (ipv4_is_multicast(oldflp->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002380 fl.fl4_src = inet_select_addr(dev_out, 0,
2381 fl.fl4_scope);
2382 else if (!oldflp->fl4_dst)
2383 fl.fl4_src = inet_select_addr(dev_out, 0,
2384 RT_SCOPE_HOST);
2385 }
2386 }
2387
2388 if (!fl.fl4_dst) {
2389 fl.fl4_dst = fl.fl4_src;
2390 if (!fl.fl4_dst)
2391 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2392 if (dev_out)
2393 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002394 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002395 dev_hold(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002396 fl.oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002397 res.type = RTN_LOCAL;
2398 flags |= RTCF_LOCAL;
2399 goto make_route;
2400 }
2401
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002402 if (fib_lookup(net, &fl, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002403 res.fi = NULL;
2404 if (oldflp->oif) {
2405 /* Apparently, routing tables are wrong. Assume,
2406 that the destination is on link.
2407
2408 WHY? DW.
2409 Because we are allowed to send to iface
2410 even if it has NO routes and NO assigned
2411 addresses. When oif is specified, routing
2412 tables are looked up with only one purpose:
2413 to catch if destination is gatewayed, rather than
2414 direct. Moreover, if MSG_DONTROUTE is set,
2415 we send packet, ignoring both routing tables
2416 and ifaddr state. --ANK
2417
2418
2419 We could make it even if oif is unknown,
2420 likely IPv6, but we do not.
2421 */
2422
2423 if (fl.fl4_src == 0)
2424 fl.fl4_src = inet_select_addr(dev_out, 0,
2425 RT_SCOPE_LINK);
2426 res.type = RTN_UNICAST;
2427 goto make_route;
2428 }
2429 if (dev_out)
2430 dev_put(dev_out);
2431 err = -ENETUNREACH;
2432 goto out;
2433 }
2434 free_res = 1;
2435
2436 if (res.type == RTN_LOCAL) {
2437 if (!fl.fl4_src)
2438 fl.fl4_src = fl.fl4_dst;
2439 if (dev_out)
2440 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002441 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002442 dev_hold(dev_out);
2443 fl.oif = dev_out->ifindex;
2444 if (res.fi)
2445 fib_info_put(res.fi);
2446 res.fi = NULL;
2447 flags |= RTCF_LOCAL;
2448 goto make_route;
2449 }
2450
2451#ifdef CONFIG_IP_ROUTE_MULTIPATH
2452 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2453 fib_select_multipath(&fl, &res);
2454 else
2455#endif
2456 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002457 fib_select_default(net, &fl, &res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002458
2459 if (!fl.fl4_src)
2460 fl.fl4_src = FIB_RES_PREFSRC(res);
2461
2462 if (dev_out)
2463 dev_put(dev_out);
2464 dev_out = FIB_RES_DEV(res);
2465 dev_hold(dev_out);
2466 fl.oif = dev_out->ifindex;
2467
2468
2469make_route:
2470 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2471
2472
2473 if (free_res)
2474 fib_res_put(&res);
2475 if (dev_out)
2476 dev_put(dev_out);
2477out: return err;
2478}
2479
Denis V. Lunev611c1832008-01-22 22:06:48 -08002480int __ip_route_output_key(struct net *net, struct rtable **rp,
2481 const struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002482{
2483 unsigned hash;
2484 struct rtable *rth;
2485
Denis V. Lunevb00180d2008-07-05 19:04:09 -07002486 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif,
2487 atomic_read(&rt_genid));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002488
2489 rcu_read_lock_bh();
2490 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002491 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002492 if (rth->fl.fl4_dst == flp->fl4_dst &&
2493 rth->fl.fl4_src == flp->fl4_src &&
2494 rth->fl.iif == 0 &&
2495 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002496 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002497 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002498 (IPTOS_RT_MASK | RTO_ONLINK)) &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002499 net_eq(dev_net(rth->u.dst.dev), net) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08002500 rth->rt_genid == atomic_read(&rt_genid)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002501 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002502 RT_CACHE_STAT_INC(out_hit);
2503 rcu_read_unlock_bh();
2504 *rp = rth;
2505 return 0;
2506 }
2507 RT_CACHE_STAT_INC(out_hlist_search);
2508 }
2509 rcu_read_unlock_bh();
2510
Denis V. Lunev611c1832008-01-22 22:06:48 -08002511 return ip_route_output_slow(net, rp, flp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002512}
2513
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002514EXPORT_SYMBOL_GPL(__ip_route_output_key);
2515
David S. Miller14e50e52007-05-24 18:17:54 -07002516static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2517{
2518}
2519
2520static struct dst_ops ipv4_dst_blackhole_ops = {
2521 .family = AF_INET,
2522 .protocol = __constant_htons(ETH_P_IP),
2523 .destroy = ipv4_dst_destroy,
2524 .check = ipv4_dst_check,
2525 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2526 .entry_size = sizeof(struct rtable),
Eric Dumazete2422972008-01-30 20:07:45 -08002527 .entries = ATOMIC_INIT(0),
David S. Miller14e50e52007-05-24 18:17:54 -07002528};
2529
2530
Denis V. Lunevce259992008-03-22 17:42:37 -07002531static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
David S. Miller14e50e52007-05-24 18:17:54 -07002532{
2533 struct rtable *ort = *rp;
2534 struct rtable *rt = (struct rtable *)
2535 dst_alloc(&ipv4_dst_blackhole_ops);
2536
2537 if (rt) {
2538 struct dst_entry *new = &rt->u.dst;
2539
2540 atomic_set(&new->__refcnt, 1);
2541 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002542 new->input = dst_discard;
2543 new->output = dst_discard;
David S. Miller14e50e52007-05-24 18:17:54 -07002544 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2545
2546 new->dev = ort->u.dst.dev;
2547 if (new->dev)
2548 dev_hold(new->dev);
2549
2550 rt->fl = ort->fl;
2551
2552 rt->idev = ort->idev;
2553 if (rt->idev)
2554 in_dev_hold(rt->idev);
Eric Dumazet29e75252008-01-31 17:05:09 -08002555 rt->rt_genid = atomic_read(&rt_genid);
David S. Miller14e50e52007-05-24 18:17:54 -07002556 rt->rt_flags = ort->rt_flags;
2557 rt->rt_type = ort->rt_type;
2558 rt->rt_dst = ort->rt_dst;
2559 rt->rt_src = ort->rt_src;
2560 rt->rt_iif = ort->rt_iif;
2561 rt->rt_gateway = ort->rt_gateway;
2562 rt->rt_spec_dst = ort->rt_spec_dst;
2563 rt->peer = ort->peer;
2564 if (rt->peer)
2565 atomic_inc(&rt->peer->refcnt);
2566
2567 dst_free(new);
2568 }
2569
2570 dst_release(&(*rp)->u.dst);
2571 *rp = rt;
2572 return (rt ? 0 : -ENOMEM);
2573}
2574
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002575int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2576 struct sock *sk, int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002577{
2578 int err;
2579
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002580 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002581 return err;
2582
2583 if (flp->proto) {
2584 if (!flp->fl4_src)
2585 flp->fl4_src = (*rp)->rt_src;
2586 if (!flp->fl4_dst)
2587 flp->fl4_dst = (*rp)->rt_dst;
Herbert Xubb728452007-12-12 18:48:58 -08002588 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2589 flags ? XFRM_LOOKUP_WAIT : 0);
David S. Miller14e50e52007-05-24 18:17:54 -07002590 if (err == -EREMOTE)
Denis V. Lunevce259992008-03-22 17:42:37 -07002591 err = ipv4_dst_blackhole(rp, flp);
David S. Miller14e50e52007-05-24 18:17:54 -07002592
2593 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002594 }
2595
2596 return 0;
2597}
2598
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002599EXPORT_SYMBOL_GPL(ip_route_output_flow);
2600
Denis V. Lunevf2063512008-01-22 22:07:34 -08002601int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002602{
Denis V. Lunevf2063512008-01-22 22:07:34 -08002603 return ip_route_output_flow(net, rp, flp, NULL, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002604}
2605
2606static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002607 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002608{
Eric Dumazetee6b9672008-03-05 18:30:47 -08002609 struct rtable *rt = skb->rtable;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002610 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002611 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002612 long expires;
2613 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002614
2615 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2616 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002617 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002618
2619 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002620 r->rtm_family = AF_INET;
2621 r->rtm_dst_len = 32;
2622 r->rtm_src_len = 0;
2623 r->rtm_tos = rt->fl.fl4_tos;
2624 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002625 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002626 r->rtm_type = rt->rt_type;
2627 r->rtm_scope = RT_SCOPE_UNIVERSE;
2628 r->rtm_protocol = RTPROT_UNSPEC;
2629 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2630 if (rt->rt_flags & RTCF_NOTIFY)
2631 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002632
Al Viro17fb2c62006-09-26 22:15:25 -07002633 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002634
Linus Torvalds1da177e2005-04-16 15:20:36 -07002635 if (rt->fl.fl4_src) {
2636 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002637 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002638 }
2639 if (rt->u.dst.dev)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002640 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002641#ifdef CONFIG_NET_CLS_ROUTE
2642 if (rt->u.dst.tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002643 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002644#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002645 if (rt->fl.iif)
Al Viro17fb2c62006-09-26 22:15:25 -07002646 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002647 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002648 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002649
Linus Torvalds1da177e2005-04-16 15:20:36 -07002650 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002651 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002652
Linus Torvalds1da177e2005-04-16 15:20:36 -07002653 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002654 goto nla_put_failure;
2655
Thomas Grafe3703b32006-11-27 09:27:07 -08002656 error = rt->u.dst.error;
2657 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002658 if (rt->peer) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002659 id = rt->peer->ip_id_count;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002660 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002661 ts = rt->peer->tcp_ts;
James Morris9d729f72007-03-04 16:12:44 -08002662 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002663 }
2664 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002665
Linus Torvalds1da177e2005-04-16 15:20:36 -07002666 if (rt->fl.iif) {
2667#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002668 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002669
Joe Perchesf97c1e02007-12-16 13:45:43 -08002670 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Pavel Emelyanov586f1212007-12-16 13:32:48 -08002671 IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002672 int err = ipmr_get_route(skb, r, nowait);
2673 if (err <= 0) {
2674 if (!nowait) {
2675 if (err == 0)
2676 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002677 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002678 } else {
2679 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002680 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002681 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002682 }
2683 }
2684 } else
2685#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002686 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002687 }
2688
Thomas Grafe3703b32006-11-27 09:27:07 -08002689 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2690 expires, error) < 0)
2691 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002692
Thomas Grafbe403ea2006-08-17 18:15:17 -07002693 return nlmsg_end(skb, nlh);
2694
2695nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002696 nlmsg_cancel(skb, nlh);
2697 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002698}
2699
Thomas Graf63f34442007-03-22 11:55:17 -07002700static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002701{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002702 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002703 struct rtmsg *rtm;
2704 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002705 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002706 __be32 dst = 0;
2707 __be32 src = 0;
2708 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002709 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002710 struct sk_buff *skb;
2711
Thomas Grafd889ce32006-08-17 18:15:44 -07002712 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2713 if (err < 0)
2714 goto errout;
2715
2716 rtm = nlmsg_data(nlh);
2717
Linus Torvalds1da177e2005-04-16 15:20:36 -07002718 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002719 if (skb == NULL) {
2720 err = -ENOBUFS;
2721 goto errout;
2722 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002723
2724 /* Reserve room for dummy headers, this skb can pass
2725 through good chunk of routing engine.
2726 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002727 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002728 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002729
2730 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002731 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002732 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2733
Al Viro17fb2c62006-09-26 22:15:25 -07002734 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2735 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002736 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002737
2738 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002739 struct net_device *dev;
2740
Denis V. Lunev19375042008-02-28 20:52:04 -08002741 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002742 if (dev == NULL) {
2743 err = -ENODEV;
2744 goto errout_free;
2745 }
2746
Linus Torvalds1da177e2005-04-16 15:20:36 -07002747 skb->protocol = htons(ETH_P_IP);
2748 skb->dev = dev;
2749 local_bh_disable();
2750 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2751 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002752
Eric Dumazetee6b9672008-03-05 18:30:47 -08002753 rt = skb->rtable;
Thomas Grafd889ce32006-08-17 18:15:44 -07002754 if (err == 0 && rt->u.dst.error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002755 err = -rt->u.dst.error;
2756 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07002757 struct flowi fl = {
2758 .nl_u = {
2759 .ip4_u = {
2760 .daddr = dst,
2761 .saddr = src,
2762 .tos = rtm->rtm_tos,
2763 },
2764 },
2765 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2766 };
Denis V. Lunev19375042008-02-28 20:52:04 -08002767 err = ip_route_output_key(net, &rt, &fl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002768 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002769
Linus Torvalds1da177e2005-04-16 15:20:36 -07002770 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002771 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002772
Eric Dumazetee6b9672008-03-05 18:30:47 -08002773 skb->rtable = rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002774 if (rtm->rtm_flags & RTM_F_NOTIFY)
2775 rt->rt_flags |= RTCF_NOTIFY;
2776
Linus Torvalds1da177e2005-04-16 15:20:36 -07002777 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08002778 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002779 if (err <= 0)
2780 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002781
Denis V. Lunev19375042008-02-28 20:52:04 -08002782 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002783errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002784 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002785
Thomas Grafd889ce32006-08-17 18:15:44 -07002786errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002787 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002788 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002789}
2790
2791int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2792{
2793 struct rtable *rt;
2794 int h, s_h;
2795 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08002796 struct net *net;
2797
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002798 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002799
2800 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002801 if (s_h < 0)
2802 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002803 s_idx = idx = cb->args[1];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002804 for (h = s_h; h <= rt_hash_mask; h++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002805 rcu_read_lock_bh();
2806 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002807 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002808 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002809 continue;
Eric Dumazet29e75252008-01-31 17:05:09 -08002810 if (rt->rt_genid != atomic_read(&rt_genid))
2811 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002812 skb->dst = dst_clone(&rt->u.dst);
2813 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002814 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002815 1, NLM_F_MULTI) <= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002816 dst_release(xchg(&skb->dst, NULL));
2817 rcu_read_unlock_bh();
2818 goto done;
2819 }
2820 dst_release(xchg(&skb->dst, NULL));
2821 }
2822 rcu_read_unlock_bh();
Eric Dumazetd8c92832008-01-07 21:52:14 -08002823 s_idx = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002824 }
2825
2826done:
2827 cb->args[0] = h;
2828 cb->args[1] = idx;
2829 return skb->len;
2830}
2831
2832void ip_rt_multicast_event(struct in_device *in_dev)
2833{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07002834 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002835}
2836
2837#ifdef CONFIG_SYSCTL
Linus Torvalds1da177e2005-04-16 15:20:36 -07002838static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2839 struct file *filp, void __user *buffer,
2840 size_t *lenp, loff_t *ppos)
2841{
2842 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07002843 int flush_delay;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07002844 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07002845 static DEFINE_MUTEX(flush_mutex);
2846
2847 mutex_lock(&flush_mutex);
2848 ctl->data = &flush_delay;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002849 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07002850 ctl->data = NULL;
2851 mutex_unlock(&flush_mutex);
2852
Denis V. Lunev39a23e72008-07-05 19:02:33 -07002853 net = (struct net *)ctl->extra1;
2854 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002855 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002856 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002857
2858 return -EINVAL;
2859}
2860
2861static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2862 int __user *name,
2863 int nlen,
2864 void __user *oldval,
2865 size_t __user *oldlenp,
2866 void __user *newval,
Alexey Dobriyan1f29bcd2006-12-10 02:19:10 -08002867 size_t newlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002868{
2869 int delay;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07002870 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002871 if (newlen != sizeof(int))
2872 return -EINVAL;
2873 if (get_user(delay, (int __user *)newval))
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002874 return -EFAULT;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07002875 net = (struct net *)table->extra1;
2876 rt_cache_flush(net, delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002877 return 0;
2878}
2879
2880ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002881 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002882 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2883 .procname = "gc_thresh",
2884 .data = &ipv4_dst_ops.gc_thresh,
2885 .maxlen = sizeof(int),
2886 .mode = 0644,
2887 .proc_handler = &proc_dointvec,
2888 },
2889 {
2890 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2891 .procname = "max_size",
2892 .data = &ip_rt_max_size,
2893 .maxlen = sizeof(int),
2894 .mode = 0644,
2895 .proc_handler = &proc_dointvec,
2896 },
2897 {
2898 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002899
Linus Torvalds1da177e2005-04-16 15:20:36 -07002900 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2901 .procname = "gc_min_interval",
2902 .data = &ip_rt_gc_min_interval,
2903 .maxlen = sizeof(int),
2904 .mode = 0644,
2905 .proc_handler = &proc_dointvec_jiffies,
2906 .strategy = &sysctl_jiffies,
2907 },
2908 {
2909 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2910 .procname = "gc_min_interval_ms",
2911 .data = &ip_rt_gc_min_interval,
2912 .maxlen = sizeof(int),
2913 .mode = 0644,
2914 .proc_handler = &proc_dointvec_ms_jiffies,
2915 .strategy = &sysctl_ms_jiffies,
2916 },
2917 {
2918 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2919 .procname = "gc_timeout",
2920 .data = &ip_rt_gc_timeout,
2921 .maxlen = sizeof(int),
2922 .mode = 0644,
2923 .proc_handler = &proc_dointvec_jiffies,
2924 .strategy = &sysctl_jiffies,
2925 },
2926 {
2927 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2928 .procname = "gc_interval",
2929 .data = &ip_rt_gc_interval,
2930 .maxlen = sizeof(int),
2931 .mode = 0644,
2932 .proc_handler = &proc_dointvec_jiffies,
2933 .strategy = &sysctl_jiffies,
2934 },
2935 {
2936 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2937 .procname = "redirect_load",
2938 .data = &ip_rt_redirect_load,
2939 .maxlen = sizeof(int),
2940 .mode = 0644,
2941 .proc_handler = &proc_dointvec,
2942 },
2943 {
2944 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2945 .procname = "redirect_number",
2946 .data = &ip_rt_redirect_number,
2947 .maxlen = sizeof(int),
2948 .mode = 0644,
2949 .proc_handler = &proc_dointvec,
2950 },
2951 {
2952 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2953 .procname = "redirect_silence",
2954 .data = &ip_rt_redirect_silence,
2955 .maxlen = sizeof(int),
2956 .mode = 0644,
2957 .proc_handler = &proc_dointvec,
2958 },
2959 {
2960 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2961 .procname = "error_cost",
2962 .data = &ip_rt_error_cost,
2963 .maxlen = sizeof(int),
2964 .mode = 0644,
2965 .proc_handler = &proc_dointvec,
2966 },
2967 {
2968 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2969 .procname = "error_burst",
2970 .data = &ip_rt_error_burst,
2971 .maxlen = sizeof(int),
2972 .mode = 0644,
2973 .proc_handler = &proc_dointvec,
2974 },
2975 {
2976 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2977 .procname = "gc_elasticity",
2978 .data = &ip_rt_gc_elasticity,
2979 .maxlen = sizeof(int),
2980 .mode = 0644,
2981 .proc_handler = &proc_dointvec,
2982 },
2983 {
2984 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2985 .procname = "mtu_expires",
2986 .data = &ip_rt_mtu_expires,
2987 .maxlen = sizeof(int),
2988 .mode = 0644,
2989 .proc_handler = &proc_dointvec_jiffies,
2990 .strategy = &sysctl_jiffies,
2991 },
2992 {
2993 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2994 .procname = "min_pmtu",
2995 .data = &ip_rt_min_pmtu,
2996 .maxlen = sizeof(int),
2997 .mode = 0644,
2998 .proc_handler = &proc_dointvec,
2999 },
3000 {
3001 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3002 .procname = "min_adv_mss",
3003 .data = &ip_rt_min_advmss,
3004 .maxlen = sizeof(int),
3005 .mode = 0644,
3006 .proc_handler = &proc_dointvec,
3007 },
3008 {
3009 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3010 .procname = "secret_interval",
3011 .data = &ip_rt_secret_interval,
3012 .maxlen = sizeof(int),
3013 .mode = 0644,
3014 .proc_handler = &proc_dointvec_jiffies,
3015 .strategy = &sysctl_jiffies,
3016 },
3017 { .ctl_name = 0 }
3018};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003019
3020static __net_initdata struct ctl_path ipv4_route_path[] = {
3021 { .procname = "net", .ctl_name = CTL_NET, },
3022 { .procname = "ipv4", .ctl_name = NET_IPV4, },
3023 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3024 { },
3025};
3026
3027
3028static struct ctl_table ipv4_route_flush_table[] = {
3029 {
3030 .ctl_name = NET_IPV4_ROUTE_FLUSH,
3031 .procname = "flush",
3032 .maxlen = sizeof(int),
3033 .mode = 0200,
3034 .proc_handler = &ipv4_sysctl_rtcache_flush,
3035 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
3036 },
3037 { .ctl_name = 0 },
3038};
3039
3040static __net_init int sysctl_route_net_init(struct net *net)
3041{
3042 struct ctl_table *tbl;
3043
3044 tbl = ipv4_route_flush_table;
3045 if (net != &init_net) {
3046 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3047 if (tbl == NULL)
3048 goto err_dup;
3049 }
3050 tbl[0].extra1 = net;
3051
3052 net->ipv4.route_hdr =
3053 register_net_sysctl_table(net, ipv4_route_path, tbl);
3054 if (net->ipv4.route_hdr == NULL)
3055 goto err_reg;
3056 return 0;
3057
3058err_reg:
3059 if (tbl != ipv4_route_flush_table)
3060 kfree(tbl);
3061err_dup:
3062 return -ENOMEM;
3063}
3064
3065static __net_exit void sysctl_route_net_exit(struct net *net)
3066{
3067 struct ctl_table *tbl;
3068
3069 tbl = net->ipv4.route_hdr->ctl_table_arg;
3070 unregister_net_sysctl_table(net->ipv4.route_hdr);
3071 BUG_ON(tbl == ipv4_route_flush_table);
3072 kfree(tbl);
3073}
3074
3075static __net_initdata struct pernet_operations sysctl_route_ops = {
3076 .init = sysctl_route_net_init,
3077 .exit = sysctl_route_net_exit,
3078};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003079#endif
3080
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003081
3082static __net_init int rt_secret_timer_init(struct net *net)
3083{
3084 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3085 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3086 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3087
3088 net->ipv4.rt_secret_timer.expires =
3089 jiffies + net_random() % ip_rt_secret_interval +
3090 ip_rt_secret_interval;
3091 add_timer(&net->ipv4.rt_secret_timer);
3092 return 0;
3093}
3094
3095static __net_exit void rt_secret_timer_exit(struct net *net)
3096{
3097 del_timer_sync(&net->ipv4.rt_secret_timer);
3098}
3099
3100static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3101 .init = rt_secret_timer_init,
3102 .exit = rt_secret_timer_exit,
3103};
3104
3105
Linus Torvalds1da177e2005-04-16 15:20:36 -07003106#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet8dbde282007-11-16 03:32:10 -08003107struct ip_rt_acct *ip_rt_acct __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003108#endif /* CONFIG_NET_CLS_ROUTE */
3109
3110static __initdata unsigned long rhash_entries;
3111static int __init set_rhash_entries(char *str)
3112{
3113 if (!str)
3114 return 0;
3115 rhash_entries = simple_strtoul(str, &str, 0);
3116 return 1;
3117}
3118__setup("rhash_entries=", set_rhash_entries);
3119
3120int __init ip_rt_init(void)
3121{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003122 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003123
Eric Dumazet29e75252008-01-31 17:05:09 -08003124 atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3125 (jiffies ^ (jiffies >> 7))));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003126
3127#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet8dbde282007-11-16 03:32:10 -08003128 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003129 if (!ip_rt_acct)
3130 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003131#endif
3132
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07003133 ipv4_dst_ops.kmem_cachep =
3134 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003135 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003136
David S. Miller14e50e52007-05-24 18:17:54 -07003137 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3138
Eric Dumazet424c4b72005-07-05 14:58:19 -07003139 rt_hash_table = (struct rt_hash_bucket *)
3140 alloc_large_system_hash("IP route cache",
3141 sizeof(struct rt_hash_bucket),
3142 rhash_entries,
3143 (num_physpages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003144 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003145 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003146 &rt_hash_log,
3147 &rt_hash_mask,
3148 0);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003149 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3150 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003151
3152 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3153 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3154
Linus Torvalds1da177e2005-04-16 15:20:36 -07003155 devinet_init();
3156 ip_fib_init();
3157
Linus Torvalds1da177e2005-04-16 15:20:36 -07003158 /* All the timers, started at system startup tend
3159 to synchronize. Perturb it a bit.
3160 */
Eric Dumazet39c90ec2007-09-15 10:55:54 -07003161 schedule_delayed_work(&expires_work,
3162 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003163
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003164 if (register_pernet_subsys(&rt_secret_timer_ops))
3165 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003166
Denis V. Lunev73b38712008-02-28 20:51:18 -08003167 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003168 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003169#ifdef CONFIG_XFRM
3170 xfrm_init();
3171 xfrm4_init();
3172#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003173 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3174
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003175#ifdef CONFIG_SYSCTL
3176 register_pernet_subsys(&sysctl_route_ops);
3177#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003178 return rc;
3179}
3180
3181EXPORT_SYMBOL(__ip_select_ident);
3182EXPORT_SYMBOL(ip_route_input);
3183EXPORT_SYMBOL(ip_route_output_key);