blob: ded2ae34eab15017821063de974a8792d4474d0b [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * IPv4 Forwarding Information Base: semantics.
7 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07008 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
Linus Torvalds1da177e2005-04-16 15:20:36 -070016#include <asm/uaccess.h>
17#include <asm/system.h>
18#include <linux/bitops.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/jiffies.h>
22#include <linux/mm.h>
23#include <linux/string.h>
24#include <linux/socket.h>
25#include <linux/sockios.h>
26#include <linux/errno.h>
27#include <linux/in.h>
28#include <linux/inet.h>
Arnaldo Carvalho de Melo14c85022005-12-27 02:43:12 -020029#include <linux/inetdevice.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070030#include <linux/netdevice.h>
31#include <linux/if_arp.h>
32#include <linux/proc_fs.h>
33#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <linux/init.h>
35
Arnaldo Carvalho de Melo14c85022005-12-27 02:43:12 -020036#include <net/arp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070037#include <net/ip.h>
38#include <net/protocol.h>
39#include <net/route.h>
40#include <net/tcp.h>
41#include <net/sock.h>
42#include <net/ip_fib.h>
Thomas Graff21c7bc2006-08-15 00:34:17 -070043#include <net/netlink.h>
Thomas Graf4e902c52006-08-17 18:14:52 -070044#include <net/nexthop.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070045
46#include "fib_lookup.h"
47
Stephen Hemminger832b4c52006-08-29 16:48:09 -070048static DEFINE_SPINLOCK(fib_info_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -070049static struct hlist_head *fib_info_hash;
50static struct hlist_head *fib_info_laddrhash;
51static unsigned int fib_hash_size;
52static unsigned int fib_info_cnt;
53
54#define DEVINDEX_HASHBITS 8
55#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
56static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
57
58#ifdef CONFIG_IP_ROUTE_MULTIPATH
59
60static DEFINE_SPINLOCK(fib_multipath_lock);
61
62#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
63for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
64
65#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
66for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
67
68#else /* CONFIG_IP_ROUTE_MULTIPATH */
69
70/* Hope, that gcc will optimize it to get rid of dummy loop */
71
72#define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
73for (nhsel=0; nhsel < 1; nhsel++)
74
75#define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
76for (nhsel=0; nhsel < 1; nhsel++)
77
78#endif /* CONFIG_IP_ROUTE_MULTIPATH */
79
80#define endfor_nexthops(fi) }
81
82
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090083static const struct
Linus Torvalds1da177e2005-04-16 15:20:36 -070084{
85 int error;
86 u8 scope;
Thomas Grafa0ee18b2007-03-24 20:32:54 -070087} fib_props[RTN_MAX + 1] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090088 {
Linus Torvalds1da177e2005-04-16 15:20:36 -070089 .error = 0,
90 .scope = RT_SCOPE_NOWHERE,
91 }, /* RTN_UNSPEC */
92 {
93 .error = 0,
94 .scope = RT_SCOPE_UNIVERSE,
95 }, /* RTN_UNICAST */
96 {
97 .error = 0,
98 .scope = RT_SCOPE_HOST,
99 }, /* RTN_LOCAL */
100 {
101 .error = 0,
102 .scope = RT_SCOPE_LINK,
103 }, /* RTN_BROADCAST */
104 {
105 .error = 0,
106 .scope = RT_SCOPE_LINK,
107 }, /* RTN_ANYCAST */
108 {
109 .error = 0,
110 .scope = RT_SCOPE_UNIVERSE,
111 }, /* RTN_MULTICAST */
112 {
113 .error = -EINVAL,
114 .scope = RT_SCOPE_UNIVERSE,
115 }, /* RTN_BLACKHOLE */
116 {
117 .error = -EHOSTUNREACH,
118 .scope = RT_SCOPE_UNIVERSE,
119 }, /* RTN_UNREACHABLE */
120 {
121 .error = -EACCES,
122 .scope = RT_SCOPE_UNIVERSE,
123 }, /* RTN_PROHIBIT */
124 {
125 .error = -EAGAIN,
126 .scope = RT_SCOPE_UNIVERSE,
127 }, /* RTN_THROW */
128 {
129 .error = -EINVAL,
130 .scope = RT_SCOPE_NOWHERE,
131 }, /* RTN_NAT */
132 {
133 .error = -EINVAL,
134 .scope = RT_SCOPE_NOWHERE,
135 }, /* RTN_XRESOLVE */
136};
137
138
139/* Release a nexthop info record */
140
141void free_fib_info(struct fib_info *fi)
142{
143 if (fi->fib_dead == 0) {
Stephen Hemmingera6db9012008-01-12 20:58:35 -0800144 printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700145 return;
146 }
147 change_nexthops(fi) {
148 if (nh->nh_dev)
149 dev_put(nh->nh_dev);
150 nh->nh_dev = NULL;
151 } endfor_nexthops(fi);
152 fib_info_cnt--;
Denis V. Lunev57d7a602008-04-16 02:00:50 -0700153 release_net(fi->fib_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154 kfree(fi);
155}
156
157void fib_release_info(struct fib_info *fi)
158{
Stephen Hemminger832b4c52006-08-29 16:48:09 -0700159 spin_lock_bh(&fib_info_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160 if (fi && --fi->fib_treeref == 0) {
161 hlist_del(&fi->fib_hash);
162 if (fi->fib_prefsrc)
163 hlist_del(&fi->fib_lhash);
164 change_nexthops(fi) {
165 if (!nh->nh_dev)
166 continue;
167 hlist_del(&nh->nh_hash);
168 } endfor_nexthops(fi)
169 fi->fib_dead = 1;
170 fib_info_put(fi);
171 }
Stephen Hemminger832b4c52006-08-29 16:48:09 -0700172 spin_unlock_bh(&fib_info_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173}
174
175static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
176{
177 const struct fib_nh *onh = ofi->fib_nh;
178
179 for_nexthops(fi) {
180 if (nh->nh_oif != onh->nh_oif ||
181 nh->nh_gw != onh->nh_gw ||
182 nh->nh_scope != onh->nh_scope ||
183#ifdef CONFIG_IP_ROUTE_MULTIPATH
184 nh->nh_weight != onh->nh_weight ||
185#endif
186#ifdef CONFIG_NET_CLS_ROUTE
187 nh->nh_tclassid != onh->nh_tclassid ||
188#endif
189 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
190 return -1;
191 onh++;
192 } endfor_nexthops(fi);
193 return 0;
194}
195
David S. Miller88ebc722008-01-12 21:49:01 -0800196static inline unsigned int fib_devindex_hashfn(unsigned int val)
197{
198 unsigned int mask = DEVINDEX_HASHSIZE - 1;
199
200 return (val ^
201 (val >> DEVINDEX_HASHBITS) ^
202 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
203}
204
Linus Torvalds1da177e2005-04-16 15:20:36 -0700205static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
206{
207 unsigned int mask = (fib_hash_size - 1);
208 unsigned int val = fi->fib_nhs;
209
210 val ^= fi->fib_protocol;
Al Viro81f7bf62006-09-27 18:40:00 -0700211 val ^= (__force u32)fi->fib_prefsrc;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700212 val ^= fi->fib_priority;
David S. Miller88ebc722008-01-12 21:49:01 -0800213 for_nexthops(fi) {
214 val ^= fib_devindex_hashfn(nh->nh_oif);
215 } endfor_nexthops(fi)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700216
217 return (val ^ (val >> 7) ^ (val >> 12)) & mask;
218}
219
220static struct fib_info *fib_find_info(const struct fib_info *nfi)
221{
222 struct hlist_head *head;
223 struct hlist_node *node;
224 struct fib_info *fi;
225 unsigned int hash;
226
227 hash = fib_info_hashfn(nfi);
228 head = &fib_info_hash[hash];
229
230 hlist_for_each_entry(fi, node, head, fib_hash) {
Denis V. Lunev4814bdb2008-01-31 18:50:07 -0800231 if (fi->fib_net != nfi->fib_net)
232 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700233 if (fi->fib_nhs != nfi->fib_nhs)
234 continue;
235 if (nfi->fib_protocol == fi->fib_protocol &&
236 nfi->fib_prefsrc == fi->fib_prefsrc &&
237 nfi->fib_priority == fi->fib_priority &&
238 memcmp(nfi->fib_metrics, fi->fib_metrics,
239 sizeof(fi->fib_metrics)) == 0 &&
240 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
241 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
242 return fi;
243 }
244
245 return NULL;
246}
247
Linus Torvalds1da177e2005-04-16 15:20:36 -0700248/* Check, that the gateway is already configured.
249 Used only by redirect accept routine.
250 */
251
Al Virod878e72e2006-09-26 22:18:13 -0700252int ip_fib_check_default(__be32 gw, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700253{
254 struct hlist_head *head;
255 struct hlist_node *node;
256 struct fib_nh *nh;
257 unsigned int hash;
258
Stephen Hemminger832b4c52006-08-29 16:48:09 -0700259 spin_lock(&fib_info_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260
261 hash = fib_devindex_hashfn(dev->ifindex);
262 head = &fib_info_devhash[hash];
263 hlist_for_each_entry(nh, node, head, nh_hash) {
264 if (nh->nh_dev == dev &&
265 nh->nh_gw == gw &&
266 !(nh->nh_flags&RTNH_F_DEAD)) {
Stephen Hemminger832b4c52006-08-29 16:48:09 -0700267 spin_unlock(&fib_info_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268 return 0;
269 }
270 }
271
Stephen Hemminger832b4c52006-08-29 16:48:09 -0700272 spin_unlock(&fib_info_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700273
274 return -1;
275}
276
Thomas Graf339bf982006-11-10 14:10:15 -0800277static inline size_t fib_nlmsg_size(struct fib_info *fi)
278{
279 size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
280 + nla_total_size(4) /* RTA_TABLE */
281 + nla_total_size(4) /* RTA_DST */
282 + nla_total_size(4) /* RTA_PRIORITY */
283 + nla_total_size(4); /* RTA_PREFSRC */
284
285 /* space for nested metrics */
286 payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
287
288 if (fi->fib_nhs) {
289 /* Also handles the special case fib_nhs == 1 */
290
291 /* each nexthop is packed in an attribute */
292 size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
293
294 /* may contain flow and gateway attribute */
295 nhsize += 2 * nla_total_size(4);
296
297 /* all nexthops are packed in a nested attribute */
298 payload += nla_total_size(fi->fib_nhs * nhsize);
299 }
300
301 return payload;
302}
303
Al Viro81f7bf62006-09-27 18:40:00 -0700304void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
Milan Kocianb8f55832007-05-23 14:55:06 -0700305 int dst_len, u32 tb_id, struct nl_info *info,
306 unsigned int nlm_flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307{
308 struct sk_buff *skb;
Thomas Graf4e902c52006-08-17 18:14:52 -0700309 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
Thomas Graff21c7bc2006-08-15 00:34:17 -0700310 int err = -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700311
Thomas Graf339bf982006-11-10 14:10:15 -0800312 skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
Thomas Graff21c7bc2006-08-15 00:34:17 -0700313 if (skb == NULL)
314 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700315
Thomas Graf4e902c52006-08-17 18:14:52 -0700316 err = fib_dump_info(skb, info->pid, seq, event, tb_id,
Thomas Grafbe403ea2006-08-17 18:15:17 -0700317 fa->fa_type, fa->fa_scope, key, dst_len,
Milan Kocianb8f55832007-05-23 14:55:06 -0700318 fa->fa_tos, fa->fa_info, nlm_flags);
Patrick McHardy26932562007-01-31 23:16:40 -0800319 if (err < 0) {
320 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
321 WARN_ON(err == -EMSGSIZE);
322 kfree_skb(skb);
323 goto errout;
324 }
Denis V. Lunev4d1169c2008-01-10 03:26:13 -0800325 err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
Thomas Graf4e902c52006-08-17 18:14:52 -0700326 info->nlh, GFP_KERNEL);
Thomas Graff21c7bc2006-08-15 00:34:17 -0700327errout:
328 if (err < 0)
Denis V. Lunev4d1169c2008-01-10 03:26:13 -0800329 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700330}
331
332/* Return the first fib alias matching TOS with
333 * priority less than or equal to PRIO.
334 */
335struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
336{
337 if (fah) {
338 struct fib_alias *fa;
339 list_for_each_entry(fa, fah, fa_list) {
340 if (fa->fa_tos > tos)
341 continue;
342 if (fa->fa_info->fib_priority >= prio ||
343 fa->fa_tos < tos)
344 return fa;
345 }
346 }
347 return NULL;
348}
349
350int fib_detect_death(struct fib_info *fi, int order,
Denis V. Lunevc17860a2007-12-08 00:22:13 -0800351 struct fib_info **last_resort, int *last_idx, int dflt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352{
353 struct neighbour *n;
354 int state = NUD_NONE;
355
356 n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
357 if (n) {
358 state = n->nud_state;
359 neigh_release(n);
360 }
361 if (state==NUD_REACHABLE)
362 return 0;
Denis V. Lunevc17860a2007-12-08 00:22:13 -0800363 if ((state&NUD_VALID) && order != dflt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700364 return 0;
365 if ((state&NUD_VALID) ||
Denis V. Lunevc17860a2007-12-08 00:22:13 -0800366 (*last_idx<0 && order > dflt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700367 *last_resort = fi;
368 *last_idx = order;
369 }
370 return 1;
371}
372
373#ifdef CONFIG_IP_ROUTE_MULTIPATH
374
Thomas Graf4e902c52006-08-17 18:14:52 -0700375static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700376{
377 int nhs = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700378
Thomas Graf4e902c52006-08-17 18:14:52 -0700379 while (rtnh_ok(rtnh, remaining)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380 nhs++;
Thomas Graf4e902c52006-08-17 18:14:52 -0700381 rtnh = rtnh_next(rtnh, &remaining);
382 }
383
384 /* leftover implies invalid nexthop configuration, discard it */
385 return remaining > 0 ? 0 : nhs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386}
387
Thomas Graf4e902c52006-08-17 18:14:52 -0700388static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
389 int remaining, struct fib_config *cfg)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700391 change_nexthops(fi) {
Thomas Graf4e902c52006-08-17 18:14:52 -0700392 int attrlen;
393
394 if (!rtnh_ok(rtnh, remaining))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700395 return -EINVAL;
Thomas Graf4e902c52006-08-17 18:14:52 -0700396
397 nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
398 nh->nh_oif = rtnh->rtnh_ifindex;
399 nh->nh_weight = rtnh->rtnh_hops + 1;
400
401 attrlen = rtnh_attrlen(rtnh);
402 if (attrlen > 0) {
403 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
404
405 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
Al Viro17fb2c62006-09-26 22:15:25 -0700406 nh->nh_gw = nla ? nla_get_be32(nla) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700407#ifdef CONFIG_NET_CLS_ROUTE
Thomas Graf4e902c52006-08-17 18:14:52 -0700408 nla = nla_find(attrs, attrlen, RTA_FLOW);
409 nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700410#endif
411 }
Thomas Graf4e902c52006-08-17 18:14:52 -0700412
413 rtnh = rtnh_next(rtnh, &remaining);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414 } endfor_nexthops(fi);
Thomas Graf4e902c52006-08-17 18:14:52 -0700415
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416 return 0;
417}
418
419#endif
420
Thomas Graf4e902c52006-08-17 18:14:52 -0700421int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422{
423#ifdef CONFIG_IP_ROUTE_MULTIPATH
Thomas Graf4e902c52006-08-17 18:14:52 -0700424 struct rtnexthop *rtnh;
425 int remaining;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700426#endif
427
Thomas Graf4e902c52006-08-17 18:14:52 -0700428 if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429 return 1;
430
Thomas Graf4e902c52006-08-17 18:14:52 -0700431 if (cfg->fc_oif || cfg->fc_gw) {
432 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
433 (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700434 return 0;
435 return 1;
436 }
437
438#ifdef CONFIG_IP_ROUTE_MULTIPATH
Thomas Graf4e902c52006-08-17 18:14:52 -0700439 if (cfg->fc_mp == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440 return 0;
Thomas Graf4e902c52006-08-17 18:14:52 -0700441
442 rtnh = cfg->fc_mp;
443 remaining = cfg->fc_mp_len;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900444
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445 for_nexthops(fi) {
Thomas Graf4e902c52006-08-17 18:14:52 -0700446 int attrlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700447
Thomas Graf4e902c52006-08-17 18:14:52 -0700448 if (!rtnh_ok(rtnh, remaining))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700449 return -EINVAL;
Thomas Graf4e902c52006-08-17 18:14:52 -0700450
451 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700452 return 1;
Thomas Graf4e902c52006-08-17 18:14:52 -0700453
454 attrlen = rtnh_attrlen(rtnh);
455 if (attrlen < 0) {
456 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
457
458 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
Al Viro17fb2c62006-09-26 22:15:25 -0700459 if (nla && nla_get_be32(nla) != nh->nh_gw)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460 return 1;
461#ifdef CONFIG_NET_CLS_ROUTE
Thomas Graf4e902c52006-08-17 18:14:52 -0700462 nla = nla_find(attrs, attrlen, RTA_FLOW);
463 if (nla && nla_get_u32(nla) != nh->nh_tclassid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700464 return 1;
465#endif
466 }
Thomas Graf4e902c52006-08-17 18:14:52 -0700467
468 rtnh = rtnh_next(rtnh, &remaining);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469 } endfor_nexthops(fi);
470#endif
471 return 0;
472}
473
474
475/*
476 Picture
477 -------
478
479 Semantics of nexthop is very messy by historical reasons.
480 We have to take into account, that:
481 a) gateway can be actually local interface address,
482 so that gatewayed route is direct.
483 b) gateway must be on-link address, possibly
484 described not by an ifaddr, but also by a direct route.
485 c) If both gateway and interface are specified, they should not
486 contradict.
487 d) If we use tunnel routes, gateway could be not on-link.
488
489 Attempt to reconcile all of these (alas, self-contradictory) conditions
490 results in pretty ugly and hairy code with obscure logic.
491
492 I chose to generalized it instead, so that the size
493 of code does not increase practically, but it becomes
494 much more general.
495 Every prefix is assigned a "scope" value: "host" is local address,
496 "link" is direct route,
497 [ ... "site" ... "interior" ... ]
498 and "universe" is true gateway route with global meaning.
499
500 Every prefix refers to a set of "nexthop"s (gw, oif),
501 where gw must have narrower scope. This recursion stops
502 when gw has LOCAL scope or if "nexthop" is declared ONLINK,
503 which means that gw is forced to be on link.
504
505 Code is still hairy, but now it is apparently logically
506 consistent and very flexible. F.e. as by-product it allows
507 to co-exists in peace independent exterior and interior
508 routing processes.
509
510 Normally it looks as following.
511
512 {universe prefix} -> (gw, oif) [scope link]
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900513 |
Linus Torvalds1da177e2005-04-16 15:20:36 -0700514 |-> {link prefix} -> (gw, oif) [scope local]
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900515 |
Linus Torvalds1da177e2005-04-16 15:20:36 -0700516 |-> {local prefix} (terminal node)
517 */
518
Thomas Graf4e902c52006-08-17 18:14:52 -0700519static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
520 struct fib_nh *nh)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521{
522 int err;
Denis V. Lunev86167a32008-01-21 17:34:00 -0800523 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700524
Denis V. Lunev86167a32008-01-21 17:34:00 -0800525 net = cfg->fc_nlinfo.nl_net;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700526 if (nh->nh_gw) {
527 struct fib_result res;
528
529#ifdef CONFIG_IP_ROUTE_PERVASIVE
530 if (nh->nh_flags&RTNH_F_PERVASIVE)
531 return 0;
532#endif
533 if (nh->nh_flags&RTNH_F_ONLINK) {
534 struct net_device *dev;
535
Thomas Graf4e902c52006-08-17 18:14:52 -0700536 if (cfg->fc_scope >= RT_SCOPE_LINK)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700537 return -EINVAL;
Denis V. Lunev86167a32008-01-21 17:34:00 -0800538 if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700539 return -EINVAL;
Denis V. Lunev86167a32008-01-21 17:34:00 -0800540 if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700541 return -ENODEV;
542 if (!(dev->flags&IFF_UP))
543 return -ENETDOWN;
544 nh->nh_dev = dev;
545 dev_hold(dev);
546 nh->nh_scope = RT_SCOPE_LINK;
547 return 0;
548 }
549 {
Thomas Graf4e902c52006-08-17 18:14:52 -0700550 struct flowi fl = {
551 .nl_u = {
552 .ip4_u = {
553 .daddr = nh->nh_gw,
554 .scope = cfg->fc_scope + 1,
555 },
556 },
557 .oif = nh->nh_oif,
558 };
Linus Torvalds1da177e2005-04-16 15:20:36 -0700559
560 /* It is not necessary, but requires a bit of thinking */
561 if (fl.fl4_scope < RT_SCOPE_LINK)
562 fl.fl4_scope = RT_SCOPE_LINK;
Denis V. Lunev86167a32008-01-21 17:34:00 -0800563 if ((err = fib_lookup(net, &fl, &res)) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700564 return err;
565 }
566 err = -EINVAL;
567 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
568 goto out;
569 nh->nh_scope = res.scope;
570 nh->nh_oif = FIB_RES_OIF(res);
571 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
572 goto out;
573 dev_hold(nh->nh_dev);
574 err = -ENETDOWN;
575 if (!(nh->nh_dev->flags & IFF_UP))
576 goto out;
577 err = 0;
578out:
579 fib_res_put(&res);
580 return err;
581 } else {
582 struct in_device *in_dev;
583
584 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
585 return -EINVAL;
586
Denis V. Lunev86167a32008-01-21 17:34:00 -0800587 in_dev = inetdev_by_index(net, nh->nh_oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700588 if (in_dev == NULL)
589 return -ENODEV;
590 if (!(in_dev->dev->flags&IFF_UP)) {
591 in_dev_put(in_dev);
592 return -ENETDOWN;
593 }
594 nh->nh_dev = in_dev->dev;
595 dev_hold(nh->nh_dev);
596 nh->nh_scope = RT_SCOPE_HOST;
597 in_dev_put(in_dev);
598 }
599 return 0;
600}
601
Al Viro81f7bf62006-09-27 18:40:00 -0700602static inline unsigned int fib_laddr_hashfn(__be32 val)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700603{
604 unsigned int mask = (fib_hash_size - 1);
605
Al Viro81f7bf62006-09-27 18:40:00 -0700606 return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700607}
608
609static struct hlist_head *fib_hash_alloc(int bytes)
610{
611 if (bytes <= PAGE_SIZE)
Joonwoo Park88f83492007-11-26 23:29:32 +0800612 return kzalloc(bytes, GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700613 else
614 return (struct hlist_head *)
Joonwoo Park88f83492007-11-26 23:29:32 +0800615 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700616}
617
618static void fib_hash_free(struct hlist_head *hash, int bytes)
619{
620 if (!hash)
621 return;
622
623 if (bytes <= PAGE_SIZE)
624 kfree(hash);
625 else
626 free_pages((unsigned long) hash, get_order(bytes));
627}
628
629static void fib_hash_move(struct hlist_head *new_info_hash,
630 struct hlist_head *new_laddrhash,
631 unsigned int new_size)
632{
David S. Millerb7656e72005-08-05 04:12:48 -0700633 struct hlist_head *old_info_hash, *old_laddrhash;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700634 unsigned int old_size = fib_hash_size;
David S. Millerb7656e72005-08-05 04:12:48 -0700635 unsigned int i, bytes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700636
Stephen Hemminger832b4c52006-08-29 16:48:09 -0700637 spin_lock_bh(&fib_info_lock);
David S. Millerb7656e72005-08-05 04:12:48 -0700638 old_info_hash = fib_info_hash;
639 old_laddrhash = fib_info_laddrhash;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700640 fib_hash_size = new_size;
641
642 for (i = 0; i < old_size; i++) {
643 struct hlist_head *head = &fib_info_hash[i];
644 struct hlist_node *node, *n;
645 struct fib_info *fi;
646
647 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
648 struct hlist_head *dest;
649 unsigned int new_hash;
650
651 hlist_del(&fi->fib_hash);
652
653 new_hash = fib_info_hashfn(fi);
654 dest = &new_info_hash[new_hash];
655 hlist_add_head(&fi->fib_hash, dest);
656 }
657 }
658 fib_info_hash = new_info_hash;
659
660 for (i = 0; i < old_size; i++) {
661 struct hlist_head *lhead = &fib_info_laddrhash[i];
662 struct hlist_node *node, *n;
663 struct fib_info *fi;
664
665 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
666 struct hlist_head *ldest;
667 unsigned int new_hash;
668
669 hlist_del(&fi->fib_lhash);
670
671 new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
672 ldest = &new_laddrhash[new_hash];
673 hlist_add_head(&fi->fib_lhash, ldest);
674 }
675 }
676 fib_info_laddrhash = new_laddrhash;
677
Stephen Hemminger832b4c52006-08-29 16:48:09 -0700678 spin_unlock_bh(&fib_info_lock);
David S. Millerb7656e72005-08-05 04:12:48 -0700679
680 bytes = old_size * sizeof(struct hlist_head *);
681 fib_hash_free(old_info_hash, bytes);
682 fib_hash_free(old_laddrhash, bytes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683}
684
Thomas Graf4e902c52006-08-17 18:14:52 -0700685struct fib_info *fib_create_info(struct fib_config *cfg)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700686{
687 int err;
688 struct fib_info *fi = NULL;
689 struct fib_info *ofi;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700690 int nhs = 1;
Denis V. Lunev7462bd742008-01-31 18:49:32 -0800691 struct net *net = cfg->fc_nlinfo.nl_net;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700692
693 /* Fast check to catch the most weird cases */
Thomas Graf4e902c52006-08-17 18:14:52 -0700694 if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700695 goto err_inval;
696
697#ifdef CONFIG_IP_ROUTE_MULTIPATH
Thomas Graf4e902c52006-08-17 18:14:52 -0700698 if (cfg->fc_mp) {
699 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700700 if (nhs == 0)
701 goto err_inval;
702 }
703#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700704
705 err = -ENOBUFS;
706 if (fib_info_cnt >= fib_hash_size) {
707 unsigned int new_size = fib_hash_size << 1;
708 struct hlist_head *new_info_hash;
709 struct hlist_head *new_laddrhash;
710 unsigned int bytes;
711
712 if (!new_size)
713 new_size = 1;
714 bytes = new_size * sizeof(struct hlist_head *);
715 new_info_hash = fib_hash_alloc(bytes);
716 new_laddrhash = fib_hash_alloc(bytes);
717 if (!new_info_hash || !new_laddrhash) {
718 fib_hash_free(new_info_hash, bytes);
719 fib_hash_free(new_laddrhash, bytes);
Joonwoo Park88f83492007-11-26 23:29:32 +0800720 } else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700721 fib_hash_move(new_info_hash, new_laddrhash, new_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700722
723 if (!fib_hash_size)
724 goto failure;
725 }
726
Panagiotis Issaris0da974f2006-07-21 14:51:30 -0700727 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700728 if (fi == NULL)
729 goto failure;
730 fib_info_cnt++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731
Denis V. Lunev57d7a602008-04-16 02:00:50 -0700732 fi->fib_net = hold_net(net);
Thomas Graf4e902c52006-08-17 18:14:52 -0700733 fi->fib_protocol = cfg->fc_protocol;
734 fi->fib_flags = cfg->fc_flags;
735 fi->fib_priority = cfg->fc_priority;
736 fi->fib_prefsrc = cfg->fc_prefsrc;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700737
738 fi->fib_nhs = nhs;
739 change_nexthops(fi) {
740 nh->nh_parent = fi;
741 } endfor_nexthops(fi)
742
Thomas Graf4e902c52006-08-17 18:14:52 -0700743 if (cfg->fc_mx) {
744 struct nlattr *nla;
745 int remaining;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700746
Thomas Graf4e902c52006-08-17 18:14:52 -0700747 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
Thomas Graf8f4c1f92007-09-12 14:44:36 +0200748 int type = nla_type(nla);
Thomas Graf4e902c52006-08-17 18:14:52 -0700749
750 if (type) {
751 if (type > RTAX_MAX)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700752 goto err_inval;
Thomas Graf4e902c52006-08-17 18:14:52 -0700753 fi->fib_metrics[type - 1] = nla_get_u32(nla);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700754 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700755 }
756 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700757
Thomas Graf4e902c52006-08-17 18:14:52 -0700758 if (cfg->fc_mp) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700759#ifdef CONFIG_IP_ROUTE_MULTIPATH
Thomas Graf4e902c52006-08-17 18:14:52 -0700760 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
761 if (err != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700762 goto failure;
Thomas Graf4e902c52006-08-17 18:14:52 -0700763 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700764 goto err_inval;
Thomas Graf4e902c52006-08-17 18:14:52 -0700765 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700766 goto err_inval;
767#ifdef CONFIG_NET_CLS_ROUTE
Thomas Graf4e902c52006-08-17 18:14:52 -0700768 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700769 goto err_inval;
770#endif
771#else
772 goto err_inval;
773#endif
774 } else {
775 struct fib_nh *nh = fi->fib_nh;
Thomas Graf4e902c52006-08-17 18:14:52 -0700776
777 nh->nh_oif = cfg->fc_oif;
778 nh->nh_gw = cfg->fc_gw;
779 nh->nh_flags = cfg->fc_flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700780#ifdef CONFIG_NET_CLS_ROUTE
Thomas Graf4e902c52006-08-17 18:14:52 -0700781 nh->nh_tclassid = cfg->fc_flow;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700782#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700783#ifdef CONFIG_IP_ROUTE_MULTIPATH
784 nh->nh_weight = 1;
785#endif
786 }
787
Thomas Graf4e902c52006-08-17 18:14:52 -0700788 if (fib_props[cfg->fc_type].error) {
789 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790 goto err_inval;
791 goto link_it;
792 }
793
Thomas Graf4e902c52006-08-17 18:14:52 -0700794 if (cfg->fc_scope > RT_SCOPE_HOST)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700795 goto err_inval;
796
Thomas Graf4e902c52006-08-17 18:14:52 -0700797 if (cfg->fc_scope == RT_SCOPE_HOST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700798 struct fib_nh *nh = fi->fib_nh;
799
800 /* Local address is added. */
801 if (nhs != 1 || nh->nh_gw)
802 goto err_inval;
803 nh->nh_scope = RT_SCOPE_NOWHERE;
Denis V. Lunev7462bd742008-01-31 18:49:32 -0800804 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700805 err = -ENODEV;
806 if (nh->nh_dev == NULL)
807 goto failure;
808 } else {
809 change_nexthops(fi) {
Thomas Graf4e902c52006-08-17 18:14:52 -0700810 if ((err = fib_check_nh(cfg, fi, nh)) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700811 goto failure;
812 } endfor_nexthops(fi)
813 }
814
815 if (fi->fib_prefsrc) {
Thomas Graf4e902c52006-08-17 18:14:52 -0700816 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
817 fi->fib_prefsrc != cfg->fc_dst)
Denis V. Lunev7462bd742008-01-31 18:49:32 -0800818 if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700819 goto err_inval;
820 }
821
822link_it:
823 if ((ofi = fib_find_info(fi)) != NULL) {
824 fi->fib_dead = 1;
825 free_fib_info(fi);
826 ofi->fib_treeref++;
827 return ofi;
828 }
829
830 fi->fib_treeref++;
831 atomic_inc(&fi->fib_clntref);
Stephen Hemminger832b4c52006-08-29 16:48:09 -0700832 spin_lock_bh(&fib_info_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700833 hlist_add_head(&fi->fib_hash,
834 &fib_info_hash[fib_info_hashfn(fi)]);
835 if (fi->fib_prefsrc) {
836 struct hlist_head *head;
837
838 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
839 hlist_add_head(&fi->fib_lhash, head);
840 }
841 change_nexthops(fi) {
842 struct hlist_head *head;
843 unsigned int hash;
844
845 if (!nh->nh_dev)
846 continue;
847 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
848 head = &fib_info_devhash[hash];
849 hlist_add_head(&nh->nh_hash, head);
850 } endfor_nexthops(fi)
Stephen Hemminger832b4c52006-08-29 16:48:09 -0700851 spin_unlock_bh(&fib_info_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700852 return fi;
853
854err_inval:
855 err = -EINVAL;
856
857failure:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900858 if (fi) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700859 fi->fib_dead = 1;
860 free_fib_info(fi);
861 }
Thomas Graf4e902c52006-08-17 18:14:52 -0700862
863 return ERR_PTR(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700864}
865
Robert Olssone5b43762005-08-25 13:01:03 -0700866/* Note! fib_semantic_match intentionally uses RCU list functions. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700867int fib_semantic_match(struct list_head *head, const struct flowi *flp,
Al Viro1ef1b8c2006-09-26 22:20:56 -0700868 struct fib_result *res, __be32 zone, __be32 mask,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700869 int prefixlen)
870{
871 struct fib_alias *fa;
872 int nh_sel = 0;
873
Robert Olssone5b43762005-08-25 13:01:03 -0700874 list_for_each_entry_rcu(fa, head, fa_list) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700875 int err;
876
877 if (fa->fa_tos &&
878 fa->fa_tos != flp->fl4_tos)
879 continue;
880
881 if (fa->fa_scope < flp->fl4_scope)
882 continue;
883
884 fa->fa_state |= FA_S_ACCESSED;
885
886 err = fib_props[fa->fa_type].error;
887 if (err == 0) {
888 struct fib_info *fi = fa->fa_info;
889
890 if (fi->fib_flags & RTNH_F_DEAD)
891 continue;
892
893 switch (fa->fa_type) {
894 case RTN_UNICAST:
895 case RTN_LOCAL:
896 case RTN_BROADCAST:
897 case RTN_ANYCAST:
898 case RTN_MULTICAST:
899 for_nexthops(fi) {
900 if (nh->nh_flags&RTNH_F_DEAD)
901 continue;
902 if (!flp->oif || flp->oif == nh->nh_oif)
903 break;
904 }
905#ifdef CONFIG_IP_ROUTE_MULTIPATH
906 if (nhsel < fi->fib_nhs) {
907 nh_sel = nhsel;
908 goto out_fill_res;
909 }
910#else
911 if (nhsel < 1) {
912 goto out_fill_res;
913 }
914#endif
915 endfor_nexthops(fi);
916 continue;
917
918 default:
Stephen Hemmingera6db9012008-01-12 20:58:35 -0800919 printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
920 fa->fa_type);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700921 return -EINVAL;
Stephen Hemminger3ff50b72007-04-20 17:09:22 -0700922 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700923 }
924 return err;
925 }
926 return 1;
927
928out_fill_res:
929 res->prefixlen = prefixlen;
930 res->nh_sel = nh_sel;
931 res->type = fa->fa_type;
932 res->scope = fa->fa_scope;
933 res->fi = fa->fa_info;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700934 atomic_inc(&res->fi->fib_clntref);
935 return 0;
936}
937
938/* Find appropriate source address to this destination */
939
Al Virob83738a2006-09-26 22:14:15 -0700940__be32 __fib_res_prefsrc(struct fib_result *res)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700941{
942 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
943}
944
Thomas Grafbe403ea2006-08-17 18:15:17 -0700945int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
Al Viro81f7bf62006-09-27 18:40:00 -0700946 u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
Thomas Grafbe403ea2006-08-17 18:15:17 -0700947 struct fib_info *fi, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700948{
Thomas Grafbe403ea2006-08-17 18:15:17 -0700949 struct nlmsghdr *nlh;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700950 struct rtmsg *rtm;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700951
Thomas Grafbe403ea2006-08-17 18:15:17 -0700952 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
953 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -0800954 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -0700955
956 rtm = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700957 rtm->rtm_family = AF_INET;
958 rtm->rtm_dst_len = dst_len;
959 rtm->rtm_src_len = 0;
960 rtm->rtm_tos = tos;
Krzysztof Piotr Oledzki709772e2008-06-10 15:44:49 -0700961 if (tb_id < 256)
962 rtm->rtm_table = tb_id;
963 else
964 rtm->rtm_table = RT_TABLE_COMPAT;
Thomas Grafbe403ea2006-08-17 18:15:17 -0700965 NLA_PUT_U32(skb, RTA_TABLE, tb_id);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700966 rtm->rtm_type = type;
967 rtm->rtm_flags = fi->fib_flags;
968 rtm->rtm_scope = scope;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700969 rtm->rtm_protocol = fi->fib_protocol;
Thomas Grafbe403ea2006-08-17 18:15:17 -0700970
971 if (rtm->rtm_dst_len)
Al Viro17fb2c62006-09-26 22:15:25 -0700972 NLA_PUT_BE32(skb, RTA_DST, dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -0700973
Linus Torvalds1da177e2005-04-16 15:20:36 -0700974 if (fi->fib_priority)
Thomas Grafbe403ea2006-08-17 18:15:17 -0700975 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
976
Linus Torvalds1da177e2005-04-16 15:20:36 -0700977 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -0700978 goto nla_put_failure;
979
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980 if (fi->fib_prefsrc)
Al Viro17fb2c62006-09-26 22:15:25 -0700981 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
Thomas Grafbe403ea2006-08-17 18:15:17 -0700982
Linus Torvalds1da177e2005-04-16 15:20:36 -0700983 if (fi->fib_nhs == 1) {
984 if (fi->fib_nh->nh_gw)
Al Viro17fb2c62006-09-26 22:15:25 -0700985 NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
Thomas Grafbe403ea2006-08-17 18:15:17 -0700986
Linus Torvalds1da177e2005-04-16 15:20:36 -0700987 if (fi->fib_nh->nh_oif)
Thomas Grafbe403ea2006-08-17 18:15:17 -0700988 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
Patrick McHardy8265abc2006-07-21 15:09:55 -0700989#ifdef CONFIG_NET_CLS_ROUTE
990 if (fi->fib_nh[0].nh_tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -0700991 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
Patrick McHardy8265abc2006-07-21 15:09:55 -0700992#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700993 }
994#ifdef CONFIG_IP_ROUTE_MULTIPATH
995 if (fi->fib_nhs > 1) {
Thomas Grafbe403ea2006-08-17 18:15:17 -0700996 struct rtnexthop *rtnh;
997 struct nlattr *mp;
998
999 mp = nla_nest_start(skb, RTA_MULTIPATH);
1000 if (mp == NULL)
1001 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001002
1003 for_nexthops(fi) {
Thomas Grafbe403ea2006-08-17 18:15:17 -07001004 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1005 if (rtnh == NULL)
1006 goto nla_put_failure;
1007
1008 rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1009 rtnh->rtnh_hops = nh->nh_weight - 1;
1010 rtnh->rtnh_ifindex = nh->nh_oif;
1011
Linus Torvalds1da177e2005-04-16 15:20:36 -07001012 if (nh->nh_gw)
Al Viro17fb2c62006-09-26 22:15:25 -07001013 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
Patrick McHardy8265abc2006-07-21 15:09:55 -07001014#ifdef CONFIG_NET_CLS_ROUTE
1015 if (nh->nh_tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -07001016 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
Patrick McHardy8265abc2006-07-21 15:09:55 -07001017#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07001018 /* length of rtnetlink header + attributes */
1019 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001020 } endfor_nexthops(fi);
Thomas Grafbe403ea2006-08-17 18:15:17 -07001021
1022 nla_nest_end(skb, mp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001023 }
1024#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07001025 return nlmsg_end(skb, nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001026
Thomas Grafbe403ea2006-08-17 18:15:17 -07001027nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08001028 nlmsg_cancel(skb, nlh);
1029 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001030}
1031
Linus Torvalds1da177e2005-04-16 15:20:36 -07001032/*
1033 Update FIB if:
1034 - local address disappeared -> we must delete all the entries
1035 referring to it.
1036 - device went down -> we must shutdown all nexthops going via it.
1037 */
Denis V. Lunev4814bdb2008-01-31 18:50:07 -08001038int fib_sync_down_addr(struct net *net, __be32 local)
Denis V. Lunev85326fa2008-01-31 18:48:47 -08001039{
1040 int ret = 0;
1041 unsigned int hash = fib_laddr_hashfn(local);
1042 struct hlist_head *head = &fib_info_laddrhash[hash];
1043 struct hlist_node *node;
1044 struct fib_info *fi;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001045
Denis V. Lunev85326fa2008-01-31 18:48:47 -08001046 if (fib_info_laddrhash == NULL || local == 0)
1047 return 0;
1048
1049 hlist_for_each_entry(fi, node, head, fib_lhash) {
Denis V. Lunev4814bdb2008-01-31 18:50:07 -08001050 if (fi->fib_net != net)
1051 continue;
Denis V. Lunev85326fa2008-01-31 18:48:47 -08001052 if (fi->fib_prefsrc == local) {
1053 fi->fib_flags |= RTNH_F_DEAD;
1054 ret++;
1055 }
1056 }
1057 return ret;
1058}
1059
1060int fib_sync_down_dev(struct net_device *dev, int force)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001061{
1062 int ret = 0;
1063 int scope = RT_SCOPE_NOWHERE;
Denis V. Lunev85326fa2008-01-31 18:48:47 -08001064 struct fib_info *prev_fi = NULL;
1065 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1066 struct hlist_head *head = &fib_info_devhash[hash];
1067 struct hlist_node *node;
1068 struct fib_nh *nh;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001069
Linus Torvalds1da177e2005-04-16 15:20:36 -07001070 if (force)
1071 scope = -1;
1072
Denis V. Lunev85326fa2008-01-31 18:48:47 -08001073 hlist_for_each_entry(nh, node, head, nh_hash) {
1074 struct fib_info *fi = nh->nh_parent;
1075 int dead;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001076
Denis V. Lunev85326fa2008-01-31 18:48:47 -08001077 BUG_ON(!fi->fib_nhs);
1078 if (nh->nh_dev != dev || fi == prev_fi)
1079 continue;
1080 prev_fi = fi;
1081 dead = 0;
1082 change_nexthops(fi) {
1083 if (nh->nh_flags&RTNH_F_DEAD)
1084 dead++;
1085 else if (nh->nh_dev == dev &&
1086 nh->nh_scope != scope) {
1087 nh->nh_flags |= RTNH_F_DEAD;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001088#ifdef CONFIG_IP_ROUTE_MULTIPATH
Denis V. Lunev85326fa2008-01-31 18:48:47 -08001089 spin_lock_bh(&fib_multipath_lock);
1090 fi->fib_power -= nh->nh_power;
1091 nh->nh_power = 0;
1092 spin_unlock_bh(&fib_multipath_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001093#endif
Denis V. Lunev85326fa2008-01-31 18:48:47 -08001094 dead++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001095 }
Denis V. Lunev85326fa2008-01-31 18:48:47 -08001096#ifdef CONFIG_IP_ROUTE_MULTIPATH
1097 if (force > 1 && nh->nh_dev == dev) {
1098 dead = fi->fib_nhs;
1099 break;
1100 }
1101#endif
1102 } endfor_nexthops(fi)
1103 if (dead == fi->fib_nhs) {
1104 fi->fib_flags |= RTNH_F_DEAD;
1105 ret++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001106 }
1107 }
1108
1109 return ret;
1110}
1111
1112#ifdef CONFIG_IP_ROUTE_MULTIPATH
1113
1114/*
1115 Dead device goes up. We wake up dead nexthops.
1116 It takes sense only on multipath routes.
1117 */
1118
1119int fib_sync_up(struct net_device *dev)
1120{
1121 struct fib_info *prev_fi;
1122 unsigned int hash;
1123 struct hlist_head *head;
1124 struct hlist_node *node;
1125 struct fib_nh *nh;
1126 int ret;
1127
1128 if (!(dev->flags&IFF_UP))
1129 return 0;
1130
1131 prev_fi = NULL;
1132 hash = fib_devindex_hashfn(dev->ifindex);
1133 head = &fib_info_devhash[hash];
1134 ret = 0;
1135
1136 hlist_for_each_entry(nh, node, head, nh_hash) {
1137 struct fib_info *fi = nh->nh_parent;
1138 int alive;
1139
1140 BUG_ON(!fi->fib_nhs);
1141 if (nh->nh_dev != dev || fi == prev_fi)
1142 continue;
1143
1144 prev_fi = fi;
1145 alive = 0;
1146 change_nexthops(fi) {
1147 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1148 alive++;
1149 continue;
1150 }
1151 if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1152 continue;
Herbert Xue5ed6392005-10-03 14:35:55 -07001153 if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001154 continue;
1155 alive++;
1156 spin_lock_bh(&fib_multipath_lock);
1157 nh->nh_power = 0;
1158 nh->nh_flags &= ~RTNH_F_DEAD;
1159 spin_unlock_bh(&fib_multipath_lock);
1160 } endfor_nexthops(fi)
1161
1162 if (alive > 0) {
1163 fi->fib_flags &= ~RTNH_F_DEAD;
1164 ret++;
1165 }
1166 }
1167
1168 return ret;
1169}
1170
1171/*
1172 The algorithm is suboptimal, but it provides really
1173 fair weighted route distribution.
1174 */
1175
1176void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1177{
1178 struct fib_info *fi = res->fi;
1179 int w;
1180
1181 spin_lock_bh(&fib_multipath_lock);
1182 if (fi->fib_power <= 0) {
1183 int power = 0;
1184 change_nexthops(fi) {
1185 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1186 power += nh->nh_weight;
1187 nh->nh_power = nh->nh_weight;
1188 }
1189 } endfor_nexthops(fi);
1190 fi->fib_power = power;
1191 if (power <= 0) {
1192 spin_unlock_bh(&fib_multipath_lock);
1193 /* Race condition: route has just become dead. */
1194 res->nh_sel = 0;
1195 return;
1196 }
1197 }
1198
1199
1200 /* w should be random number [0..fi->fib_power-1],
1201 it is pretty bad approximation.
1202 */
1203
1204 w = jiffies % fi->fib_power;
1205
1206 change_nexthops(fi) {
1207 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1208 if ((w -= nh->nh_power) <= 0) {
1209 nh->nh_power--;
1210 fi->fib_power--;
1211 res->nh_sel = nhsel;
1212 spin_unlock_bh(&fib_multipath_lock);
1213 return;
1214 }
1215 }
1216 } endfor_nexthops(fi);
1217
1218 /* Race condition: route has just become dead. */
1219 res->nh_sel = 0;
1220 spin_unlock_bh(&fib_multipath_lock);
1221}
1222#endif