blob: ce226c14bef5b1965fb2d254e78f29253d08d8bd [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Linux INET6 implementation
3 * Forwarding Information Database
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * $Id: ip6_fib.c,v 1.25 2001/10/31 21:55:55 davem Exp $
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
16/*
17 * Changes:
18 * Yuji SEKIYA @USAGI: Support default route on router node;
19 * remove ip6_null_entry from the top of
20 * routing table.
21 */
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/errno.h>
23#include <linux/types.h>
24#include <linux/net.h>
25#include <linux/route.h>
26#include <linux/netdevice.h>
27#include <linux/in6.h>
28#include <linux/init.h>
Thomas Grafc71099a2006-08-04 23:20:06 -070029#include <linux/list.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070030
31#ifdef CONFIG_PROC_FS
32#include <linux/proc_fs.h>
33#endif
34
35#include <net/ipv6.h>
36#include <net/ndisc.h>
37#include <net/addrconf.h>
38
39#include <net/ip6_fib.h>
40#include <net/ip6_route.h>
41
42#define RT6_DEBUG 2
43
44#if RT6_DEBUG >= 3
45#define RT6_TRACE(x...) printk(KERN_DEBUG x)
46#else
47#define RT6_TRACE(x...) do { ; } while (0)
48#endif
49
50struct rt6_statistics rt6_stats;
51
Eric Dumazetba899662005-08-26 12:05:31 -070052static kmem_cache_t * fib6_node_kmem __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070053
54enum fib_walk_state_t
55{
56#ifdef CONFIG_IPV6_SUBTREES
57 FWS_S,
58#endif
59 FWS_L,
60 FWS_R,
61 FWS_C,
62 FWS_U
63};
64
65struct fib6_cleaner_t
66{
67 struct fib6_walker_t w;
68 int (*func)(struct rt6_info *, void *arg);
69 void *arg;
70};
71
72DEFINE_RWLOCK(fib6_walker_lock);
73
74
75#ifdef CONFIG_IPV6_SUBTREES
76#define FWS_INIT FWS_S
77#define SUBTREE(fn) ((fn)->subtree)
78#else
79#define FWS_INIT FWS_L
80#define SUBTREE(fn) NULL
81#endif
82
83static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt);
84static struct fib6_node * fib6_repair_tree(struct fib6_node *fn);
85
86/*
87 * A routing update causes an increase of the serial number on the
88 * affected subtree. This allows for cached routes to be asynchronously
89 * tested when modifications are made to the destination cache as a
90 * result of redirects, path MTU changes, etc.
91 */
92
93static __u32 rt_sernum;
94
Ingo Molnar8d06afa2005-09-09 13:10:40 -070095static DEFINE_TIMER(ip6_fib_timer, fib6_run_gc, 0, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -070096
97struct fib6_walker_t fib6_walker_list = {
98 .prev = &fib6_walker_list,
99 .next = &fib6_walker_list,
100};
101
102#define FOR_WALKERS(w) for ((w)=fib6_walker_list.next; (w) != &fib6_walker_list; (w)=(w)->next)
103
104static __inline__ u32 fib6_new_sernum(void)
105{
106 u32 n = ++rt_sernum;
107 if ((__s32)n <= 0)
108 rt_sernum = n = 1;
109 return n;
110}
111
112/*
113 * Auxiliary address test functions for the radix tree.
114 *
115 * These assume a 32bit processor (although it will work on
116 * 64bit processors)
117 */
118
119/*
120 * test bit
121 */
122
123static __inline__ int addr_bit_set(void *token, int fn_bit)
124{
125 __u32 *addr = token;
126
127 return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5];
128}
129
Linus Torvalds1da177e2005-04-16 15:20:36 -0700130static __inline__ struct fib6_node * node_alloc(void)
131{
132 struct fib6_node *fn;
133
134 if ((fn = kmem_cache_alloc(fib6_node_kmem, SLAB_ATOMIC)) != NULL)
135 memset(fn, 0, sizeof(struct fib6_node));
136
137 return fn;
138}
139
140static __inline__ void node_free(struct fib6_node * fn)
141{
142 kmem_cache_free(fib6_node_kmem, fn);
143}
144
145static __inline__ void rt6_release(struct rt6_info *rt)
146{
147 if (atomic_dec_and_test(&rt->rt6i_ref))
148 dst_free(&rt->u.dst);
149}
150
Thomas Grafc71099a2006-08-04 23:20:06 -0700151static struct fib6_table fib6_main_tbl = {
152 .tb6_id = RT6_TABLE_MAIN,
153 .tb6_lock = RW_LOCK_UNLOCKED,
154 .tb6_root = {
155 .leaf = &ip6_null_entry,
156 .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
157 },
158};
159
160#ifdef CONFIG_IPV6_MULTIPLE_TABLES
161
Thomas Graf101367c2006-08-04 03:39:02 -0700162static struct fib6_table fib6_local_tbl = {
163 .tb6_id = RT6_TABLE_LOCAL,
164 .tb6_lock = RW_LOCK_UNLOCKED,
165 .tb6_root = {
166 .leaf = &ip6_null_entry,
167 .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
168 },
169};
170
Thomas Grafc71099a2006-08-04 23:20:06 -0700171#define FIB_TABLE_HASHSZ 256
172static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
173
174static struct fib6_table *fib6_alloc_table(u32 id)
175{
176 struct fib6_table *table;
177
178 table = kzalloc(sizeof(*table), GFP_ATOMIC);
179 if (table != NULL) {
180 table->tb6_id = id;
181 table->tb6_lock = RW_LOCK_UNLOCKED;
182 table->tb6_root.leaf = &ip6_null_entry;
183 table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
184 }
185
186 return table;
187}
188
189static void fib6_link_table(struct fib6_table *tb)
190{
191 unsigned int h;
192
193 h = tb->tb6_id & (FIB_TABLE_HASHSZ - 1);
194
195 /*
196 * No protection necessary, this is the only list mutatation
197 * operation, tables never disappear once they exist.
198 */
199 hlist_add_head_rcu(&tb->tb6_hlist, &fib_table_hash[h]);
200}
201
202struct fib6_table *fib6_new_table(u32 id)
203{
204 struct fib6_table *tb;
205
206 if (id == 0)
207 id = RT6_TABLE_MAIN;
208 tb = fib6_get_table(id);
209 if (tb)
210 return tb;
211
212 tb = fib6_alloc_table(id);
213 if (tb != NULL)
214 fib6_link_table(tb);
215
216 return tb;
217}
218
219struct fib6_table *fib6_get_table(u32 id)
220{
221 struct fib6_table *tb;
222 struct hlist_node *node;
223 unsigned int h;
224
225 if (id == 0)
226 id = RT6_TABLE_MAIN;
227 h = id & (FIB_TABLE_HASHSZ - 1);
228 rcu_read_lock();
229 hlist_for_each_entry_rcu(tb, node, &fib_table_hash[h], tb6_hlist) {
230 if (tb->tb6_id == id) {
231 rcu_read_unlock();
232 return tb;
233 }
234 }
235 rcu_read_unlock();
236
237 return NULL;
238}
239
Thomas Grafc71099a2006-08-04 23:20:06 -0700240static void __init fib6_tables_init(void)
241{
242 fib6_link_table(&fib6_main_tbl);
Thomas Graf101367c2006-08-04 03:39:02 -0700243 fib6_link_table(&fib6_local_tbl);
Thomas Grafc71099a2006-08-04 23:20:06 -0700244}
245
246#else
247
248struct fib6_table *fib6_new_table(u32 id)
249{
250 return fib6_get_table(id);
251}
252
253struct fib6_table *fib6_get_table(u32 id)
254{
255 return &fib6_main_tbl;
256}
257
258struct dst_entry *fib6_rule_lookup(struct flowi *fl, int flags,
259 pol_lookup_t lookup)
260{
261 return (struct dst_entry *) lookup(&fib6_main_tbl, fl, flags);
262}
263
264static void __init fib6_tables_init(void)
265{
266}
267
268#endif
269
Linus Torvalds1da177e2005-04-16 15:20:36 -0700270
271/*
272 * Routing Table
273 *
274 * return the appropriate node for a routing tree "add" operation
275 * by either creating and inserting or by returning an existing
276 * node.
277 */
278
279static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
280 int addrlen, int plen,
281 int offset)
282{
283 struct fib6_node *fn, *in, *ln;
284 struct fib6_node *pn = NULL;
285 struct rt6key *key;
286 int bit;
287 int dir = 0;
288 __u32 sernum = fib6_new_sernum();
289
290 RT6_TRACE("fib6_add_1\n");
291
292 /* insert node in tree */
293
294 fn = root;
295
296 do {
297 key = (struct rt6key *)((u8 *)fn->leaf + offset);
298
299 /*
300 * Prefix match
301 */
302 if (plen < fn->fn_bit ||
303 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
304 goto insert_above;
305
306 /*
307 * Exact match ?
308 */
309
310 if (plen == fn->fn_bit) {
311 /* clean up an intermediate node */
312 if ((fn->fn_flags & RTN_RTINFO) == 0) {
313 rt6_release(fn->leaf);
314 fn->leaf = NULL;
315 }
316
317 fn->fn_sernum = sernum;
318
319 return fn;
320 }
321
322 /*
323 * We have more bits to go
324 */
325
326 /* Try to walk down on tree. */
327 fn->fn_sernum = sernum;
328 dir = addr_bit_set(addr, fn->fn_bit);
329 pn = fn;
330 fn = dir ? fn->right: fn->left;
331 } while (fn);
332
333 /*
334 * We walked to the bottom of tree.
335 * Create new leaf node without children.
336 */
337
338 ln = node_alloc();
339
340 if (ln == NULL)
341 return NULL;
342 ln->fn_bit = plen;
343
344 ln->parent = pn;
345 ln->fn_sernum = sernum;
346
347 if (dir)
348 pn->right = ln;
349 else
350 pn->left = ln;
351
352 return ln;
353
354
355insert_above:
356 /*
357 * split since we don't have a common prefix anymore or
358 * we have a less significant route.
359 * we've to insert an intermediate node on the list
360 * this new node will point to the one we need to create
361 * and the current
362 */
363
364 pn = fn->parent;
365
366 /* find 1st bit in difference between the 2 addrs.
367
YOSHIFUJI Hideaki971f3592005-11-08 09:37:56 -0800368 See comment in __ipv6_addr_diff: bit may be an invalid value,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700369 but if it is >= plen, the value is ignored in any case.
370 */
371
YOSHIFUJI Hideaki971f3592005-11-08 09:37:56 -0800372 bit = __ipv6_addr_diff(addr, &key->addr, addrlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700373
374 /*
375 * (intermediate)[in]
376 * / \
377 * (new leaf node)[ln] (old node)[fn]
378 */
379 if (plen > bit) {
380 in = node_alloc();
381 ln = node_alloc();
382
383 if (in == NULL || ln == NULL) {
384 if (in)
385 node_free(in);
386 if (ln)
387 node_free(ln);
388 return NULL;
389 }
390
391 /*
392 * new intermediate node.
393 * RTN_RTINFO will
394 * be off since that an address that chooses one of
395 * the branches would not match less specific routes
396 * in the other branch
397 */
398
399 in->fn_bit = bit;
400
401 in->parent = pn;
402 in->leaf = fn->leaf;
403 atomic_inc(&in->leaf->rt6i_ref);
404
405 in->fn_sernum = sernum;
406
407 /* update parent pointer */
408 if (dir)
409 pn->right = in;
410 else
411 pn->left = in;
412
413 ln->fn_bit = plen;
414
415 ln->parent = in;
416 fn->parent = in;
417
418 ln->fn_sernum = sernum;
419
420 if (addr_bit_set(addr, bit)) {
421 in->right = ln;
422 in->left = fn;
423 } else {
424 in->left = ln;
425 in->right = fn;
426 }
427 } else { /* plen <= bit */
428
429 /*
430 * (new leaf node)[ln]
431 * / \
432 * (old node)[fn] NULL
433 */
434
435 ln = node_alloc();
436
437 if (ln == NULL)
438 return NULL;
439
440 ln->fn_bit = plen;
441
442 ln->parent = pn;
443
444 ln->fn_sernum = sernum;
445
446 if (dir)
447 pn->right = ln;
448 else
449 pn->left = ln;
450
451 if (addr_bit_set(&key->addr, plen))
452 ln->right = fn;
453 else
454 ln->left = fn;
455
456 fn->parent = ln;
457 }
458 return ln;
459}
460
461/*
462 * Insert routing information in a node.
463 */
464
465static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700466 struct nlmsghdr *nlh, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700467{
468 struct rt6_info *iter = NULL;
469 struct rt6_info **ins;
470
471 ins = &fn->leaf;
472
473 if (fn->fn_flags&RTN_TL_ROOT &&
474 fn->leaf == &ip6_null_entry &&
475 !(rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ){
476 fn->leaf = rt;
477 rt->u.next = NULL;
478 goto out;
479 }
480
481 for (iter = fn->leaf; iter; iter=iter->u.next) {
482 /*
483 * Search for duplicates
484 */
485
486 if (iter->rt6i_metric == rt->rt6i_metric) {
487 /*
488 * Same priority level
489 */
490
491 if (iter->rt6i_dev == rt->rt6i_dev &&
492 iter->rt6i_idev == rt->rt6i_idev &&
493 ipv6_addr_equal(&iter->rt6i_gateway,
494 &rt->rt6i_gateway)) {
495 if (!(iter->rt6i_flags&RTF_EXPIRES))
496 return -EEXIST;
497 iter->rt6i_expires = rt->rt6i_expires;
498 if (!(rt->rt6i_flags&RTF_EXPIRES)) {
499 iter->rt6i_flags &= ~RTF_EXPIRES;
500 iter->rt6i_expires = 0;
501 }
502 return -EEXIST;
503 }
504 }
505
506 if (iter->rt6i_metric > rt->rt6i_metric)
507 break;
508
509 ins = &iter->u.next;
510 }
511
512 /*
513 * insert node
514 */
515
516out:
517 rt->u.next = iter;
518 *ins = rt;
519 rt->rt6i_node = fn;
520 atomic_inc(&rt->rt6i_ref);
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700521 inet6_rt_notify(RTM_NEWROUTE, rt, nlh, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700522 rt6_stats.fib_rt_entries++;
523
524 if ((fn->fn_flags & RTN_RTINFO) == 0) {
525 rt6_stats.fib_route_nodes++;
526 fn->fn_flags |= RTN_RTINFO;
527 }
528
529 return 0;
530}
531
532static __inline__ void fib6_start_gc(struct rt6_info *rt)
533{
534 if (ip6_fib_timer.expires == 0 &&
535 (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE)))
536 mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval);
537}
538
539void fib6_force_start_gc(void)
540{
541 if (ip6_fib_timer.expires == 0)
542 mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval);
543}
544
545/*
546 * Add routing information to the routing tree.
547 * <destination addr>/<source addr>
548 * with source addr info in sub-trees
549 */
550
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700551int fib6_add(struct fib6_node *root, struct rt6_info *rt,
552 struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700553{
554 struct fib6_node *fn;
555 int err = -ENOMEM;
556
557 fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr),
558 rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst));
559
560 if (fn == NULL)
561 goto out;
562
563#ifdef CONFIG_IPV6_SUBTREES
564 if (rt->rt6i_src.plen) {
565 struct fib6_node *sn;
566
567 if (fn->subtree == NULL) {
568 struct fib6_node *sfn;
569
570 /*
571 * Create subtree.
572 *
573 * fn[main tree]
574 * |
575 * sfn[subtree root]
576 * \
577 * sn[new leaf node]
578 */
579
580 /* Create subtree root node */
581 sfn = node_alloc();
582 if (sfn == NULL)
583 goto st_failure;
584
585 sfn->leaf = &ip6_null_entry;
586 atomic_inc(&ip6_null_entry.rt6i_ref);
587 sfn->fn_flags = RTN_ROOT;
588 sfn->fn_sernum = fib6_new_sernum();
589
590 /* Now add the first leaf node to new subtree */
591
592 sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
593 sizeof(struct in6_addr), rt->rt6i_src.plen,
594 offsetof(struct rt6_info, rt6i_src));
595
596 if (sn == NULL) {
597 /* If it is failed, discard just allocated
598 root, and then (in st_failure) stale node
599 in main tree.
600 */
601 node_free(sfn);
602 goto st_failure;
603 }
604
605 /* Now link new subtree to main tree */
606 sfn->parent = fn;
607 fn->subtree = sfn;
608 if (fn->leaf == NULL) {
609 fn->leaf = rt;
610 atomic_inc(&rt->rt6i_ref);
611 }
612 } else {
613 sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
614 sizeof(struct in6_addr), rt->rt6i_src.plen,
615 offsetof(struct rt6_info, rt6i_src));
616
617 if (sn == NULL)
618 goto st_failure;
619 }
620
621 fn = sn;
622 }
623#endif
624
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700625 err = fib6_add_rt2node(fn, rt, nlh, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700626
627 if (err == 0) {
628 fib6_start_gc(rt);
629 if (!(rt->rt6i_flags&RTF_CACHE))
630 fib6_prune_clones(fn, rt);
631 }
632
633out:
634 if (err)
635 dst_free(&rt->u.dst);
636 return err;
637
638#ifdef CONFIG_IPV6_SUBTREES
639 /* Subtree creation failed, probably main tree node
640 is orphan. If it is, shoot it.
641 */
642st_failure:
643 if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
644 fib6_repair_tree(fn);
645 dst_free(&rt->u.dst);
646 return err;
647#endif
648}
649
650/*
651 * Routing tree lookup
652 *
653 */
654
655struct lookup_args {
656 int offset; /* key offset on rt6_info */
657 struct in6_addr *addr; /* search key */
658};
659
660static struct fib6_node * fib6_lookup_1(struct fib6_node *root,
661 struct lookup_args *args)
662{
663 struct fib6_node *fn;
664 int dir;
665
666 /*
667 * Descend on a tree
668 */
669
670 fn = root;
671
672 for (;;) {
673 struct fib6_node *next;
674
675 dir = addr_bit_set(args->addr, fn->fn_bit);
676
677 next = dir ? fn->right : fn->left;
678
679 if (next) {
680 fn = next;
681 continue;
682 }
683
684 break;
685 }
686
687 while ((fn->fn_flags & RTN_ROOT) == 0) {
688#ifdef CONFIG_IPV6_SUBTREES
689 if (fn->subtree) {
690 struct fib6_node *st;
691 struct lookup_args *narg;
692
693 narg = args + 1;
694
695 if (narg->addr) {
696 st = fib6_lookup_1(fn->subtree, narg);
697
698 if (st && !(st->fn_flags & RTN_ROOT))
699 return st;
700 }
701 }
702#endif
703
704 if (fn->fn_flags & RTN_RTINFO) {
705 struct rt6key *key;
706
707 key = (struct rt6key *) ((u8 *) fn->leaf +
708 args->offset);
709
710 if (ipv6_prefix_equal(&key->addr, args->addr, key->plen))
711 return fn;
712 }
713
714 fn = fn->parent;
715 }
716
717 return NULL;
718}
719
720struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr,
721 struct in6_addr *saddr)
722{
723 struct lookup_args args[2];
724 struct fib6_node *fn;
725
726 args[0].offset = offsetof(struct rt6_info, rt6i_dst);
727 args[0].addr = daddr;
728
729#ifdef CONFIG_IPV6_SUBTREES
730 args[1].offset = offsetof(struct rt6_info, rt6i_src);
731 args[1].addr = saddr;
732#endif
733
734 fn = fib6_lookup_1(root, args);
735
736 if (fn == NULL || fn->fn_flags & RTN_TL_ROOT)
737 fn = root;
738
739 return fn;
740}
741
742/*
743 * Get node with specified destination prefix (and source prefix,
744 * if subtrees are used)
745 */
746
747
748static struct fib6_node * fib6_locate_1(struct fib6_node *root,
749 struct in6_addr *addr,
750 int plen, int offset)
751{
752 struct fib6_node *fn;
753
754 for (fn = root; fn ; ) {
755 struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset);
756
757 /*
758 * Prefix match
759 */
760 if (plen < fn->fn_bit ||
761 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
762 return NULL;
763
764 if (plen == fn->fn_bit)
765 return fn;
766
767 /*
768 * We have more bits to go
769 */
770 if (addr_bit_set(addr, fn->fn_bit))
771 fn = fn->right;
772 else
773 fn = fn->left;
774 }
775 return NULL;
776}
777
778struct fib6_node * fib6_locate(struct fib6_node *root,
779 struct in6_addr *daddr, int dst_len,
780 struct in6_addr *saddr, int src_len)
781{
782 struct fib6_node *fn;
783
784 fn = fib6_locate_1(root, daddr, dst_len,
785 offsetof(struct rt6_info, rt6i_dst));
786
787#ifdef CONFIG_IPV6_SUBTREES
788 if (src_len) {
789 BUG_TRAP(saddr!=NULL);
790 if (fn == NULL)
791 fn = fn->subtree;
792 if (fn)
793 fn = fib6_locate_1(fn, saddr, src_len,
794 offsetof(struct rt6_info, rt6i_src));
795 }
796#endif
797
798 if (fn && fn->fn_flags&RTN_RTINFO)
799 return fn;
800
801 return NULL;
802}
803
804
805/*
806 * Deletion
807 *
808 */
809
810static struct rt6_info * fib6_find_prefix(struct fib6_node *fn)
811{
812 if (fn->fn_flags&RTN_ROOT)
813 return &ip6_null_entry;
814
815 while(fn) {
816 if(fn->left)
817 return fn->left->leaf;
818
819 if(fn->right)
820 return fn->right->leaf;
821
822 fn = SUBTREE(fn);
823 }
824 return NULL;
825}
826
827/*
828 * Called to trim the tree of intermediate nodes when possible. "fn"
829 * is the node we want to try and remove.
830 */
831
832static struct fib6_node * fib6_repair_tree(struct fib6_node *fn)
833{
834 int children;
835 int nstate;
836 struct fib6_node *child, *pn;
837 struct fib6_walker_t *w;
838 int iter = 0;
839
840 for (;;) {
841 RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
842 iter++;
843
844 BUG_TRAP(!(fn->fn_flags&RTN_RTINFO));
845 BUG_TRAP(!(fn->fn_flags&RTN_TL_ROOT));
846 BUG_TRAP(fn->leaf==NULL);
847
848 children = 0;
849 child = NULL;
850 if (fn->right) child = fn->right, children |= 1;
851 if (fn->left) child = fn->left, children |= 2;
852
853 if (children == 3 || SUBTREE(fn)
854#ifdef CONFIG_IPV6_SUBTREES
855 /* Subtree root (i.e. fn) may have one child */
856 || (children && fn->fn_flags&RTN_ROOT)
857#endif
858 ) {
859 fn->leaf = fib6_find_prefix(fn);
860#if RT6_DEBUG >= 2
861 if (fn->leaf==NULL) {
862 BUG_TRAP(fn->leaf);
863 fn->leaf = &ip6_null_entry;
864 }
865#endif
866 atomic_inc(&fn->leaf->rt6i_ref);
867 return fn->parent;
868 }
869
870 pn = fn->parent;
871#ifdef CONFIG_IPV6_SUBTREES
872 if (SUBTREE(pn) == fn) {
873 BUG_TRAP(fn->fn_flags&RTN_ROOT);
874 SUBTREE(pn) = NULL;
875 nstate = FWS_L;
876 } else {
877 BUG_TRAP(!(fn->fn_flags&RTN_ROOT));
878#endif
879 if (pn->right == fn) pn->right = child;
880 else if (pn->left == fn) pn->left = child;
881#if RT6_DEBUG >= 2
882 else BUG_TRAP(0);
883#endif
884 if (child)
885 child->parent = pn;
886 nstate = FWS_R;
887#ifdef CONFIG_IPV6_SUBTREES
888 }
889#endif
890
891 read_lock(&fib6_walker_lock);
892 FOR_WALKERS(w) {
893 if (child == NULL) {
894 if (w->root == fn) {
895 w->root = w->node = NULL;
896 RT6_TRACE("W %p adjusted by delroot 1\n", w);
897 } else if (w->node == fn) {
898 RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
899 w->node = pn;
900 w->state = nstate;
901 }
902 } else {
903 if (w->root == fn) {
904 w->root = child;
905 RT6_TRACE("W %p adjusted by delroot 2\n", w);
906 }
907 if (w->node == fn) {
908 w->node = child;
909 if (children&2) {
910 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
911 w->state = w->state>=FWS_R ? FWS_U : FWS_INIT;
912 } else {
913 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
914 w->state = w->state>=FWS_C ? FWS_U : FWS_INIT;
915 }
916 }
917 }
918 }
919 read_unlock(&fib6_walker_lock);
920
921 node_free(fn);
922 if (pn->fn_flags&RTN_RTINFO || SUBTREE(pn))
923 return pn;
924
925 rt6_release(pn->leaf);
926 pn->leaf = NULL;
927 fn = pn;
928 }
929}
930
931static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700932 struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700933{
934 struct fib6_walker_t *w;
935 struct rt6_info *rt = *rtp;
936
937 RT6_TRACE("fib6_del_route\n");
938
939 /* Unlink it */
940 *rtp = rt->u.next;
941 rt->rt6i_node = NULL;
942 rt6_stats.fib_rt_entries--;
943 rt6_stats.fib_discarded_routes++;
944
945 /* Adjust walkers */
946 read_lock(&fib6_walker_lock);
947 FOR_WALKERS(w) {
948 if (w->state == FWS_C && w->leaf == rt) {
949 RT6_TRACE("walker %p adjusted by delroute\n", w);
950 w->leaf = rt->u.next;
951 if (w->leaf == NULL)
952 w->state = FWS_U;
953 }
954 }
955 read_unlock(&fib6_walker_lock);
956
957 rt->u.next = NULL;
958
959 if (fn->leaf == NULL && fn->fn_flags&RTN_TL_ROOT)
960 fn->leaf = &ip6_null_entry;
961
962 /* If it was last route, expunge its radix tree node */
963 if (fn->leaf == NULL) {
964 fn->fn_flags &= ~RTN_RTINFO;
965 rt6_stats.fib_route_nodes--;
966 fn = fib6_repair_tree(fn);
967 }
968
969 if (atomic_read(&rt->rt6i_ref) != 1) {
970 /* This route is used as dummy address holder in some split
971 * nodes. It is not leaked, but it still holds other resources,
972 * which must be released in time. So, scan ascendant nodes
973 * and replace dummy references to this route with references
974 * to still alive ones.
975 */
976 while (fn) {
977 if (!(fn->fn_flags&RTN_RTINFO) && fn->leaf == rt) {
978 fn->leaf = fib6_find_prefix(fn);
979 atomic_inc(&fn->leaf->rt6i_ref);
980 rt6_release(rt);
981 }
982 fn = fn->parent;
983 }
984 /* No more references are possible at this point. */
985 if (atomic_read(&rt->rt6i_ref) != 1) BUG();
986 }
987
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700988 inet6_rt_notify(RTM_DELROUTE, rt, nlh, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700989 rt6_release(rt);
990}
991
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -0700992int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700993{
994 struct fib6_node *fn = rt->rt6i_node;
995 struct rt6_info **rtp;
996
997#if RT6_DEBUG >= 2
998 if (rt->u.dst.obsolete>0) {
999 BUG_TRAP(fn==NULL);
1000 return -ENOENT;
1001 }
1002#endif
1003 if (fn == NULL || rt == &ip6_null_entry)
1004 return -ENOENT;
1005
1006 BUG_TRAP(fn->fn_flags&RTN_RTINFO);
1007
1008 if (!(rt->rt6i_flags&RTF_CACHE))
1009 fib6_prune_clones(fn, rt);
1010
1011 /*
1012 * Walk the leaf entries looking for ourself
1013 */
1014
1015 for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) {
1016 if (*rtp == rt) {
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001017 fib6_del_route(fn, rtp, nlh, _rtattr, req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001018 return 0;
1019 }
1020 }
1021 return -ENOENT;
1022}
1023
1024/*
1025 * Tree traversal function.
1026 *
1027 * Certainly, it is not interrupt safe.
1028 * However, it is internally reenterable wrt itself and fib6_add/fib6_del.
1029 * It means, that we can modify tree during walking
1030 * and use this function for garbage collection, clone pruning,
1031 * cleaning tree when a device goes down etc. etc.
1032 *
1033 * It guarantees that every node will be traversed,
1034 * and that it will be traversed only once.
1035 *
1036 * Callback function w->func may return:
1037 * 0 -> continue walking.
1038 * positive value -> walking is suspended (used by tree dumps,
1039 * and probably by gc, if it will be split to several slices)
1040 * negative value -> terminate walking.
1041 *
1042 * The function itself returns:
1043 * 0 -> walk is complete.
1044 * >0 -> walk is incomplete (i.e. suspended)
1045 * <0 -> walk is terminated by an error.
1046 */
1047
1048int fib6_walk_continue(struct fib6_walker_t *w)
1049{
1050 struct fib6_node *fn, *pn;
1051
1052 for (;;) {
1053 fn = w->node;
1054 if (fn == NULL)
1055 return 0;
1056
1057 if (w->prune && fn != w->root &&
1058 fn->fn_flags&RTN_RTINFO && w->state < FWS_C) {
1059 w->state = FWS_C;
1060 w->leaf = fn->leaf;
1061 }
1062 switch (w->state) {
1063#ifdef CONFIG_IPV6_SUBTREES
1064 case FWS_S:
1065 if (SUBTREE(fn)) {
1066 w->node = SUBTREE(fn);
1067 continue;
1068 }
1069 w->state = FWS_L;
1070#endif
1071 case FWS_L:
1072 if (fn->left) {
1073 w->node = fn->left;
1074 w->state = FWS_INIT;
1075 continue;
1076 }
1077 w->state = FWS_R;
1078 case FWS_R:
1079 if (fn->right) {
1080 w->node = fn->right;
1081 w->state = FWS_INIT;
1082 continue;
1083 }
1084 w->state = FWS_C;
1085 w->leaf = fn->leaf;
1086 case FWS_C:
1087 if (w->leaf && fn->fn_flags&RTN_RTINFO) {
1088 int err = w->func(w);
1089 if (err)
1090 return err;
1091 continue;
1092 }
1093 w->state = FWS_U;
1094 case FWS_U:
1095 if (fn == w->root)
1096 return 0;
1097 pn = fn->parent;
1098 w->node = pn;
1099#ifdef CONFIG_IPV6_SUBTREES
1100 if (SUBTREE(pn) == fn) {
1101 BUG_TRAP(fn->fn_flags&RTN_ROOT);
1102 w->state = FWS_L;
1103 continue;
1104 }
1105#endif
1106 if (pn->left == fn) {
1107 w->state = FWS_R;
1108 continue;
1109 }
1110 if (pn->right == fn) {
1111 w->state = FWS_C;
1112 w->leaf = w->node->leaf;
1113 continue;
1114 }
1115#if RT6_DEBUG >= 2
1116 BUG_TRAP(0);
1117#endif
1118 }
1119 }
1120}
1121
1122int fib6_walk(struct fib6_walker_t *w)
1123{
1124 int res;
1125
1126 w->state = FWS_INIT;
1127 w->node = w->root;
1128
1129 fib6_walker_link(w);
1130 res = fib6_walk_continue(w);
1131 if (res <= 0)
1132 fib6_walker_unlink(w);
1133 return res;
1134}
1135
1136static int fib6_clean_node(struct fib6_walker_t *w)
1137{
1138 int res;
1139 struct rt6_info *rt;
1140 struct fib6_cleaner_t *c = (struct fib6_cleaner_t*)w;
1141
1142 for (rt = w->leaf; rt; rt = rt->u.next) {
1143 res = c->func(rt, c->arg);
1144 if (res < 0) {
1145 w->leaf = rt;
Jamal Hadi Salim0d51aa82005-06-21 13:51:04 -07001146 res = fib6_del(rt, NULL, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001147 if (res) {
1148#if RT6_DEBUG >= 2
1149 printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res);
1150#endif
1151 continue;
1152 }
1153 return 0;
1154 }
1155 BUG_TRAP(res==0);
1156 }
1157 w->leaf = rt;
1158 return 0;
1159}
1160
1161/*
1162 * Convenient frontend to tree walker.
1163 *
1164 * func is called on each route.
1165 * It may return -1 -> delete this route.
1166 * 0 -> continue walking
1167 *
1168 * prune==1 -> only immediate children of node (certainly,
1169 * ignoring pure split nodes) will be scanned.
1170 */
1171
1172void fib6_clean_tree(struct fib6_node *root,
1173 int (*func)(struct rt6_info *, void *arg),
1174 int prune, void *arg)
1175{
1176 struct fib6_cleaner_t c;
1177
1178 c.w.root = root;
1179 c.w.func = fib6_clean_node;
1180 c.w.prune = prune;
1181 c.func = func;
1182 c.arg = arg;
1183
1184 fib6_walk(&c.w);
1185}
1186
Thomas Grafc71099a2006-08-04 23:20:06 -07001187void fib6_clean_all(int (*func)(struct rt6_info *, void *arg),
1188 int prune, void *arg)
1189{
1190 int i;
1191 struct fib6_table *table;
1192
1193 for (i = FIB6_TABLE_MIN; i <= FIB6_TABLE_MAX; i++) {
1194 table = fib6_get_table(i);
1195 if (table != NULL) {
1196 write_lock_bh(&table->tb6_lock);
1197 fib6_clean_tree(&table->tb6_root, func, prune, arg);
1198 write_unlock_bh(&table->tb6_lock);
1199 }
1200 }
1201}
1202
Linus Torvalds1da177e2005-04-16 15:20:36 -07001203static int fib6_prune_clone(struct rt6_info *rt, void *arg)
1204{
1205 if (rt->rt6i_flags & RTF_CACHE) {
1206 RT6_TRACE("pruning clone %p\n", rt);
1207 return -1;
1208 }
1209
1210 return 0;
1211}
1212
1213static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt)
1214{
1215 fib6_clean_tree(fn, fib6_prune_clone, 1, rt);
1216}
1217
1218/*
1219 * Garbage collection
1220 */
1221
1222static struct fib6_gc_args
1223{
1224 int timeout;
1225 int more;
1226} gc_args;
1227
1228static int fib6_age(struct rt6_info *rt, void *arg)
1229{
1230 unsigned long now = jiffies;
1231
1232 /*
1233 * check addrconf expiration here.
1234 * Routes are expired even if they are in use.
1235 *
1236 * Also age clones. Note, that clones are aged out
1237 * only if they are not in use now.
1238 */
1239
1240 if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) {
1241 if (time_after(now, rt->rt6i_expires)) {
1242 RT6_TRACE("expiring %p\n", rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001243 return -1;
1244 }
1245 gc_args.more++;
1246 } else if (rt->rt6i_flags & RTF_CACHE) {
1247 if (atomic_read(&rt->u.dst.__refcnt) == 0 &&
1248 time_after_eq(now, rt->u.dst.lastuse + gc_args.timeout)) {
1249 RT6_TRACE("aging clone %p\n", rt);
1250 return -1;
1251 } else if ((rt->rt6i_flags & RTF_GATEWAY) &&
1252 (!(rt->rt6i_nexthop->flags & NTF_ROUTER))) {
1253 RT6_TRACE("purging route %p via non-router but gateway\n",
1254 rt);
1255 return -1;
1256 }
1257 gc_args.more++;
1258 }
1259
1260 return 0;
1261}
1262
1263static DEFINE_SPINLOCK(fib6_gc_lock);
1264
1265void fib6_run_gc(unsigned long dummy)
1266{
1267 if (dummy != ~0UL) {
1268 spin_lock_bh(&fib6_gc_lock);
1269 gc_args.timeout = dummy ? (int)dummy : ip6_rt_gc_interval;
1270 } else {
1271 local_bh_disable();
1272 if (!spin_trylock(&fib6_gc_lock)) {
1273 mod_timer(&ip6_fib_timer, jiffies + HZ);
1274 local_bh_enable();
1275 return;
1276 }
1277 gc_args.timeout = ip6_rt_gc_interval;
1278 }
1279 gc_args.more = 0;
1280
Linus Torvalds1da177e2005-04-16 15:20:36 -07001281 ndisc_dst_gc(&gc_args.more);
Thomas Grafc71099a2006-08-04 23:20:06 -07001282 fib6_clean_all(fib6_age, 0, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001283
1284 if (gc_args.more)
1285 mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval);
1286 else {
1287 del_timer(&ip6_fib_timer);
1288 ip6_fib_timer.expires = 0;
1289 }
1290 spin_unlock_bh(&fib6_gc_lock);
1291}
1292
1293void __init fib6_init(void)
1294{
1295 fib6_node_kmem = kmem_cache_create("fib6_nodes",
1296 sizeof(struct fib6_node),
1297 0, SLAB_HWCACHE_ALIGN,
1298 NULL, NULL);
1299 if (!fib6_node_kmem)
1300 panic("cannot create fib6_nodes cache");
Thomas Grafc71099a2006-08-04 23:20:06 -07001301
1302 fib6_tables_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001303}
1304
1305void fib6_gc_cleanup(void)
1306{
1307 del_timer(&ip6_fib_timer);
1308 kmem_cache_destroy(fib6_node_kmem);
1309}