blob: 6a28fafe910c2a6ef858a900e688c682d9c470d5 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 *
19 * Changes:
20 *
21 */
22
23#include <linux/module.h>
24#include <linux/init.h>
25#include <linux/types.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080026#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070027#include <linux/fs.h>
28#include <linux/sysctl.h>
29#include <linux/proc_fs.h>
30#include <linux/workqueue.h>
31#include <linux/swap.h>
32#include <linux/proc_fs.h>
33#include <linux/seq_file.h>
34
35#include <linux/netfilter.h>
36#include <linux/netfilter_ipv4.h>
Ingo Molnar14cc3e22006-03-26 01:37:14 -080037#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070038
39#include <net/ip.h>
Arnaldo Carvalho de Melo14c85022005-12-27 02:43:12 -020040#include <net/route.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070041#include <net/sock.h>
42
43#include <asm/uaccess.h>
44
45#include <net/ip_vs.h>
46
47/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
Ingo Molnar14cc3e22006-03-26 01:37:14 -080048static DEFINE_MUTEX(__ip_vs_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -070049
50/* lock for service table */
51static DEFINE_RWLOCK(__ip_vs_svc_lock);
52
53/* lock for table with the real services */
54static DEFINE_RWLOCK(__ip_vs_rs_lock);
55
56/* lock for state and timeout tables */
57static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
58
59/* lock for drop entry handling */
60static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
61
62/* lock for drop packet handling */
63static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
64
65/* 1/rate drop and drop-entry variables */
66int ip_vs_drop_rate = 0;
67int ip_vs_drop_counter = 0;
68static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
69
70/* number of virtual services */
71static int ip_vs_num_services = 0;
72
73/* sysctl variables */
74static int sysctl_ip_vs_drop_entry = 0;
75static int sysctl_ip_vs_drop_packet = 0;
76static int sysctl_ip_vs_secure_tcp = 0;
77static int sysctl_ip_vs_amemthresh = 1024;
78static int sysctl_ip_vs_am_droprate = 10;
79int sysctl_ip_vs_cache_bypass = 0;
80int sysctl_ip_vs_expire_nodest_conn = 0;
81int sysctl_ip_vs_expire_quiescent_template = 0;
82int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
83int sysctl_ip_vs_nat_icmp_send = 0;
84
85
86#ifdef CONFIG_IP_VS_DEBUG
87static int sysctl_ip_vs_debug_level = 0;
88
89int ip_vs_get_debug_level(void)
90{
91 return sysctl_ip_vs_debug_level;
92}
93#endif
94
95/*
Julian Anastasovaf9debd2005-07-11 20:59:57 -070096 * update_defense_level is called from keventd and from sysctl,
97 * so it needs to protect itself from softirqs
Linus Torvalds1da177e2005-04-16 15:20:36 -070098 */
99static void update_defense_level(void)
100{
101 struct sysinfo i;
102 static int old_secure_tcp = 0;
103 int availmem;
104 int nomem;
105 int to_change = -1;
106
107 /* we only count free and buffered memory (in pages) */
108 si_meminfo(&i);
109 availmem = i.freeram + i.bufferram;
110 /* however in linux 2.5 the i.bufferram is total page cache size,
111 we need adjust it */
112 /* si_swapinfo(&i); */
113 /* availmem = availmem - (i.totalswap - i.freeswap); */
114
115 nomem = (availmem < sysctl_ip_vs_amemthresh);
116
Julian Anastasovaf9debd2005-07-11 20:59:57 -0700117 local_bh_disable();
118
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119 /* drop_entry */
120 spin_lock(&__ip_vs_dropentry_lock);
121 switch (sysctl_ip_vs_drop_entry) {
122 case 0:
123 atomic_set(&ip_vs_dropentry, 0);
124 break;
125 case 1:
126 if (nomem) {
127 atomic_set(&ip_vs_dropentry, 1);
128 sysctl_ip_vs_drop_entry = 2;
129 } else {
130 atomic_set(&ip_vs_dropentry, 0);
131 }
132 break;
133 case 2:
134 if (nomem) {
135 atomic_set(&ip_vs_dropentry, 1);
136 } else {
137 atomic_set(&ip_vs_dropentry, 0);
138 sysctl_ip_vs_drop_entry = 1;
139 };
140 break;
141 case 3:
142 atomic_set(&ip_vs_dropentry, 1);
143 break;
144 }
145 spin_unlock(&__ip_vs_dropentry_lock);
146
147 /* drop_packet */
148 spin_lock(&__ip_vs_droppacket_lock);
149 switch (sysctl_ip_vs_drop_packet) {
150 case 0:
151 ip_vs_drop_rate = 0;
152 break;
153 case 1:
154 if (nomem) {
155 ip_vs_drop_rate = ip_vs_drop_counter
156 = sysctl_ip_vs_amemthresh /
157 (sysctl_ip_vs_amemthresh-availmem);
158 sysctl_ip_vs_drop_packet = 2;
159 } else {
160 ip_vs_drop_rate = 0;
161 }
162 break;
163 case 2:
164 if (nomem) {
165 ip_vs_drop_rate = ip_vs_drop_counter
166 = sysctl_ip_vs_amemthresh /
167 (sysctl_ip_vs_amemthresh-availmem);
168 } else {
169 ip_vs_drop_rate = 0;
170 sysctl_ip_vs_drop_packet = 1;
171 }
172 break;
173 case 3:
174 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
175 break;
176 }
177 spin_unlock(&__ip_vs_droppacket_lock);
178
179 /* secure_tcp */
180 write_lock(&__ip_vs_securetcp_lock);
181 switch (sysctl_ip_vs_secure_tcp) {
182 case 0:
183 if (old_secure_tcp >= 2)
184 to_change = 0;
185 break;
186 case 1:
187 if (nomem) {
188 if (old_secure_tcp < 2)
189 to_change = 1;
190 sysctl_ip_vs_secure_tcp = 2;
191 } else {
192 if (old_secure_tcp >= 2)
193 to_change = 0;
194 }
195 break;
196 case 2:
197 if (nomem) {
198 if (old_secure_tcp < 2)
199 to_change = 1;
200 } else {
201 if (old_secure_tcp >= 2)
202 to_change = 0;
203 sysctl_ip_vs_secure_tcp = 1;
204 }
205 break;
206 case 3:
207 if (old_secure_tcp < 2)
208 to_change = 1;
209 break;
210 }
211 old_secure_tcp = sysctl_ip_vs_secure_tcp;
212 if (to_change >= 0)
213 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
214 write_unlock(&__ip_vs_securetcp_lock);
Julian Anastasovaf9debd2005-07-11 20:59:57 -0700215
216 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700217}
218
219
220/*
221 * Timer for checking the defense
222 */
223#define DEFENSE_TIMER_PERIOD 1*HZ
224static void defense_work_handler(void *data);
225static DECLARE_WORK(defense_work, defense_work_handler, NULL);
226
227static void defense_work_handler(void *data)
228{
229 update_defense_level();
230 if (atomic_read(&ip_vs_dropentry))
231 ip_vs_random_dropentry();
232
233 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
234}
235
236int
237ip_vs_use_count_inc(void)
238{
239 return try_module_get(THIS_MODULE);
240}
241
242void
243ip_vs_use_count_dec(void)
244{
245 module_put(THIS_MODULE);
246}
247
248
249/*
250 * Hash table: for virtual service lookups
251 */
252#define IP_VS_SVC_TAB_BITS 8
253#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
254#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
255
256/* the service table hashed by <protocol, addr, port> */
257static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
258/* the service table hashed by fwmark */
259static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
260
261/*
262 * Hash table: for real service lookups
263 */
264#define IP_VS_RTAB_BITS 4
265#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
266#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
267
268static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
269
270/*
271 * Trash for destinations
272 */
273static LIST_HEAD(ip_vs_dest_trash);
274
275/*
276 * FTP & NULL virtual service counters
277 */
278static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
279static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
280
281
282/*
283 * Returns hash value for virtual service
284 */
285static __inline__ unsigned
286ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
287{
288 register unsigned porth = ntohs(port);
289
290 return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
291 & IP_VS_SVC_TAB_MASK;
292}
293
294/*
295 * Returns hash value of fwmark for virtual service lookup
296 */
297static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
298{
299 return fwmark & IP_VS_SVC_TAB_MASK;
300}
301
302/*
303 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
304 * or in the ip_vs_svc_fwm_table by fwmark.
305 * Should be called with locked tables.
306 */
307static int ip_vs_svc_hash(struct ip_vs_service *svc)
308{
309 unsigned hash;
310
311 if (svc->flags & IP_VS_SVC_F_HASHED) {
312 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
313 "called from %p\n", __builtin_return_address(0));
314 return 0;
315 }
316
317 if (svc->fwmark == 0) {
318 /*
319 * Hash it by <protocol,addr,port> in ip_vs_svc_table
320 */
321 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
322 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
323 } else {
324 /*
325 * Hash it by fwmark in ip_vs_svc_fwm_table
326 */
327 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
328 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
329 }
330
331 svc->flags |= IP_VS_SVC_F_HASHED;
332 /* increase its refcnt because it is referenced by the svc table */
333 atomic_inc(&svc->refcnt);
334 return 1;
335}
336
337
338/*
339 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
340 * Should be called with locked tables.
341 */
342static int ip_vs_svc_unhash(struct ip_vs_service *svc)
343{
344 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
345 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
346 "called from %p\n", __builtin_return_address(0));
347 return 0;
348 }
349
350 if (svc->fwmark == 0) {
351 /* Remove it from the ip_vs_svc_table table */
352 list_del(&svc->s_list);
353 } else {
354 /* Remove it from the ip_vs_svc_fwm_table table */
355 list_del(&svc->f_list);
356 }
357
358 svc->flags &= ~IP_VS_SVC_F_HASHED;
359 atomic_dec(&svc->refcnt);
360 return 1;
361}
362
363
364/*
365 * Get service by {proto,addr,port} in the service table.
366 */
367static __inline__ struct ip_vs_service *
368__ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
369{
370 unsigned hash;
371 struct ip_vs_service *svc;
372
373 /* Check for "full" addressed entries */
374 hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
375
376 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
377 if ((svc->addr == vaddr)
378 && (svc->port == vport)
379 && (svc->protocol == protocol)) {
380 /* HIT */
381 atomic_inc(&svc->usecnt);
382 return svc;
383 }
384 }
385
386 return NULL;
387}
388
389
390/*
391 * Get service by {fwmark} in the service table.
392 */
393static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
394{
395 unsigned hash;
396 struct ip_vs_service *svc;
397
398 /* Check for fwmark addressed entries */
399 hash = ip_vs_svc_fwm_hashkey(fwmark);
400
401 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
402 if (svc->fwmark == fwmark) {
403 /* HIT */
404 atomic_inc(&svc->usecnt);
405 return svc;
406 }
407 }
408
409 return NULL;
410}
411
412struct ip_vs_service *
413ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
414{
415 struct ip_vs_service *svc;
416
417 read_lock(&__ip_vs_svc_lock);
418
419 /*
420 * Check the table hashed by fwmark first
421 */
422 if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
423 goto out;
424
425 /*
426 * Check the table hashed by <protocol,addr,port>
427 * for "full" addressed entries
428 */
429 svc = __ip_vs_service_get(protocol, vaddr, vport);
430
431 if (svc == NULL
432 && protocol == IPPROTO_TCP
433 && atomic_read(&ip_vs_ftpsvc_counter)
434 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
435 /*
436 * Check if ftp service entry exists, the packet
437 * might belong to FTP data connections.
438 */
439 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
440 }
441
442 if (svc == NULL
443 && atomic_read(&ip_vs_nullsvc_counter)) {
444 /*
445 * Check if the catch-all port (port zero) exists
446 */
447 svc = __ip_vs_service_get(protocol, vaddr, 0);
448 }
449
450 out:
451 read_unlock(&__ip_vs_svc_lock);
452
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800453 IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454 fwmark, ip_vs_proto_name(protocol),
455 NIPQUAD(vaddr), ntohs(vport),
456 svc?"hit":"not hit");
457
458 return svc;
459}
460
461
462static inline void
463__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
464{
465 atomic_inc(&svc->refcnt);
466 dest->svc = svc;
467}
468
469static inline void
470__ip_vs_unbind_svc(struct ip_vs_dest *dest)
471{
472 struct ip_vs_service *svc = dest->svc;
473
474 dest->svc = NULL;
475 if (atomic_dec_and_test(&svc->refcnt))
476 kfree(svc);
477}
478
479
480/*
481 * Returns hash value for real service
482 */
483static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
484{
485 register unsigned porth = ntohs(port);
486
487 return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
488 & IP_VS_RTAB_MASK;
489}
490
491/*
492 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
493 * should be called with locked tables.
494 */
495static int ip_vs_rs_hash(struct ip_vs_dest *dest)
496{
497 unsigned hash;
498
499 if (!list_empty(&dest->d_list)) {
500 return 0;
501 }
502
503 /*
504 * Hash by proto,addr,port,
505 * which are the parameters of the real service.
506 */
507 hash = ip_vs_rs_hashkey(dest->addr, dest->port);
508 list_add(&dest->d_list, &ip_vs_rtable[hash]);
509
510 return 1;
511}
512
513/*
514 * UNhashes ip_vs_dest from ip_vs_rtable.
515 * should be called with locked tables.
516 */
517static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
518{
519 /*
520 * Remove it from the ip_vs_rtable table.
521 */
522 if (!list_empty(&dest->d_list)) {
523 list_del(&dest->d_list);
524 INIT_LIST_HEAD(&dest->d_list);
525 }
526
527 return 1;
528}
529
530/*
531 * Lookup real service by <proto,addr,port> in the real service table.
532 */
533struct ip_vs_dest *
534ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
535{
536 unsigned hash;
537 struct ip_vs_dest *dest;
538
539 /*
540 * Check for "full" addressed entries
541 * Return the first found entry
542 */
543 hash = ip_vs_rs_hashkey(daddr, dport);
544
545 read_lock(&__ip_vs_rs_lock);
546 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
547 if ((dest->addr == daddr)
548 && (dest->port == dport)
549 && ((dest->protocol == protocol) ||
550 dest->vfwmark)) {
551 /* HIT */
552 read_unlock(&__ip_vs_rs_lock);
553 return dest;
554 }
555 }
556 read_unlock(&__ip_vs_rs_lock);
557
558 return NULL;
559}
560
561/*
562 * Lookup destination by {addr,port} in the given service
563 */
564static struct ip_vs_dest *
565ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
566{
567 struct ip_vs_dest *dest;
568
569 /*
570 * Find the destination for the given service
571 */
572 list_for_each_entry(dest, &svc->destinations, n_list) {
573 if ((dest->addr == daddr) && (dest->port == dport)) {
574 /* HIT */
575 return dest;
576 }
577 }
578
579 return NULL;
580}
581
582
583/*
584 * Lookup dest by {svc,addr,port} in the destination trash.
585 * The destination trash is used to hold the destinations that are removed
586 * from the service table but are still referenced by some conn entries.
587 * The reason to add the destination trash is when the dest is temporary
588 * down (either by administrator or by monitor program), the dest can be
589 * picked back from the trash, the remaining connections to the dest can
590 * continue, and the counting information of the dest is also useful for
591 * scheduling.
592 */
593static struct ip_vs_dest *
594ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
595{
596 struct ip_vs_dest *dest, *nxt;
597
598 /*
599 * Find the destination in trash
600 */
601 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
602 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800603 "dest->refcnt=%d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700604 dest->vfwmark,
605 NIPQUAD(dest->addr), ntohs(dest->port),
606 atomic_read(&dest->refcnt));
607 if (dest->addr == daddr &&
608 dest->port == dport &&
609 dest->vfwmark == svc->fwmark &&
610 dest->protocol == svc->protocol &&
611 (svc->fwmark ||
612 (dest->vaddr == svc->addr &&
613 dest->vport == svc->port))) {
614 /* HIT */
615 return dest;
616 }
617
618 /*
619 * Try to purge the destination from trash if not referenced
620 */
621 if (atomic_read(&dest->refcnt) == 1) {
622 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
623 "from trash\n",
624 dest->vfwmark,
625 NIPQUAD(dest->addr), ntohs(dest->port));
626 list_del(&dest->n_list);
627 ip_vs_dst_reset(dest);
628 __ip_vs_unbind_svc(dest);
629 kfree(dest);
630 }
631 }
632
633 return NULL;
634}
635
636
637/*
638 * Clean up all the destinations in the trash
639 * Called by the ip_vs_control_cleanup()
640 *
641 * When the ip_vs_control_clearup is activated by ipvs module exit,
642 * the service tables must have been flushed and all the connections
643 * are expired, and the refcnt of each destination in the trash must
644 * be 1, so we simply release them here.
645 */
646static void ip_vs_trash_cleanup(void)
647{
648 struct ip_vs_dest *dest, *nxt;
649
650 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
651 list_del(&dest->n_list);
652 ip_vs_dst_reset(dest);
653 __ip_vs_unbind_svc(dest);
654 kfree(dest);
655 }
656}
657
658
659static void
660ip_vs_zero_stats(struct ip_vs_stats *stats)
661{
662 spin_lock_bh(&stats->lock);
663 memset(stats, 0, (char *)&stats->lock - (char *)stats);
664 spin_unlock_bh(&stats->lock);
665 ip_vs_zero_estimator(stats);
666}
667
668/*
669 * Update a destination in the given service
670 */
671static void
672__ip_vs_update_dest(struct ip_vs_service *svc,
673 struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
674{
675 int conn_flags;
676
677 /* set the weight and the flags */
678 atomic_set(&dest->weight, udest->weight);
679 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
680
681 /* check if local node and update the flags */
682 if (inet_addr_type(udest->addr) == RTN_LOCAL) {
683 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
684 | IP_VS_CONN_F_LOCALNODE;
685 }
686
687 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
688 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
689 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
690 } else {
691 /*
692 * Put the real service in ip_vs_rtable if not present.
693 * For now only for NAT!
694 */
695 write_lock_bh(&__ip_vs_rs_lock);
696 ip_vs_rs_hash(dest);
697 write_unlock_bh(&__ip_vs_rs_lock);
698 }
699 atomic_set(&dest->conn_flags, conn_flags);
700
701 /* bind the service */
702 if (!dest->svc) {
703 __ip_vs_bind_svc(dest, svc);
704 } else {
705 if (dest->svc != svc) {
706 __ip_vs_unbind_svc(dest);
707 ip_vs_zero_stats(&dest->stats);
708 __ip_vs_bind_svc(dest, svc);
709 }
710 }
711
712 /* set the dest status flags */
713 dest->flags |= IP_VS_DEST_F_AVAILABLE;
714
715 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
716 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
717 dest->u_threshold = udest->u_threshold;
718 dest->l_threshold = udest->l_threshold;
719}
720
721
722/*
723 * Create a destination for the given service
724 */
725static int
726ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
727 struct ip_vs_dest **dest_p)
728{
729 struct ip_vs_dest *dest;
730 unsigned atype;
731
732 EnterFunction(2);
733
734 atype = inet_addr_type(udest->addr);
735 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
736 return -EINVAL;
737
Panagiotis Issaris0da974f2006-07-21 14:51:30 -0700738 dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700739 if (dest == NULL) {
740 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
741 return -ENOMEM;
742 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700743
744 dest->protocol = svc->protocol;
745 dest->vaddr = svc->addr;
746 dest->vport = svc->port;
747 dest->vfwmark = svc->fwmark;
748 dest->addr = udest->addr;
749 dest->port = udest->port;
750
751 atomic_set(&dest->activeconns, 0);
752 atomic_set(&dest->inactconns, 0);
753 atomic_set(&dest->persistconns, 0);
754 atomic_set(&dest->refcnt, 0);
755
756 INIT_LIST_HEAD(&dest->d_list);
757 spin_lock_init(&dest->dst_lock);
758 spin_lock_init(&dest->stats.lock);
759 __ip_vs_update_dest(svc, dest, udest);
760 ip_vs_new_estimator(&dest->stats);
761
762 *dest_p = dest;
763
764 LeaveFunction(2);
765 return 0;
766}
767
768
769/*
770 * Add a destination into an existing service
771 */
772static int
773ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
774{
775 struct ip_vs_dest *dest;
776 __u32 daddr = udest->addr;
777 __u16 dport = udest->port;
778 int ret;
779
780 EnterFunction(2);
781
782 if (udest->weight < 0) {
783 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
784 return -ERANGE;
785 }
786
787 if (udest->l_threshold > udest->u_threshold) {
788 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
789 "upper threshold\n");
790 return -ERANGE;
791 }
792
793 /*
794 * Check if the dest already exists in the list
795 */
796 dest = ip_vs_lookup_dest(svc, daddr, dport);
797 if (dest != NULL) {
798 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
799 return -EEXIST;
800 }
801
802 /*
803 * Check if the dest already exists in the trash and
804 * is from the same service
805 */
806 dest = ip_vs_trash_get_dest(svc, daddr, dport);
807 if (dest != NULL) {
808 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800809 "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700810 NIPQUAD(daddr), ntohs(dport),
811 atomic_read(&dest->refcnt),
812 dest->vfwmark,
813 NIPQUAD(dest->vaddr),
814 ntohs(dest->vport));
815 __ip_vs_update_dest(svc, dest, udest);
816
817 /*
818 * Get the destination from the trash
819 */
820 list_del(&dest->n_list);
821
822 ip_vs_new_estimator(&dest->stats);
823
824 write_lock_bh(&__ip_vs_svc_lock);
825
826 /*
827 * Wait until all other svc users go away.
828 */
829 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
830
831 list_add(&dest->n_list, &svc->destinations);
832 svc->num_dests++;
833
834 /* call the update_service function of its scheduler */
835 svc->scheduler->update_service(svc);
836
837 write_unlock_bh(&__ip_vs_svc_lock);
838 return 0;
839 }
840
841 /*
842 * Allocate and initialize the dest structure
843 */
844 ret = ip_vs_new_dest(svc, udest, &dest);
845 if (ret) {
846 return ret;
847 }
848
849 /*
850 * Add the dest entry into the list
851 */
852 atomic_inc(&dest->refcnt);
853
854 write_lock_bh(&__ip_vs_svc_lock);
855
856 /*
857 * Wait until all other svc users go away.
858 */
859 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
860
861 list_add(&dest->n_list, &svc->destinations);
862 svc->num_dests++;
863
864 /* call the update_service function of its scheduler */
865 svc->scheduler->update_service(svc);
866
867 write_unlock_bh(&__ip_vs_svc_lock);
868
869 LeaveFunction(2);
870
871 return 0;
872}
873
874
875/*
876 * Edit a destination in the given service
877 */
878static int
879ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
880{
881 struct ip_vs_dest *dest;
882 __u32 daddr = udest->addr;
883 __u16 dport = udest->port;
884
885 EnterFunction(2);
886
887 if (udest->weight < 0) {
888 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
889 return -ERANGE;
890 }
891
892 if (udest->l_threshold > udest->u_threshold) {
893 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
894 "upper threshold\n");
895 return -ERANGE;
896 }
897
898 /*
899 * Lookup the destination list
900 */
901 dest = ip_vs_lookup_dest(svc, daddr, dport);
902 if (dest == NULL) {
903 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
904 return -ENOENT;
905 }
906
907 __ip_vs_update_dest(svc, dest, udest);
908
909 write_lock_bh(&__ip_vs_svc_lock);
910
911 /* Wait until all other svc users go away */
912 while (atomic_read(&svc->usecnt) > 1) {};
913
914 /* call the update_service, because server weight may be changed */
915 svc->scheduler->update_service(svc);
916
917 write_unlock_bh(&__ip_vs_svc_lock);
918
919 LeaveFunction(2);
920
921 return 0;
922}
923
924
925/*
926 * Delete a destination (must be already unlinked from the service)
927 */
928static void __ip_vs_del_dest(struct ip_vs_dest *dest)
929{
930 ip_vs_kill_estimator(&dest->stats);
931
932 /*
933 * Remove it from the d-linked list with the real services.
934 */
935 write_lock_bh(&__ip_vs_rs_lock);
936 ip_vs_rs_unhash(dest);
937 write_unlock_bh(&__ip_vs_rs_lock);
938
939 /*
940 * Decrease the refcnt of the dest, and free the dest
941 * if nobody refers to it (refcnt=0). Otherwise, throw
942 * the destination into the trash.
943 */
944 if (atomic_dec_and_test(&dest->refcnt)) {
945 ip_vs_dst_reset(dest);
946 /* simply decrease svc->refcnt here, let the caller check
947 and release the service if nobody refers to it.
948 Only user context can release destination and service,
949 and only one user context can update virtual service at a
950 time, so the operation here is OK */
951 atomic_dec(&dest->svc->refcnt);
952 kfree(dest);
953 } else {
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800954 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
955 "dest->refcnt=%d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700956 NIPQUAD(dest->addr), ntohs(dest->port),
957 atomic_read(&dest->refcnt));
958 list_add(&dest->n_list, &ip_vs_dest_trash);
959 atomic_inc(&dest->refcnt);
960 }
961}
962
963
964/*
965 * Unlink a destination from the given service
966 */
967static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
968 struct ip_vs_dest *dest,
969 int svcupd)
970{
971 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
972
973 /*
974 * Remove it from the d-linked destination list.
975 */
976 list_del(&dest->n_list);
977 svc->num_dests--;
978 if (svcupd) {
979 /*
980 * Call the update_service function of its scheduler
981 */
982 svc->scheduler->update_service(svc);
983 }
984}
985
986
987/*
988 * Delete a destination server in the given service
989 */
990static int
991ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
992{
993 struct ip_vs_dest *dest;
994 __u32 daddr = udest->addr;
995 __u16 dport = udest->port;
996
997 EnterFunction(2);
998
999 dest = ip_vs_lookup_dest(svc, daddr, dport);
1000 if (dest == NULL) {
1001 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1002 return -ENOENT;
1003 }
1004
1005 write_lock_bh(&__ip_vs_svc_lock);
1006
1007 /*
1008 * Wait until all other svc users go away.
1009 */
1010 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1011
1012 /*
1013 * Unlink dest from the service
1014 */
1015 __ip_vs_unlink_dest(svc, dest, 1);
1016
1017 write_unlock_bh(&__ip_vs_svc_lock);
1018
1019 /*
1020 * Delete the destination
1021 */
1022 __ip_vs_del_dest(dest);
1023
1024 LeaveFunction(2);
1025
1026 return 0;
1027}
1028
1029
1030/*
1031 * Add a service into the service hash table
1032 */
1033static int
1034ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1035{
1036 int ret = 0;
1037 struct ip_vs_scheduler *sched = NULL;
1038 struct ip_vs_service *svc = NULL;
1039
1040 /* increase the module use count */
1041 ip_vs_use_count_inc();
1042
1043 /* Lookup the scheduler by 'u->sched_name' */
1044 sched = ip_vs_scheduler_get(u->sched_name);
1045 if (sched == NULL) {
1046 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1047 u->sched_name);
1048 ret = -ENOENT;
1049 goto out_mod_dec;
1050 }
1051
Panagiotis Issaris0da974f2006-07-21 14:51:30 -07001052 svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053 if (svc == NULL) {
1054 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1055 ret = -ENOMEM;
1056 goto out_err;
1057 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001058
1059 /* I'm the first user of the service */
1060 atomic_set(&svc->usecnt, 1);
1061 atomic_set(&svc->refcnt, 0);
1062
1063 svc->protocol = u->protocol;
1064 svc->addr = u->addr;
1065 svc->port = u->port;
1066 svc->fwmark = u->fwmark;
1067 svc->flags = u->flags;
1068 svc->timeout = u->timeout * HZ;
1069 svc->netmask = u->netmask;
1070
1071 INIT_LIST_HEAD(&svc->destinations);
1072 rwlock_init(&svc->sched_lock);
1073 spin_lock_init(&svc->stats.lock);
1074
1075 /* Bind the scheduler */
1076 ret = ip_vs_bind_scheduler(svc, sched);
1077 if (ret)
1078 goto out_err;
1079 sched = NULL;
1080
1081 /* Update the virtual service counters */
1082 if (svc->port == FTPPORT)
1083 atomic_inc(&ip_vs_ftpsvc_counter);
1084 else if (svc->port == 0)
1085 atomic_inc(&ip_vs_nullsvc_counter);
1086
1087 ip_vs_new_estimator(&svc->stats);
1088 ip_vs_num_services++;
1089
1090 /* Hash the service into the service table */
1091 write_lock_bh(&__ip_vs_svc_lock);
1092 ip_vs_svc_hash(svc);
1093 write_unlock_bh(&__ip_vs_svc_lock);
1094
1095 *svc_p = svc;
1096 return 0;
1097
1098 out_err:
1099 if (svc != NULL) {
1100 if (svc->scheduler)
1101 ip_vs_unbind_scheduler(svc);
1102 if (svc->inc) {
1103 local_bh_disable();
1104 ip_vs_app_inc_put(svc->inc);
1105 local_bh_enable();
1106 }
1107 kfree(svc);
1108 }
1109 ip_vs_scheduler_put(sched);
1110
1111 out_mod_dec:
1112 /* decrease the module use count */
1113 ip_vs_use_count_dec();
1114
1115 return ret;
1116}
1117
1118
1119/*
1120 * Edit a service and bind it with a new scheduler
1121 */
1122static int
1123ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1124{
1125 struct ip_vs_scheduler *sched, *old_sched;
1126 int ret = 0;
1127
1128 /*
1129 * Lookup the scheduler, by 'u->sched_name'
1130 */
1131 sched = ip_vs_scheduler_get(u->sched_name);
1132 if (sched == NULL) {
1133 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1134 u->sched_name);
1135 return -ENOENT;
1136 }
1137 old_sched = sched;
1138
1139 write_lock_bh(&__ip_vs_svc_lock);
1140
1141 /*
1142 * Wait until all other svc users go away.
1143 */
1144 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1145
1146 /*
1147 * Set the flags and timeout value
1148 */
1149 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1150 svc->timeout = u->timeout * HZ;
1151 svc->netmask = u->netmask;
1152
1153 old_sched = svc->scheduler;
1154 if (sched != old_sched) {
1155 /*
1156 * Unbind the old scheduler
1157 */
1158 if ((ret = ip_vs_unbind_scheduler(svc))) {
1159 old_sched = sched;
1160 goto out;
1161 }
1162
1163 /*
1164 * Bind the new scheduler
1165 */
1166 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1167 /*
1168 * If ip_vs_bind_scheduler fails, restore the old
1169 * scheduler.
1170 * The main reason of failure is out of memory.
1171 *
1172 * The question is if the old scheduler can be
1173 * restored all the time. TODO: if it cannot be
1174 * restored some time, we must delete the service,
1175 * otherwise the system may crash.
1176 */
1177 ip_vs_bind_scheduler(svc, old_sched);
1178 old_sched = sched;
1179 goto out;
1180 }
1181 }
1182
1183 out:
1184 write_unlock_bh(&__ip_vs_svc_lock);
1185
1186 if (old_sched)
1187 ip_vs_scheduler_put(old_sched);
1188
1189 return ret;
1190}
1191
1192
1193/*
1194 * Delete a service from the service list
1195 * - The service must be unlinked, unlocked and not referenced!
1196 * - We are called under _bh lock
1197 */
1198static void __ip_vs_del_service(struct ip_vs_service *svc)
1199{
1200 struct ip_vs_dest *dest, *nxt;
1201 struct ip_vs_scheduler *old_sched;
1202
1203 ip_vs_num_services--;
1204 ip_vs_kill_estimator(&svc->stats);
1205
1206 /* Unbind scheduler */
1207 old_sched = svc->scheduler;
1208 ip_vs_unbind_scheduler(svc);
1209 if (old_sched)
1210 ip_vs_scheduler_put(old_sched);
1211
1212 /* Unbind app inc */
1213 if (svc->inc) {
1214 ip_vs_app_inc_put(svc->inc);
1215 svc->inc = NULL;
1216 }
1217
1218 /*
1219 * Unlink the whole destination list
1220 */
1221 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1222 __ip_vs_unlink_dest(svc, dest, 0);
1223 __ip_vs_del_dest(dest);
1224 }
1225
1226 /*
1227 * Update the virtual service counters
1228 */
1229 if (svc->port == FTPPORT)
1230 atomic_dec(&ip_vs_ftpsvc_counter);
1231 else if (svc->port == 0)
1232 atomic_dec(&ip_vs_nullsvc_counter);
1233
1234 /*
1235 * Free the service if nobody refers to it
1236 */
1237 if (atomic_read(&svc->refcnt) == 0)
1238 kfree(svc);
1239
1240 /* decrease the module use count */
1241 ip_vs_use_count_dec();
1242}
1243
1244/*
1245 * Delete a service from the service list
1246 */
1247static int ip_vs_del_service(struct ip_vs_service *svc)
1248{
1249 if (svc == NULL)
1250 return -EEXIST;
1251
1252 /*
1253 * Unhash it from the service table
1254 */
1255 write_lock_bh(&__ip_vs_svc_lock);
1256
1257 ip_vs_svc_unhash(svc);
1258
1259 /*
1260 * Wait until all the svc users go away.
1261 */
1262 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1263
1264 __ip_vs_del_service(svc);
1265
1266 write_unlock_bh(&__ip_vs_svc_lock);
1267
1268 return 0;
1269}
1270
1271
1272/*
1273 * Flush all the virtual services
1274 */
1275static int ip_vs_flush(void)
1276{
1277 int idx;
1278 struct ip_vs_service *svc, *nxt;
1279
1280 /*
1281 * Flush the service table hashed by <protocol,addr,port>
1282 */
1283 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1284 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1285 write_lock_bh(&__ip_vs_svc_lock);
1286 ip_vs_svc_unhash(svc);
1287 /*
1288 * Wait until all the svc users go away.
1289 */
1290 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1291 __ip_vs_del_service(svc);
1292 write_unlock_bh(&__ip_vs_svc_lock);
1293 }
1294 }
1295
1296 /*
1297 * Flush the service table hashed by fwmark
1298 */
1299 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1300 list_for_each_entry_safe(svc, nxt,
1301 &ip_vs_svc_fwm_table[idx], f_list) {
1302 write_lock_bh(&__ip_vs_svc_lock);
1303 ip_vs_svc_unhash(svc);
1304 /*
1305 * Wait until all the svc users go away.
1306 */
1307 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1308 __ip_vs_del_service(svc);
1309 write_unlock_bh(&__ip_vs_svc_lock);
1310 }
1311 }
1312
1313 return 0;
1314}
1315
1316
1317/*
1318 * Zero counters in a service or all services
1319 */
1320static int ip_vs_zero_service(struct ip_vs_service *svc)
1321{
1322 struct ip_vs_dest *dest;
1323
1324 write_lock_bh(&__ip_vs_svc_lock);
1325 list_for_each_entry(dest, &svc->destinations, n_list) {
1326 ip_vs_zero_stats(&dest->stats);
1327 }
1328 ip_vs_zero_stats(&svc->stats);
1329 write_unlock_bh(&__ip_vs_svc_lock);
1330 return 0;
1331}
1332
1333static int ip_vs_zero_all(void)
1334{
1335 int idx;
1336 struct ip_vs_service *svc;
1337
1338 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1339 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1340 ip_vs_zero_service(svc);
1341 }
1342 }
1343
1344 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1345 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1346 ip_vs_zero_service(svc);
1347 }
1348 }
1349
1350 ip_vs_zero_stats(&ip_vs_stats);
1351 return 0;
1352}
1353
1354
1355static int
1356proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1357 void __user *buffer, size_t *lenp, loff_t *ppos)
1358{
1359 int *valp = table->data;
1360 int val = *valp;
1361 int rc;
1362
1363 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1364 if (write && (*valp != val)) {
1365 if ((*valp < 0) || (*valp > 3)) {
1366 /* Restore the correct value */
1367 *valp = val;
1368 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001369 update_defense_level();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001370 }
1371 }
1372 return rc;
1373}
1374
1375
1376static int
1377proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1378 void __user *buffer, size_t *lenp, loff_t *ppos)
1379{
1380 int *valp = table->data;
1381 int val[2];
1382 int rc;
1383
1384 /* backup the value first */
1385 memcpy(val, valp, sizeof(val));
1386
1387 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1388 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1389 /* Restore the correct value */
1390 memcpy(valp, val, sizeof(val));
1391 }
1392 return rc;
1393}
1394
1395
1396/*
1397 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1398 */
1399
1400static struct ctl_table vs_vars[] = {
1401 {
1402 .ctl_name = NET_IPV4_VS_AMEMTHRESH,
1403 .procname = "amemthresh",
1404 .data = &sysctl_ip_vs_amemthresh,
1405 .maxlen = sizeof(int),
1406 .mode = 0644,
1407 .proc_handler = &proc_dointvec,
1408 },
1409#ifdef CONFIG_IP_VS_DEBUG
1410 {
1411 .ctl_name = NET_IPV4_VS_DEBUG_LEVEL,
1412 .procname = "debug_level",
1413 .data = &sysctl_ip_vs_debug_level,
1414 .maxlen = sizeof(int),
1415 .mode = 0644,
1416 .proc_handler = &proc_dointvec,
1417 },
1418#endif
1419 {
1420 .ctl_name = NET_IPV4_VS_AMDROPRATE,
1421 .procname = "am_droprate",
1422 .data = &sysctl_ip_vs_am_droprate,
1423 .maxlen = sizeof(int),
1424 .mode = 0644,
1425 .proc_handler = &proc_dointvec,
1426 },
1427 {
1428 .ctl_name = NET_IPV4_VS_DROP_ENTRY,
1429 .procname = "drop_entry",
1430 .data = &sysctl_ip_vs_drop_entry,
1431 .maxlen = sizeof(int),
1432 .mode = 0644,
1433 .proc_handler = &proc_do_defense_mode,
1434 },
1435 {
1436 .ctl_name = NET_IPV4_VS_DROP_PACKET,
1437 .procname = "drop_packet",
1438 .data = &sysctl_ip_vs_drop_packet,
1439 .maxlen = sizeof(int),
1440 .mode = 0644,
1441 .proc_handler = &proc_do_defense_mode,
1442 },
1443 {
1444 .ctl_name = NET_IPV4_VS_SECURE_TCP,
1445 .procname = "secure_tcp",
1446 .data = &sysctl_ip_vs_secure_tcp,
1447 .maxlen = sizeof(int),
1448 .mode = 0644,
1449 .proc_handler = &proc_do_defense_mode,
1450 },
1451#if 0
1452 {
1453 .ctl_name = NET_IPV4_VS_TO_ES,
1454 .procname = "timeout_established",
1455 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1456 .maxlen = sizeof(int),
1457 .mode = 0644,
1458 .proc_handler = &proc_dointvec_jiffies,
1459 },
1460 {
1461 .ctl_name = NET_IPV4_VS_TO_SS,
1462 .procname = "timeout_synsent",
1463 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1464 .maxlen = sizeof(int),
1465 .mode = 0644,
1466 .proc_handler = &proc_dointvec_jiffies,
1467 },
1468 {
1469 .ctl_name = NET_IPV4_VS_TO_SR,
1470 .procname = "timeout_synrecv",
1471 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1472 .maxlen = sizeof(int),
1473 .mode = 0644,
1474 .proc_handler = &proc_dointvec_jiffies,
1475 },
1476 {
1477 .ctl_name = NET_IPV4_VS_TO_FW,
1478 .procname = "timeout_finwait",
1479 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1480 .maxlen = sizeof(int),
1481 .mode = 0644,
1482 .proc_handler = &proc_dointvec_jiffies,
1483 },
1484 {
1485 .ctl_name = NET_IPV4_VS_TO_TW,
1486 .procname = "timeout_timewait",
1487 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1488 .maxlen = sizeof(int),
1489 .mode = 0644,
1490 .proc_handler = &proc_dointvec_jiffies,
1491 },
1492 {
1493 .ctl_name = NET_IPV4_VS_TO_CL,
1494 .procname = "timeout_close",
1495 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1496 .maxlen = sizeof(int),
1497 .mode = 0644,
1498 .proc_handler = &proc_dointvec_jiffies,
1499 },
1500 {
1501 .ctl_name = NET_IPV4_VS_TO_CW,
1502 .procname = "timeout_closewait",
1503 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1504 .maxlen = sizeof(int),
1505 .mode = 0644,
1506 .proc_handler = &proc_dointvec_jiffies,
1507 },
1508 {
1509 .ctl_name = NET_IPV4_VS_TO_LA,
1510 .procname = "timeout_lastack",
1511 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1512 .maxlen = sizeof(int),
1513 .mode = 0644,
1514 .proc_handler = &proc_dointvec_jiffies,
1515 },
1516 {
1517 .ctl_name = NET_IPV4_VS_TO_LI,
1518 .procname = "timeout_listen",
1519 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1520 .maxlen = sizeof(int),
1521 .mode = 0644,
1522 .proc_handler = &proc_dointvec_jiffies,
1523 },
1524 {
1525 .ctl_name = NET_IPV4_VS_TO_SA,
1526 .procname = "timeout_synack",
1527 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1528 .maxlen = sizeof(int),
1529 .mode = 0644,
1530 .proc_handler = &proc_dointvec_jiffies,
1531 },
1532 {
1533 .ctl_name = NET_IPV4_VS_TO_UDP,
1534 .procname = "timeout_udp",
1535 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1536 .maxlen = sizeof(int),
1537 .mode = 0644,
1538 .proc_handler = &proc_dointvec_jiffies,
1539 },
1540 {
1541 .ctl_name = NET_IPV4_VS_TO_ICMP,
1542 .procname = "timeout_icmp",
1543 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1544 .maxlen = sizeof(int),
1545 .mode = 0644,
1546 .proc_handler = &proc_dointvec_jiffies,
1547 },
1548#endif
1549 {
1550 .ctl_name = NET_IPV4_VS_CACHE_BYPASS,
1551 .procname = "cache_bypass",
1552 .data = &sysctl_ip_vs_cache_bypass,
1553 .maxlen = sizeof(int),
1554 .mode = 0644,
1555 .proc_handler = &proc_dointvec,
1556 },
1557 {
1558 .ctl_name = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1559 .procname = "expire_nodest_conn",
1560 .data = &sysctl_ip_vs_expire_nodest_conn,
1561 .maxlen = sizeof(int),
1562 .mode = 0644,
1563 .proc_handler = &proc_dointvec,
1564 },
1565 {
1566 .ctl_name = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1567 .procname = "expire_quiescent_template",
1568 .data = &sysctl_ip_vs_expire_quiescent_template,
1569 .maxlen = sizeof(int),
1570 .mode = 0644,
1571 .proc_handler = &proc_dointvec,
1572 },
1573 {
1574 .ctl_name = NET_IPV4_VS_SYNC_THRESHOLD,
1575 .procname = "sync_threshold",
1576 .data = &sysctl_ip_vs_sync_threshold,
1577 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1578 .mode = 0644,
1579 .proc_handler = &proc_do_sync_threshold,
1580 },
1581 {
1582 .ctl_name = NET_IPV4_VS_NAT_ICMP_SEND,
1583 .procname = "nat_icmp_send",
1584 .data = &sysctl_ip_vs_nat_icmp_send,
1585 .maxlen = sizeof(int),
1586 .mode = 0644,
1587 .proc_handler = &proc_dointvec,
1588 },
1589 { .ctl_name = 0 }
1590};
1591
1592static ctl_table vs_table[] = {
1593 {
1594 .ctl_name = NET_IPV4_VS,
1595 .procname = "vs",
1596 .mode = 0555,
1597 .child = vs_vars
1598 },
1599 { .ctl_name = 0 }
1600};
1601
David S. Millerbf0ff9e2005-08-19 16:37:30 -07001602static ctl_table ipvs_ipv4_table[] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001603 {
1604 .ctl_name = NET_IPV4,
1605 .procname = "ipv4",
1606 .mode = 0555,
1607 .child = vs_table,
1608 },
1609 { .ctl_name = 0 }
1610};
1611
1612static ctl_table vs_root_table[] = {
1613 {
1614 .ctl_name = CTL_NET,
1615 .procname = "net",
1616 .mode = 0555,
David S. Millerbf0ff9e2005-08-19 16:37:30 -07001617 .child = ipvs_ipv4_table,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001618 },
1619 { .ctl_name = 0 }
1620};
1621
1622static struct ctl_table_header * sysctl_header;
1623
1624#ifdef CONFIG_PROC_FS
1625
1626struct ip_vs_iter {
1627 struct list_head *table;
1628 int bucket;
1629};
1630
1631/*
1632 * Write the contents of the VS rule table to a PROCfs file.
1633 * (It is kept just for backward compatibility)
1634 */
1635static inline const char *ip_vs_fwd_name(unsigned flags)
1636{
1637 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1638 case IP_VS_CONN_F_LOCALNODE:
1639 return "Local";
1640 case IP_VS_CONN_F_TUNNEL:
1641 return "Tunnel";
1642 case IP_VS_CONN_F_DROUTE:
1643 return "Route";
1644 default:
1645 return "Masq";
1646 }
1647}
1648
1649
1650/* Get the Nth entry in the two lists */
1651static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1652{
1653 struct ip_vs_iter *iter = seq->private;
1654 int idx;
1655 struct ip_vs_service *svc;
1656
1657 /* look in hash by protocol */
1658 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1659 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1660 if (pos-- == 0){
1661 iter->table = ip_vs_svc_table;
1662 iter->bucket = idx;
1663 return svc;
1664 }
1665 }
1666 }
1667
1668 /* keep looking in fwmark */
1669 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1670 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1671 if (pos-- == 0) {
1672 iter->table = ip_vs_svc_fwm_table;
1673 iter->bucket = idx;
1674 return svc;
1675 }
1676 }
1677 }
1678
1679 return NULL;
1680}
1681
1682static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1683{
1684
1685 read_lock_bh(&__ip_vs_svc_lock);
1686 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1687}
1688
1689
1690static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1691{
1692 struct list_head *e;
1693 struct ip_vs_iter *iter;
1694 struct ip_vs_service *svc;
1695
1696 ++*pos;
1697 if (v == SEQ_START_TOKEN)
1698 return ip_vs_info_array(seq,0);
1699
1700 svc = v;
1701 iter = seq->private;
1702
1703 if (iter->table == ip_vs_svc_table) {
1704 /* next service in table hashed by protocol */
1705 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1706 return list_entry(e, struct ip_vs_service, s_list);
1707
1708
1709 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1710 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1711 s_list) {
1712 return svc;
1713 }
1714 }
1715
1716 iter->table = ip_vs_svc_fwm_table;
1717 iter->bucket = -1;
1718 goto scan_fwmark;
1719 }
1720
1721 /* next service in hashed by fwmark */
1722 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1723 return list_entry(e, struct ip_vs_service, f_list);
1724
1725 scan_fwmark:
1726 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1727 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1728 f_list)
1729 return svc;
1730 }
1731
1732 return NULL;
1733}
1734
1735static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1736{
1737 read_unlock_bh(&__ip_vs_svc_lock);
1738}
1739
1740
1741static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1742{
1743 if (v == SEQ_START_TOKEN) {
1744 seq_printf(seq,
1745 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1746 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1747 seq_puts(seq,
1748 "Prot LocalAddress:Port Scheduler Flags\n");
1749 seq_puts(seq,
1750 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1751 } else {
1752 const struct ip_vs_service *svc = v;
1753 const struct ip_vs_iter *iter = seq->private;
1754 const struct ip_vs_dest *dest;
1755
1756 if (iter->table == ip_vs_svc_table)
1757 seq_printf(seq, "%s %08X:%04X %s ",
1758 ip_vs_proto_name(svc->protocol),
1759 ntohl(svc->addr),
1760 ntohs(svc->port),
1761 svc->scheduler->name);
1762 else
1763 seq_printf(seq, "FWM %08X %s ",
1764 svc->fwmark, svc->scheduler->name);
1765
1766 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1767 seq_printf(seq, "persistent %d %08X\n",
1768 svc->timeout,
1769 ntohl(svc->netmask));
1770 else
1771 seq_putc(seq, '\n');
1772
1773 list_for_each_entry(dest, &svc->destinations, n_list) {
1774 seq_printf(seq,
1775 " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
1776 ntohl(dest->addr), ntohs(dest->port),
1777 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1778 atomic_read(&dest->weight),
1779 atomic_read(&dest->activeconns),
1780 atomic_read(&dest->inactconns));
1781 }
1782 }
1783 return 0;
1784}
1785
1786static struct seq_operations ip_vs_info_seq_ops = {
1787 .start = ip_vs_info_seq_start,
1788 .next = ip_vs_info_seq_next,
1789 .stop = ip_vs_info_seq_stop,
1790 .show = ip_vs_info_seq_show,
1791};
1792
1793static int ip_vs_info_open(struct inode *inode, struct file *file)
1794{
1795 struct seq_file *seq;
1796 int rc = -ENOMEM;
Panagiotis Issaris0da974f2006-07-21 14:51:30 -07001797 struct ip_vs_iter *s = kzalloc(sizeof(*s), GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001798
1799 if (!s)
1800 goto out;
1801
1802 rc = seq_open(file, &ip_vs_info_seq_ops);
1803 if (rc)
1804 goto out_kfree;
1805
1806 seq = file->private_data;
1807 seq->private = s;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001808out:
1809 return rc;
1810out_kfree:
1811 kfree(s);
1812 goto out;
1813}
1814
1815static struct file_operations ip_vs_info_fops = {
1816 .owner = THIS_MODULE,
1817 .open = ip_vs_info_open,
1818 .read = seq_read,
1819 .llseek = seq_lseek,
1820 .release = seq_release_private,
1821};
1822
1823#endif
1824
1825struct ip_vs_stats ip_vs_stats;
1826
1827#ifdef CONFIG_PROC_FS
1828static int ip_vs_stats_show(struct seq_file *seq, void *v)
1829{
1830
1831/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1832 seq_puts(seq,
1833 " Total Incoming Outgoing Incoming Outgoing\n");
1834 seq_printf(seq,
1835 " Conns Packets Packets Bytes Bytes\n");
1836
1837 spin_lock_bh(&ip_vs_stats.lock);
1838 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1839 ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1840 (unsigned long long) ip_vs_stats.inbytes,
1841 (unsigned long long) ip_vs_stats.outbytes);
1842
1843/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1844 seq_puts(seq,
1845 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1846 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1847 ip_vs_stats.cps,
1848 ip_vs_stats.inpps,
1849 ip_vs_stats.outpps,
1850 ip_vs_stats.inbps,
1851 ip_vs_stats.outbps);
1852 spin_unlock_bh(&ip_vs_stats.lock);
1853
1854 return 0;
1855}
1856
1857static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1858{
1859 return single_open(file, ip_vs_stats_show, NULL);
1860}
1861
1862static struct file_operations ip_vs_stats_fops = {
1863 .owner = THIS_MODULE,
1864 .open = ip_vs_stats_seq_open,
1865 .read = seq_read,
1866 .llseek = seq_lseek,
1867 .release = single_release,
1868};
1869
1870#endif
1871
1872/*
1873 * Set timeout values for tcp tcpfin udp in the timeout_table.
1874 */
1875static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1876{
1877 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1878 u->tcp_timeout,
1879 u->tcp_fin_timeout,
1880 u->udp_timeout);
1881
1882#ifdef CONFIG_IP_VS_PROTO_TCP
1883 if (u->tcp_timeout) {
1884 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1885 = u->tcp_timeout * HZ;
1886 }
1887
1888 if (u->tcp_fin_timeout) {
1889 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1890 = u->tcp_fin_timeout * HZ;
1891 }
1892#endif
1893
1894#ifdef CONFIG_IP_VS_PROTO_UDP
1895 if (u->udp_timeout) {
1896 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1897 = u->udp_timeout * HZ;
1898 }
1899#endif
1900 return 0;
1901}
1902
1903
1904#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
1905#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
1906#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
1907 sizeof(struct ip_vs_dest_user))
1908#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
1909#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
1910#define MAX_ARG_LEN SVCDEST_ARG_LEN
1911
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001912static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001913 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
1914 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
1915 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
1916 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
1917 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
1918 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
1919 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
1920 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
1921 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
1922 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
1923 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
1924};
1925
1926static int
1927do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1928{
1929 int ret;
1930 unsigned char arg[MAX_ARG_LEN];
1931 struct ip_vs_service_user *usvc;
1932 struct ip_vs_service *svc;
1933 struct ip_vs_dest_user *udest;
1934
1935 if (!capable(CAP_NET_ADMIN))
1936 return -EPERM;
1937
1938 if (len != set_arglen[SET_CMDID(cmd)]) {
1939 IP_VS_ERR("set_ctl: len %u != %u\n",
1940 len, set_arglen[SET_CMDID(cmd)]);
1941 return -EINVAL;
1942 }
1943
1944 if (copy_from_user(arg, user, len) != 0)
1945 return -EFAULT;
1946
1947 /* increase the module use count */
1948 ip_vs_use_count_inc();
1949
Ingo Molnar14cc3e22006-03-26 01:37:14 -08001950 if (mutex_lock_interruptible(&__ip_vs_mutex)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001951 ret = -ERESTARTSYS;
1952 goto out_dec;
1953 }
1954
1955 if (cmd == IP_VS_SO_SET_FLUSH) {
1956 /* Flush the virtual service */
1957 ret = ip_vs_flush();
1958 goto out_unlock;
1959 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1960 /* Set timeout values for (tcp tcpfin udp) */
1961 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1962 goto out_unlock;
1963 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1964 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1965 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1966 goto out_unlock;
1967 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1968 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1969 ret = stop_sync_thread(dm->state);
1970 goto out_unlock;
1971 }
1972
1973 usvc = (struct ip_vs_service_user *)arg;
1974 udest = (struct ip_vs_dest_user *)(usvc + 1);
1975
1976 if (cmd == IP_VS_SO_SET_ZERO) {
1977 /* if no service address is set, zero counters in all */
1978 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1979 ret = ip_vs_zero_all();
1980 goto out_unlock;
1981 }
1982 }
1983
1984 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1985 if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1986 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1987 usvc->protocol, NIPQUAD(usvc->addr),
1988 ntohs(usvc->port), usvc->sched_name);
1989 ret = -EFAULT;
1990 goto out_unlock;
1991 }
1992
1993 /* Lookup the exact service by <protocol, addr, port> or fwmark */
1994 if (usvc->fwmark == 0)
1995 svc = __ip_vs_service_get(usvc->protocol,
1996 usvc->addr, usvc->port);
1997 else
1998 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1999
2000 if (cmd != IP_VS_SO_SET_ADD
2001 && (svc == NULL || svc->protocol != usvc->protocol)) {
2002 ret = -ESRCH;
2003 goto out_unlock;
2004 }
2005
2006 switch (cmd) {
2007 case IP_VS_SO_SET_ADD:
2008 if (svc != NULL)
2009 ret = -EEXIST;
2010 else
2011 ret = ip_vs_add_service(usvc, &svc);
2012 break;
2013 case IP_VS_SO_SET_EDIT:
2014 ret = ip_vs_edit_service(svc, usvc);
2015 break;
2016 case IP_VS_SO_SET_DEL:
2017 ret = ip_vs_del_service(svc);
2018 if (!ret)
2019 goto out_unlock;
2020 break;
2021 case IP_VS_SO_SET_ZERO:
2022 ret = ip_vs_zero_service(svc);
2023 break;
2024 case IP_VS_SO_SET_ADDDEST:
2025 ret = ip_vs_add_dest(svc, udest);
2026 break;
2027 case IP_VS_SO_SET_EDITDEST:
2028 ret = ip_vs_edit_dest(svc, udest);
2029 break;
2030 case IP_VS_SO_SET_DELDEST:
2031 ret = ip_vs_del_dest(svc, udest);
2032 break;
2033 default:
2034 ret = -EINVAL;
2035 }
2036
2037 if (svc)
2038 ip_vs_service_put(svc);
2039
2040 out_unlock:
Ingo Molnar14cc3e22006-03-26 01:37:14 -08002041 mutex_unlock(&__ip_vs_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002042 out_dec:
2043 /* decrease the module use count */
2044 ip_vs_use_count_dec();
2045
2046 return ret;
2047}
2048
2049
2050static void
2051ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2052{
2053 spin_lock_bh(&src->lock);
2054 memcpy(dst, src, (char*)&src->lock - (char*)src);
2055 spin_unlock_bh(&src->lock);
2056}
2057
2058static void
2059ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2060{
2061 dst->protocol = src->protocol;
2062 dst->addr = src->addr;
2063 dst->port = src->port;
2064 dst->fwmark = src->fwmark;
pageexec4da62fc2005-06-26 16:00:19 -07002065 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002066 dst->flags = src->flags;
2067 dst->timeout = src->timeout / HZ;
2068 dst->netmask = src->netmask;
2069 dst->num_dests = src->num_dests;
2070 ip_vs_copy_stats(&dst->stats, &src->stats);
2071}
2072
2073static inline int
2074__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2075 struct ip_vs_get_services __user *uptr)
2076{
2077 int idx, count=0;
2078 struct ip_vs_service *svc;
2079 struct ip_vs_service_entry entry;
2080 int ret = 0;
2081
2082 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2083 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2084 if (count >= get->num_services)
2085 goto out;
pageexec4da62fc2005-06-26 16:00:19 -07002086 memset(&entry, 0, sizeof(entry));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002087 ip_vs_copy_service(&entry, svc);
2088 if (copy_to_user(&uptr->entrytable[count],
2089 &entry, sizeof(entry))) {
2090 ret = -EFAULT;
2091 goto out;
2092 }
2093 count++;
2094 }
2095 }
2096
2097 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2098 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2099 if (count >= get->num_services)
2100 goto out;
pageexec4da62fc2005-06-26 16:00:19 -07002101 memset(&entry, 0, sizeof(entry));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002102 ip_vs_copy_service(&entry, svc);
2103 if (copy_to_user(&uptr->entrytable[count],
2104 &entry, sizeof(entry))) {
2105 ret = -EFAULT;
2106 goto out;
2107 }
2108 count++;
2109 }
2110 }
2111 out:
2112 return ret;
2113}
2114
2115static inline int
2116__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2117 struct ip_vs_get_dests __user *uptr)
2118{
2119 struct ip_vs_service *svc;
2120 int ret = 0;
2121
2122 if (get->fwmark)
2123 svc = __ip_vs_svc_fwm_get(get->fwmark);
2124 else
2125 svc = __ip_vs_service_get(get->protocol,
2126 get->addr, get->port);
2127 if (svc) {
2128 int count = 0;
2129 struct ip_vs_dest *dest;
2130 struct ip_vs_dest_entry entry;
2131
2132 list_for_each_entry(dest, &svc->destinations, n_list) {
2133 if (count >= get->num_dests)
2134 break;
2135
2136 entry.addr = dest->addr;
2137 entry.port = dest->port;
2138 entry.conn_flags = atomic_read(&dest->conn_flags);
2139 entry.weight = atomic_read(&dest->weight);
2140 entry.u_threshold = dest->u_threshold;
2141 entry.l_threshold = dest->l_threshold;
2142 entry.activeconns = atomic_read(&dest->activeconns);
2143 entry.inactconns = atomic_read(&dest->inactconns);
2144 entry.persistconns = atomic_read(&dest->persistconns);
2145 ip_vs_copy_stats(&entry.stats, &dest->stats);
2146 if (copy_to_user(&uptr->entrytable[count],
2147 &entry, sizeof(entry))) {
2148 ret = -EFAULT;
2149 break;
2150 }
2151 count++;
2152 }
2153 ip_vs_service_put(svc);
2154 } else
2155 ret = -ESRCH;
2156 return ret;
2157}
2158
2159static inline void
2160__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2161{
2162#ifdef CONFIG_IP_VS_PROTO_TCP
2163 u->tcp_timeout =
2164 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2165 u->tcp_fin_timeout =
2166 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2167#endif
2168#ifdef CONFIG_IP_VS_PROTO_UDP
2169 u->udp_timeout =
2170 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2171#endif
2172}
2173
2174
2175#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2176#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2177#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2178#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2179#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2180#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2181#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2182
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08002183static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002184 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2185 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2186 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2187 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2188 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2189 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2190 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2191};
2192
2193static int
2194do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2195{
2196 unsigned char arg[128];
2197 int ret = 0;
2198
2199 if (!capable(CAP_NET_ADMIN))
2200 return -EPERM;
2201
2202 if (*len < get_arglen[GET_CMDID(cmd)]) {
2203 IP_VS_ERR("get_ctl: len %u < %u\n",
2204 *len, get_arglen[GET_CMDID(cmd)]);
2205 return -EINVAL;
2206 }
2207
2208 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2209 return -EFAULT;
2210
Ingo Molnar14cc3e22006-03-26 01:37:14 -08002211 if (mutex_lock_interruptible(&__ip_vs_mutex))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002212 return -ERESTARTSYS;
2213
2214 switch (cmd) {
2215 case IP_VS_SO_GET_VERSION:
2216 {
2217 char buf[64];
2218
2219 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2220 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2221 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2222 ret = -EFAULT;
2223 goto out;
2224 }
2225 *len = strlen(buf)+1;
2226 }
2227 break;
2228
2229 case IP_VS_SO_GET_INFO:
2230 {
2231 struct ip_vs_getinfo info;
2232 info.version = IP_VS_VERSION_CODE;
2233 info.size = IP_VS_CONN_TAB_SIZE;
2234 info.num_services = ip_vs_num_services;
2235 if (copy_to_user(user, &info, sizeof(info)) != 0)
2236 ret = -EFAULT;
2237 }
2238 break;
2239
2240 case IP_VS_SO_GET_SERVICES:
2241 {
2242 struct ip_vs_get_services *get;
2243 int size;
2244
2245 get = (struct ip_vs_get_services *)arg;
2246 size = sizeof(*get) +
2247 sizeof(struct ip_vs_service_entry) * get->num_services;
2248 if (*len != size) {
2249 IP_VS_ERR("length: %u != %u\n", *len, size);
2250 ret = -EINVAL;
2251 goto out;
2252 }
2253 ret = __ip_vs_get_service_entries(get, user);
2254 }
2255 break;
2256
2257 case IP_VS_SO_GET_SERVICE:
2258 {
2259 struct ip_vs_service_entry *entry;
2260 struct ip_vs_service *svc;
2261
2262 entry = (struct ip_vs_service_entry *)arg;
2263 if (entry->fwmark)
2264 svc = __ip_vs_svc_fwm_get(entry->fwmark);
2265 else
2266 svc = __ip_vs_service_get(entry->protocol,
2267 entry->addr, entry->port);
2268 if (svc) {
2269 ip_vs_copy_service(entry, svc);
2270 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2271 ret = -EFAULT;
2272 ip_vs_service_put(svc);
2273 } else
2274 ret = -ESRCH;
2275 }
2276 break;
2277
2278 case IP_VS_SO_GET_DESTS:
2279 {
2280 struct ip_vs_get_dests *get;
2281 int size;
2282
2283 get = (struct ip_vs_get_dests *)arg;
2284 size = sizeof(*get) +
2285 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2286 if (*len != size) {
2287 IP_VS_ERR("length: %u != %u\n", *len, size);
2288 ret = -EINVAL;
2289 goto out;
2290 }
2291 ret = __ip_vs_get_dest_entries(get, user);
2292 }
2293 break;
2294
2295 case IP_VS_SO_GET_TIMEOUT:
2296 {
2297 struct ip_vs_timeout_user t;
2298
2299 __ip_vs_get_timeouts(&t);
2300 if (copy_to_user(user, &t, sizeof(t)) != 0)
2301 ret = -EFAULT;
2302 }
2303 break;
2304
2305 case IP_VS_SO_GET_DAEMON:
2306 {
2307 struct ip_vs_daemon_user d[2];
2308
2309 memset(&d, 0, sizeof(d));
2310 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2311 d[0].state = IP_VS_STATE_MASTER;
pageexec4da62fc2005-06-26 16:00:19 -07002312 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002313 d[0].syncid = ip_vs_master_syncid;
2314 }
2315 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2316 d[1].state = IP_VS_STATE_BACKUP;
pageexec4da62fc2005-06-26 16:00:19 -07002317 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002318 d[1].syncid = ip_vs_backup_syncid;
2319 }
2320 if (copy_to_user(user, &d, sizeof(d)) != 0)
2321 ret = -EFAULT;
2322 }
2323 break;
2324
2325 default:
2326 ret = -EINVAL;
2327 }
2328
2329 out:
Ingo Molnar14cc3e22006-03-26 01:37:14 -08002330 mutex_unlock(&__ip_vs_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002331 return ret;
2332}
2333
2334
2335static struct nf_sockopt_ops ip_vs_sockopts = {
2336 .pf = PF_INET,
2337 .set_optmin = IP_VS_BASE_CTL,
2338 .set_optmax = IP_VS_SO_SET_MAX+1,
2339 .set = do_ip_vs_set_ctl,
2340 .get_optmin = IP_VS_BASE_CTL,
2341 .get_optmax = IP_VS_SO_GET_MAX+1,
2342 .get = do_ip_vs_get_ctl,
2343};
2344
2345
2346int ip_vs_control_init(void)
2347{
2348 int ret;
2349 int idx;
2350
2351 EnterFunction(2);
2352
2353 ret = nf_register_sockopt(&ip_vs_sockopts);
2354 if (ret) {
2355 IP_VS_ERR("cannot register sockopt.\n");
2356 return ret;
2357 }
2358
2359 proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2360 proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2361
2362 sysctl_header = register_sysctl_table(vs_root_table, 0);
2363
2364 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2365 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2366 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2367 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2368 }
2369 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
2370 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2371 }
2372
2373 memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2374 spin_lock_init(&ip_vs_stats.lock);
2375 ip_vs_new_estimator(&ip_vs_stats);
2376
2377 /* Hook the defense timer */
2378 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2379
2380 LeaveFunction(2);
2381 return 0;
2382}
2383
2384
2385void ip_vs_control_cleanup(void)
2386{
2387 EnterFunction(2);
2388 ip_vs_trash_cleanup();
2389 cancel_rearming_delayed_work(&defense_work);
2390 ip_vs_kill_estimator(&ip_vs_stats);
2391 unregister_sysctl_table(sysctl_header);
2392 proc_net_remove("ip_vs_stats");
2393 proc_net_remove("ip_vs");
2394 nf_unregister_sockopt(&ip_vs_sockopts);
2395 LeaveFunction(2);
2396}