blob: 3c4d22a468ecd68499df51dbfbfa5ad3ad5bccbe [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 *
19 * Changes:
20 *
21 */
22
23#include <linux/module.h>
24#include <linux/init.h>
25#include <linux/types.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080026#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070027#include <linux/fs.h>
28#include <linux/sysctl.h>
29#include <linux/proc_fs.h>
30#include <linux/workqueue.h>
31#include <linux/swap.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070032#include <linux/seq_file.h>
33
34#include <linux/netfilter.h>
35#include <linux/netfilter_ipv4.h>
Ingo Molnar14cc3e22006-03-26 01:37:14 -080036#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070037
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020038#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039#include <net/ip.h>
Arnaldo Carvalho de Melo14c85022005-12-27 02:43:12 -020040#include <net/route.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070041#include <net/sock.h>
42
43#include <asm/uaccess.h>
44
45#include <net/ip_vs.h>
46
47/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
Ingo Molnar14cc3e22006-03-26 01:37:14 -080048static DEFINE_MUTEX(__ip_vs_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -070049
50/* lock for service table */
51static DEFINE_RWLOCK(__ip_vs_svc_lock);
52
53/* lock for table with the real services */
54static DEFINE_RWLOCK(__ip_vs_rs_lock);
55
56/* lock for state and timeout tables */
57static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
58
59/* lock for drop entry handling */
60static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
61
62/* lock for drop packet handling */
63static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
64
65/* 1/rate drop and drop-entry variables */
66int ip_vs_drop_rate = 0;
67int ip_vs_drop_counter = 0;
68static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
69
70/* number of virtual services */
71static int ip_vs_num_services = 0;
72
73/* sysctl variables */
74static int sysctl_ip_vs_drop_entry = 0;
75static int sysctl_ip_vs_drop_packet = 0;
76static int sysctl_ip_vs_secure_tcp = 0;
77static int sysctl_ip_vs_amemthresh = 1024;
78static int sysctl_ip_vs_am_droprate = 10;
79int sysctl_ip_vs_cache_bypass = 0;
80int sysctl_ip_vs_expire_nodest_conn = 0;
81int sysctl_ip_vs_expire_quiescent_template = 0;
82int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
83int sysctl_ip_vs_nat_icmp_send = 0;
84
85
86#ifdef CONFIG_IP_VS_DEBUG
87static int sysctl_ip_vs_debug_level = 0;
88
89int ip_vs_get_debug_level(void)
90{
91 return sysctl_ip_vs_debug_level;
92}
93#endif
94
95/*
Julian Anastasovaf9debd2005-07-11 20:59:57 -070096 * update_defense_level is called from keventd and from sysctl,
97 * so it needs to protect itself from softirqs
Linus Torvalds1da177e2005-04-16 15:20:36 -070098 */
99static void update_defense_level(void)
100{
101 struct sysinfo i;
102 static int old_secure_tcp = 0;
103 int availmem;
104 int nomem;
105 int to_change = -1;
106
107 /* we only count free and buffered memory (in pages) */
108 si_meminfo(&i);
109 availmem = i.freeram + i.bufferram;
110 /* however in linux 2.5 the i.bufferram is total page cache size,
111 we need adjust it */
112 /* si_swapinfo(&i); */
113 /* availmem = availmem - (i.totalswap - i.freeswap); */
114
115 nomem = (availmem < sysctl_ip_vs_amemthresh);
116
Julian Anastasovaf9debd2005-07-11 20:59:57 -0700117 local_bh_disable();
118
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119 /* drop_entry */
120 spin_lock(&__ip_vs_dropentry_lock);
121 switch (sysctl_ip_vs_drop_entry) {
122 case 0:
123 atomic_set(&ip_vs_dropentry, 0);
124 break;
125 case 1:
126 if (nomem) {
127 atomic_set(&ip_vs_dropentry, 1);
128 sysctl_ip_vs_drop_entry = 2;
129 } else {
130 atomic_set(&ip_vs_dropentry, 0);
131 }
132 break;
133 case 2:
134 if (nomem) {
135 atomic_set(&ip_vs_dropentry, 1);
136 } else {
137 atomic_set(&ip_vs_dropentry, 0);
138 sysctl_ip_vs_drop_entry = 1;
139 };
140 break;
141 case 3:
142 atomic_set(&ip_vs_dropentry, 1);
143 break;
144 }
145 spin_unlock(&__ip_vs_dropentry_lock);
146
147 /* drop_packet */
148 spin_lock(&__ip_vs_droppacket_lock);
149 switch (sysctl_ip_vs_drop_packet) {
150 case 0:
151 ip_vs_drop_rate = 0;
152 break;
153 case 1:
154 if (nomem) {
155 ip_vs_drop_rate = ip_vs_drop_counter
156 = sysctl_ip_vs_amemthresh /
157 (sysctl_ip_vs_amemthresh-availmem);
158 sysctl_ip_vs_drop_packet = 2;
159 } else {
160 ip_vs_drop_rate = 0;
161 }
162 break;
163 case 2:
164 if (nomem) {
165 ip_vs_drop_rate = ip_vs_drop_counter
166 = sysctl_ip_vs_amemthresh /
167 (sysctl_ip_vs_amemthresh-availmem);
168 } else {
169 ip_vs_drop_rate = 0;
170 sysctl_ip_vs_drop_packet = 1;
171 }
172 break;
173 case 3:
174 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
175 break;
176 }
177 spin_unlock(&__ip_vs_droppacket_lock);
178
179 /* secure_tcp */
180 write_lock(&__ip_vs_securetcp_lock);
181 switch (sysctl_ip_vs_secure_tcp) {
182 case 0:
183 if (old_secure_tcp >= 2)
184 to_change = 0;
185 break;
186 case 1:
187 if (nomem) {
188 if (old_secure_tcp < 2)
189 to_change = 1;
190 sysctl_ip_vs_secure_tcp = 2;
191 } else {
192 if (old_secure_tcp >= 2)
193 to_change = 0;
194 }
195 break;
196 case 2:
197 if (nomem) {
198 if (old_secure_tcp < 2)
199 to_change = 1;
200 } else {
201 if (old_secure_tcp >= 2)
202 to_change = 0;
203 sysctl_ip_vs_secure_tcp = 1;
204 }
205 break;
206 case 3:
207 if (old_secure_tcp < 2)
208 to_change = 1;
209 break;
210 }
211 old_secure_tcp = sysctl_ip_vs_secure_tcp;
212 if (to_change >= 0)
213 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
214 write_unlock(&__ip_vs_securetcp_lock);
Julian Anastasovaf9debd2005-07-11 20:59:57 -0700215
216 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700217}
218
219
220/*
221 * Timer for checking the defense
222 */
223#define DEFENSE_TIMER_PERIOD 1*HZ
David Howellsc4028952006-11-22 14:57:56 +0000224static void defense_work_handler(struct work_struct *work);
225static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700226
David Howellsc4028952006-11-22 14:57:56 +0000227static void defense_work_handler(struct work_struct *work)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700228{
229 update_defense_level();
230 if (atomic_read(&ip_vs_dropentry))
231 ip_vs_random_dropentry();
232
233 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
234}
235
236int
237ip_vs_use_count_inc(void)
238{
239 return try_module_get(THIS_MODULE);
240}
241
242void
243ip_vs_use_count_dec(void)
244{
245 module_put(THIS_MODULE);
246}
247
248
249/*
250 * Hash table: for virtual service lookups
251 */
252#define IP_VS_SVC_TAB_BITS 8
253#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
254#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
255
256/* the service table hashed by <protocol, addr, port> */
257static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
258/* the service table hashed by fwmark */
259static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
260
261/*
262 * Hash table: for real service lookups
263 */
264#define IP_VS_RTAB_BITS 4
265#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
266#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
267
268static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
269
270/*
271 * Trash for destinations
272 */
273static LIST_HEAD(ip_vs_dest_trash);
274
275/*
276 * FTP & NULL virtual service counters
277 */
278static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
279static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
280
281
282/*
283 * Returns hash value for virtual service
284 */
285static __inline__ unsigned
Al Viro014d7302006-09-28 14:29:52 -0700286ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700287{
288 register unsigned porth = ntohs(port);
289
290 return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
291 & IP_VS_SVC_TAB_MASK;
292}
293
294/*
295 * Returns hash value of fwmark for virtual service lookup
296 */
297static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
298{
299 return fwmark & IP_VS_SVC_TAB_MASK;
300}
301
302/*
303 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
304 * or in the ip_vs_svc_fwm_table by fwmark.
305 * Should be called with locked tables.
306 */
307static int ip_vs_svc_hash(struct ip_vs_service *svc)
308{
309 unsigned hash;
310
311 if (svc->flags & IP_VS_SVC_F_HASHED) {
312 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
313 "called from %p\n", __builtin_return_address(0));
314 return 0;
315 }
316
317 if (svc->fwmark == 0) {
318 /*
319 * Hash it by <protocol,addr,port> in ip_vs_svc_table
320 */
321 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
322 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
323 } else {
324 /*
325 * Hash it by fwmark in ip_vs_svc_fwm_table
326 */
327 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
328 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
329 }
330
331 svc->flags |= IP_VS_SVC_F_HASHED;
332 /* increase its refcnt because it is referenced by the svc table */
333 atomic_inc(&svc->refcnt);
334 return 1;
335}
336
337
338/*
339 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
340 * Should be called with locked tables.
341 */
342static int ip_vs_svc_unhash(struct ip_vs_service *svc)
343{
344 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
345 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
346 "called from %p\n", __builtin_return_address(0));
347 return 0;
348 }
349
350 if (svc->fwmark == 0) {
351 /* Remove it from the ip_vs_svc_table table */
352 list_del(&svc->s_list);
353 } else {
354 /* Remove it from the ip_vs_svc_fwm_table table */
355 list_del(&svc->f_list);
356 }
357
358 svc->flags &= ~IP_VS_SVC_F_HASHED;
359 atomic_dec(&svc->refcnt);
360 return 1;
361}
362
363
364/*
365 * Get service by {proto,addr,port} in the service table.
366 */
367static __inline__ struct ip_vs_service *
Al Viro014d7302006-09-28 14:29:52 -0700368__ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700369{
370 unsigned hash;
371 struct ip_vs_service *svc;
372
373 /* Check for "full" addressed entries */
374 hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
375
376 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
377 if ((svc->addr == vaddr)
378 && (svc->port == vport)
379 && (svc->protocol == protocol)) {
380 /* HIT */
381 atomic_inc(&svc->usecnt);
382 return svc;
383 }
384 }
385
386 return NULL;
387}
388
389
390/*
391 * Get service by {fwmark} in the service table.
392 */
393static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
394{
395 unsigned hash;
396 struct ip_vs_service *svc;
397
398 /* Check for fwmark addressed entries */
399 hash = ip_vs_svc_fwm_hashkey(fwmark);
400
401 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
402 if (svc->fwmark == fwmark) {
403 /* HIT */
404 atomic_inc(&svc->usecnt);
405 return svc;
406 }
407 }
408
409 return NULL;
410}
411
412struct ip_vs_service *
Al Viro014d7302006-09-28 14:29:52 -0700413ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414{
415 struct ip_vs_service *svc;
416
417 read_lock(&__ip_vs_svc_lock);
418
419 /*
420 * Check the table hashed by fwmark first
421 */
422 if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
423 goto out;
424
425 /*
426 * Check the table hashed by <protocol,addr,port>
427 * for "full" addressed entries
428 */
429 svc = __ip_vs_service_get(protocol, vaddr, vport);
430
431 if (svc == NULL
432 && protocol == IPPROTO_TCP
433 && atomic_read(&ip_vs_ftpsvc_counter)
434 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
435 /*
436 * Check if ftp service entry exists, the packet
437 * might belong to FTP data connections.
438 */
439 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
440 }
441
442 if (svc == NULL
443 && atomic_read(&ip_vs_nullsvc_counter)) {
444 /*
445 * Check if the catch-all port (port zero) exists
446 */
447 svc = __ip_vs_service_get(protocol, vaddr, 0);
448 }
449
450 out:
451 read_unlock(&__ip_vs_svc_lock);
452
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800453 IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454 fwmark, ip_vs_proto_name(protocol),
455 NIPQUAD(vaddr), ntohs(vport),
456 svc?"hit":"not hit");
457
458 return svc;
459}
460
461
462static inline void
463__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
464{
465 atomic_inc(&svc->refcnt);
466 dest->svc = svc;
467}
468
469static inline void
470__ip_vs_unbind_svc(struct ip_vs_dest *dest)
471{
472 struct ip_vs_service *svc = dest->svc;
473
474 dest->svc = NULL;
475 if (atomic_dec_and_test(&svc->refcnt))
476 kfree(svc);
477}
478
479
480/*
481 * Returns hash value for real service
482 */
Al Viro014d7302006-09-28 14:29:52 -0700483static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700484{
485 register unsigned porth = ntohs(port);
486
487 return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
488 & IP_VS_RTAB_MASK;
489}
490
491/*
492 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
493 * should be called with locked tables.
494 */
495static int ip_vs_rs_hash(struct ip_vs_dest *dest)
496{
497 unsigned hash;
498
499 if (!list_empty(&dest->d_list)) {
500 return 0;
501 }
502
503 /*
504 * Hash by proto,addr,port,
505 * which are the parameters of the real service.
506 */
507 hash = ip_vs_rs_hashkey(dest->addr, dest->port);
508 list_add(&dest->d_list, &ip_vs_rtable[hash]);
509
510 return 1;
511}
512
513/*
514 * UNhashes ip_vs_dest from ip_vs_rtable.
515 * should be called with locked tables.
516 */
517static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
518{
519 /*
520 * Remove it from the ip_vs_rtable table.
521 */
522 if (!list_empty(&dest->d_list)) {
523 list_del(&dest->d_list);
524 INIT_LIST_HEAD(&dest->d_list);
525 }
526
527 return 1;
528}
529
530/*
531 * Lookup real service by <proto,addr,port> in the real service table.
532 */
533struct ip_vs_dest *
Al Viro014d7302006-09-28 14:29:52 -0700534ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700535{
536 unsigned hash;
537 struct ip_vs_dest *dest;
538
539 /*
540 * Check for "full" addressed entries
541 * Return the first found entry
542 */
543 hash = ip_vs_rs_hashkey(daddr, dport);
544
545 read_lock(&__ip_vs_rs_lock);
546 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
547 if ((dest->addr == daddr)
548 && (dest->port == dport)
549 && ((dest->protocol == protocol) ||
550 dest->vfwmark)) {
551 /* HIT */
552 read_unlock(&__ip_vs_rs_lock);
553 return dest;
554 }
555 }
556 read_unlock(&__ip_vs_rs_lock);
557
558 return NULL;
559}
560
561/*
562 * Lookup destination by {addr,port} in the given service
563 */
564static struct ip_vs_dest *
Al Viro014d7302006-09-28 14:29:52 -0700565ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700566{
567 struct ip_vs_dest *dest;
568
569 /*
570 * Find the destination for the given service
571 */
572 list_for_each_entry(dest, &svc->destinations, n_list) {
573 if ((dest->addr == daddr) && (dest->port == dport)) {
574 /* HIT */
575 return dest;
576 }
577 }
578
579 return NULL;
580}
581
Rumen G. Bogdanovski1e356f92007-11-07 02:35:54 -0800582/*
583 * Find destination by {daddr,dport,vaddr,protocol}
584 * Cretaed to be used in ip_vs_process_message() in
585 * the backup synchronization daemon. It finds the
586 * destination to be bound to the received connection
587 * on the backup.
588 *
589 * ip_vs_lookup_real_service() looked promissing, but
590 * seems not working as expected.
591 */
592struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport,
593 __be32 vaddr, __be16 vport, __u16 protocol)
594{
595 struct ip_vs_dest *dest;
596 struct ip_vs_service *svc;
597
598 svc = ip_vs_service_get(0, protocol, vaddr, vport);
599 if (!svc)
600 return NULL;
601 dest = ip_vs_lookup_dest(svc, daddr, dport);
602 if (dest)
603 atomic_inc(&dest->refcnt);
604 ip_vs_service_put(svc);
605 return dest;
606}
607EXPORT_SYMBOL(ip_vs_find_dest);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700608
609/*
610 * Lookup dest by {svc,addr,port} in the destination trash.
611 * The destination trash is used to hold the destinations that are removed
612 * from the service table but are still referenced by some conn entries.
613 * The reason to add the destination trash is when the dest is temporary
614 * down (either by administrator or by monitor program), the dest can be
615 * picked back from the trash, the remaining connections to the dest can
616 * continue, and the counting information of the dest is also useful for
617 * scheduling.
618 */
619static struct ip_vs_dest *
Al Viro014d7302006-09-28 14:29:52 -0700620ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700621{
622 struct ip_vs_dest *dest, *nxt;
623
624 /*
625 * Find the destination in trash
626 */
627 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
628 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800629 "dest->refcnt=%d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700630 dest->vfwmark,
631 NIPQUAD(dest->addr), ntohs(dest->port),
632 atomic_read(&dest->refcnt));
633 if (dest->addr == daddr &&
634 dest->port == dport &&
635 dest->vfwmark == svc->fwmark &&
636 dest->protocol == svc->protocol &&
637 (svc->fwmark ||
638 (dest->vaddr == svc->addr &&
639 dest->vport == svc->port))) {
640 /* HIT */
641 return dest;
642 }
643
644 /*
645 * Try to purge the destination from trash if not referenced
646 */
647 if (atomic_read(&dest->refcnt) == 1) {
648 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
649 "from trash\n",
650 dest->vfwmark,
651 NIPQUAD(dest->addr), ntohs(dest->port));
652 list_del(&dest->n_list);
653 ip_vs_dst_reset(dest);
654 __ip_vs_unbind_svc(dest);
655 kfree(dest);
656 }
657 }
658
659 return NULL;
660}
661
662
663/*
664 * Clean up all the destinations in the trash
665 * Called by the ip_vs_control_cleanup()
666 *
667 * When the ip_vs_control_clearup is activated by ipvs module exit,
668 * the service tables must have been flushed and all the connections
669 * are expired, and the refcnt of each destination in the trash must
670 * be 1, so we simply release them here.
671 */
672static void ip_vs_trash_cleanup(void)
673{
674 struct ip_vs_dest *dest, *nxt;
675
676 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
677 list_del(&dest->n_list);
678 ip_vs_dst_reset(dest);
679 __ip_vs_unbind_svc(dest);
680 kfree(dest);
681 }
682}
683
684
685static void
686ip_vs_zero_stats(struct ip_vs_stats *stats)
687{
688 spin_lock_bh(&stats->lock);
689 memset(stats, 0, (char *)&stats->lock - (char *)stats);
690 spin_unlock_bh(&stats->lock);
691 ip_vs_zero_estimator(stats);
692}
693
694/*
695 * Update a destination in the given service
696 */
697static void
698__ip_vs_update_dest(struct ip_vs_service *svc,
699 struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
700{
701 int conn_flags;
702
703 /* set the weight and the flags */
704 atomic_set(&dest->weight, udest->weight);
705 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
706
707 /* check if local node and update the flags */
708 if (inet_addr_type(udest->addr) == RTN_LOCAL) {
709 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
710 | IP_VS_CONN_F_LOCALNODE;
711 }
712
713 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
714 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
715 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
716 } else {
717 /*
718 * Put the real service in ip_vs_rtable if not present.
719 * For now only for NAT!
720 */
721 write_lock_bh(&__ip_vs_rs_lock);
722 ip_vs_rs_hash(dest);
723 write_unlock_bh(&__ip_vs_rs_lock);
724 }
725 atomic_set(&dest->conn_flags, conn_flags);
726
727 /* bind the service */
728 if (!dest->svc) {
729 __ip_vs_bind_svc(dest, svc);
730 } else {
731 if (dest->svc != svc) {
732 __ip_vs_unbind_svc(dest);
733 ip_vs_zero_stats(&dest->stats);
734 __ip_vs_bind_svc(dest, svc);
735 }
736 }
737
738 /* set the dest status flags */
739 dest->flags |= IP_VS_DEST_F_AVAILABLE;
740
741 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
742 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
743 dest->u_threshold = udest->u_threshold;
744 dest->l_threshold = udest->l_threshold;
745}
746
747
748/*
749 * Create a destination for the given service
750 */
751static int
752ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
753 struct ip_vs_dest **dest_p)
754{
755 struct ip_vs_dest *dest;
756 unsigned atype;
757
758 EnterFunction(2);
759
760 atype = inet_addr_type(udest->addr);
761 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
762 return -EINVAL;
763
Panagiotis Issaris0da974f2006-07-21 14:51:30 -0700764 dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700765 if (dest == NULL) {
766 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
767 return -ENOMEM;
768 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700769
770 dest->protocol = svc->protocol;
771 dest->vaddr = svc->addr;
772 dest->vport = svc->port;
773 dest->vfwmark = svc->fwmark;
774 dest->addr = udest->addr;
775 dest->port = udest->port;
776
777 atomic_set(&dest->activeconns, 0);
778 atomic_set(&dest->inactconns, 0);
779 atomic_set(&dest->persistconns, 0);
780 atomic_set(&dest->refcnt, 0);
781
782 INIT_LIST_HEAD(&dest->d_list);
783 spin_lock_init(&dest->dst_lock);
784 spin_lock_init(&dest->stats.lock);
785 __ip_vs_update_dest(svc, dest, udest);
786 ip_vs_new_estimator(&dest->stats);
787
788 *dest_p = dest;
789
790 LeaveFunction(2);
791 return 0;
792}
793
794
795/*
796 * Add a destination into an existing service
797 */
798static int
799ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
800{
801 struct ip_vs_dest *dest;
Al Viro014d7302006-09-28 14:29:52 -0700802 __be32 daddr = udest->addr;
803 __be16 dport = udest->port;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700804 int ret;
805
806 EnterFunction(2);
807
808 if (udest->weight < 0) {
809 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
810 return -ERANGE;
811 }
812
813 if (udest->l_threshold > udest->u_threshold) {
814 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
815 "upper threshold\n");
816 return -ERANGE;
817 }
818
819 /*
820 * Check if the dest already exists in the list
821 */
822 dest = ip_vs_lookup_dest(svc, daddr, dport);
823 if (dest != NULL) {
824 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
825 return -EEXIST;
826 }
827
828 /*
829 * Check if the dest already exists in the trash and
830 * is from the same service
831 */
832 dest = ip_vs_trash_get_dest(svc, daddr, dport);
833 if (dest != NULL) {
834 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800835 "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700836 NIPQUAD(daddr), ntohs(dport),
837 atomic_read(&dest->refcnt),
838 dest->vfwmark,
839 NIPQUAD(dest->vaddr),
840 ntohs(dest->vport));
841 __ip_vs_update_dest(svc, dest, udest);
842
843 /*
844 * Get the destination from the trash
845 */
846 list_del(&dest->n_list);
847
848 ip_vs_new_estimator(&dest->stats);
849
850 write_lock_bh(&__ip_vs_svc_lock);
851
852 /*
853 * Wait until all other svc users go away.
854 */
855 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
856
857 list_add(&dest->n_list, &svc->destinations);
858 svc->num_dests++;
859
860 /* call the update_service function of its scheduler */
861 svc->scheduler->update_service(svc);
862
863 write_unlock_bh(&__ip_vs_svc_lock);
864 return 0;
865 }
866
867 /*
868 * Allocate and initialize the dest structure
869 */
870 ret = ip_vs_new_dest(svc, udest, &dest);
871 if (ret) {
872 return ret;
873 }
874
875 /*
876 * Add the dest entry into the list
877 */
878 atomic_inc(&dest->refcnt);
879
880 write_lock_bh(&__ip_vs_svc_lock);
881
882 /*
883 * Wait until all other svc users go away.
884 */
885 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
886
887 list_add(&dest->n_list, &svc->destinations);
888 svc->num_dests++;
889
890 /* call the update_service function of its scheduler */
891 svc->scheduler->update_service(svc);
892
893 write_unlock_bh(&__ip_vs_svc_lock);
894
895 LeaveFunction(2);
896
897 return 0;
898}
899
900
901/*
902 * Edit a destination in the given service
903 */
904static int
905ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
906{
907 struct ip_vs_dest *dest;
Al Viro014d7302006-09-28 14:29:52 -0700908 __be32 daddr = udest->addr;
909 __be16 dport = udest->port;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700910
911 EnterFunction(2);
912
913 if (udest->weight < 0) {
914 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
915 return -ERANGE;
916 }
917
918 if (udest->l_threshold > udest->u_threshold) {
919 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
920 "upper threshold\n");
921 return -ERANGE;
922 }
923
924 /*
925 * Lookup the destination list
926 */
927 dest = ip_vs_lookup_dest(svc, daddr, dport);
928 if (dest == NULL) {
929 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
930 return -ENOENT;
931 }
932
933 __ip_vs_update_dest(svc, dest, udest);
934
935 write_lock_bh(&__ip_vs_svc_lock);
936
937 /* Wait until all other svc users go away */
Heiko Carstenscae7ca32007-08-10 15:50:30 -0700938 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700939
940 /* call the update_service, because server weight may be changed */
941 svc->scheduler->update_service(svc);
942
943 write_unlock_bh(&__ip_vs_svc_lock);
944
945 LeaveFunction(2);
946
947 return 0;
948}
949
950
951/*
952 * Delete a destination (must be already unlinked from the service)
953 */
954static void __ip_vs_del_dest(struct ip_vs_dest *dest)
955{
956 ip_vs_kill_estimator(&dest->stats);
957
958 /*
959 * Remove it from the d-linked list with the real services.
960 */
961 write_lock_bh(&__ip_vs_rs_lock);
962 ip_vs_rs_unhash(dest);
963 write_unlock_bh(&__ip_vs_rs_lock);
964
965 /*
966 * Decrease the refcnt of the dest, and free the dest
967 * if nobody refers to it (refcnt=0). Otherwise, throw
968 * the destination into the trash.
969 */
970 if (atomic_dec_and_test(&dest->refcnt)) {
971 ip_vs_dst_reset(dest);
972 /* simply decrease svc->refcnt here, let the caller check
973 and release the service if nobody refers to it.
974 Only user context can release destination and service,
975 and only one user context can update virtual service at a
976 time, so the operation here is OK */
977 atomic_dec(&dest->svc->refcnt);
978 kfree(dest);
979 } else {
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800980 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
981 "dest->refcnt=%d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700982 NIPQUAD(dest->addr), ntohs(dest->port),
983 atomic_read(&dest->refcnt));
984 list_add(&dest->n_list, &ip_vs_dest_trash);
985 atomic_inc(&dest->refcnt);
986 }
987}
988
989
990/*
991 * Unlink a destination from the given service
992 */
993static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
994 struct ip_vs_dest *dest,
995 int svcupd)
996{
997 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
998
999 /*
1000 * Remove it from the d-linked destination list.
1001 */
1002 list_del(&dest->n_list);
1003 svc->num_dests--;
1004 if (svcupd) {
1005 /*
1006 * Call the update_service function of its scheduler
1007 */
1008 svc->scheduler->update_service(svc);
1009 }
1010}
1011
1012
1013/*
1014 * Delete a destination server in the given service
1015 */
1016static int
1017ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
1018{
1019 struct ip_vs_dest *dest;
Al Viro014d7302006-09-28 14:29:52 -07001020 __be32 daddr = udest->addr;
1021 __be16 dport = udest->port;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001022
1023 EnterFunction(2);
1024
1025 dest = ip_vs_lookup_dest(svc, daddr, dport);
1026 if (dest == NULL) {
1027 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1028 return -ENOENT;
1029 }
1030
1031 write_lock_bh(&__ip_vs_svc_lock);
1032
1033 /*
1034 * Wait until all other svc users go away.
1035 */
1036 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1037
1038 /*
1039 * Unlink dest from the service
1040 */
1041 __ip_vs_unlink_dest(svc, dest, 1);
1042
1043 write_unlock_bh(&__ip_vs_svc_lock);
1044
1045 /*
1046 * Delete the destination
1047 */
1048 __ip_vs_del_dest(dest);
1049
1050 LeaveFunction(2);
1051
1052 return 0;
1053}
1054
1055
1056/*
1057 * Add a service into the service hash table
1058 */
1059static int
1060ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1061{
1062 int ret = 0;
1063 struct ip_vs_scheduler *sched = NULL;
1064 struct ip_vs_service *svc = NULL;
1065
1066 /* increase the module use count */
1067 ip_vs_use_count_inc();
1068
1069 /* Lookup the scheduler by 'u->sched_name' */
1070 sched = ip_vs_scheduler_get(u->sched_name);
1071 if (sched == NULL) {
1072 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1073 u->sched_name);
1074 ret = -ENOENT;
1075 goto out_mod_dec;
1076 }
1077
Panagiotis Issaris0da974f2006-07-21 14:51:30 -07001078 svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001079 if (svc == NULL) {
1080 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1081 ret = -ENOMEM;
1082 goto out_err;
1083 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001084
1085 /* I'm the first user of the service */
1086 atomic_set(&svc->usecnt, 1);
1087 atomic_set(&svc->refcnt, 0);
1088
1089 svc->protocol = u->protocol;
1090 svc->addr = u->addr;
1091 svc->port = u->port;
1092 svc->fwmark = u->fwmark;
1093 svc->flags = u->flags;
1094 svc->timeout = u->timeout * HZ;
1095 svc->netmask = u->netmask;
1096
1097 INIT_LIST_HEAD(&svc->destinations);
1098 rwlock_init(&svc->sched_lock);
1099 spin_lock_init(&svc->stats.lock);
1100
1101 /* Bind the scheduler */
1102 ret = ip_vs_bind_scheduler(svc, sched);
1103 if (ret)
1104 goto out_err;
1105 sched = NULL;
1106
1107 /* Update the virtual service counters */
1108 if (svc->port == FTPPORT)
1109 atomic_inc(&ip_vs_ftpsvc_counter);
1110 else if (svc->port == 0)
1111 atomic_inc(&ip_vs_nullsvc_counter);
1112
1113 ip_vs_new_estimator(&svc->stats);
1114 ip_vs_num_services++;
1115
1116 /* Hash the service into the service table */
1117 write_lock_bh(&__ip_vs_svc_lock);
1118 ip_vs_svc_hash(svc);
1119 write_unlock_bh(&__ip_vs_svc_lock);
1120
1121 *svc_p = svc;
1122 return 0;
1123
1124 out_err:
1125 if (svc != NULL) {
1126 if (svc->scheduler)
1127 ip_vs_unbind_scheduler(svc);
1128 if (svc->inc) {
1129 local_bh_disable();
1130 ip_vs_app_inc_put(svc->inc);
1131 local_bh_enable();
1132 }
1133 kfree(svc);
1134 }
1135 ip_vs_scheduler_put(sched);
1136
1137 out_mod_dec:
1138 /* decrease the module use count */
1139 ip_vs_use_count_dec();
1140
1141 return ret;
1142}
1143
1144
1145/*
1146 * Edit a service and bind it with a new scheduler
1147 */
1148static int
1149ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1150{
1151 struct ip_vs_scheduler *sched, *old_sched;
1152 int ret = 0;
1153
1154 /*
1155 * Lookup the scheduler, by 'u->sched_name'
1156 */
1157 sched = ip_vs_scheduler_get(u->sched_name);
1158 if (sched == NULL) {
1159 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1160 u->sched_name);
1161 return -ENOENT;
1162 }
1163 old_sched = sched;
1164
1165 write_lock_bh(&__ip_vs_svc_lock);
1166
1167 /*
1168 * Wait until all other svc users go away.
1169 */
1170 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1171
1172 /*
1173 * Set the flags and timeout value
1174 */
1175 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1176 svc->timeout = u->timeout * HZ;
1177 svc->netmask = u->netmask;
1178
1179 old_sched = svc->scheduler;
1180 if (sched != old_sched) {
1181 /*
1182 * Unbind the old scheduler
1183 */
1184 if ((ret = ip_vs_unbind_scheduler(svc))) {
1185 old_sched = sched;
1186 goto out;
1187 }
1188
1189 /*
1190 * Bind the new scheduler
1191 */
1192 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1193 /*
1194 * If ip_vs_bind_scheduler fails, restore the old
1195 * scheduler.
1196 * The main reason of failure is out of memory.
1197 *
1198 * The question is if the old scheduler can be
1199 * restored all the time. TODO: if it cannot be
1200 * restored some time, we must delete the service,
1201 * otherwise the system may crash.
1202 */
1203 ip_vs_bind_scheduler(svc, old_sched);
1204 old_sched = sched;
1205 goto out;
1206 }
1207 }
1208
1209 out:
1210 write_unlock_bh(&__ip_vs_svc_lock);
1211
1212 if (old_sched)
1213 ip_vs_scheduler_put(old_sched);
1214
1215 return ret;
1216}
1217
1218
1219/*
1220 * Delete a service from the service list
1221 * - The service must be unlinked, unlocked and not referenced!
1222 * - We are called under _bh lock
1223 */
1224static void __ip_vs_del_service(struct ip_vs_service *svc)
1225{
1226 struct ip_vs_dest *dest, *nxt;
1227 struct ip_vs_scheduler *old_sched;
1228
1229 ip_vs_num_services--;
1230 ip_vs_kill_estimator(&svc->stats);
1231
1232 /* Unbind scheduler */
1233 old_sched = svc->scheduler;
1234 ip_vs_unbind_scheduler(svc);
1235 if (old_sched)
1236 ip_vs_scheduler_put(old_sched);
1237
1238 /* Unbind app inc */
1239 if (svc->inc) {
1240 ip_vs_app_inc_put(svc->inc);
1241 svc->inc = NULL;
1242 }
1243
1244 /*
1245 * Unlink the whole destination list
1246 */
1247 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1248 __ip_vs_unlink_dest(svc, dest, 0);
1249 __ip_vs_del_dest(dest);
1250 }
1251
1252 /*
1253 * Update the virtual service counters
1254 */
1255 if (svc->port == FTPPORT)
1256 atomic_dec(&ip_vs_ftpsvc_counter);
1257 else if (svc->port == 0)
1258 atomic_dec(&ip_vs_nullsvc_counter);
1259
1260 /*
1261 * Free the service if nobody refers to it
1262 */
1263 if (atomic_read(&svc->refcnt) == 0)
1264 kfree(svc);
1265
1266 /* decrease the module use count */
1267 ip_vs_use_count_dec();
1268}
1269
1270/*
1271 * Delete a service from the service list
1272 */
1273static int ip_vs_del_service(struct ip_vs_service *svc)
1274{
1275 if (svc == NULL)
1276 return -EEXIST;
1277
1278 /*
1279 * Unhash it from the service table
1280 */
1281 write_lock_bh(&__ip_vs_svc_lock);
1282
1283 ip_vs_svc_unhash(svc);
1284
1285 /*
1286 * Wait until all the svc users go away.
1287 */
1288 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1289
1290 __ip_vs_del_service(svc);
1291
1292 write_unlock_bh(&__ip_vs_svc_lock);
1293
1294 return 0;
1295}
1296
1297
1298/*
1299 * Flush all the virtual services
1300 */
1301static int ip_vs_flush(void)
1302{
1303 int idx;
1304 struct ip_vs_service *svc, *nxt;
1305
1306 /*
1307 * Flush the service table hashed by <protocol,addr,port>
1308 */
1309 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1310 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1311 write_lock_bh(&__ip_vs_svc_lock);
1312 ip_vs_svc_unhash(svc);
1313 /*
1314 * Wait until all the svc users go away.
1315 */
1316 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1317 __ip_vs_del_service(svc);
1318 write_unlock_bh(&__ip_vs_svc_lock);
1319 }
1320 }
1321
1322 /*
1323 * Flush the service table hashed by fwmark
1324 */
1325 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1326 list_for_each_entry_safe(svc, nxt,
1327 &ip_vs_svc_fwm_table[idx], f_list) {
1328 write_lock_bh(&__ip_vs_svc_lock);
1329 ip_vs_svc_unhash(svc);
1330 /*
1331 * Wait until all the svc users go away.
1332 */
1333 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1334 __ip_vs_del_service(svc);
1335 write_unlock_bh(&__ip_vs_svc_lock);
1336 }
1337 }
1338
1339 return 0;
1340}
1341
1342
1343/*
1344 * Zero counters in a service or all services
1345 */
1346static int ip_vs_zero_service(struct ip_vs_service *svc)
1347{
1348 struct ip_vs_dest *dest;
1349
1350 write_lock_bh(&__ip_vs_svc_lock);
1351 list_for_each_entry(dest, &svc->destinations, n_list) {
1352 ip_vs_zero_stats(&dest->stats);
1353 }
1354 ip_vs_zero_stats(&svc->stats);
1355 write_unlock_bh(&__ip_vs_svc_lock);
1356 return 0;
1357}
1358
1359static int ip_vs_zero_all(void)
1360{
1361 int idx;
1362 struct ip_vs_service *svc;
1363
1364 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1365 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1366 ip_vs_zero_service(svc);
1367 }
1368 }
1369
1370 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1371 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1372 ip_vs_zero_service(svc);
1373 }
1374 }
1375
1376 ip_vs_zero_stats(&ip_vs_stats);
1377 return 0;
1378}
1379
1380
1381static int
1382proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1383 void __user *buffer, size_t *lenp, loff_t *ppos)
1384{
1385 int *valp = table->data;
1386 int val = *valp;
1387 int rc;
1388
1389 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1390 if (write && (*valp != val)) {
1391 if ((*valp < 0) || (*valp > 3)) {
1392 /* Restore the correct value */
1393 *valp = val;
1394 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001395 update_defense_level();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001396 }
1397 }
1398 return rc;
1399}
1400
1401
1402static int
1403proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1404 void __user *buffer, size_t *lenp, loff_t *ppos)
1405{
1406 int *valp = table->data;
1407 int val[2];
1408 int rc;
1409
1410 /* backup the value first */
1411 memcpy(val, valp, sizeof(val));
1412
1413 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1414 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1415 /* Restore the correct value */
1416 memcpy(valp, val, sizeof(val));
1417 }
1418 return rc;
1419}
1420
1421
1422/*
1423 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1424 */
1425
1426static struct ctl_table vs_vars[] = {
1427 {
1428 .ctl_name = NET_IPV4_VS_AMEMTHRESH,
1429 .procname = "amemthresh",
1430 .data = &sysctl_ip_vs_amemthresh,
1431 .maxlen = sizeof(int),
1432 .mode = 0644,
1433 .proc_handler = &proc_dointvec,
1434 },
1435#ifdef CONFIG_IP_VS_DEBUG
1436 {
1437 .ctl_name = NET_IPV4_VS_DEBUG_LEVEL,
1438 .procname = "debug_level",
1439 .data = &sysctl_ip_vs_debug_level,
1440 .maxlen = sizeof(int),
1441 .mode = 0644,
1442 .proc_handler = &proc_dointvec,
1443 },
1444#endif
1445 {
1446 .ctl_name = NET_IPV4_VS_AMDROPRATE,
1447 .procname = "am_droprate",
1448 .data = &sysctl_ip_vs_am_droprate,
1449 .maxlen = sizeof(int),
1450 .mode = 0644,
1451 .proc_handler = &proc_dointvec,
1452 },
1453 {
1454 .ctl_name = NET_IPV4_VS_DROP_ENTRY,
1455 .procname = "drop_entry",
1456 .data = &sysctl_ip_vs_drop_entry,
1457 .maxlen = sizeof(int),
1458 .mode = 0644,
1459 .proc_handler = &proc_do_defense_mode,
1460 },
1461 {
1462 .ctl_name = NET_IPV4_VS_DROP_PACKET,
1463 .procname = "drop_packet",
1464 .data = &sysctl_ip_vs_drop_packet,
1465 .maxlen = sizeof(int),
1466 .mode = 0644,
1467 .proc_handler = &proc_do_defense_mode,
1468 },
1469 {
1470 .ctl_name = NET_IPV4_VS_SECURE_TCP,
1471 .procname = "secure_tcp",
1472 .data = &sysctl_ip_vs_secure_tcp,
1473 .maxlen = sizeof(int),
1474 .mode = 0644,
1475 .proc_handler = &proc_do_defense_mode,
1476 },
1477#if 0
1478 {
1479 .ctl_name = NET_IPV4_VS_TO_ES,
1480 .procname = "timeout_established",
1481 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1482 .maxlen = sizeof(int),
1483 .mode = 0644,
1484 .proc_handler = &proc_dointvec_jiffies,
1485 },
1486 {
1487 .ctl_name = NET_IPV4_VS_TO_SS,
1488 .procname = "timeout_synsent",
1489 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1490 .maxlen = sizeof(int),
1491 .mode = 0644,
1492 .proc_handler = &proc_dointvec_jiffies,
1493 },
1494 {
1495 .ctl_name = NET_IPV4_VS_TO_SR,
1496 .procname = "timeout_synrecv",
1497 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1498 .maxlen = sizeof(int),
1499 .mode = 0644,
1500 .proc_handler = &proc_dointvec_jiffies,
1501 },
1502 {
1503 .ctl_name = NET_IPV4_VS_TO_FW,
1504 .procname = "timeout_finwait",
1505 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1506 .maxlen = sizeof(int),
1507 .mode = 0644,
1508 .proc_handler = &proc_dointvec_jiffies,
1509 },
1510 {
1511 .ctl_name = NET_IPV4_VS_TO_TW,
1512 .procname = "timeout_timewait",
1513 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1514 .maxlen = sizeof(int),
1515 .mode = 0644,
1516 .proc_handler = &proc_dointvec_jiffies,
1517 },
1518 {
1519 .ctl_name = NET_IPV4_VS_TO_CL,
1520 .procname = "timeout_close",
1521 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1522 .maxlen = sizeof(int),
1523 .mode = 0644,
1524 .proc_handler = &proc_dointvec_jiffies,
1525 },
1526 {
1527 .ctl_name = NET_IPV4_VS_TO_CW,
1528 .procname = "timeout_closewait",
1529 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1530 .maxlen = sizeof(int),
1531 .mode = 0644,
1532 .proc_handler = &proc_dointvec_jiffies,
1533 },
1534 {
1535 .ctl_name = NET_IPV4_VS_TO_LA,
1536 .procname = "timeout_lastack",
1537 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1538 .maxlen = sizeof(int),
1539 .mode = 0644,
1540 .proc_handler = &proc_dointvec_jiffies,
1541 },
1542 {
1543 .ctl_name = NET_IPV4_VS_TO_LI,
1544 .procname = "timeout_listen",
1545 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1546 .maxlen = sizeof(int),
1547 .mode = 0644,
1548 .proc_handler = &proc_dointvec_jiffies,
1549 },
1550 {
1551 .ctl_name = NET_IPV4_VS_TO_SA,
1552 .procname = "timeout_synack",
1553 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1554 .maxlen = sizeof(int),
1555 .mode = 0644,
1556 .proc_handler = &proc_dointvec_jiffies,
1557 },
1558 {
1559 .ctl_name = NET_IPV4_VS_TO_UDP,
1560 .procname = "timeout_udp",
1561 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1562 .maxlen = sizeof(int),
1563 .mode = 0644,
1564 .proc_handler = &proc_dointvec_jiffies,
1565 },
1566 {
1567 .ctl_name = NET_IPV4_VS_TO_ICMP,
1568 .procname = "timeout_icmp",
1569 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1570 .maxlen = sizeof(int),
1571 .mode = 0644,
1572 .proc_handler = &proc_dointvec_jiffies,
1573 },
1574#endif
1575 {
1576 .ctl_name = NET_IPV4_VS_CACHE_BYPASS,
1577 .procname = "cache_bypass",
1578 .data = &sysctl_ip_vs_cache_bypass,
1579 .maxlen = sizeof(int),
1580 .mode = 0644,
1581 .proc_handler = &proc_dointvec,
1582 },
1583 {
1584 .ctl_name = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1585 .procname = "expire_nodest_conn",
1586 .data = &sysctl_ip_vs_expire_nodest_conn,
1587 .maxlen = sizeof(int),
1588 .mode = 0644,
1589 .proc_handler = &proc_dointvec,
1590 },
1591 {
1592 .ctl_name = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1593 .procname = "expire_quiescent_template",
1594 .data = &sysctl_ip_vs_expire_quiescent_template,
1595 .maxlen = sizeof(int),
1596 .mode = 0644,
1597 .proc_handler = &proc_dointvec,
1598 },
1599 {
1600 .ctl_name = NET_IPV4_VS_SYNC_THRESHOLD,
1601 .procname = "sync_threshold",
1602 .data = &sysctl_ip_vs_sync_threshold,
1603 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1604 .mode = 0644,
1605 .proc_handler = &proc_do_sync_threshold,
1606 },
1607 {
1608 .ctl_name = NET_IPV4_VS_NAT_ICMP_SEND,
1609 .procname = "nat_icmp_send",
1610 .data = &sysctl_ip_vs_nat_icmp_send,
1611 .maxlen = sizeof(int),
1612 .mode = 0644,
1613 .proc_handler = &proc_dointvec,
1614 },
1615 { .ctl_name = 0 }
1616};
1617
1618static ctl_table vs_table[] = {
1619 {
1620 .ctl_name = NET_IPV4_VS,
1621 .procname = "vs",
1622 .mode = 0555,
1623 .child = vs_vars
1624 },
1625 { .ctl_name = 0 }
1626};
1627
David S. Millerbf0ff9e2005-08-19 16:37:30 -07001628static ctl_table ipvs_ipv4_table[] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001629 {
1630 .ctl_name = NET_IPV4,
1631 .procname = "ipv4",
1632 .mode = 0555,
1633 .child = vs_table,
1634 },
1635 { .ctl_name = 0 }
1636};
1637
1638static ctl_table vs_root_table[] = {
1639 {
1640 .ctl_name = CTL_NET,
1641 .procname = "net",
1642 .mode = 0555,
David S. Millerbf0ff9e2005-08-19 16:37:30 -07001643 .child = ipvs_ipv4_table,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001644 },
1645 { .ctl_name = 0 }
1646};
1647
1648static struct ctl_table_header * sysctl_header;
1649
1650#ifdef CONFIG_PROC_FS
1651
1652struct ip_vs_iter {
1653 struct list_head *table;
1654 int bucket;
1655};
1656
1657/*
1658 * Write the contents of the VS rule table to a PROCfs file.
1659 * (It is kept just for backward compatibility)
1660 */
1661static inline const char *ip_vs_fwd_name(unsigned flags)
1662{
1663 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1664 case IP_VS_CONN_F_LOCALNODE:
1665 return "Local";
1666 case IP_VS_CONN_F_TUNNEL:
1667 return "Tunnel";
1668 case IP_VS_CONN_F_DROUTE:
1669 return "Route";
1670 default:
1671 return "Masq";
1672 }
1673}
1674
1675
1676/* Get the Nth entry in the two lists */
1677static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1678{
1679 struct ip_vs_iter *iter = seq->private;
1680 int idx;
1681 struct ip_vs_service *svc;
1682
1683 /* look in hash by protocol */
1684 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1685 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1686 if (pos-- == 0){
1687 iter->table = ip_vs_svc_table;
1688 iter->bucket = idx;
1689 return svc;
1690 }
1691 }
1692 }
1693
1694 /* keep looking in fwmark */
1695 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1696 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1697 if (pos-- == 0) {
1698 iter->table = ip_vs_svc_fwm_table;
1699 iter->bucket = idx;
1700 return svc;
1701 }
1702 }
1703 }
1704
1705 return NULL;
1706}
1707
1708static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1709{
1710
1711 read_lock_bh(&__ip_vs_svc_lock);
1712 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1713}
1714
1715
1716static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1717{
1718 struct list_head *e;
1719 struct ip_vs_iter *iter;
1720 struct ip_vs_service *svc;
1721
1722 ++*pos;
1723 if (v == SEQ_START_TOKEN)
1724 return ip_vs_info_array(seq,0);
1725
1726 svc = v;
1727 iter = seq->private;
1728
1729 if (iter->table == ip_vs_svc_table) {
1730 /* next service in table hashed by protocol */
1731 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1732 return list_entry(e, struct ip_vs_service, s_list);
1733
1734
1735 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1736 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1737 s_list) {
1738 return svc;
1739 }
1740 }
1741
1742 iter->table = ip_vs_svc_fwm_table;
1743 iter->bucket = -1;
1744 goto scan_fwmark;
1745 }
1746
1747 /* next service in hashed by fwmark */
1748 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1749 return list_entry(e, struct ip_vs_service, f_list);
1750
1751 scan_fwmark:
1752 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1753 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1754 f_list)
1755 return svc;
1756 }
1757
1758 return NULL;
1759}
1760
1761static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1762{
1763 read_unlock_bh(&__ip_vs_svc_lock);
1764}
1765
1766
1767static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1768{
1769 if (v == SEQ_START_TOKEN) {
1770 seq_printf(seq,
1771 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1772 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1773 seq_puts(seq,
1774 "Prot LocalAddress:Port Scheduler Flags\n");
1775 seq_puts(seq,
1776 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1777 } else {
1778 const struct ip_vs_service *svc = v;
1779 const struct ip_vs_iter *iter = seq->private;
1780 const struct ip_vs_dest *dest;
1781
1782 if (iter->table == ip_vs_svc_table)
1783 seq_printf(seq, "%s %08X:%04X %s ",
1784 ip_vs_proto_name(svc->protocol),
1785 ntohl(svc->addr),
1786 ntohs(svc->port),
1787 svc->scheduler->name);
1788 else
1789 seq_printf(seq, "FWM %08X %s ",
1790 svc->fwmark, svc->scheduler->name);
1791
1792 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1793 seq_printf(seq, "persistent %d %08X\n",
1794 svc->timeout,
1795 ntohl(svc->netmask));
1796 else
1797 seq_putc(seq, '\n');
1798
1799 list_for_each_entry(dest, &svc->destinations, n_list) {
1800 seq_printf(seq,
1801 " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
1802 ntohl(dest->addr), ntohs(dest->port),
1803 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1804 atomic_read(&dest->weight),
1805 atomic_read(&dest->activeconns),
1806 atomic_read(&dest->inactconns));
1807 }
1808 }
1809 return 0;
1810}
1811
Philippe De Muyter56b3d972007-07-10 23:07:31 -07001812static const struct seq_operations ip_vs_info_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001813 .start = ip_vs_info_seq_start,
1814 .next = ip_vs_info_seq_next,
1815 .stop = ip_vs_info_seq_stop,
1816 .show = ip_vs_info_seq_show,
1817};
1818
1819static int ip_vs_info_open(struct inode *inode, struct file *file)
1820{
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -07001821 return seq_open_private(file, &ip_vs_info_seq_ops,
1822 sizeof(struct ip_vs_iter));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001823}
1824
Arjan van de Ven9a321442007-02-12 00:55:35 -08001825static const struct file_operations ip_vs_info_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001826 .owner = THIS_MODULE,
1827 .open = ip_vs_info_open,
1828 .read = seq_read,
1829 .llseek = seq_lseek,
1830 .release = seq_release_private,
1831};
1832
1833#endif
1834
1835struct ip_vs_stats ip_vs_stats;
1836
1837#ifdef CONFIG_PROC_FS
1838static int ip_vs_stats_show(struct seq_file *seq, void *v)
1839{
1840
1841/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1842 seq_puts(seq,
1843 " Total Incoming Outgoing Incoming Outgoing\n");
1844 seq_printf(seq,
1845 " Conns Packets Packets Bytes Bytes\n");
1846
1847 spin_lock_bh(&ip_vs_stats.lock);
1848 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1849 ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1850 (unsigned long long) ip_vs_stats.inbytes,
1851 (unsigned long long) ip_vs_stats.outbytes);
1852
1853/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1854 seq_puts(seq,
1855 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1856 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1857 ip_vs_stats.cps,
1858 ip_vs_stats.inpps,
1859 ip_vs_stats.outpps,
1860 ip_vs_stats.inbps,
1861 ip_vs_stats.outbps);
1862 spin_unlock_bh(&ip_vs_stats.lock);
1863
1864 return 0;
1865}
1866
1867static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1868{
1869 return single_open(file, ip_vs_stats_show, NULL);
1870}
1871
Arjan van de Ven9a321442007-02-12 00:55:35 -08001872static const struct file_operations ip_vs_stats_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001873 .owner = THIS_MODULE,
1874 .open = ip_vs_stats_seq_open,
1875 .read = seq_read,
1876 .llseek = seq_lseek,
1877 .release = single_release,
1878};
1879
1880#endif
1881
1882/*
1883 * Set timeout values for tcp tcpfin udp in the timeout_table.
1884 */
1885static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1886{
1887 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1888 u->tcp_timeout,
1889 u->tcp_fin_timeout,
1890 u->udp_timeout);
1891
1892#ifdef CONFIG_IP_VS_PROTO_TCP
1893 if (u->tcp_timeout) {
1894 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1895 = u->tcp_timeout * HZ;
1896 }
1897
1898 if (u->tcp_fin_timeout) {
1899 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1900 = u->tcp_fin_timeout * HZ;
1901 }
1902#endif
1903
1904#ifdef CONFIG_IP_VS_PROTO_UDP
1905 if (u->udp_timeout) {
1906 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1907 = u->udp_timeout * HZ;
1908 }
1909#endif
1910 return 0;
1911}
1912
1913
1914#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
1915#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
1916#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
1917 sizeof(struct ip_vs_dest_user))
1918#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
1919#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
1920#define MAX_ARG_LEN SVCDEST_ARG_LEN
1921
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001922static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001923 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
1924 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
1925 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
1926 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
1927 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
1928 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
1929 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
1930 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
1931 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
1932 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
1933 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
1934};
1935
1936static int
1937do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1938{
1939 int ret;
1940 unsigned char arg[MAX_ARG_LEN];
1941 struct ip_vs_service_user *usvc;
1942 struct ip_vs_service *svc;
1943 struct ip_vs_dest_user *udest;
1944
1945 if (!capable(CAP_NET_ADMIN))
1946 return -EPERM;
1947
1948 if (len != set_arglen[SET_CMDID(cmd)]) {
1949 IP_VS_ERR("set_ctl: len %u != %u\n",
1950 len, set_arglen[SET_CMDID(cmd)]);
1951 return -EINVAL;
1952 }
1953
1954 if (copy_from_user(arg, user, len) != 0)
1955 return -EFAULT;
1956
1957 /* increase the module use count */
1958 ip_vs_use_count_inc();
1959
Ingo Molnar14cc3e22006-03-26 01:37:14 -08001960 if (mutex_lock_interruptible(&__ip_vs_mutex)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001961 ret = -ERESTARTSYS;
1962 goto out_dec;
1963 }
1964
1965 if (cmd == IP_VS_SO_SET_FLUSH) {
1966 /* Flush the virtual service */
1967 ret = ip_vs_flush();
1968 goto out_unlock;
1969 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1970 /* Set timeout values for (tcp tcpfin udp) */
1971 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1972 goto out_unlock;
1973 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1974 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1975 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1976 goto out_unlock;
1977 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1978 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1979 ret = stop_sync_thread(dm->state);
1980 goto out_unlock;
1981 }
1982
1983 usvc = (struct ip_vs_service_user *)arg;
1984 udest = (struct ip_vs_dest_user *)(usvc + 1);
1985
1986 if (cmd == IP_VS_SO_SET_ZERO) {
1987 /* if no service address is set, zero counters in all */
1988 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1989 ret = ip_vs_zero_all();
1990 goto out_unlock;
1991 }
1992 }
1993
1994 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1995 if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1996 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1997 usvc->protocol, NIPQUAD(usvc->addr),
1998 ntohs(usvc->port), usvc->sched_name);
1999 ret = -EFAULT;
2000 goto out_unlock;
2001 }
2002
2003 /* Lookup the exact service by <protocol, addr, port> or fwmark */
2004 if (usvc->fwmark == 0)
2005 svc = __ip_vs_service_get(usvc->protocol,
2006 usvc->addr, usvc->port);
2007 else
2008 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
2009
2010 if (cmd != IP_VS_SO_SET_ADD
2011 && (svc == NULL || svc->protocol != usvc->protocol)) {
2012 ret = -ESRCH;
2013 goto out_unlock;
2014 }
2015
2016 switch (cmd) {
2017 case IP_VS_SO_SET_ADD:
2018 if (svc != NULL)
2019 ret = -EEXIST;
2020 else
2021 ret = ip_vs_add_service(usvc, &svc);
2022 break;
2023 case IP_VS_SO_SET_EDIT:
2024 ret = ip_vs_edit_service(svc, usvc);
2025 break;
2026 case IP_VS_SO_SET_DEL:
2027 ret = ip_vs_del_service(svc);
2028 if (!ret)
2029 goto out_unlock;
2030 break;
2031 case IP_VS_SO_SET_ZERO:
2032 ret = ip_vs_zero_service(svc);
2033 break;
2034 case IP_VS_SO_SET_ADDDEST:
2035 ret = ip_vs_add_dest(svc, udest);
2036 break;
2037 case IP_VS_SO_SET_EDITDEST:
2038 ret = ip_vs_edit_dest(svc, udest);
2039 break;
2040 case IP_VS_SO_SET_DELDEST:
2041 ret = ip_vs_del_dest(svc, udest);
2042 break;
2043 default:
2044 ret = -EINVAL;
2045 }
2046
2047 if (svc)
2048 ip_vs_service_put(svc);
2049
2050 out_unlock:
Ingo Molnar14cc3e22006-03-26 01:37:14 -08002051 mutex_unlock(&__ip_vs_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002052 out_dec:
2053 /* decrease the module use count */
2054 ip_vs_use_count_dec();
2055
2056 return ret;
2057}
2058
2059
2060static void
2061ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2062{
2063 spin_lock_bh(&src->lock);
2064 memcpy(dst, src, (char*)&src->lock - (char*)src);
2065 spin_unlock_bh(&src->lock);
2066}
2067
2068static void
2069ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2070{
2071 dst->protocol = src->protocol;
2072 dst->addr = src->addr;
2073 dst->port = src->port;
2074 dst->fwmark = src->fwmark;
pageexec4da62fc2005-06-26 16:00:19 -07002075 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002076 dst->flags = src->flags;
2077 dst->timeout = src->timeout / HZ;
2078 dst->netmask = src->netmask;
2079 dst->num_dests = src->num_dests;
2080 ip_vs_copy_stats(&dst->stats, &src->stats);
2081}
2082
2083static inline int
2084__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2085 struct ip_vs_get_services __user *uptr)
2086{
2087 int idx, count=0;
2088 struct ip_vs_service *svc;
2089 struct ip_vs_service_entry entry;
2090 int ret = 0;
2091
2092 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2093 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2094 if (count >= get->num_services)
2095 goto out;
pageexec4da62fc2005-06-26 16:00:19 -07002096 memset(&entry, 0, sizeof(entry));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002097 ip_vs_copy_service(&entry, svc);
2098 if (copy_to_user(&uptr->entrytable[count],
2099 &entry, sizeof(entry))) {
2100 ret = -EFAULT;
2101 goto out;
2102 }
2103 count++;
2104 }
2105 }
2106
2107 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2108 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2109 if (count >= get->num_services)
2110 goto out;
pageexec4da62fc2005-06-26 16:00:19 -07002111 memset(&entry, 0, sizeof(entry));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002112 ip_vs_copy_service(&entry, svc);
2113 if (copy_to_user(&uptr->entrytable[count],
2114 &entry, sizeof(entry))) {
2115 ret = -EFAULT;
2116 goto out;
2117 }
2118 count++;
2119 }
2120 }
2121 out:
2122 return ret;
2123}
2124
2125static inline int
2126__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2127 struct ip_vs_get_dests __user *uptr)
2128{
2129 struct ip_vs_service *svc;
2130 int ret = 0;
2131
2132 if (get->fwmark)
2133 svc = __ip_vs_svc_fwm_get(get->fwmark);
2134 else
2135 svc = __ip_vs_service_get(get->protocol,
2136 get->addr, get->port);
2137 if (svc) {
2138 int count = 0;
2139 struct ip_vs_dest *dest;
2140 struct ip_vs_dest_entry entry;
2141
2142 list_for_each_entry(dest, &svc->destinations, n_list) {
2143 if (count >= get->num_dests)
2144 break;
2145
2146 entry.addr = dest->addr;
2147 entry.port = dest->port;
2148 entry.conn_flags = atomic_read(&dest->conn_flags);
2149 entry.weight = atomic_read(&dest->weight);
2150 entry.u_threshold = dest->u_threshold;
2151 entry.l_threshold = dest->l_threshold;
2152 entry.activeconns = atomic_read(&dest->activeconns);
2153 entry.inactconns = atomic_read(&dest->inactconns);
2154 entry.persistconns = atomic_read(&dest->persistconns);
2155 ip_vs_copy_stats(&entry.stats, &dest->stats);
2156 if (copy_to_user(&uptr->entrytable[count],
2157 &entry, sizeof(entry))) {
2158 ret = -EFAULT;
2159 break;
2160 }
2161 count++;
2162 }
2163 ip_vs_service_put(svc);
2164 } else
2165 ret = -ESRCH;
2166 return ret;
2167}
2168
2169static inline void
2170__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2171{
2172#ifdef CONFIG_IP_VS_PROTO_TCP
2173 u->tcp_timeout =
2174 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2175 u->tcp_fin_timeout =
2176 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2177#endif
2178#ifdef CONFIG_IP_VS_PROTO_UDP
2179 u->udp_timeout =
2180 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2181#endif
2182}
2183
2184
2185#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2186#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2187#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2188#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2189#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2190#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2191#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2192
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08002193static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002194 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2195 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2196 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2197 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2198 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2199 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2200 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2201};
2202
2203static int
2204do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2205{
2206 unsigned char arg[128];
2207 int ret = 0;
2208
2209 if (!capable(CAP_NET_ADMIN))
2210 return -EPERM;
2211
2212 if (*len < get_arglen[GET_CMDID(cmd)]) {
2213 IP_VS_ERR("get_ctl: len %u < %u\n",
2214 *len, get_arglen[GET_CMDID(cmd)]);
2215 return -EINVAL;
2216 }
2217
2218 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2219 return -EFAULT;
2220
Ingo Molnar14cc3e22006-03-26 01:37:14 -08002221 if (mutex_lock_interruptible(&__ip_vs_mutex))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002222 return -ERESTARTSYS;
2223
2224 switch (cmd) {
2225 case IP_VS_SO_GET_VERSION:
2226 {
2227 char buf[64];
2228
2229 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2230 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2231 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2232 ret = -EFAULT;
2233 goto out;
2234 }
2235 *len = strlen(buf)+1;
2236 }
2237 break;
2238
2239 case IP_VS_SO_GET_INFO:
2240 {
2241 struct ip_vs_getinfo info;
2242 info.version = IP_VS_VERSION_CODE;
2243 info.size = IP_VS_CONN_TAB_SIZE;
2244 info.num_services = ip_vs_num_services;
2245 if (copy_to_user(user, &info, sizeof(info)) != 0)
2246 ret = -EFAULT;
2247 }
2248 break;
2249
2250 case IP_VS_SO_GET_SERVICES:
2251 {
2252 struct ip_vs_get_services *get;
2253 int size;
2254
2255 get = (struct ip_vs_get_services *)arg;
2256 size = sizeof(*get) +
2257 sizeof(struct ip_vs_service_entry) * get->num_services;
2258 if (*len != size) {
2259 IP_VS_ERR("length: %u != %u\n", *len, size);
2260 ret = -EINVAL;
2261 goto out;
2262 }
2263 ret = __ip_vs_get_service_entries(get, user);
2264 }
2265 break;
2266
2267 case IP_VS_SO_GET_SERVICE:
2268 {
2269 struct ip_vs_service_entry *entry;
2270 struct ip_vs_service *svc;
2271
2272 entry = (struct ip_vs_service_entry *)arg;
2273 if (entry->fwmark)
2274 svc = __ip_vs_svc_fwm_get(entry->fwmark);
2275 else
2276 svc = __ip_vs_service_get(entry->protocol,
2277 entry->addr, entry->port);
2278 if (svc) {
2279 ip_vs_copy_service(entry, svc);
2280 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2281 ret = -EFAULT;
2282 ip_vs_service_put(svc);
2283 } else
2284 ret = -ESRCH;
2285 }
2286 break;
2287
2288 case IP_VS_SO_GET_DESTS:
2289 {
2290 struct ip_vs_get_dests *get;
2291 int size;
2292
2293 get = (struct ip_vs_get_dests *)arg;
2294 size = sizeof(*get) +
2295 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2296 if (*len != size) {
2297 IP_VS_ERR("length: %u != %u\n", *len, size);
2298 ret = -EINVAL;
2299 goto out;
2300 }
2301 ret = __ip_vs_get_dest_entries(get, user);
2302 }
2303 break;
2304
2305 case IP_VS_SO_GET_TIMEOUT:
2306 {
2307 struct ip_vs_timeout_user t;
2308
2309 __ip_vs_get_timeouts(&t);
2310 if (copy_to_user(user, &t, sizeof(t)) != 0)
2311 ret = -EFAULT;
2312 }
2313 break;
2314
2315 case IP_VS_SO_GET_DAEMON:
2316 {
2317 struct ip_vs_daemon_user d[2];
2318
2319 memset(&d, 0, sizeof(d));
2320 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2321 d[0].state = IP_VS_STATE_MASTER;
pageexec4da62fc2005-06-26 16:00:19 -07002322 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002323 d[0].syncid = ip_vs_master_syncid;
2324 }
2325 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2326 d[1].state = IP_VS_STATE_BACKUP;
pageexec4da62fc2005-06-26 16:00:19 -07002327 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002328 d[1].syncid = ip_vs_backup_syncid;
2329 }
2330 if (copy_to_user(user, &d, sizeof(d)) != 0)
2331 ret = -EFAULT;
2332 }
2333 break;
2334
2335 default:
2336 ret = -EINVAL;
2337 }
2338
2339 out:
Ingo Molnar14cc3e22006-03-26 01:37:14 -08002340 mutex_unlock(&__ip_vs_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002341 return ret;
2342}
2343
2344
2345static struct nf_sockopt_ops ip_vs_sockopts = {
2346 .pf = PF_INET,
2347 .set_optmin = IP_VS_BASE_CTL,
2348 .set_optmax = IP_VS_SO_SET_MAX+1,
2349 .set = do_ip_vs_set_ctl,
2350 .get_optmin = IP_VS_BASE_CTL,
2351 .get_optmax = IP_VS_SO_GET_MAX+1,
2352 .get = do_ip_vs_get_ctl,
Neil Horman16fcec32007-09-11 11:28:26 +02002353 .owner = THIS_MODULE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002354};
2355
2356
2357int ip_vs_control_init(void)
2358{
2359 int ret;
2360 int idx;
2361
2362 EnterFunction(2);
2363
2364 ret = nf_register_sockopt(&ip_vs_sockopts);
2365 if (ret) {
2366 IP_VS_ERR("cannot register sockopt.\n");
2367 return ret;
2368 }
2369
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02002370 proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
2371 proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002372
Eric W. Biederman0b4d4142007-02-14 00:34:09 -08002373 sysctl_header = register_sysctl_table(vs_root_table);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002374
2375 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2376 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2377 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2378 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2379 }
2380 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
2381 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2382 }
2383
2384 memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2385 spin_lock_init(&ip_vs_stats.lock);
2386 ip_vs_new_estimator(&ip_vs_stats);
2387
2388 /* Hook the defense timer */
2389 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2390
2391 LeaveFunction(2);
2392 return 0;
2393}
2394
2395
2396void ip_vs_control_cleanup(void)
2397{
2398 EnterFunction(2);
2399 ip_vs_trash_cleanup();
2400 cancel_rearming_delayed_work(&defense_work);
Oleg Nesterov28e53bd2007-05-09 02:34:22 -07002401 cancel_work_sync(&defense_work.work);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002402 ip_vs_kill_estimator(&ip_vs_stats);
2403 unregister_sysctl_table(sysctl_header);
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02002404 proc_net_remove(&init_net, "ip_vs_stats");
2405 proc_net_remove(&init_net, "ip_vs");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002406 nf_unregister_sockopt(&ip_vs_sockopts);
2407 LeaveFunction(2);
2408}