blob: b64cf45a9eadde464169663688b53da624a2a0fd [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 *
19 * Changes:
20 *
21 */
22
23#include <linux/module.h>
24#include <linux/init.h>
25#include <linux/types.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080026#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070027#include <linux/fs.h>
28#include <linux/sysctl.h>
29#include <linux/proc_fs.h>
30#include <linux/workqueue.h>
31#include <linux/swap.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070032#include <linux/seq_file.h>
33
34#include <linux/netfilter.h>
35#include <linux/netfilter_ipv4.h>
Ingo Molnar14cc3e22006-03-26 01:37:14 -080036#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070037
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020038#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039#include <net/ip.h>
Arnaldo Carvalho de Melo14c85022005-12-27 02:43:12 -020040#include <net/route.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070041#include <net/sock.h>
42
43#include <asm/uaccess.h>
44
45#include <net/ip_vs.h>
46
47/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
Ingo Molnar14cc3e22006-03-26 01:37:14 -080048static DEFINE_MUTEX(__ip_vs_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -070049
50/* lock for service table */
51static DEFINE_RWLOCK(__ip_vs_svc_lock);
52
53/* lock for table with the real services */
54static DEFINE_RWLOCK(__ip_vs_rs_lock);
55
56/* lock for state and timeout tables */
57static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
58
59/* lock for drop entry handling */
60static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
61
62/* lock for drop packet handling */
63static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
64
65/* 1/rate drop and drop-entry variables */
66int ip_vs_drop_rate = 0;
67int ip_vs_drop_counter = 0;
68static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
69
70/* number of virtual services */
71static int ip_vs_num_services = 0;
72
73/* sysctl variables */
74static int sysctl_ip_vs_drop_entry = 0;
75static int sysctl_ip_vs_drop_packet = 0;
76static int sysctl_ip_vs_secure_tcp = 0;
77static int sysctl_ip_vs_amemthresh = 1024;
78static int sysctl_ip_vs_am_droprate = 10;
79int sysctl_ip_vs_cache_bypass = 0;
80int sysctl_ip_vs_expire_nodest_conn = 0;
81int sysctl_ip_vs_expire_quiescent_template = 0;
82int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
83int sysctl_ip_vs_nat_icmp_send = 0;
84
85
86#ifdef CONFIG_IP_VS_DEBUG
87static int sysctl_ip_vs_debug_level = 0;
88
89int ip_vs_get_debug_level(void)
90{
91 return sysctl_ip_vs_debug_level;
92}
93#endif
94
95/*
Julian Anastasovaf9debd2005-07-11 20:59:57 -070096 * update_defense_level is called from keventd and from sysctl,
97 * so it needs to protect itself from softirqs
Linus Torvalds1da177e2005-04-16 15:20:36 -070098 */
99static void update_defense_level(void)
100{
101 struct sysinfo i;
102 static int old_secure_tcp = 0;
103 int availmem;
104 int nomem;
105 int to_change = -1;
106
107 /* we only count free and buffered memory (in pages) */
108 si_meminfo(&i);
109 availmem = i.freeram + i.bufferram;
110 /* however in linux 2.5 the i.bufferram is total page cache size,
111 we need adjust it */
112 /* si_swapinfo(&i); */
113 /* availmem = availmem - (i.totalswap - i.freeswap); */
114
115 nomem = (availmem < sysctl_ip_vs_amemthresh);
116
Julian Anastasovaf9debd2005-07-11 20:59:57 -0700117 local_bh_disable();
118
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119 /* drop_entry */
120 spin_lock(&__ip_vs_dropentry_lock);
121 switch (sysctl_ip_vs_drop_entry) {
122 case 0:
123 atomic_set(&ip_vs_dropentry, 0);
124 break;
125 case 1:
126 if (nomem) {
127 atomic_set(&ip_vs_dropentry, 1);
128 sysctl_ip_vs_drop_entry = 2;
129 } else {
130 atomic_set(&ip_vs_dropentry, 0);
131 }
132 break;
133 case 2:
134 if (nomem) {
135 atomic_set(&ip_vs_dropentry, 1);
136 } else {
137 atomic_set(&ip_vs_dropentry, 0);
138 sysctl_ip_vs_drop_entry = 1;
139 };
140 break;
141 case 3:
142 atomic_set(&ip_vs_dropentry, 1);
143 break;
144 }
145 spin_unlock(&__ip_vs_dropentry_lock);
146
147 /* drop_packet */
148 spin_lock(&__ip_vs_droppacket_lock);
149 switch (sysctl_ip_vs_drop_packet) {
150 case 0:
151 ip_vs_drop_rate = 0;
152 break;
153 case 1:
154 if (nomem) {
155 ip_vs_drop_rate = ip_vs_drop_counter
156 = sysctl_ip_vs_amemthresh /
157 (sysctl_ip_vs_amemthresh-availmem);
158 sysctl_ip_vs_drop_packet = 2;
159 } else {
160 ip_vs_drop_rate = 0;
161 }
162 break;
163 case 2:
164 if (nomem) {
165 ip_vs_drop_rate = ip_vs_drop_counter
166 = sysctl_ip_vs_amemthresh /
167 (sysctl_ip_vs_amemthresh-availmem);
168 } else {
169 ip_vs_drop_rate = 0;
170 sysctl_ip_vs_drop_packet = 1;
171 }
172 break;
173 case 3:
174 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
175 break;
176 }
177 spin_unlock(&__ip_vs_droppacket_lock);
178
179 /* secure_tcp */
180 write_lock(&__ip_vs_securetcp_lock);
181 switch (sysctl_ip_vs_secure_tcp) {
182 case 0:
183 if (old_secure_tcp >= 2)
184 to_change = 0;
185 break;
186 case 1:
187 if (nomem) {
188 if (old_secure_tcp < 2)
189 to_change = 1;
190 sysctl_ip_vs_secure_tcp = 2;
191 } else {
192 if (old_secure_tcp >= 2)
193 to_change = 0;
194 }
195 break;
196 case 2:
197 if (nomem) {
198 if (old_secure_tcp < 2)
199 to_change = 1;
200 } else {
201 if (old_secure_tcp >= 2)
202 to_change = 0;
203 sysctl_ip_vs_secure_tcp = 1;
204 }
205 break;
206 case 3:
207 if (old_secure_tcp < 2)
208 to_change = 1;
209 break;
210 }
211 old_secure_tcp = sysctl_ip_vs_secure_tcp;
212 if (to_change >= 0)
213 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
214 write_unlock(&__ip_vs_securetcp_lock);
Julian Anastasovaf9debd2005-07-11 20:59:57 -0700215
216 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700217}
218
219
220/*
221 * Timer for checking the defense
222 */
223#define DEFENSE_TIMER_PERIOD 1*HZ
David Howellsc4028952006-11-22 14:57:56 +0000224static void defense_work_handler(struct work_struct *work);
225static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700226
David Howellsc4028952006-11-22 14:57:56 +0000227static void defense_work_handler(struct work_struct *work)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700228{
229 update_defense_level();
230 if (atomic_read(&ip_vs_dropentry))
231 ip_vs_random_dropentry();
232
233 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
234}
235
236int
237ip_vs_use_count_inc(void)
238{
239 return try_module_get(THIS_MODULE);
240}
241
242void
243ip_vs_use_count_dec(void)
244{
245 module_put(THIS_MODULE);
246}
247
248
249/*
250 * Hash table: for virtual service lookups
251 */
252#define IP_VS_SVC_TAB_BITS 8
253#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
254#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
255
256/* the service table hashed by <protocol, addr, port> */
257static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
258/* the service table hashed by fwmark */
259static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
260
261/*
262 * Hash table: for real service lookups
263 */
264#define IP_VS_RTAB_BITS 4
265#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
266#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
267
268static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
269
270/*
271 * Trash for destinations
272 */
273static LIST_HEAD(ip_vs_dest_trash);
274
275/*
276 * FTP & NULL virtual service counters
277 */
278static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
279static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
280
281
282/*
283 * Returns hash value for virtual service
284 */
285static __inline__ unsigned
Al Viro014d7302006-09-28 14:29:52 -0700286ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700287{
288 register unsigned porth = ntohs(port);
289
290 return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
291 & IP_VS_SVC_TAB_MASK;
292}
293
294/*
295 * Returns hash value of fwmark for virtual service lookup
296 */
297static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
298{
299 return fwmark & IP_VS_SVC_TAB_MASK;
300}
301
302/*
303 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
304 * or in the ip_vs_svc_fwm_table by fwmark.
305 * Should be called with locked tables.
306 */
307static int ip_vs_svc_hash(struct ip_vs_service *svc)
308{
309 unsigned hash;
310
311 if (svc->flags & IP_VS_SVC_F_HASHED) {
312 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
313 "called from %p\n", __builtin_return_address(0));
314 return 0;
315 }
316
317 if (svc->fwmark == 0) {
318 /*
319 * Hash it by <protocol,addr,port> in ip_vs_svc_table
320 */
321 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
322 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
323 } else {
324 /*
325 * Hash it by fwmark in ip_vs_svc_fwm_table
326 */
327 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
328 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
329 }
330
331 svc->flags |= IP_VS_SVC_F_HASHED;
332 /* increase its refcnt because it is referenced by the svc table */
333 atomic_inc(&svc->refcnt);
334 return 1;
335}
336
337
338/*
339 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
340 * Should be called with locked tables.
341 */
342static int ip_vs_svc_unhash(struct ip_vs_service *svc)
343{
344 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
345 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
346 "called from %p\n", __builtin_return_address(0));
347 return 0;
348 }
349
350 if (svc->fwmark == 0) {
351 /* Remove it from the ip_vs_svc_table table */
352 list_del(&svc->s_list);
353 } else {
354 /* Remove it from the ip_vs_svc_fwm_table table */
355 list_del(&svc->f_list);
356 }
357
358 svc->flags &= ~IP_VS_SVC_F_HASHED;
359 atomic_dec(&svc->refcnt);
360 return 1;
361}
362
363
364/*
365 * Get service by {proto,addr,port} in the service table.
366 */
367static __inline__ struct ip_vs_service *
Al Viro014d7302006-09-28 14:29:52 -0700368__ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700369{
370 unsigned hash;
371 struct ip_vs_service *svc;
372
373 /* Check for "full" addressed entries */
374 hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
375
376 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
377 if ((svc->addr == vaddr)
378 && (svc->port == vport)
379 && (svc->protocol == protocol)) {
380 /* HIT */
381 atomic_inc(&svc->usecnt);
382 return svc;
383 }
384 }
385
386 return NULL;
387}
388
389
390/*
391 * Get service by {fwmark} in the service table.
392 */
393static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
394{
395 unsigned hash;
396 struct ip_vs_service *svc;
397
398 /* Check for fwmark addressed entries */
399 hash = ip_vs_svc_fwm_hashkey(fwmark);
400
401 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
402 if (svc->fwmark == fwmark) {
403 /* HIT */
404 atomic_inc(&svc->usecnt);
405 return svc;
406 }
407 }
408
409 return NULL;
410}
411
412struct ip_vs_service *
Al Viro014d7302006-09-28 14:29:52 -0700413ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414{
415 struct ip_vs_service *svc;
416
417 read_lock(&__ip_vs_svc_lock);
418
419 /*
420 * Check the table hashed by fwmark first
421 */
422 if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
423 goto out;
424
425 /*
426 * Check the table hashed by <protocol,addr,port>
427 * for "full" addressed entries
428 */
429 svc = __ip_vs_service_get(protocol, vaddr, vport);
430
431 if (svc == NULL
432 && protocol == IPPROTO_TCP
433 && atomic_read(&ip_vs_ftpsvc_counter)
434 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
435 /*
436 * Check if ftp service entry exists, the packet
437 * might belong to FTP data connections.
438 */
439 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
440 }
441
442 if (svc == NULL
443 && atomic_read(&ip_vs_nullsvc_counter)) {
444 /*
445 * Check if the catch-all port (port zero) exists
446 */
447 svc = __ip_vs_service_get(protocol, vaddr, 0);
448 }
449
450 out:
451 read_unlock(&__ip_vs_svc_lock);
452
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800453 IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454 fwmark, ip_vs_proto_name(protocol),
455 NIPQUAD(vaddr), ntohs(vport),
456 svc?"hit":"not hit");
457
458 return svc;
459}
460
461
462static inline void
463__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
464{
465 atomic_inc(&svc->refcnt);
466 dest->svc = svc;
467}
468
469static inline void
470__ip_vs_unbind_svc(struct ip_vs_dest *dest)
471{
472 struct ip_vs_service *svc = dest->svc;
473
474 dest->svc = NULL;
475 if (atomic_dec_and_test(&svc->refcnt))
476 kfree(svc);
477}
478
479
480/*
481 * Returns hash value for real service
482 */
Al Viro014d7302006-09-28 14:29:52 -0700483static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700484{
485 register unsigned porth = ntohs(port);
486
487 return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
488 & IP_VS_RTAB_MASK;
489}
490
491/*
492 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
493 * should be called with locked tables.
494 */
495static int ip_vs_rs_hash(struct ip_vs_dest *dest)
496{
497 unsigned hash;
498
499 if (!list_empty(&dest->d_list)) {
500 return 0;
501 }
502
503 /*
504 * Hash by proto,addr,port,
505 * which are the parameters of the real service.
506 */
507 hash = ip_vs_rs_hashkey(dest->addr, dest->port);
508 list_add(&dest->d_list, &ip_vs_rtable[hash]);
509
510 return 1;
511}
512
513/*
514 * UNhashes ip_vs_dest from ip_vs_rtable.
515 * should be called with locked tables.
516 */
517static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
518{
519 /*
520 * Remove it from the ip_vs_rtable table.
521 */
522 if (!list_empty(&dest->d_list)) {
523 list_del(&dest->d_list);
524 INIT_LIST_HEAD(&dest->d_list);
525 }
526
527 return 1;
528}
529
530/*
531 * Lookup real service by <proto,addr,port> in the real service table.
532 */
533struct ip_vs_dest *
Al Viro014d7302006-09-28 14:29:52 -0700534ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700535{
536 unsigned hash;
537 struct ip_vs_dest *dest;
538
539 /*
540 * Check for "full" addressed entries
541 * Return the first found entry
542 */
543 hash = ip_vs_rs_hashkey(daddr, dport);
544
545 read_lock(&__ip_vs_rs_lock);
546 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
547 if ((dest->addr == daddr)
548 && (dest->port == dport)
549 && ((dest->protocol == protocol) ||
550 dest->vfwmark)) {
551 /* HIT */
552 read_unlock(&__ip_vs_rs_lock);
553 return dest;
554 }
555 }
556 read_unlock(&__ip_vs_rs_lock);
557
558 return NULL;
559}
560
561/*
562 * Lookup destination by {addr,port} in the given service
563 */
564static struct ip_vs_dest *
Al Viro014d7302006-09-28 14:29:52 -0700565ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700566{
567 struct ip_vs_dest *dest;
568
569 /*
570 * Find the destination for the given service
571 */
572 list_for_each_entry(dest, &svc->destinations, n_list) {
573 if ((dest->addr == daddr) && (dest->port == dport)) {
574 /* HIT */
575 return dest;
576 }
577 }
578
579 return NULL;
580}
581
Rumen G. Bogdanovski1e356f92007-11-07 02:35:54 -0800582/*
583 * Find destination by {daddr,dport,vaddr,protocol}
584 * Cretaed to be used in ip_vs_process_message() in
585 * the backup synchronization daemon. It finds the
586 * destination to be bound to the received connection
587 * on the backup.
588 *
589 * ip_vs_lookup_real_service() looked promissing, but
590 * seems not working as expected.
591 */
592struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport,
593 __be32 vaddr, __be16 vport, __u16 protocol)
594{
595 struct ip_vs_dest *dest;
596 struct ip_vs_service *svc;
597
598 svc = ip_vs_service_get(0, protocol, vaddr, vport);
599 if (!svc)
600 return NULL;
601 dest = ip_vs_lookup_dest(svc, daddr, dport);
602 if (dest)
603 atomic_inc(&dest->refcnt);
604 ip_vs_service_put(svc);
605 return dest;
606}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700607
608/*
609 * Lookup dest by {svc,addr,port} in the destination trash.
610 * The destination trash is used to hold the destinations that are removed
611 * from the service table but are still referenced by some conn entries.
612 * The reason to add the destination trash is when the dest is temporary
613 * down (either by administrator or by monitor program), the dest can be
614 * picked back from the trash, the remaining connections to the dest can
615 * continue, and the counting information of the dest is also useful for
616 * scheduling.
617 */
618static struct ip_vs_dest *
Al Viro014d7302006-09-28 14:29:52 -0700619ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700620{
621 struct ip_vs_dest *dest, *nxt;
622
623 /*
624 * Find the destination in trash
625 */
626 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
627 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800628 "dest->refcnt=%d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629 dest->vfwmark,
630 NIPQUAD(dest->addr), ntohs(dest->port),
631 atomic_read(&dest->refcnt));
632 if (dest->addr == daddr &&
633 dest->port == dport &&
634 dest->vfwmark == svc->fwmark &&
635 dest->protocol == svc->protocol &&
636 (svc->fwmark ||
637 (dest->vaddr == svc->addr &&
638 dest->vport == svc->port))) {
639 /* HIT */
640 return dest;
641 }
642
643 /*
644 * Try to purge the destination from trash if not referenced
645 */
646 if (atomic_read(&dest->refcnt) == 1) {
647 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
648 "from trash\n",
649 dest->vfwmark,
650 NIPQUAD(dest->addr), ntohs(dest->port));
651 list_del(&dest->n_list);
652 ip_vs_dst_reset(dest);
653 __ip_vs_unbind_svc(dest);
654 kfree(dest);
655 }
656 }
657
658 return NULL;
659}
660
661
662/*
663 * Clean up all the destinations in the trash
664 * Called by the ip_vs_control_cleanup()
665 *
666 * When the ip_vs_control_clearup is activated by ipvs module exit,
667 * the service tables must have been flushed and all the connections
668 * are expired, and the refcnt of each destination in the trash must
669 * be 1, so we simply release them here.
670 */
671static void ip_vs_trash_cleanup(void)
672{
673 struct ip_vs_dest *dest, *nxt;
674
675 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
676 list_del(&dest->n_list);
677 ip_vs_dst_reset(dest);
678 __ip_vs_unbind_svc(dest);
679 kfree(dest);
680 }
681}
682
683
684static void
685ip_vs_zero_stats(struct ip_vs_stats *stats)
686{
687 spin_lock_bh(&stats->lock);
688 memset(stats, 0, (char *)&stats->lock - (char *)stats);
689 spin_unlock_bh(&stats->lock);
690 ip_vs_zero_estimator(stats);
691}
692
693/*
694 * Update a destination in the given service
695 */
696static void
697__ip_vs_update_dest(struct ip_vs_service *svc,
698 struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
699{
700 int conn_flags;
701
702 /* set the weight and the flags */
703 atomic_set(&dest->weight, udest->weight);
704 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
705
706 /* check if local node and update the flags */
707 if (inet_addr_type(udest->addr) == RTN_LOCAL) {
708 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
709 | IP_VS_CONN_F_LOCALNODE;
710 }
711
712 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
713 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
714 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
715 } else {
716 /*
717 * Put the real service in ip_vs_rtable if not present.
718 * For now only for NAT!
719 */
720 write_lock_bh(&__ip_vs_rs_lock);
721 ip_vs_rs_hash(dest);
722 write_unlock_bh(&__ip_vs_rs_lock);
723 }
724 atomic_set(&dest->conn_flags, conn_flags);
725
726 /* bind the service */
727 if (!dest->svc) {
728 __ip_vs_bind_svc(dest, svc);
729 } else {
730 if (dest->svc != svc) {
731 __ip_vs_unbind_svc(dest);
732 ip_vs_zero_stats(&dest->stats);
733 __ip_vs_bind_svc(dest, svc);
734 }
735 }
736
737 /* set the dest status flags */
738 dest->flags |= IP_VS_DEST_F_AVAILABLE;
739
740 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
741 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
742 dest->u_threshold = udest->u_threshold;
743 dest->l_threshold = udest->l_threshold;
744}
745
746
747/*
748 * Create a destination for the given service
749 */
750static int
751ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
752 struct ip_vs_dest **dest_p)
753{
754 struct ip_vs_dest *dest;
755 unsigned atype;
756
757 EnterFunction(2);
758
759 atype = inet_addr_type(udest->addr);
760 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
761 return -EINVAL;
762
Panagiotis Issaris0da974f2006-07-21 14:51:30 -0700763 dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700764 if (dest == NULL) {
765 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
766 return -ENOMEM;
767 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700768
769 dest->protocol = svc->protocol;
770 dest->vaddr = svc->addr;
771 dest->vport = svc->port;
772 dest->vfwmark = svc->fwmark;
773 dest->addr = udest->addr;
774 dest->port = udest->port;
775
776 atomic_set(&dest->activeconns, 0);
777 atomic_set(&dest->inactconns, 0);
778 atomic_set(&dest->persistconns, 0);
779 atomic_set(&dest->refcnt, 0);
780
781 INIT_LIST_HEAD(&dest->d_list);
782 spin_lock_init(&dest->dst_lock);
783 spin_lock_init(&dest->stats.lock);
784 __ip_vs_update_dest(svc, dest, udest);
785 ip_vs_new_estimator(&dest->stats);
786
787 *dest_p = dest;
788
789 LeaveFunction(2);
790 return 0;
791}
792
793
794/*
795 * Add a destination into an existing service
796 */
797static int
798ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
799{
800 struct ip_vs_dest *dest;
Al Viro014d7302006-09-28 14:29:52 -0700801 __be32 daddr = udest->addr;
802 __be16 dport = udest->port;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700803 int ret;
804
805 EnterFunction(2);
806
807 if (udest->weight < 0) {
808 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
809 return -ERANGE;
810 }
811
812 if (udest->l_threshold > udest->u_threshold) {
813 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
814 "upper threshold\n");
815 return -ERANGE;
816 }
817
818 /*
819 * Check if the dest already exists in the list
820 */
821 dest = ip_vs_lookup_dest(svc, daddr, dport);
822 if (dest != NULL) {
823 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
824 return -EEXIST;
825 }
826
827 /*
828 * Check if the dest already exists in the trash and
829 * is from the same service
830 */
831 dest = ip_vs_trash_get_dest(svc, daddr, dport);
832 if (dest != NULL) {
833 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800834 "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700835 NIPQUAD(daddr), ntohs(dport),
836 atomic_read(&dest->refcnt),
837 dest->vfwmark,
838 NIPQUAD(dest->vaddr),
839 ntohs(dest->vport));
840 __ip_vs_update_dest(svc, dest, udest);
841
842 /*
843 * Get the destination from the trash
844 */
845 list_del(&dest->n_list);
846
847 ip_vs_new_estimator(&dest->stats);
848
849 write_lock_bh(&__ip_vs_svc_lock);
850
851 /*
852 * Wait until all other svc users go away.
853 */
854 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
855
856 list_add(&dest->n_list, &svc->destinations);
857 svc->num_dests++;
858
859 /* call the update_service function of its scheduler */
860 svc->scheduler->update_service(svc);
861
862 write_unlock_bh(&__ip_vs_svc_lock);
863 return 0;
864 }
865
866 /*
867 * Allocate and initialize the dest structure
868 */
869 ret = ip_vs_new_dest(svc, udest, &dest);
870 if (ret) {
871 return ret;
872 }
873
874 /*
875 * Add the dest entry into the list
876 */
877 atomic_inc(&dest->refcnt);
878
879 write_lock_bh(&__ip_vs_svc_lock);
880
881 /*
882 * Wait until all other svc users go away.
883 */
884 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
885
886 list_add(&dest->n_list, &svc->destinations);
887 svc->num_dests++;
888
889 /* call the update_service function of its scheduler */
890 svc->scheduler->update_service(svc);
891
892 write_unlock_bh(&__ip_vs_svc_lock);
893
894 LeaveFunction(2);
895
896 return 0;
897}
898
899
900/*
901 * Edit a destination in the given service
902 */
903static int
904ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
905{
906 struct ip_vs_dest *dest;
Al Viro014d7302006-09-28 14:29:52 -0700907 __be32 daddr = udest->addr;
908 __be16 dport = udest->port;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700909
910 EnterFunction(2);
911
912 if (udest->weight < 0) {
913 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
914 return -ERANGE;
915 }
916
917 if (udest->l_threshold > udest->u_threshold) {
918 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
919 "upper threshold\n");
920 return -ERANGE;
921 }
922
923 /*
924 * Lookup the destination list
925 */
926 dest = ip_vs_lookup_dest(svc, daddr, dport);
927 if (dest == NULL) {
928 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
929 return -ENOENT;
930 }
931
932 __ip_vs_update_dest(svc, dest, udest);
933
934 write_lock_bh(&__ip_vs_svc_lock);
935
936 /* Wait until all other svc users go away */
Heiko Carstenscae7ca32007-08-10 15:50:30 -0700937 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700938
939 /* call the update_service, because server weight may be changed */
940 svc->scheduler->update_service(svc);
941
942 write_unlock_bh(&__ip_vs_svc_lock);
943
944 LeaveFunction(2);
945
946 return 0;
947}
948
949
950/*
951 * Delete a destination (must be already unlinked from the service)
952 */
953static void __ip_vs_del_dest(struct ip_vs_dest *dest)
954{
955 ip_vs_kill_estimator(&dest->stats);
956
957 /*
958 * Remove it from the d-linked list with the real services.
959 */
960 write_lock_bh(&__ip_vs_rs_lock);
961 ip_vs_rs_unhash(dest);
962 write_unlock_bh(&__ip_vs_rs_lock);
963
964 /*
965 * Decrease the refcnt of the dest, and free the dest
966 * if nobody refers to it (refcnt=0). Otherwise, throw
967 * the destination into the trash.
968 */
969 if (atomic_dec_and_test(&dest->refcnt)) {
970 ip_vs_dst_reset(dest);
971 /* simply decrease svc->refcnt here, let the caller check
972 and release the service if nobody refers to it.
973 Only user context can release destination and service,
974 and only one user context can update virtual service at a
975 time, so the operation here is OK */
976 atomic_dec(&dest->svc->refcnt);
977 kfree(dest);
978 } else {
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800979 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
980 "dest->refcnt=%d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700981 NIPQUAD(dest->addr), ntohs(dest->port),
982 atomic_read(&dest->refcnt));
983 list_add(&dest->n_list, &ip_vs_dest_trash);
984 atomic_inc(&dest->refcnt);
985 }
986}
987
988
989/*
990 * Unlink a destination from the given service
991 */
992static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
993 struct ip_vs_dest *dest,
994 int svcupd)
995{
996 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
997
998 /*
999 * Remove it from the d-linked destination list.
1000 */
1001 list_del(&dest->n_list);
1002 svc->num_dests--;
1003 if (svcupd) {
1004 /*
1005 * Call the update_service function of its scheduler
1006 */
1007 svc->scheduler->update_service(svc);
1008 }
1009}
1010
1011
1012/*
1013 * Delete a destination server in the given service
1014 */
1015static int
1016ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
1017{
1018 struct ip_vs_dest *dest;
Al Viro014d7302006-09-28 14:29:52 -07001019 __be32 daddr = udest->addr;
1020 __be16 dport = udest->port;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001021
1022 EnterFunction(2);
1023
1024 dest = ip_vs_lookup_dest(svc, daddr, dport);
1025 if (dest == NULL) {
1026 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1027 return -ENOENT;
1028 }
1029
1030 write_lock_bh(&__ip_vs_svc_lock);
1031
1032 /*
1033 * Wait until all other svc users go away.
1034 */
1035 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1036
1037 /*
1038 * Unlink dest from the service
1039 */
1040 __ip_vs_unlink_dest(svc, dest, 1);
1041
1042 write_unlock_bh(&__ip_vs_svc_lock);
1043
1044 /*
1045 * Delete the destination
1046 */
1047 __ip_vs_del_dest(dest);
1048
1049 LeaveFunction(2);
1050
1051 return 0;
1052}
1053
1054
1055/*
1056 * Add a service into the service hash table
1057 */
1058static int
1059ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1060{
1061 int ret = 0;
1062 struct ip_vs_scheduler *sched = NULL;
1063 struct ip_vs_service *svc = NULL;
1064
1065 /* increase the module use count */
1066 ip_vs_use_count_inc();
1067
1068 /* Lookup the scheduler by 'u->sched_name' */
1069 sched = ip_vs_scheduler_get(u->sched_name);
1070 if (sched == NULL) {
1071 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1072 u->sched_name);
1073 ret = -ENOENT;
1074 goto out_mod_dec;
1075 }
1076
Panagiotis Issaris0da974f2006-07-21 14:51:30 -07001077 svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001078 if (svc == NULL) {
1079 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1080 ret = -ENOMEM;
1081 goto out_err;
1082 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001083
1084 /* I'm the first user of the service */
1085 atomic_set(&svc->usecnt, 1);
1086 atomic_set(&svc->refcnt, 0);
1087
1088 svc->protocol = u->protocol;
1089 svc->addr = u->addr;
1090 svc->port = u->port;
1091 svc->fwmark = u->fwmark;
1092 svc->flags = u->flags;
1093 svc->timeout = u->timeout * HZ;
1094 svc->netmask = u->netmask;
1095
1096 INIT_LIST_HEAD(&svc->destinations);
1097 rwlock_init(&svc->sched_lock);
1098 spin_lock_init(&svc->stats.lock);
1099
1100 /* Bind the scheduler */
1101 ret = ip_vs_bind_scheduler(svc, sched);
1102 if (ret)
1103 goto out_err;
1104 sched = NULL;
1105
1106 /* Update the virtual service counters */
1107 if (svc->port == FTPPORT)
1108 atomic_inc(&ip_vs_ftpsvc_counter);
1109 else if (svc->port == 0)
1110 atomic_inc(&ip_vs_nullsvc_counter);
1111
1112 ip_vs_new_estimator(&svc->stats);
1113 ip_vs_num_services++;
1114
1115 /* Hash the service into the service table */
1116 write_lock_bh(&__ip_vs_svc_lock);
1117 ip_vs_svc_hash(svc);
1118 write_unlock_bh(&__ip_vs_svc_lock);
1119
1120 *svc_p = svc;
1121 return 0;
1122
1123 out_err:
1124 if (svc != NULL) {
1125 if (svc->scheduler)
1126 ip_vs_unbind_scheduler(svc);
1127 if (svc->inc) {
1128 local_bh_disable();
1129 ip_vs_app_inc_put(svc->inc);
1130 local_bh_enable();
1131 }
1132 kfree(svc);
1133 }
1134 ip_vs_scheduler_put(sched);
1135
1136 out_mod_dec:
1137 /* decrease the module use count */
1138 ip_vs_use_count_dec();
1139
1140 return ret;
1141}
1142
1143
1144/*
1145 * Edit a service and bind it with a new scheduler
1146 */
1147static int
1148ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1149{
1150 struct ip_vs_scheduler *sched, *old_sched;
1151 int ret = 0;
1152
1153 /*
1154 * Lookup the scheduler, by 'u->sched_name'
1155 */
1156 sched = ip_vs_scheduler_get(u->sched_name);
1157 if (sched == NULL) {
1158 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1159 u->sched_name);
1160 return -ENOENT;
1161 }
1162 old_sched = sched;
1163
1164 write_lock_bh(&__ip_vs_svc_lock);
1165
1166 /*
1167 * Wait until all other svc users go away.
1168 */
1169 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1170
1171 /*
1172 * Set the flags and timeout value
1173 */
1174 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1175 svc->timeout = u->timeout * HZ;
1176 svc->netmask = u->netmask;
1177
1178 old_sched = svc->scheduler;
1179 if (sched != old_sched) {
1180 /*
1181 * Unbind the old scheduler
1182 */
1183 if ((ret = ip_vs_unbind_scheduler(svc))) {
1184 old_sched = sched;
1185 goto out;
1186 }
1187
1188 /*
1189 * Bind the new scheduler
1190 */
1191 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1192 /*
1193 * If ip_vs_bind_scheduler fails, restore the old
1194 * scheduler.
1195 * The main reason of failure is out of memory.
1196 *
1197 * The question is if the old scheduler can be
1198 * restored all the time. TODO: if it cannot be
1199 * restored some time, we must delete the service,
1200 * otherwise the system may crash.
1201 */
1202 ip_vs_bind_scheduler(svc, old_sched);
1203 old_sched = sched;
1204 goto out;
1205 }
1206 }
1207
1208 out:
1209 write_unlock_bh(&__ip_vs_svc_lock);
1210
1211 if (old_sched)
1212 ip_vs_scheduler_put(old_sched);
1213
1214 return ret;
1215}
1216
1217
1218/*
1219 * Delete a service from the service list
1220 * - The service must be unlinked, unlocked and not referenced!
1221 * - We are called under _bh lock
1222 */
1223static void __ip_vs_del_service(struct ip_vs_service *svc)
1224{
1225 struct ip_vs_dest *dest, *nxt;
1226 struct ip_vs_scheduler *old_sched;
1227
1228 ip_vs_num_services--;
1229 ip_vs_kill_estimator(&svc->stats);
1230
1231 /* Unbind scheduler */
1232 old_sched = svc->scheduler;
1233 ip_vs_unbind_scheduler(svc);
1234 if (old_sched)
1235 ip_vs_scheduler_put(old_sched);
1236
1237 /* Unbind app inc */
1238 if (svc->inc) {
1239 ip_vs_app_inc_put(svc->inc);
1240 svc->inc = NULL;
1241 }
1242
1243 /*
1244 * Unlink the whole destination list
1245 */
1246 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1247 __ip_vs_unlink_dest(svc, dest, 0);
1248 __ip_vs_del_dest(dest);
1249 }
1250
1251 /*
1252 * Update the virtual service counters
1253 */
1254 if (svc->port == FTPPORT)
1255 atomic_dec(&ip_vs_ftpsvc_counter);
1256 else if (svc->port == 0)
1257 atomic_dec(&ip_vs_nullsvc_counter);
1258
1259 /*
1260 * Free the service if nobody refers to it
1261 */
1262 if (atomic_read(&svc->refcnt) == 0)
1263 kfree(svc);
1264
1265 /* decrease the module use count */
1266 ip_vs_use_count_dec();
1267}
1268
1269/*
1270 * Delete a service from the service list
1271 */
1272static int ip_vs_del_service(struct ip_vs_service *svc)
1273{
1274 if (svc == NULL)
1275 return -EEXIST;
1276
1277 /*
1278 * Unhash it from the service table
1279 */
1280 write_lock_bh(&__ip_vs_svc_lock);
1281
1282 ip_vs_svc_unhash(svc);
1283
1284 /*
1285 * Wait until all the svc users go away.
1286 */
1287 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1288
1289 __ip_vs_del_service(svc);
1290
1291 write_unlock_bh(&__ip_vs_svc_lock);
1292
1293 return 0;
1294}
1295
1296
1297/*
1298 * Flush all the virtual services
1299 */
1300static int ip_vs_flush(void)
1301{
1302 int idx;
1303 struct ip_vs_service *svc, *nxt;
1304
1305 /*
1306 * Flush the service table hashed by <protocol,addr,port>
1307 */
1308 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1309 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1310 write_lock_bh(&__ip_vs_svc_lock);
1311 ip_vs_svc_unhash(svc);
1312 /*
1313 * Wait until all the svc users go away.
1314 */
1315 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1316 __ip_vs_del_service(svc);
1317 write_unlock_bh(&__ip_vs_svc_lock);
1318 }
1319 }
1320
1321 /*
1322 * Flush the service table hashed by fwmark
1323 */
1324 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1325 list_for_each_entry_safe(svc, nxt,
1326 &ip_vs_svc_fwm_table[idx], f_list) {
1327 write_lock_bh(&__ip_vs_svc_lock);
1328 ip_vs_svc_unhash(svc);
1329 /*
1330 * Wait until all the svc users go away.
1331 */
1332 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1333 __ip_vs_del_service(svc);
1334 write_unlock_bh(&__ip_vs_svc_lock);
1335 }
1336 }
1337
1338 return 0;
1339}
1340
1341
1342/*
1343 * Zero counters in a service or all services
1344 */
1345static int ip_vs_zero_service(struct ip_vs_service *svc)
1346{
1347 struct ip_vs_dest *dest;
1348
1349 write_lock_bh(&__ip_vs_svc_lock);
1350 list_for_each_entry(dest, &svc->destinations, n_list) {
1351 ip_vs_zero_stats(&dest->stats);
1352 }
1353 ip_vs_zero_stats(&svc->stats);
1354 write_unlock_bh(&__ip_vs_svc_lock);
1355 return 0;
1356}
1357
1358static int ip_vs_zero_all(void)
1359{
1360 int idx;
1361 struct ip_vs_service *svc;
1362
1363 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1364 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1365 ip_vs_zero_service(svc);
1366 }
1367 }
1368
1369 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1370 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1371 ip_vs_zero_service(svc);
1372 }
1373 }
1374
1375 ip_vs_zero_stats(&ip_vs_stats);
1376 return 0;
1377}
1378
1379
1380static int
1381proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1382 void __user *buffer, size_t *lenp, loff_t *ppos)
1383{
1384 int *valp = table->data;
1385 int val = *valp;
1386 int rc;
1387
1388 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1389 if (write && (*valp != val)) {
1390 if ((*valp < 0) || (*valp > 3)) {
1391 /* Restore the correct value */
1392 *valp = val;
1393 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001394 update_defense_level();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001395 }
1396 }
1397 return rc;
1398}
1399
1400
1401static int
1402proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1403 void __user *buffer, size_t *lenp, loff_t *ppos)
1404{
1405 int *valp = table->data;
1406 int val[2];
1407 int rc;
1408
1409 /* backup the value first */
1410 memcpy(val, valp, sizeof(val));
1411
1412 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1413 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1414 /* Restore the correct value */
1415 memcpy(valp, val, sizeof(val));
1416 }
1417 return rc;
1418}
1419
1420
1421/*
1422 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1423 */
1424
1425static struct ctl_table vs_vars[] = {
1426 {
1427 .ctl_name = NET_IPV4_VS_AMEMTHRESH,
1428 .procname = "amemthresh",
1429 .data = &sysctl_ip_vs_amemthresh,
1430 .maxlen = sizeof(int),
1431 .mode = 0644,
1432 .proc_handler = &proc_dointvec,
1433 },
1434#ifdef CONFIG_IP_VS_DEBUG
1435 {
1436 .ctl_name = NET_IPV4_VS_DEBUG_LEVEL,
1437 .procname = "debug_level",
1438 .data = &sysctl_ip_vs_debug_level,
1439 .maxlen = sizeof(int),
1440 .mode = 0644,
1441 .proc_handler = &proc_dointvec,
1442 },
1443#endif
1444 {
1445 .ctl_name = NET_IPV4_VS_AMDROPRATE,
1446 .procname = "am_droprate",
1447 .data = &sysctl_ip_vs_am_droprate,
1448 .maxlen = sizeof(int),
1449 .mode = 0644,
1450 .proc_handler = &proc_dointvec,
1451 },
1452 {
1453 .ctl_name = NET_IPV4_VS_DROP_ENTRY,
1454 .procname = "drop_entry",
1455 .data = &sysctl_ip_vs_drop_entry,
1456 .maxlen = sizeof(int),
1457 .mode = 0644,
1458 .proc_handler = &proc_do_defense_mode,
1459 },
1460 {
1461 .ctl_name = NET_IPV4_VS_DROP_PACKET,
1462 .procname = "drop_packet",
1463 .data = &sysctl_ip_vs_drop_packet,
1464 .maxlen = sizeof(int),
1465 .mode = 0644,
1466 .proc_handler = &proc_do_defense_mode,
1467 },
1468 {
1469 .ctl_name = NET_IPV4_VS_SECURE_TCP,
1470 .procname = "secure_tcp",
1471 .data = &sysctl_ip_vs_secure_tcp,
1472 .maxlen = sizeof(int),
1473 .mode = 0644,
1474 .proc_handler = &proc_do_defense_mode,
1475 },
1476#if 0
1477 {
1478 .ctl_name = NET_IPV4_VS_TO_ES,
1479 .procname = "timeout_established",
1480 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1481 .maxlen = sizeof(int),
1482 .mode = 0644,
1483 .proc_handler = &proc_dointvec_jiffies,
1484 },
1485 {
1486 .ctl_name = NET_IPV4_VS_TO_SS,
1487 .procname = "timeout_synsent",
1488 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1489 .maxlen = sizeof(int),
1490 .mode = 0644,
1491 .proc_handler = &proc_dointvec_jiffies,
1492 },
1493 {
1494 .ctl_name = NET_IPV4_VS_TO_SR,
1495 .procname = "timeout_synrecv",
1496 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1497 .maxlen = sizeof(int),
1498 .mode = 0644,
1499 .proc_handler = &proc_dointvec_jiffies,
1500 },
1501 {
1502 .ctl_name = NET_IPV4_VS_TO_FW,
1503 .procname = "timeout_finwait",
1504 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1505 .maxlen = sizeof(int),
1506 .mode = 0644,
1507 .proc_handler = &proc_dointvec_jiffies,
1508 },
1509 {
1510 .ctl_name = NET_IPV4_VS_TO_TW,
1511 .procname = "timeout_timewait",
1512 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1513 .maxlen = sizeof(int),
1514 .mode = 0644,
1515 .proc_handler = &proc_dointvec_jiffies,
1516 },
1517 {
1518 .ctl_name = NET_IPV4_VS_TO_CL,
1519 .procname = "timeout_close",
1520 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1521 .maxlen = sizeof(int),
1522 .mode = 0644,
1523 .proc_handler = &proc_dointvec_jiffies,
1524 },
1525 {
1526 .ctl_name = NET_IPV4_VS_TO_CW,
1527 .procname = "timeout_closewait",
1528 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1529 .maxlen = sizeof(int),
1530 .mode = 0644,
1531 .proc_handler = &proc_dointvec_jiffies,
1532 },
1533 {
1534 .ctl_name = NET_IPV4_VS_TO_LA,
1535 .procname = "timeout_lastack",
1536 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1537 .maxlen = sizeof(int),
1538 .mode = 0644,
1539 .proc_handler = &proc_dointvec_jiffies,
1540 },
1541 {
1542 .ctl_name = NET_IPV4_VS_TO_LI,
1543 .procname = "timeout_listen",
1544 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1545 .maxlen = sizeof(int),
1546 .mode = 0644,
1547 .proc_handler = &proc_dointvec_jiffies,
1548 },
1549 {
1550 .ctl_name = NET_IPV4_VS_TO_SA,
1551 .procname = "timeout_synack",
1552 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1553 .maxlen = sizeof(int),
1554 .mode = 0644,
1555 .proc_handler = &proc_dointvec_jiffies,
1556 },
1557 {
1558 .ctl_name = NET_IPV4_VS_TO_UDP,
1559 .procname = "timeout_udp",
1560 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1561 .maxlen = sizeof(int),
1562 .mode = 0644,
1563 .proc_handler = &proc_dointvec_jiffies,
1564 },
1565 {
1566 .ctl_name = NET_IPV4_VS_TO_ICMP,
1567 .procname = "timeout_icmp",
1568 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1569 .maxlen = sizeof(int),
1570 .mode = 0644,
1571 .proc_handler = &proc_dointvec_jiffies,
1572 },
1573#endif
1574 {
1575 .ctl_name = NET_IPV4_VS_CACHE_BYPASS,
1576 .procname = "cache_bypass",
1577 .data = &sysctl_ip_vs_cache_bypass,
1578 .maxlen = sizeof(int),
1579 .mode = 0644,
1580 .proc_handler = &proc_dointvec,
1581 },
1582 {
1583 .ctl_name = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1584 .procname = "expire_nodest_conn",
1585 .data = &sysctl_ip_vs_expire_nodest_conn,
1586 .maxlen = sizeof(int),
1587 .mode = 0644,
1588 .proc_handler = &proc_dointvec,
1589 },
1590 {
1591 .ctl_name = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1592 .procname = "expire_quiescent_template",
1593 .data = &sysctl_ip_vs_expire_quiescent_template,
1594 .maxlen = sizeof(int),
1595 .mode = 0644,
1596 .proc_handler = &proc_dointvec,
1597 },
1598 {
1599 .ctl_name = NET_IPV4_VS_SYNC_THRESHOLD,
1600 .procname = "sync_threshold",
1601 .data = &sysctl_ip_vs_sync_threshold,
1602 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1603 .mode = 0644,
1604 .proc_handler = &proc_do_sync_threshold,
1605 },
1606 {
1607 .ctl_name = NET_IPV4_VS_NAT_ICMP_SEND,
1608 .procname = "nat_icmp_send",
1609 .data = &sysctl_ip_vs_nat_icmp_send,
1610 .maxlen = sizeof(int),
1611 .mode = 0644,
1612 .proc_handler = &proc_dointvec,
1613 },
1614 { .ctl_name = 0 }
1615};
1616
1617static ctl_table vs_table[] = {
1618 {
1619 .ctl_name = NET_IPV4_VS,
1620 .procname = "vs",
1621 .mode = 0555,
1622 .child = vs_vars
1623 },
1624 { .ctl_name = 0 }
1625};
1626
David S. Millerbf0ff9e2005-08-19 16:37:30 -07001627static ctl_table ipvs_ipv4_table[] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001628 {
1629 .ctl_name = NET_IPV4,
1630 .procname = "ipv4",
1631 .mode = 0555,
1632 .child = vs_table,
1633 },
1634 { .ctl_name = 0 }
1635};
1636
1637static ctl_table vs_root_table[] = {
1638 {
1639 .ctl_name = CTL_NET,
1640 .procname = "net",
1641 .mode = 0555,
David S. Millerbf0ff9e2005-08-19 16:37:30 -07001642 .child = ipvs_ipv4_table,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001643 },
1644 { .ctl_name = 0 }
1645};
1646
1647static struct ctl_table_header * sysctl_header;
1648
1649#ifdef CONFIG_PROC_FS
1650
1651struct ip_vs_iter {
1652 struct list_head *table;
1653 int bucket;
1654};
1655
1656/*
1657 * Write the contents of the VS rule table to a PROCfs file.
1658 * (It is kept just for backward compatibility)
1659 */
1660static inline const char *ip_vs_fwd_name(unsigned flags)
1661{
1662 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1663 case IP_VS_CONN_F_LOCALNODE:
1664 return "Local";
1665 case IP_VS_CONN_F_TUNNEL:
1666 return "Tunnel";
1667 case IP_VS_CONN_F_DROUTE:
1668 return "Route";
1669 default:
1670 return "Masq";
1671 }
1672}
1673
1674
1675/* Get the Nth entry in the two lists */
1676static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1677{
1678 struct ip_vs_iter *iter = seq->private;
1679 int idx;
1680 struct ip_vs_service *svc;
1681
1682 /* look in hash by protocol */
1683 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1684 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1685 if (pos-- == 0){
1686 iter->table = ip_vs_svc_table;
1687 iter->bucket = idx;
1688 return svc;
1689 }
1690 }
1691 }
1692
1693 /* keep looking in fwmark */
1694 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1695 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1696 if (pos-- == 0) {
1697 iter->table = ip_vs_svc_fwm_table;
1698 iter->bucket = idx;
1699 return svc;
1700 }
1701 }
1702 }
1703
1704 return NULL;
1705}
1706
1707static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1708{
1709
1710 read_lock_bh(&__ip_vs_svc_lock);
1711 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1712}
1713
1714
1715static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1716{
1717 struct list_head *e;
1718 struct ip_vs_iter *iter;
1719 struct ip_vs_service *svc;
1720
1721 ++*pos;
1722 if (v == SEQ_START_TOKEN)
1723 return ip_vs_info_array(seq,0);
1724
1725 svc = v;
1726 iter = seq->private;
1727
1728 if (iter->table == ip_vs_svc_table) {
1729 /* next service in table hashed by protocol */
1730 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1731 return list_entry(e, struct ip_vs_service, s_list);
1732
1733
1734 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1735 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1736 s_list) {
1737 return svc;
1738 }
1739 }
1740
1741 iter->table = ip_vs_svc_fwm_table;
1742 iter->bucket = -1;
1743 goto scan_fwmark;
1744 }
1745
1746 /* next service in hashed by fwmark */
1747 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1748 return list_entry(e, struct ip_vs_service, f_list);
1749
1750 scan_fwmark:
1751 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1752 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1753 f_list)
1754 return svc;
1755 }
1756
1757 return NULL;
1758}
1759
1760static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1761{
1762 read_unlock_bh(&__ip_vs_svc_lock);
1763}
1764
1765
1766static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1767{
1768 if (v == SEQ_START_TOKEN) {
1769 seq_printf(seq,
1770 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1771 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1772 seq_puts(seq,
1773 "Prot LocalAddress:Port Scheduler Flags\n");
1774 seq_puts(seq,
1775 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1776 } else {
1777 const struct ip_vs_service *svc = v;
1778 const struct ip_vs_iter *iter = seq->private;
1779 const struct ip_vs_dest *dest;
1780
1781 if (iter->table == ip_vs_svc_table)
1782 seq_printf(seq, "%s %08X:%04X %s ",
1783 ip_vs_proto_name(svc->protocol),
1784 ntohl(svc->addr),
1785 ntohs(svc->port),
1786 svc->scheduler->name);
1787 else
1788 seq_printf(seq, "FWM %08X %s ",
1789 svc->fwmark, svc->scheduler->name);
1790
1791 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1792 seq_printf(seq, "persistent %d %08X\n",
1793 svc->timeout,
1794 ntohl(svc->netmask));
1795 else
1796 seq_putc(seq, '\n');
1797
1798 list_for_each_entry(dest, &svc->destinations, n_list) {
1799 seq_printf(seq,
1800 " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
1801 ntohl(dest->addr), ntohs(dest->port),
1802 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1803 atomic_read(&dest->weight),
1804 atomic_read(&dest->activeconns),
1805 atomic_read(&dest->inactconns));
1806 }
1807 }
1808 return 0;
1809}
1810
Philippe De Muyter56b3d972007-07-10 23:07:31 -07001811static const struct seq_operations ip_vs_info_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001812 .start = ip_vs_info_seq_start,
1813 .next = ip_vs_info_seq_next,
1814 .stop = ip_vs_info_seq_stop,
1815 .show = ip_vs_info_seq_show,
1816};
1817
1818static int ip_vs_info_open(struct inode *inode, struct file *file)
1819{
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -07001820 return seq_open_private(file, &ip_vs_info_seq_ops,
1821 sizeof(struct ip_vs_iter));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001822}
1823
Arjan van de Ven9a321442007-02-12 00:55:35 -08001824static const struct file_operations ip_vs_info_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001825 .owner = THIS_MODULE,
1826 .open = ip_vs_info_open,
1827 .read = seq_read,
1828 .llseek = seq_lseek,
1829 .release = seq_release_private,
1830};
1831
1832#endif
1833
1834struct ip_vs_stats ip_vs_stats;
1835
1836#ifdef CONFIG_PROC_FS
1837static int ip_vs_stats_show(struct seq_file *seq, void *v)
1838{
1839
1840/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1841 seq_puts(seq,
1842 " Total Incoming Outgoing Incoming Outgoing\n");
1843 seq_printf(seq,
1844 " Conns Packets Packets Bytes Bytes\n");
1845
1846 spin_lock_bh(&ip_vs_stats.lock);
1847 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1848 ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1849 (unsigned long long) ip_vs_stats.inbytes,
1850 (unsigned long long) ip_vs_stats.outbytes);
1851
1852/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1853 seq_puts(seq,
1854 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1855 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1856 ip_vs_stats.cps,
1857 ip_vs_stats.inpps,
1858 ip_vs_stats.outpps,
1859 ip_vs_stats.inbps,
1860 ip_vs_stats.outbps);
1861 spin_unlock_bh(&ip_vs_stats.lock);
1862
1863 return 0;
1864}
1865
1866static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1867{
1868 return single_open(file, ip_vs_stats_show, NULL);
1869}
1870
Arjan van de Ven9a321442007-02-12 00:55:35 -08001871static const struct file_operations ip_vs_stats_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001872 .owner = THIS_MODULE,
1873 .open = ip_vs_stats_seq_open,
1874 .read = seq_read,
1875 .llseek = seq_lseek,
1876 .release = single_release,
1877};
1878
1879#endif
1880
1881/*
1882 * Set timeout values for tcp tcpfin udp in the timeout_table.
1883 */
1884static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1885{
1886 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1887 u->tcp_timeout,
1888 u->tcp_fin_timeout,
1889 u->udp_timeout);
1890
1891#ifdef CONFIG_IP_VS_PROTO_TCP
1892 if (u->tcp_timeout) {
1893 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1894 = u->tcp_timeout * HZ;
1895 }
1896
1897 if (u->tcp_fin_timeout) {
1898 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1899 = u->tcp_fin_timeout * HZ;
1900 }
1901#endif
1902
1903#ifdef CONFIG_IP_VS_PROTO_UDP
1904 if (u->udp_timeout) {
1905 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1906 = u->udp_timeout * HZ;
1907 }
1908#endif
1909 return 0;
1910}
1911
1912
1913#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
1914#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
1915#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
1916 sizeof(struct ip_vs_dest_user))
1917#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
1918#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
1919#define MAX_ARG_LEN SVCDEST_ARG_LEN
1920
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001921static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001922 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
1923 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
1924 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
1925 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
1926 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
1927 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
1928 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
1929 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
1930 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
1931 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
1932 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
1933};
1934
1935static int
1936do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1937{
1938 int ret;
1939 unsigned char arg[MAX_ARG_LEN];
1940 struct ip_vs_service_user *usvc;
1941 struct ip_vs_service *svc;
1942 struct ip_vs_dest_user *udest;
1943
1944 if (!capable(CAP_NET_ADMIN))
1945 return -EPERM;
1946
1947 if (len != set_arglen[SET_CMDID(cmd)]) {
1948 IP_VS_ERR("set_ctl: len %u != %u\n",
1949 len, set_arglen[SET_CMDID(cmd)]);
1950 return -EINVAL;
1951 }
1952
1953 if (copy_from_user(arg, user, len) != 0)
1954 return -EFAULT;
1955
1956 /* increase the module use count */
1957 ip_vs_use_count_inc();
1958
Ingo Molnar14cc3e22006-03-26 01:37:14 -08001959 if (mutex_lock_interruptible(&__ip_vs_mutex)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001960 ret = -ERESTARTSYS;
1961 goto out_dec;
1962 }
1963
1964 if (cmd == IP_VS_SO_SET_FLUSH) {
1965 /* Flush the virtual service */
1966 ret = ip_vs_flush();
1967 goto out_unlock;
1968 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1969 /* Set timeout values for (tcp tcpfin udp) */
1970 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1971 goto out_unlock;
1972 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1973 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1974 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1975 goto out_unlock;
1976 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1977 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1978 ret = stop_sync_thread(dm->state);
1979 goto out_unlock;
1980 }
1981
1982 usvc = (struct ip_vs_service_user *)arg;
1983 udest = (struct ip_vs_dest_user *)(usvc + 1);
1984
1985 if (cmd == IP_VS_SO_SET_ZERO) {
1986 /* if no service address is set, zero counters in all */
1987 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1988 ret = ip_vs_zero_all();
1989 goto out_unlock;
1990 }
1991 }
1992
1993 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1994 if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1995 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1996 usvc->protocol, NIPQUAD(usvc->addr),
1997 ntohs(usvc->port), usvc->sched_name);
1998 ret = -EFAULT;
1999 goto out_unlock;
2000 }
2001
2002 /* Lookup the exact service by <protocol, addr, port> or fwmark */
2003 if (usvc->fwmark == 0)
2004 svc = __ip_vs_service_get(usvc->protocol,
2005 usvc->addr, usvc->port);
2006 else
2007 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
2008
2009 if (cmd != IP_VS_SO_SET_ADD
2010 && (svc == NULL || svc->protocol != usvc->protocol)) {
2011 ret = -ESRCH;
2012 goto out_unlock;
2013 }
2014
2015 switch (cmd) {
2016 case IP_VS_SO_SET_ADD:
2017 if (svc != NULL)
2018 ret = -EEXIST;
2019 else
2020 ret = ip_vs_add_service(usvc, &svc);
2021 break;
2022 case IP_VS_SO_SET_EDIT:
2023 ret = ip_vs_edit_service(svc, usvc);
2024 break;
2025 case IP_VS_SO_SET_DEL:
2026 ret = ip_vs_del_service(svc);
2027 if (!ret)
2028 goto out_unlock;
2029 break;
2030 case IP_VS_SO_SET_ZERO:
2031 ret = ip_vs_zero_service(svc);
2032 break;
2033 case IP_VS_SO_SET_ADDDEST:
2034 ret = ip_vs_add_dest(svc, udest);
2035 break;
2036 case IP_VS_SO_SET_EDITDEST:
2037 ret = ip_vs_edit_dest(svc, udest);
2038 break;
2039 case IP_VS_SO_SET_DELDEST:
2040 ret = ip_vs_del_dest(svc, udest);
2041 break;
2042 default:
2043 ret = -EINVAL;
2044 }
2045
2046 if (svc)
2047 ip_vs_service_put(svc);
2048
2049 out_unlock:
Ingo Molnar14cc3e22006-03-26 01:37:14 -08002050 mutex_unlock(&__ip_vs_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002051 out_dec:
2052 /* decrease the module use count */
2053 ip_vs_use_count_dec();
2054
2055 return ret;
2056}
2057
2058
2059static void
2060ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2061{
2062 spin_lock_bh(&src->lock);
2063 memcpy(dst, src, (char*)&src->lock - (char*)src);
2064 spin_unlock_bh(&src->lock);
2065}
2066
2067static void
2068ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2069{
2070 dst->protocol = src->protocol;
2071 dst->addr = src->addr;
2072 dst->port = src->port;
2073 dst->fwmark = src->fwmark;
pageexec4da62fc2005-06-26 16:00:19 -07002074 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002075 dst->flags = src->flags;
2076 dst->timeout = src->timeout / HZ;
2077 dst->netmask = src->netmask;
2078 dst->num_dests = src->num_dests;
2079 ip_vs_copy_stats(&dst->stats, &src->stats);
2080}
2081
2082static inline int
2083__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2084 struct ip_vs_get_services __user *uptr)
2085{
2086 int idx, count=0;
2087 struct ip_vs_service *svc;
2088 struct ip_vs_service_entry entry;
2089 int ret = 0;
2090
2091 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2092 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2093 if (count >= get->num_services)
2094 goto out;
pageexec4da62fc2005-06-26 16:00:19 -07002095 memset(&entry, 0, sizeof(entry));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002096 ip_vs_copy_service(&entry, svc);
2097 if (copy_to_user(&uptr->entrytable[count],
2098 &entry, sizeof(entry))) {
2099 ret = -EFAULT;
2100 goto out;
2101 }
2102 count++;
2103 }
2104 }
2105
2106 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2107 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2108 if (count >= get->num_services)
2109 goto out;
pageexec4da62fc2005-06-26 16:00:19 -07002110 memset(&entry, 0, sizeof(entry));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002111 ip_vs_copy_service(&entry, svc);
2112 if (copy_to_user(&uptr->entrytable[count],
2113 &entry, sizeof(entry))) {
2114 ret = -EFAULT;
2115 goto out;
2116 }
2117 count++;
2118 }
2119 }
2120 out:
2121 return ret;
2122}
2123
2124static inline int
2125__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2126 struct ip_vs_get_dests __user *uptr)
2127{
2128 struct ip_vs_service *svc;
2129 int ret = 0;
2130
2131 if (get->fwmark)
2132 svc = __ip_vs_svc_fwm_get(get->fwmark);
2133 else
2134 svc = __ip_vs_service_get(get->protocol,
2135 get->addr, get->port);
2136 if (svc) {
2137 int count = 0;
2138 struct ip_vs_dest *dest;
2139 struct ip_vs_dest_entry entry;
2140
2141 list_for_each_entry(dest, &svc->destinations, n_list) {
2142 if (count >= get->num_dests)
2143 break;
2144
2145 entry.addr = dest->addr;
2146 entry.port = dest->port;
2147 entry.conn_flags = atomic_read(&dest->conn_flags);
2148 entry.weight = atomic_read(&dest->weight);
2149 entry.u_threshold = dest->u_threshold;
2150 entry.l_threshold = dest->l_threshold;
2151 entry.activeconns = atomic_read(&dest->activeconns);
2152 entry.inactconns = atomic_read(&dest->inactconns);
2153 entry.persistconns = atomic_read(&dest->persistconns);
2154 ip_vs_copy_stats(&entry.stats, &dest->stats);
2155 if (copy_to_user(&uptr->entrytable[count],
2156 &entry, sizeof(entry))) {
2157 ret = -EFAULT;
2158 break;
2159 }
2160 count++;
2161 }
2162 ip_vs_service_put(svc);
2163 } else
2164 ret = -ESRCH;
2165 return ret;
2166}
2167
2168static inline void
2169__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2170{
2171#ifdef CONFIG_IP_VS_PROTO_TCP
2172 u->tcp_timeout =
2173 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2174 u->tcp_fin_timeout =
2175 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2176#endif
2177#ifdef CONFIG_IP_VS_PROTO_UDP
2178 u->udp_timeout =
2179 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2180#endif
2181}
2182
2183
2184#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2185#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2186#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2187#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2188#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2189#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2190#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2191
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08002192static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002193 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2194 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2195 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2196 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2197 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2198 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2199 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2200};
2201
2202static int
2203do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2204{
2205 unsigned char arg[128];
2206 int ret = 0;
2207
2208 if (!capable(CAP_NET_ADMIN))
2209 return -EPERM;
2210
2211 if (*len < get_arglen[GET_CMDID(cmd)]) {
2212 IP_VS_ERR("get_ctl: len %u < %u\n",
2213 *len, get_arglen[GET_CMDID(cmd)]);
2214 return -EINVAL;
2215 }
2216
2217 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2218 return -EFAULT;
2219
Ingo Molnar14cc3e22006-03-26 01:37:14 -08002220 if (mutex_lock_interruptible(&__ip_vs_mutex))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002221 return -ERESTARTSYS;
2222
2223 switch (cmd) {
2224 case IP_VS_SO_GET_VERSION:
2225 {
2226 char buf[64];
2227
2228 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2229 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2230 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2231 ret = -EFAULT;
2232 goto out;
2233 }
2234 *len = strlen(buf)+1;
2235 }
2236 break;
2237
2238 case IP_VS_SO_GET_INFO:
2239 {
2240 struct ip_vs_getinfo info;
2241 info.version = IP_VS_VERSION_CODE;
2242 info.size = IP_VS_CONN_TAB_SIZE;
2243 info.num_services = ip_vs_num_services;
2244 if (copy_to_user(user, &info, sizeof(info)) != 0)
2245 ret = -EFAULT;
2246 }
2247 break;
2248
2249 case IP_VS_SO_GET_SERVICES:
2250 {
2251 struct ip_vs_get_services *get;
2252 int size;
2253
2254 get = (struct ip_vs_get_services *)arg;
2255 size = sizeof(*get) +
2256 sizeof(struct ip_vs_service_entry) * get->num_services;
2257 if (*len != size) {
2258 IP_VS_ERR("length: %u != %u\n", *len, size);
2259 ret = -EINVAL;
2260 goto out;
2261 }
2262 ret = __ip_vs_get_service_entries(get, user);
2263 }
2264 break;
2265
2266 case IP_VS_SO_GET_SERVICE:
2267 {
2268 struct ip_vs_service_entry *entry;
2269 struct ip_vs_service *svc;
2270
2271 entry = (struct ip_vs_service_entry *)arg;
2272 if (entry->fwmark)
2273 svc = __ip_vs_svc_fwm_get(entry->fwmark);
2274 else
2275 svc = __ip_vs_service_get(entry->protocol,
2276 entry->addr, entry->port);
2277 if (svc) {
2278 ip_vs_copy_service(entry, svc);
2279 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2280 ret = -EFAULT;
2281 ip_vs_service_put(svc);
2282 } else
2283 ret = -ESRCH;
2284 }
2285 break;
2286
2287 case IP_VS_SO_GET_DESTS:
2288 {
2289 struct ip_vs_get_dests *get;
2290 int size;
2291
2292 get = (struct ip_vs_get_dests *)arg;
2293 size = sizeof(*get) +
2294 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2295 if (*len != size) {
2296 IP_VS_ERR("length: %u != %u\n", *len, size);
2297 ret = -EINVAL;
2298 goto out;
2299 }
2300 ret = __ip_vs_get_dest_entries(get, user);
2301 }
2302 break;
2303
2304 case IP_VS_SO_GET_TIMEOUT:
2305 {
2306 struct ip_vs_timeout_user t;
2307
2308 __ip_vs_get_timeouts(&t);
2309 if (copy_to_user(user, &t, sizeof(t)) != 0)
2310 ret = -EFAULT;
2311 }
2312 break;
2313
2314 case IP_VS_SO_GET_DAEMON:
2315 {
2316 struct ip_vs_daemon_user d[2];
2317
2318 memset(&d, 0, sizeof(d));
2319 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2320 d[0].state = IP_VS_STATE_MASTER;
pageexec4da62fc2005-06-26 16:00:19 -07002321 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002322 d[0].syncid = ip_vs_master_syncid;
2323 }
2324 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2325 d[1].state = IP_VS_STATE_BACKUP;
pageexec4da62fc2005-06-26 16:00:19 -07002326 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002327 d[1].syncid = ip_vs_backup_syncid;
2328 }
2329 if (copy_to_user(user, &d, sizeof(d)) != 0)
2330 ret = -EFAULT;
2331 }
2332 break;
2333
2334 default:
2335 ret = -EINVAL;
2336 }
2337
2338 out:
Ingo Molnar14cc3e22006-03-26 01:37:14 -08002339 mutex_unlock(&__ip_vs_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002340 return ret;
2341}
2342
2343
2344static struct nf_sockopt_ops ip_vs_sockopts = {
2345 .pf = PF_INET,
2346 .set_optmin = IP_VS_BASE_CTL,
2347 .set_optmax = IP_VS_SO_SET_MAX+1,
2348 .set = do_ip_vs_set_ctl,
2349 .get_optmin = IP_VS_BASE_CTL,
2350 .get_optmax = IP_VS_SO_GET_MAX+1,
2351 .get = do_ip_vs_get_ctl,
Neil Horman16fcec32007-09-11 11:28:26 +02002352 .owner = THIS_MODULE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002353};
2354
2355
2356int ip_vs_control_init(void)
2357{
2358 int ret;
2359 int idx;
2360
2361 EnterFunction(2);
2362
2363 ret = nf_register_sockopt(&ip_vs_sockopts);
2364 if (ret) {
2365 IP_VS_ERR("cannot register sockopt.\n");
2366 return ret;
2367 }
2368
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02002369 proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
2370 proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002371
Eric W. Biederman0b4d4142007-02-14 00:34:09 -08002372 sysctl_header = register_sysctl_table(vs_root_table);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002373
2374 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2375 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2376 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2377 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2378 }
2379 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
2380 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2381 }
2382
2383 memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2384 spin_lock_init(&ip_vs_stats.lock);
2385 ip_vs_new_estimator(&ip_vs_stats);
2386
2387 /* Hook the defense timer */
2388 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2389
2390 LeaveFunction(2);
2391 return 0;
2392}
2393
2394
2395void ip_vs_control_cleanup(void)
2396{
2397 EnterFunction(2);
2398 ip_vs_trash_cleanup();
2399 cancel_rearming_delayed_work(&defense_work);
Oleg Nesterov28e53bd2007-05-09 02:34:22 -07002400 cancel_work_sync(&defense_work.work);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002401 ip_vs_kill_estimator(&ip_vs_stats);
2402 unregister_sysctl_table(sysctl_header);
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02002403 proc_net_remove(&init_net, "ip_vs_stats");
2404 proc_net_remove(&init_net, "ip_vs");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002405 nf_unregister_sockopt(&ip_vs_sockopts);
2406 LeaveFunction(2);
2407}