blob: 6379705a8dcb2a98505af97ee4fa0f23045d61f8 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07008 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 * Julian Anastasov <ja@ssi.bg>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * Changes:
18 *
19 */
20
21#include <linux/module.h>
22#include <linux/init.h>
23#include <linux/types.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080024#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070025#include <linux/fs.h>
26#include <linux/sysctl.h>
27#include <linux/proc_fs.h>
28#include <linux/workqueue.h>
29#include <linux/swap.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070030#include <linux/seq_file.h>
31
32#include <linux/netfilter.h>
33#include <linux/netfilter_ipv4.h>
Ingo Molnar14cc3e22006-03-26 01:37:14 -080034#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070035
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020036#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070037#include <net/ip.h>
Arnaldo Carvalho de Melo14c85022005-12-27 02:43:12 -020038#include <net/route.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039#include <net/sock.h>
40
41#include <asm/uaccess.h>
42
43#include <net/ip_vs.h>
44
45/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
Ingo Molnar14cc3e22006-03-26 01:37:14 -080046static DEFINE_MUTEX(__ip_vs_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -070047
48/* lock for service table */
49static DEFINE_RWLOCK(__ip_vs_svc_lock);
50
51/* lock for table with the real services */
52static DEFINE_RWLOCK(__ip_vs_rs_lock);
53
54/* lock for state and timeout tables */
55static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
56
57/* lock for drop entry handling */
58static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
59
60/* lock for drop packet handling */
61static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
62
63/* 1/rate drop and drop-entry variables */
64int ip_vs_drop_rate = 0;
65int ip_vs_drop_counter = 0;
66static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
67
68/* number of virtual services */
69static int ip_vs_num_services = 0;
70
71/* sysctl variables */
72static int sysctl_ip_vs_drop_entry = 0;
73static int sysctl_ip_vs_drop_packet = 0;
74static int sysctl_ip_vs_secure_tcp = 0;
75static int sysctl_ip_vs_amemthresh = 1024;
76static int sysctl_ip_vs_am_droprate = 10;
77int sysctl_ip_vs_cache_bypass = 0;
78int sysctl_ip_vs_expire_nodest_conn = 0;
79int sysctl_ip_vs_expire_quiescent_template = 0;
80int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
81int sysctl_ip_vs_nat_icmp_send = 0;
82
83
84#ifdef CONFIG_IP_VS_DEBUG
85static int sysctl_ip_vs_debug_level = 0;
86
87int ip_vs_get_debug_level(void)
88{
89 return sysctl_ip_vs_debug_level;
90}
91#endif
92
93/*
Julian Anastasovaf9debd2005-07-11 20:59:57 -070094 * update_defense_level is called from keventd and from sysctl,
95 * so it needs to protect itself from softirqs
Linus Torvalds1da177e2005-04-16 15:20:36 -070096 */
97static void update_defense_level(void)
98{
99 struct sysinfo i;
100 static int old_secure_tcp = 0;
101 int availmem;
102 int nomem;
103 int to_change = -1;
104
105 /* we only count free and buffered memory (in pages) */
106 si_meminfo(&i);
107 availmem = i.freeram + i.bufferram;
108 /* however in linux 2.5 the i.bufferram is total page cache size,
109 we need adjust it */
110 /* si_swapinfo(&i); */
111 /* availmem = availmem - (i.totalswap - i.freeswap); */
112
113 nomem = (availmem < sysctl_ip_vs_amemthresh);
114
Julian Anastasovaf9debd2005-07-11 20:59:57 -0700115 local_bh_disable();
116
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117 /* drop_entry */
118 spin_lock(&__ip_vs_dropentry_lock);
119 switch (sysctl_ip_vs_drop_entry) {
120 case 0:
121 atomic_set(&ip_vs_dropentry, 0);
122 break;
123 case 1:
124 if (nomem) {
125 atomic_set(&ip_vs_dropentry, 1);
126 sysctl_ip_vs_drop_entry = 2;
127 } else {
128 atomic_set(&ip_vs_dropentry, 0);
129 }
130 break;
131 case 2:
132 if (nomem) {
133 atomic_set(&ip_vs_dropentry, 1);
134 } else {
135 atomic_set(&ip_vs_dropentry, 0);
136 sysctl_ip_vs_drop_entry = 1;
137 };
138 break;
139 case 3:
140 atomic_set(&ip_vs_dropentry, 1);
141 break;
142 }
143 spin_unlock(&__ip_vs_dropentry_lock);
144
145 /* drop_packet */
146 spin_lock(&__ip_vs_droppacket_lock);
147 switch (sysctl_ip_vs_drop_packet) {
148 case 0:
149 ip_vs_drop_rate = 0;
150 break;
151 case 1:
152 if (nomem) {
153 ip_vs_drop_rate = ip_vs_drop_counter
154 = sysctl_ip_vs_amemthresh /
155 (sysctl_ip_vs_amemthresh-availmem);
156 sysctl_ip_vs_drop_packet = 2;
157 } else {
158 ip_vs_drop_rate = 0;
159 }
160 break;
161 case 2:
162 if (nomem) {
163 ip_vs_drop_rate = ip_vs_drop_counter
164 = sysctl_ip_vs_amemthresh /
165 (sysctl_ip_vs_amemthresh-availmem);
166 } else {
167 ip_vs_drop_rate = 0;
168 sysctl_ip_vs_drop_packet = 1;
169 }
170 break;
171 case 3:
172 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
173 break;
174 }
175 spin_unlock(&__ip_vs_droppacket_lock);
176
177 /* secure_tcp */
178 write_lock(&__ip_vs_securetcp_lock);
179 switch (sysctl_ip_vs_secure_tcp) {
180 case 0:
181 if (old_secure_tcp >= 2)
182 to_change = 0;
183 break;
184 case 1:
185 if (nomem) {
186 if (old_secure_tcp < 2)
187 to_change = 1;
188 sysctl_ip_vs_secure_tcp = 2;
189 } else {
190 if (old_secure_tcp >= 2)
191 to_change = 0;
192 }
193 break;
194 case 2:
195 if (nomem) {
196 if (old_secure_tcp < 2)
197 to_change = 1;
198 } else {
199 if (old_secure_tcp >= 2)
200 to_change = 0;
201 sysctl_ip_vs_secure_tcp = 1;
202 }
203 break;
204 case 3:
205 if (old_secure_tcp < 2)
206 to_change = 1;
207 break;
208 }
209 old_secure_tcp = sysctl_ip_vs_secure_tcp;
210 if (to_change >= 0)
211 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
212 write_unlock(&__ip_vs_securetcp_lock);
Julian Anastasovaf9debd2005-07-11 20:59:57 -0700213
214 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700215}
216
217
218/*
219 * Timer for checking the defense
220 */
221#define DEFENSE_TIMER_PERIOD 1*HZ
David Howellsc4028952006-11-22 14:57:56 +0000222static void defense_work_handler(struct work_struct *work);
223static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700224
David Howellsc4028952006-11-22 14:57:56 +0000225static void defense_work_handler(struct work_struct *work)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700226{
227 update_defense_level();
228 if (atomic_read(&ip_vs_dropentry))
229 ip_vs_random_dropentry();
230
231 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
232}
233
234int
235ip_vs_use_count_inc(void)
236{
237 return try_module_get(THIS_MODULE);
238}
239
240void
241ip_vs_use_count_dec(void)
242{
243 module_put(THIS_MODULE);
244}
245
246
247/*
248 * Hash table: for virtual service lookups
249 */
250#define IP_VS_SVC_TAB_BITS 8
251#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
252#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
253
254/* the service table hashed by <protocol, addr, port> */
255static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
256/* the service table hashed by fwmark */
257static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
258
259/*
260 * Hash table: for real service lookups
261 */
262#define IP_VS_RTAB_BITS 4
263#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
264#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
265
266static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
267
268/*
269 * Trash for destinations
270 */
271static LIST_HEAD(ip_vs_dest_trash);
272
273/*
274 * FTP & NULL virtual service counters
275 */
276static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
277static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
278
279
280/*
281 * Returns hash value for virtual service
282 */
283static __inline__ unsigned
Al Viro014d7302006-09-28 14:29:52 -0700284ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285{
286 register unsigned porth = ntohs(port);
287
288 return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
289 & IP_VS_SVC_TAB_MASK;
290}
291
292/*
293 * Returns hash value of fwmark for virtual service lookup
294 */
295static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
296{
297 return fwmark & IP_VS_SVC_TAB_MASK;
298}
299
300/*
301 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
302 * or in the ip_vs_svc_fwm_table by fwmark.
303 * Should be called with locked tables.
304 */
305static int ip_vs_svc_hash(struct ip_vs_service *svc)
306{
307 unsigned hash;
308
309 if (svc->flags & IP_VS_SVC_F_HASHED) {
310 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
311 "called from %p\n", __builtin_return_address(0));
312 return 0;
313 }
314
315 if (svc->fwmark == 0) {
316 /*
317 * Hash it by <protocol,addr,port> in ip_vs_svc_table
318 */
319 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
320 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
321 } else {
322 /*
323 * Hash it by fwmark in ip_vs_svc_fwm_table
324 */
325 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
326 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
327 }
328
329 svc->flags |= IP_VS_SVC_F_HASHED;
330 /* increase its refcnt because it is referenced by the svc table */
331 atomic_inc(&svc->refcnt);
332 return 1;
333}
334
335
336/*
337 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
338 * Should be called with locked tables.
339 */
340static int ip_vs_svc_unhash(struct ip_vs_service *svc)
341{
342 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
343 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
344 "called from %p\n", __builtin_return_address(0));
345 return 0;
346 }
347
348 if (svc->fwmark == 0) {
349 /* Remove it from the ip_vs_svc_table table */
350 list_del(&svc->s_list);
351 } else {
352 /* Remove it from the ip_vs_svc_fwm_table table */
353 list_del(&svc->f_list);
354 }
355
356 svc->flags &= ~IP_VS_SVC_F_HASHED;
357 atomic_dec(&svc->refcnt);
358 return 1;
359}
360
361
362/*
363 * Get service by {proto,addr,port} in the service table.
364 */
365static __inline__ struct ip_vs_service *
Al Viro014d7302006-09-28 14:29:52 -0700366__ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700367{
368 unsigned hash;
369 struct ip_vs_service *svc;
370
371 /* Check for "full" addressed entries */
372 hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
373
374 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
375 if ((svc->addr == vaddr)
376 && (svc->port == vport)
377 && (svc->protocol == protocol)) {
378 /* HIT */
379 atomic_inc(&svc->usecnt);
380 return svc;
381 }
382 }
383
384 return NULL;
385}
386
387
388/*
389 * Get service by {fwmark} in the service table.
390 */
391static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
392{
393 unsigned hash;
394 struct ip_vs_service *svc;
395
396 /* Check for fwmark addressed entries */
397 hash = ip_vs_svc_fwm_hashkey(fwmark);
398
399 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
400 if (svc->fwmark == fwmark) {
401 /* HIT */
402 atomic_inc(&svc->usecnt);
403 return svc;
404 }
405 }
406
407 return NULL;
408}
409
410struct ip_vs_service *
Al Viro014d7302006-09-28 14:29:52 -0700411ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700412{
413 struct ip_vs_service *svc;
414
415 read_lock(&__ip_vs_svc_lock);
416
417 /*
418 * Check the table hashed by fwmark first
419 */
420 if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
421 goto out;
422
423 /*
424 * Check the table hashed by <protocol,addr,port>
425 * for "full" addressed entries
426 */
427 svc = __ip_vs_service_get(protocol, vaddr, vport);
428
429 if (svc == NULL
430 && protocol == IPPROTO_TCP
431 && atomic_read(&ip_vs_ftpsvc_counter)
432 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
433 /*
434 * Check if ftp service entry exists, the packet
435 * might belong to FTP data connections.
436 */
437 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
438 }
439
440 if (svc == NULL
441 && atomic_read(&ip_vs_nullsvc_counter)) {
442 /*
443 * Check if the catch-all port (port zero) exists
444 */
445 svc = __ip_vs_service_get(protocol, vaddr, 0);
446 }
447
448 out:
449 read_unlock(&__ip_vs_svc_lock);
450
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800451 IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700452 fwmark, ip_vs_proto_name(protocol),
453 NIPQUAD(vaddr), ntohs(vport),
454 svc?"hit":"not hit");
455
456 return svc;
457}
458
459
460static inline void
461__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
462{
463 atomic_inc(&svc->refcnt);
464 dest->svc = svc;
465}
466
467static inline void
468__ip_vs_unbind_svc(struct ip_vs_dest *dest)
469{
470 struct ip_vs_service *svc = dest->svc;
471
472 dest->svc = NULL;
473 if (atomic_dec_and_test(&svc->refcnt))
474 kfree(svc);
475}
476
477
478/*
479 * Returns hash value for real service
480 */
Al Viro014d7302006-09-28 14:29:52 -0700481static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482{
483 register unsigned porth = ntohs(port);
484
485 return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
486 & IP_VS_RTAB_MASK;
487}
488
489/*
490 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
491 * should be called with locked tables.
492 */
493static int ip_vs_rs_hash(struct ip_vs_dest *dest)
494{
495 unsigned hash;
496
497 if (!list_empty(&dest->d_list)) {
498 return 0;
499 }
500
501 /*
502 * Hash by proto,addr,port,
503 * which are the parameters of the real service.
504 */
505 hash = ip_vs_rs_hashkey(dest->addr, dest->port);
506 list_add(&dest->d_list, &ip_vs_rtable[hash]);
507
508 return 1;
509}
510
511/*
512 * UNhashes ip_vs_dest from ip_vs_rtable.
513 * should be called with locked tables.
514 */
515static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
516{
517 /*
518 * Remove it from the ip_vs_rtable table.
519 */
520 if (!list_empty(&dest->d_list)) {
521 list_del(&dest->d_list);
522 INIT_LIST_HEAD(&dest->d_list);
523 }
524
525 return 1;
526}
527
528/*
529 * Lookup real service by <proto,addr,port> in the real service table.
530 */
531struct ip_vs_dest *
Al Viro014d7302006-09-28 14:29:52 -0700532ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700533{
534 unsigned hash;
535 struct ip_vs_dest *dest;
536
537 /*
538 * Check for "full" addressed entries
539 * Return the first found entry
540 */
541 hash = ip_vs_rs_hashkey(daddr, dport);
542
543 read_lock(&__ip_vs_rs_lock);
544 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
545 if ((dest->addr == daddr)
546 && (dest->port == dport)
547 && ((dest->protocol == protocol) ||
548 dest->vfwmark)) {
549 /* HIT */
550 read_unlock(&__ip_vs_rs_lock);
551 return dest;
552 }
553 }
554 read_unlock(&__ip_vs_rs_lock);
555
556 return NULL;
557}
558
559/*
560 * Lookup destination by {addr,port} in the given service
561 */
562static struct ip_vs_dest *
Al Viro014d7302006-09-28 14:29:52 -0700563ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700564{
565 struct ip_vs_dest *dest;
566
567 /*
568 * Find the destination for the given service
569 */
570 list_for_each_entry(dest, &svc->destinations, n_list) {
571 if ((dest->addr == daddr) && (dest->port == dport)) {
572 /* HIT */
573 return dest;
574 }
575 }
576
577 return NULL;
578}
579
Rumen G. Bogdanovski1e356f92007-11-07 02:35:54 -0800580/*
581 * Find destination by {daddr,dport,vaddr,protocol}
582 * Cretaed to be used in ip_vs_process_message() in
583 * the backup synchronization daemon. It finds the
584 * destination to be bound to the received connection
585 * on the backup.
586 *
587 * ip_vs_lookup_real_service() looked promissing, but
588 * seems not working as expected.
589 */
590struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport,
591 __be32 vaddr, __be16 vport, __u16 protocol)
592{
593 struct ip_vs_dest *dest;
594 struct ip_vs_service *svc;
595
596 svc = ip_vs_service_get(0, protocol, vaddr, vport);
597 if (!svc)
598 return NULL;
599 dest = ip_vs_lookup_dest(svc, daddr, dport);
600 if (dest)
601 atomic_inc(&dest->refcnt);
602 ip_vs_service_put(svc);
603 return dest;
604}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700605
606/*
607 * Lookup dest by {svc,addr,port} in the destination trash.
608 * The destination trash is used to hold the destinations that are removed
609 * from the service table but are still referenced by some conn entries.
610 * The reason to add the destination trash is when the dest is temporary
611 * down (either by administrator or by monitor program), the dest can be
612 * picked back from the trash, the remaining connections to the dest can
613 * continue, and the counting information of the dest is also useful for
614 * scheduling.
615 */
616static struct ip_vs_dest *
Al Viro014d7302006-09-28 14:29:52 -0700617ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618{
619 struct ip_vs_dest *dest, *nxt;
620
621 /*
622 * Find the destination in trash
623 */
624 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
625 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800626 "dest->refcnt=%d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700627 dest->vfwmark,
628 NIPQUAD(dest->addr), ntohs(dest->port),
629 atomic_read(&dest->refcnt));
630 if (dest->addr == daddr &&
631 dest->port == dport &&
632 dest->vfwmark == svc->fwmark &&
633 dest->protocol == svc->protocol &&
634 (svc->fwmark ||
635 (dest->vaddr == svc->addr &&
636 dest->vport == svc->port))) {
637 /* HIT */
638 return dest;
639 }
640
641 /*
642 * Try to purge the destination from trash if not referenced
643 */
644 if (atomic_read(&dest->refcnt) == 1) {
645 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
646 "from trash\n",
647 dest->vfwmark,
648 NIPQUAD(dest->addr), ntohs(dest->port));
649 list_del(&dest->n_list);
650 ip_vs_dst_reset(dest);
651 __ip_vs_unbind_svc(dest);
652 kfree(dest);
653 }
654 }
655
656 return NULL;
657}
658
659
660/*
661 * Clean up all the destinations in the trash
662 * Called by the ip_vs_control_cleanup()
663 *
664 * When the ip_vs_control_clearup is activated by ipvs module exit,
665 * the service tables must have been flushed and all the connections
666 * are expired, and the refcnt of each destination in the trash must
667 * be 1, so we simply release them here.
668 */
669static void ip_vs_trash_cleanup(void)
670{
671 struct ip_vs_dest *dest, *nxt;
672
673 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
674 list_del(&dest->n_list);
675 ip_vs_dst_reset(dest);
676 __ip_vs_unbind_svc(dest);
677 kfree(dest);
678 }
679}
680
681
682static void
683ip_vs_zero_stats(struct ip_vs_stats *stats)
684{
685 spin_lock_bh(&stats->lock);
Simon Hormane93615d2008-08-11 17:19:14 +1000686
687 stats->conns = 0;
688 stats->inpkts = 0;
689 stats->outpkts = 0;
690 stats->inbytes = 0;
691 stats->outbytes = 0;
692
693 stats->cps = 0;
694 stats->inpps = 0;
695 stats->outpps = 0;
696 stats->inbps = 0;
697 stats->outbps = 0;
698
Linus Torvalds1da177e2005-04-16 15:20:36 -0700699 ip_vs_zero_estimator(stats);
Simon Hormane93615d2008-08-11 17:19:14 +1000700
Sven Wegener3a14a312008-08-10 18:24:41 +0000701 spin_unlock_bh(&stats->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700702}
703
704/*
705 * Update a destination in the given service
706 */
707static void
708__ip_vs_update_dest(struct ip_vs_service *svc,
709 struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
710{
711 int conn_flags;
712
713 /* set the weight and the flags */
714 atomic_set(&dest->weight, udest->weight);
715 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
716
717 /* check if local node and update the flags */
Eric W. Biederman6b175b22008-01-10 03:25:28 -0800718 if (inet_addr_type(&init_net, udest->addr) == RTN_LOCAL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700719 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
720 | IP_VS_CONN_F_LOCALNODE;
721 }
722
723 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
724 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
725 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
726 } else {
727 /*
728 * Put the real service in ip_vs_rtable if not present.
729 * For now only for NAT!
730 */
731 write_lock_bh(&__ip_vs_rs_lock);
732 ip_vs_rs_hash(dest);
733 write_unlock_bh(&__ip_vs_rs_lock);
734 }
735 atomic_set(&dest->conn_flags, conn_flags);
736
737 /* bind the service */
738 if (!dest->svc) {
739 __ip_vs_bind_svc(dest, svc);
740 } else {
741 if (dest->svc != svc) {
742 __ip_vs_unbind_svc(dest);
743 ip_vs_zero_stats(&dest->stats);
744 __ip_vs_bind_svc(dest, svc);
745 }
746 }
747
748 /* set the dest status flags */
749 dest->flags |= IP_VS_DEST_F_AVAILABLE;
750
751 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
752 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
753 dest->u_threshold = udest->u_threshold;
754 dest->l_threshold = udest->l_threshold;
755}
756
757
758/*
759 * Create a destination for the given service
760 */
761static int
762ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
763 struct ip_vs_dest **dest_p)
764{
765 struct ip_vs_dest *dest;
766 unsigned atype;
767
768 EnterFunction(2);
769
Eric W. Biederman6b175b22008-01-10 03:25:28 -0800770 atype = inet_addr_type(&init_net, udest->addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700771 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
772 return -EINVAL;
773
Panagiotis Issaris0da974f2006-07-21 14:51:30 -0700774 dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700775 if (dest == NULL) {
776 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
777 return -ENOMEM;
778 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700779
780 dest->protocol = svc->protocol;
781 dest->vaddr = svc->addr;
782 dest->vport = svc->port;
783 dest->vfwmark = svc->fwmark;
784 dest->addr = udest->addr;
785 dest->port = udest->port;
786
787 atomic_set(&dest->activeconns, 0);
788 atomic_set(&dest->inactconns, 0);
789 atomic_set(&dest->persistconns, 0);
790 atomic_set(&dest->refcnt, 0);
791
792 INIT_LIST_HEAD(&dest->d_list);
793 spin_lock_init(&dest->dst_lock);
794 spin_lock_init(&dest->stats.lock);
795 __ip_vs_update_dest(svc, dest, udest);
796 ip_vs_new_estimator(&dest->stats);
797
798 *dest_p = dest;
799
800 LeaveFunction(2);
801 return 0;
802}
803
804
805/*
806 * Add a destination into an existing service
807 */
808static int
809ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
810{
811 struct ip_vs_dest *dest;
Al Viro014d7302006-09-28 14:29:52 -0700812 __be32 daddr = udest->addr;
813 __be16 dport = udest->port;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700814 int ret;
815
816 EnterFunction(2);
817
818 if (udest->weight < 0) {
819 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
820 return -ERANGE;
821 }
822
823 if (udest->l_threshold > udest->u_threshold) {
824 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
825 "upper threshold\n");
826 return -ERANGE;
827 }
828
829 /*
830 * Check if the dest already exists in the list
831 */
832 dest = ip_vs_lookup_dest(svc, daddr, dport);
833 if (dest != NULL) {
834 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
835 return -EEXIST;
836 }
837
838 /*
839 * Check if the dest already exists in the trash and
840 * is from the same service
841 */
842 dest = ip_vs_trash_get_dest(svc, daddr, dport);
843 if (dest != NULL) {
844 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800845 "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700846 NIPQUAD(daddr), ntohs(dport),
847 atomic_read(&dest->refcnt),
848 dest->vfwmark,
849 NIPQUAD(dest->vaddr),
850 ntohs(dest->vport));
851 __ip_vs_update_dest(svc, dest, udest);
852
853 /*
854 * Get the destination from the trash
855 */
856 list_del(&dest->n_list);
857
858 ip_vs_new_estimator(&dest->stats);
859
860 write_lock_bh(&__ip_vs_svc_lock);
861
862 /*
863 * Wait until all other svc users go away.
864 */
865 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
866
867 list_add(&dest->n_list, &svc->destinations);
868 svc->num_dests++;
869
870 /* call the update_service function of its scheduler */
871 svc->scheduler->update_service(svc);
872
873 write_unlock_bh(&__ip_vs_svc_lock);
874 return 0;
875 }
876
877 /*
878 * Allocate and initialize the dest structure
879 */
880 ret = ip_vs_new_dest(svc, udest, &dest);
881 if (ret) {
882 return ret;
883 }
884
885 /*
886 * Add the dest entry into the list
887 */
888 atomic_inc(&dest->refcnt);
889
890 write_lock_bh(&__ip_vs_svc_lock);
891
892 /*
893 * Wait until all other svc users go away.
894 */
895 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
896
897 list_add(&dest->n_list, &svc->destinations);
898 svc->num_dests++;
899
900 /* call the update_service function of its scheduler */
901 svc->scheduler->update_service(svc);
902
903 write_unlock_bh(&__ip_vs_svc_lock);
904
905 LeaveFunction(2);
906
907 return 0;
908}
909
910
911/*
912 * Edit a destination in the given service
913 */
914static int
915ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
916{
917 struct ip_vs_dest *dest;
Al Viro014d7302006-09-28 14:29:52 -0700918 __be32 daddr = udest->addr;
919 __be16 dport = udest->port;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700920
921 EnterFunction(2);
922
923 if (udest->weight < 0) {
924 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
925 return -ERANGE;
926 }
927
928 if (udest->l_threshold > udest->u_threshold) {
929 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
930 "upper threshold\n");
931 return -ERANGE;
932 }
933
934 /*
935 * Lookup the destination list
936 */
937 dest = ip_vs_lookup_dest(svc, daddr, dport);
938 if (dest == NULL) {
939 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
940 return -ENOENT;
941 }
942
943 __ip_vs_update_dest(svc, dest, udest);
944
945 write_lock_bh(&__ip_vs_svc_lock);
946
947 /* Wait until all other svc users go away */
Heiko Carstenscae7ca32007-08-10 15:50:30 -0700948 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700949
950 /* call the update_service, because server weight may be changed */
951 svc->scheduler->update_service(svc);
952
953 write_unlock_bh(&__ip_vs_svc_lock);
954
955 LeaveFunction(2);
956
957 return 0;
958}
959
960
961/*
962 * Delete a destination (must be already unlinked from the service)
963 */
964static void __ip_vs_del_dest(struct ip_vs_dest *dest)
965{
966 ip_vs_kill_estimator(&dest->stats);
967
968 /*
969 * Remove it from the d-linked list with the real services.
970 */
971 write_lock_bh(&__ip_vs_rs_lock);
972 ip_vs_rs_unhash(dest);
973 write_unlock_bh(&__ip_vs_rs_lock);
974
975 /*
976 * Decrease the refcnt of the dest, and free the dest
977 * if nobody refers to it (refcnt=0). Otherwise, throw
978 * the destination into the trash.
979 */
980 if (atomic_dec_and_test(&dest->refcnt)) {
981 ip_vs_dst_reset(dest);
982 /* simply decrease svc->refcnt here, let the caller check
983 and release the service if nobody refers to it.
984 Only user context can release destination and service,
985 and only one user context can update virtual service at a
986 time, so the operation here is OK */
987 atomic_dec(&dest->svc->refcnt);
988 kfree(dest);
989 } else {
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800990 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
991 "dest->refcnt=%d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700992 NIPQUAD(dest->addr), ntohs(dest->port),
993 atomic_read(&dest->refcnt));
994 list_add(&dest->n_list, &ip_vs_dest_trash);
995 atomic_inc(&dest->refcnt);
996 }
997}
998
999
1000/*
1001 * Unlink a destination from the given service
1002 */
1003static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1004 struct ip_vs_dest *dest,
1005 int svcupd)
1006{
1007 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1008
1009 /*
1010 * Remove it from the d-linked destination list.
1011 */
1012 list_del(&dest->n_list);
1013 svc->num_dests--;
1014 if (svcupd) {
1015 /*
1016 * Call the update_service function of its scheduler
1017 */
1018 svc->scheduler->update_service(svc);
1019 }
1020}
1021
1022
1023/*
1024 * Delete a destination server in the given service
1025 */
1026static int
1027ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
1028{
1029 struct ip_vs_dest *dest;
Al Viro014d7302006-09-28 14:29:52 -07001030 __be32 daddr = udest->addr;
1031 __be16 dport = udest->port;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001032
1033 EnterFunction(2);
1034
1035 dest = ip_vs_lookup_dest(svc, daddr, dport);
1036 if (dest == NULL) {
1037 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1038 return -ENOENT;
1039 }
1040
1041 write_lock_bh(&__ip_vs_svc_lock);
1042
1043 /*
1044 * Wait until all other svc users go away.
1045 */
1046 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1047
1048 /*
1049 * Unlink dest from the service
1050 */
1051 __ip_vs_unlink_dest(svc, dest, 1);
1052
1053 write_unlock_bh(&__ip_vs_svc_lock);
1054
1055 /*
1056 * Delete the destination
1057 */
1058 __ip_vs_del_dest(dest);
1059
1060 LeaveFunction(2);
1061
1062 return 0;
1063}
1064
1065
1066/*
1067 * Add a service into the service hash table
1068 */
1069static int
1070ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1071{
1072 int ret = 0;
1073 struct ip_vs_scheduler *sched = NULL;
1074 struct ip_vs_service *svc = NULL;
1075
1076 /* increase the module use count */
1077 ip_vs_use_count_inc();
1078
1079 /* Lookup the scheduler by 'u->sched_name' */
1080 sched = ip_vs_scheduler_get(u->sched_name);
1081 if (sched == NULL) {
1082 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1083 u->sched_name);
1084 ret = -ENOENT;
1085 goto out_mod_dec;
1086 }
1087
Panagiotis Issaris0da974f2006-07-21 14:51:30 -07001088 svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001089 if (svc == NULL) {
1090 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1091 ret = -ENOMEM;
1092 goto out_err;
1093 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001094
1095 /* I'm the first user of the service */
1096 atomic_set(&svc->usecnt, 1);
1097 atomic_set(&svc->refcnt, 0);
1098
1099 svc->protocol = u->protocol;
1100 svc->addr = u->addr;
1101 svc->port = u->port;
1102 svc->fwmark = u->fwmark;
1103 svc->flags = u->flags;
1104 svc->timeout = u->timeout * HZ;
1105 svc->netmask = u->netmask;
1106
1107 INIT_LIST_HEAD(&svc->destinations);
1108 rwlock_init(&svc->sched_lock);
1109 spin_lock_init(&svc->stats.lock);
1110
1111 /* Bind the scheduler */
1112 ret = ip_vs_bind_scheduler(svc, sched);
1113 if (ret)
1114 goto out_err;
1115 sched = NULL;
1116
1117 /* Update the virtual service counters */
1118 if (svc->port == FTPPORT)
1119 atomic_inc(&ip_vs_ftpsvc_counter);
1120 else if (svc->port == 0)
1121 atomic_inc(&ip_vs_nullsvc_counter);
1122
1123 ip_vs_new_estimator(&svc->stats);
1124 ip_vs_num_services++;
1125
1126 /* Hash the service into the service table */
1127 write_lock_bh(&__ip_vs_svc_lock);
1128 ip_vs_svc_hash(svc);
1129 write_unlock_bh(&__ip_vs_svc_lock);
1130
1131 *svc_p = svc;
1132 return 0;
1133
1134 out_err:
1135 if (svc != NULL) {
1136 if (svc->scheduler)
1137 ip_vs_unbind_scheduler(svc);
1138 if (svc->inc) {
1139 local_bh_disable();
1140 ip_vs_app_inc_put(svc->inc);
1141 local_bh_enable();
1142 }
1143 kfree(svc);
1144 }
1145 ip_vs_scheduler_put(sched);
1146
1147 out_mod_dec:
1148 /* decrease the module use count */
1149 ip_vs_use_count_dec();
1150
1151 return ret;
1152}
1153
1154
1155/*
1156 * Edit a service and bind it with a new scheduler
1157 */
1158static int
1159ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1160{
1161 struct ip_vs_scheduler *sched, *old_sched;
1162 int ret = 0;
1163
1164 /*
1165 * Lookup the scheduler, by 'u->sched_name'
1166 */
1167 sched = ip_vs_scheduler_get(u->sched_name);
1168 if (sched == NULL) {
1169 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1170 u->sched_name);
1171 return -ENOENT;
1172 }
1173 old_sched = sched;
1174
1175 write_lock_bh(&__ip_vs_svc_lock);
1176
1177 /*
1178 * Wait until all other svc users go away.
1179 */
1180 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1181
1182 /*
1183 * Set the flags and timeout value
1184 */
1185 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1186 svc->timeout = u->timeout * HZ;
1187 svc->netmask = u->netmask;
1188
1189 old_sched = svc->scheduler;
1190 if (sched != old_sched) {
1191 /*
1192 * Unbind the old scheduler
1193 */
1194 if ((ret = ip_vs_unbind_scheduler(svc))) {
1195 old_sched = sched;
1196 goto out;
1197 }
1198
1199 /*
1200 * Bind the new scheduler
1201 */
1202 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1203 /*
1204 * If ip_vs_bind_scheduler fails, restore the old
1205 * scheduler.
1206 * The main reason of failure is out of memory.
1207 *
1208 * The question is if the old scheduler can be
1209 * restored all the time. TODO: if it cannot be
1210 * restored some time, we must delete the service,
1211 * otherwise the system may crash.
1212 */
1213 ip_vs_bind_scheduler(svc, old_sched);
1214 old_sched = sched;
1215 goto out;
1216 }
1217 }
1218
1219 out:
1220 write_unlock_bh(&__ip_vs_svc_lock);
1221
1222 if (old_sched)
1223 ip_vs_scheduler_put(old_sched);
1224
1225 return ret;
1226}
1227
1228
1229/*
1230 * Delete a service from the service list
1231 * - The service must be unlinked, unlocked and not referenced!
1232 * - We are called under _bh lock
1233 */
1234static void __ip_vs_del_service(struct ip_vs_service *svc)
1235{
1236 struct ip_vs_dest *dest, *nxt;
1237 struct ip_vs_scheduler *old_sched;
1238
1239 ip_vs_num_services--;
1240 ip_vs_kill_estimator(&svc->stats);
1241
1242 /* Unbind scheduler */
1243 old_sched = svc->scheduler;
1244 ip_vs_unbind_scheduler(svc);
1245 if (old_sched)
1246 ip_vs_scheduler_put(old_sched);
1247
1248 /* Unbind app inc */
1249 if (svc->inc) {
1250 ip_vs_app_inc_put(svc->inc);
1251 svc->inc = NULL;
1252 }
1253
1254 /*
1255 * Unlink the whole destination list
1256 */
1257 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1258 __ip_vs_unlink_dest(svc, dest, 0);
1259 __ip_vs_del_dest(dest);
1260 }
1261
1262 /*
1263 * Update the virtual service counters
1264 */
1265 if (svc->port == FTPPORT)
1266 atomic_dec(&ip_vs_ftpsvc_counter);
1267 else if (svc->port == 0)
1268 atomic_dec(&ip_vs_nullsvc_counter);
1269
1270 /*
1271 * Free the service if nobody refers to it
1272 */
1273 if (atomic_read(&svc->refcnt) == 0)
1274 kfree(svc);
1275
1276 /* decrease the module use count */
1277 ip_vs_use_count_dec();
1278}
1279
1280/*
1281 * Delete a service from the service list
1282 */
1283static int ip_vs_del_service(struct ip_vs_service *svc)
1284{
1285 if (svc == NULL)
1286 return -EEXIST;
1287
1288 /*
1289 * Unhash it from the service table
1290 */
1291 write_lock_bh(&__ip_vs_svc_lock);
1292
1293 ip_vs_svc_unhash(svc);
1294
1295 /*
1296 * Wait until all the svc users go away.
1297 */
1298 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1299
1300 __ip_vs_del_service(svc);
1301
1302 write_unlock_bh(&__ip_vs_svc_lock);
1303
1304 return 0;
1305}
1306
1307
1308/*
1309 * Flush all the virtual services
1310 */
1311static int ip_vs_flush(void)
1312{
1313 int idx;
1314 struct ip_vs_service *svc, *nxt;
1315
1316 /*
1317 * Flush the service table hashed by <protocol,addr,port>
1318 */
1319 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1320 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1321 write_lock_bh(&__ip_vs_svc_lock);
1322 ip_vs_svc_unhash(svc);
1323 /*
1324 * Wait until all the svc users go away.
1325 */
1326 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1327 __ip_vs_del_service(svc);
1328 write_unlock_bh(&__ip_vs_svc_lock);
1329 }
1330 }
1331
1332 /*
1333 * Flush the service table hashed by fwmark
1334 */
1335 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1336 list_for_each_entry_safe(svc, nxt,
1337 &ip_vs_svc_fwm_table[idx], f_list) {
1338 write_lock_bh(&__ip_vs_svc_lock);
1339 ip_vs_svc_unhash(svc);
1340 /*
1341 * Wait until all the svc users go away.
1342 */
1343 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1344 __ip_vs_del_service(svc);
1345 write_unlock_bh(&__ip_vs_svc_lock);
1346 }
1347 }
1348
1349 return 0;
1350}
1351
1352
1353/*
1354 * Zero counters in a service or all services
1355 */
1356static int ip_vs_zero_service(struct ip_vs_service *svc)
1357{
1358 struct ip_vs_dest *dest;
1359
1360 write_lock_bh(&__ip_vs_svc_lock);
1361 list_for_each_entry(dest, &svc->destinations, n_list) {
1362 ip_vs_zero_stats(&dest->stats);
1363 }
1364 ip_vs_zero_stats(&svc->stats);
1365 write_unlock_bh(&__ip_vs_svc_lock);
1366 return 0;
1367}
1368
1369static int ip_vs_zero_all(void)
1370{
1371 int idx;
1372 struct ip_vs_service *svc;
1373
1374 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1375 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1376 ip_vs_zero_service(svc);
1377 }
1378 }
1379
1380 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1381 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1382 ip_vs_zero_service(svc);
1383 }
1384 }
1385
1386 ip_vs_zero_stats(&ip_vs_stats);
1387 return 0;
1388}
1389
1390
1391static int
1392proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1393 void __user *buffer, size_t *lenp, loff_t *ppos)
1394{
1395 int *valp = table->data;
1396 int val = *valp;
1397 int rc;
1398
1399 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1400 if (write && (*valp != val)) {
1401 if ((*valp < 0) || (*valp > 3)) {
1402 /* Restore the correct value */
1403 *valp = val;
1404 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001405 update_defense_level();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001406 }
1407 }
1408 return rc;
1409}
1410
1411
1412static int
1413proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1414 void __user *buffer, size_t *lenp, loff_t *ppos)
1415{
1416 int *valp = table->data;
1417 int val[2];
1418 int rc;
1419
1420 /* backup the value first */
1421 memcpy(val, valp, sizeof(val));
1422
1423 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1424 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1425 /* Restore the correct value */
1426 memcpy(valp, val, sizeof(val));
1427 }
1428 return rc;
1429}
1430
1431
1432/*
1433 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1434 */
1435
1436static struct ctl_table vs_vars[] = {
1437 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001438 .procname = "amemthresh",
1439 .data = &sysctl_ip_vs_amemthresh,
1440 .maxlen = sizeof(int),
1441 .mode = 0644,
1442 .proc_handler = &proc_dointvec,
1443 },
1444#ifdef CONFIG_IP_VS_DEBUG
1445 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001446 .procname = "debug_level",
1447 .data = &sysctl_ip_vs_debug_level,
1448 .maxlen = sizeof(int),
1449 .mode = 0644,
1450 .proc_handler = &proc_dointvec,
1451 },
1452#endif
1453 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001454 .procname = "am_droprate",
1455 .data = &sysctl_ip_vs_am_droprate,
1456 .maxlen = sizeof(int),
1457 .mode = 0644,
1458 .proc_handler = &proc_dointvec,
1459 },
1460 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001461 .procname = "drop_entry",
1462 .data = &sysctl_ip_vs_drop_entry,
1463 .maxlen = sizeof(int),
1464 .mode = 0644,
1465 .proc_handler = &proc_do_defense_mode,
1466 },
1467 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001468 .procname = "drop_packet",
1469 .data = &sysctl_ip_vs_drop_packet,
1470 .maxlen = sizeof(int),
1471 .mode = 0644,
1472 .proc_handler = &proc_do_defense_mode,
1473 },
1474 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001475 .procname = "secure_tcp",
1476 .data = &sysctl_ip_vs_secure_tcp,
1477 .maxlen = sizeof(int),
1478 .mode = 0644,
1479 .proc_handler = &proc_do_defense_mode,
1480 },
1481#if 0
1482 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001483 .procname = "timeout_established",
1484 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1485 .maxlen = sizeof(int),
1486 .mode = 0644,
1487 .proc_handler = &proc_dointvec_jiffies,
1488 },
1489 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001490 .procname = "timeout_synsent",
1491 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1492 .maxlen = sizeof(int),
1493 .mode = 0644,
1494 .proc_handler = &proc_dointvec_jiffies,
1495 },
1496 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001497 .procname = "timeout_synrecv",
1498 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1499 .maxlen = sizeof(int),
1500 .mode = 0644,
1501 .proc_handler = &proc_dointvec_jiffies,
1502 },
1503 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001504 .procname = "timeout_finwait",
1505 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1506 .maxlen = sizeof(int),
1507 .mode = 0644,
1508 .proc_handler = &proc_dointvec_jiffies,
1509 },
1510 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511 .procname = "timeout_timewait",
1512 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1513 .maxlen = sizeof(int),
1514 .mode = 0644,
1515 .proc_handler = &proc_dointvec_jiffies,
1516 },
1517 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001518 .procname = "timeout_close",
1519 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1520 .maxlen = sizeof(int),
1521 .mode = 0644,
1522 .proc_handler = &proc_dointvec_jiffies,
1523 },
1524 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001525 .procname = "timeout_closewait",
1526 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1527 .maxlen = sizeof(int),
1528 .mode = 0644,
1529 .proc_handler = &proc_dointvec_jiffies,
1530 },
1531 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001532 .procname = "timeout_lastack",
1533 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1534 .maxlen = sizeof(int),
1535 .mode = 0644,
1536 .proc_handler = &proc_dointvec_jiffies,
1537 },
1538 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001539 .procname = "timeout_listen",
1540 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1541 .maxlen = sizeof(int),
1542 .mode = 0644,
1543 .proc_handler = &proc_dointvec_jiffies,
1544 },
1545 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001546 .procname = "timeout_synack",
1547 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1548 .maxlen = sizeof(int),
1549 .mode = 0644,
1550 .proc_handler = &proc_dointvec_jiffies,
1551 },
1552 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001553 .procname = "timeout_udp",
1554 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1555 .maxlen = sizeof(int),
1556 .mode = 0644,
1557 .proc_handler = &proc_dointvec_jiffies,
1558 },
1559 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001560 .procname = "timeout_icmp",
1561 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1562 .maxlen = sizeof(int),
1563 .mode = 0644,
1564 .proc_handler = &proc_dointvec_jiffies,
1565 },
1566#endif
1567 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001568 .procname = "cache_bypass",
1569 .data = &sysctl_ip_vs_cache_bypass,
1570 .maxlen = sizeof(int),
1571 .mode = 0644,
1572 .proc_handler = &proc_dointvec,
1573 },
1574 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001575 .procname = "expire_nodest_conn",
1576 .data = &sysctl_ip_vs_expire_nodest_conn,
1577 .maxlen = sizeof(int),
1578 .mode = 0644,
1579 .proc_handler = &proc_dointvec,
1580 },
1581 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001582 .procname = "expire_quiescent_template",
1583 .data = &sysctl_ip_vs_expire_quiescent_template,
1584 .maxlen = sizeof(int),
1585 .mode = 0644,
1586 .proc_handler = &proc_dointvec,
1587 },
1588 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589 .procname = "sync_threshold",
1590 .data = &sysctl_ip_vs_sync_threshold,
1591 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1592 .mode = 0644,
1593 .proc_handler = &proc_do_sync_threshold,
1594 },
1595 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001596 .procname = "nat_icmp_send",
1597 .data = &sysctl_ip_vs_nat_icmp_send,
1598 .maxlen = sizeof(int),
1599 .mode = 0644,
1600 .proc_handler = &proc_dointvec,
1601 },
1602 { .ctl_name = 0 }
1603};
1604
Sven Wegener5587da52008-08-10 18:24:40 +00001605const struct ctl_path net_vs_ctl_path[] = {
Pavel Emelyanov90754f82008-01-12 02:33:50 -08001606 { .procname = "net", .ctl_name = CTL_NET, },
1607 { .procname = "ipv4", .ctl_name = NET_IPV4, },
1608 { .procname = "vs", },
1609 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001610};
Pavel Emelyanov90754f82008-01-12 02:33:50 -08001611EXPORT_SYMBOL_GPL(net_vs_ctl_path);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001612
1613static struct ctl_table_header * sysctl_header;
1614
1615#ifdef CONFIG_PROC_FS
1616
1617struct ip_vs_iter {
1618 struct list_head *table;
1619 int bucket;
1620};
1621
1622/*
1623 * Write the contents of the VS rule table to a PROCfs file.
1624 * (It is kept just for backward compatibility)
1625 */
1626static inline const char *ip_vs_fwd_name(unsigned flags)
1627{
1628 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1629 case IP_VS_CONN_F_LOCALNODE:
1630 return "Local";
1631 case IP_VS_CONN_F_TUNNEL:
1632 return "Tunnel";
1633 case IP_VS_CONN_F_DROUTE:
1634 return "Route";
1635 default:
1636 return "Masq";
1637 }
1638}
1639
1640
1641/* Get the Nth entry in the two lists */
1642static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1643{
1644 struct ip_vs_iter *iter = seq->private;
1645 int idx;
1646 struct ip_vs_service *svc;
1647
1648 /* look in hash by protocol */
1649 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1650 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1651 if (pos-- == 0){
1652 iter->table = ip_vs_svc_table;
1653 iter->bucket = idx;
1654 return svc;
1655 }
1656 }
1657 }
1658
1659 /* keep looking in fwmark */
1660 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1661 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1662 if (pos-- == 0) {
1663 iter->table = ip_vs_svc_fwm_table;
1664 iter->bucket = idx;
1665 return svc;
1666 }
1667 }
1668 }
1669
1670 return NULL;
1671}
1672
1673static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1674{
1675
1676 read_lock_bh(&__ip_vs_svc_lock);
1677 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1678}
1679
1680
1681static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1682{
1683 struct list_head *e;
1684 struct ip_vs_iter *iter;
1685 struct ip_vs_service *svc;
1686
1687 ++*pos;
1688 if (v == SEQ_START_TOKEN)
1689 return ip_vs_info_array(seq,0);
1690
1691 svc = v;
1692 iter = seq->private;
1693
1694 if (iter->table == ip_vs_svc_table) {
1695 /* next service in table hashed by protocol */
1696 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1697 return list_entry(e, struct ip_vs_service, s_list);
1698
1699
1700 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1701 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1702 s_list) {
1703 return svc;
1704 }
1705 }
1706
1707 iter->table = ip_vs_svc_fwm_table;
1708 iter->bucket = -1;
1709 goto scan_fwmark;
1710 }
1711
1712 /* next service in hashed by fwmark */
1713 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1714 return list_entry(e, struct ip_vs_service, f_list);
1715
1716 scan_fwmark:
1717 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1718 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1719 f_list)
1720 return svc;
1721 }
1722
1723 return NULL;
1724}
1725
1726static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1727{
1728 read_unlock_bh(&__ip_vs_svc_lock);
1729}
1730
1731
1732static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1733{
1734 if (v == SEQ_START_TOKEN) {
1735 seq_printf(seq,
1736 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1737 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1738 seq_puts(seq,
1739 "Prot LocalAddress:Port Scheduler Flags\n");
1740 seq_puts(seq,
1741 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1742 } else {
1743 const struct ip_vs_service *svc = v;
1744 const struct ip_vs_iter *iter = seq->private;
1745 const struct ip_vs_dest *dest;
1746
1747 if (iter->table == ip_vs_svc_table)
1748 seq_printf(seq, "%s %08X:%04X %s ",
1749 ip_vs_proto_name(svc->protocol),
1750 ntohl(svc->addr),
1751 ntohs(svc->port),
1752 svc->scheduler->name);
1753 else
1754 seq_printf(seq, "FWM %08X %s ",
1755 svc->fwmark, svc->scheduler->name);
1756
1757 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1758 seq_printf(seq, "persistent %d %08X\n",
1759 svc->timeout,
1760 ntohl(svc->netmask));
1761 else
1762 seq_putc(seq, '\n');
1763
1764 list_for_each_entry(dest, &svc->destinations, n_list) {
1765 seq_printf(seq,
1766 " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
1767 ntohl(dest->addr), ntohs(dest->port),
1768 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1769 atomic_read(&dest->weight),
1770 atomic_read(&dest->activeconns),
1771 atomic_read(&dest->inactconns));
1772 }
1773 }
1774 return 0;
1775}
1776
Philippe De Muyter56b3d972007-07-10 23:07:31 -07001777static const struct seq_operations ip_vs_info_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001778 .start = ip_vs_info_seq_start,
1779 .next = ip_vs_info_seq_next,
1780 .stop = ip_vs_info_seq_stop,
1781 .show = ip_vs_info_seq_show,
1782};
1783
1784static int ip_vs_info_open(struct inode *inode, struct file *file)
1785{
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -07001786 return seq_open_private(file, &ip_vs_info_seq_ops,
1787 sizeof(struct ip_vs_iter));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001788}
1789
Arjan van de Ven9a321442007-02-12 00:55:35 -08001790static const struct file_operations ip_vs_info_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001791 .owner = THIS_MODULE,
1792 .open = ip_vs_info_open,
1793 .read = seq_read,
1794 .llseek = seq_lseek,
1795 .release = seq_release_private,
1796};
1797
1798#endif
1799
Sven Wegener519e49e2008-08-10 18:24:41 +00001800struct ip_vs_stats ip_vs_stats = {
1801 .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock),
1802};
Linus Torvalds1da177e2005-04-16 15:20:36 -07001803
1804#ifdef CONFIG_PROC_FS
1805static int ip_vs_stats_show(struct seq_file *seq, void *v)
1806{
1807
1808/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1809 seq_puts(seq,
1810 " Total Incoming Outgoing Incoming Outgoing\n");
1811 seq_printf(seq,
1812 " Conns Packets Packets Bytes Bytes\n");
1813
1814 spin_lock_bh(&ip_vs_stats.lock);
1815 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1816 ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1817 (unsigned long long) ip_vs_stats.inbytes,
1818 (unsigned long long) ip_vs_stats.outbytes);
1819
1820/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1821 seq_puts(seq,
1822 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1823 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1824 ip_vs_stats.cps,
1825 ip_vs_stats.inpps,
1826 ip_vs_stats.outpps,
1827 ip_vs_stats.inbps,
1828 ip_vs_stats.outbps);
1829 spin_unlock_bh(&ip_vs_stats.lock);
1830
1831 return 0;
1832}
1833
1834static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1835{
1836 return single_open(file, ip_vs_stats_show, NULL);
1837}
1838
Arjan van de Ven9a321442007-02-12 00:55:35 -08001839static const struct file_operations ip_vs_stats_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001840 .owner = THIS_MODULE,
1841 .open = ip_vs_stats_seq_open,
1842 .read = seq_read,
1843 .llseek = seq_lseek,
1844 .release = single_release,
1845};
1846
1847#endif
1848
1849/*
1850 * Set timeout values for tcp tcpfin udp in the timeout_table.
1851 */
1852static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1853{
1854 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1855 u->tcp_timeout,
1856 u->tcp_fin_timeout,
1857 u->udp_timeout);
1858
1859#ifdef CONFIG_IP_VS_PROTO_TCP
1860 if (u->tcp_timeout) {
1861 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1862 = u->tcp_timeout * HZ;
1863 }
1864
1865 if (u->tcp_fin_timeout) {
1866 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1867 = u->tcp_fin_timeout * HZ;
1868 }
1869#endif
1870
1871#ifdef CONFIG_IP_VS_PROTO_UDP
1872 if (u->udp_timeout) {
1873 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1874 = u->udp_timeout * HZ;
1875 }
1876#endif
1877 return 0;
1878}
1879
1880
1881#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
1882#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
1883#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
1884 sizeof(struct ip_vs_dest_user))
1885#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
1886#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
1887#define MAX_ARG_LEN SVCDEST_ARG_LEN
1888
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001889static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001890 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
1891 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
1892 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
1893 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
1894 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
1895 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
1896 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
1897 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
1898 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
1899 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
1900 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
1901};
1902
1903static int
1904do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1905{
1906 int ret;
1907 unsigned char arg[MAX_ARG_LEN];
1908 struct ip_vs_service_user *usvc;
1909 struct ip_vs_service *svc;
1910 struct ip_vs_dest_user *udest;
1911
1912 if (!capable(CAP_NET_ADMIN))
1913 return -EPERM;
1914
1915 if (len != set_arglen[SET_CMDID(cmd)]) {
1916 IP_VS_ERR("set_ctl: len %u != %u\n",
1917 len, set_arglen[SET_CMDID(cmd)]);
1918 return -EINVAL;
1919 }
1920
1921 if (copy_from_user(arg, user, len) != 0)
1922 return -EFAULT;
1923
1924 /* increase the module use count */
1925 ip_vs_use_count_inc();
1926
Ingo Molnar14cc3e22006-03-26 01:37:14 -08001927 if (mutex_lock_interruptible(&__ip_vs_mutex)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001928 ret = -ERESTARTSYS;
1929 goto out_dec;
1930 }
1931
1932 if (cmd == IP_VS_SO_SET_FLUSH) {
1933 /* Flush the virtual service */
1934 ret = ip_vs_flush();
1935 goto out_unlock;
1936 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1937 /* Set timeout values for (tcp tcpfin udp) */
1938 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1939 goto out_unlock;
1940 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1941 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1942 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1943 goto out_unlock;
1944 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1945 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1946 ret = stop_sync_thread(dm->state);
1947 goto out_unlock;
1948 }
1949
1950 usvc = (struct ip_vs_service_user *)arg;
1951 udest = (struct ip_vs_dest_user *)(usvc + 1);
1952
1953 if (cmd == IP_VS_SO_SET_ZERO) {
1954 /* if no service address is set, zero counters in all */
1955 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1956 ret = ip_vs_zero_all();
1957 goto out_unlock;
1958 }
1959 }
1960
1961 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1962 if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1963 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1964 usvc->protocol, NIPQUAD(usvc->addr),
1965 ntohs(usvc->port), usvc->sched_name);
1966 ret = -EFAULT;
1967 goto out_unlock;
1968 }
1969
1970 /* Lookup the exact service by <protocol, addr, port> or fwmark */
1971 if (usvc->fwmark == 0)
1972 svc = __ip_vs_service_get(usvc->protocol,
1973 usvc->addr, usvc->port);
1974 else
1975 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1976
1977 if (cmd != IP_VS_SO_SET_ADD
1978 && (svc == NULL || svc->protocol != usvc->protocol)) {
1979 ret = -ESRCH;
1980 goto out_unlock;
1981 }
1982
1983 switch (cmd) {
1984 case IP_VS_SO_SET_ADD:
1985 if (svc != NULL)
1986 ret = -EEXIST;
1987 else
1988 ret = ip_vs_add_service(usvc, &svc);
1989 break;
1990 case IP_VS_SO_SET_EDIT:
1991 ret = ip_vs_edit_service(svc, usvc);
1992 break;
1993 case IP_VS_SO_SET_DEL:
1994 ret = ip_vs_del_service(svc);
1995 if (!ret)
1996 goto out_unlock;
1997 break;
1998 case IP_VS_SO_SET_ZERO:
1999 ret = ip_vs_zero_service(svc);
2000 break;
2001 case IP_VS_SO_SET_ADDDEST:
2002 ret = ip_vs_add_dest(svc, udest);
2003 break;
2004 case IP_VS_SO_SET_EDITDEST:
2005 ret = ip_vs_edit_dest(svc, udest);
2006 break;
2007 case IP_VS_SO_SET_DELDEST:
2008 ret = ip_vs_del_dest(svc, udest);
2009 break;
2010 default:
2011 ret = -EINVAL;
2012 }
2013
2014 if (svc)
2015 ip_vs_service_put(svc);
2016
2017 out_unlock:
Ingo Molnar14cc3e22006-03-26 01:37:14 -08002018 mutex_unlock(&__ip_vs_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002019 out_dec:
2020 /* decrease the module use count */
2021 ip_vs_use_count_dec();
2022
2023 return ret;
2024}
2025
2026
2027static void
2028ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2029{
2030 spin_lock_bh(&src->lock);
2031 memcpy(dst, src, (char*)&src->lock - (char*)src);
2032 spin_unlock_bh(&src->lock);
2033}
2034
2035static void
2036ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2037{
2038 dst->protocol = src->protocol;
2039 dst->addr = src->addr;
2040 dst->port = src->port;
2041 dst->fwmark = src->fwmark;
pageexec4da62fc2005-06-26 16:00:19 -07002042 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002043 dst->flags = src->flags;
2044 dst->timeout = src->timeout / HZ;
2045 dst->netmask = src->netmask;
2046 dst->num_dests = src->num_dests;
2047 ip_vs_copy_stats(&dst->stats, &src->stats);
2048}
2049
2050static inline int
2051__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2052 struct ip_vs_get_services __user *uptr)
2053{
2054 int idx, count=0;
2055 struct ip_vs_service *svc;
2056 struct ip_vs_service_entry entry;
2057 int ret = 0;
2058
2059 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2060 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2061 if (count >= get->num_services)
2062 goto out;
pageexec4da62fc2005-06-26 16:00:19 -07002063 memset(&entry, 0, sizeof(entry));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002064 ip_vs_copy_service(&entry, svc);
2065 if (copy_to_user(&uptr->entrytable[count],
2066 &entry, sizeof(entry))) {
2067 ret = -EFAULT;
2068 goto out;
2069 }
2070 count++;
2071 }
2072 }
2073
2074 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2075 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2076 if (count >= get->num_services)
2077 goto out;
pageexec4da62fc2005-06-26 16:00:19 -07002078 memset(&entry, 0, sizeof(entry));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002079 ip_vs_copy_service(&entry, svc);
2080 if (copy_to_user(&uptr->entrytable[count],
2081 &entry, sizeof(entry))) {
2082 ret = -EFAULT;
2083 goto out;
2084 }
2085 count++;
2086 }
2087 }
2088 out:
2089 return ret;
2090}
2091
2092static inline int
2093__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2094 struct ip_vs_get_dests __user *uptr)
2095{
2096 struct ip_vs_service *svc;
2097 int ret = 0;
2098
2099 if (get->fwmark)
2100 svc = __ip_vs_svc_fwm_get(get->fwmark);
2101 else
2102 svc = __ip_vs_service_get(get->protocol,
2103 get->addr, get->port);
2104 if (svc) {
2105 int count = 0;
2106 struct ip_vs_dest *dest;
2107 struct ip_vs_dest_entry entry;
2108
2109 list_for_each_entry(dest, &svc->destinations, n_list) {
2110 if (count >= get->num_dests)
2111 break;
2112
2113 entry.addr = dest->addr;
2114 entry.port = dest->port;
2115 entry.conn_flags = atomic_read(&dest->conn_flags);
2116 entry.weight = atomic_read(&dest->weight);
2117 entry.u_threshold = dest->u_threshold;
2118 entry.l_threshold = dest->l_threshold;
2119 entry.activeconns = atomic_read(&dest->activeconns);
2120 entry.inactconns = atomic_read(&dest->inactconns);
2121 entry.persistconns = atomic_read(&dest->persistconns);
2122 ip_vs_copy_stats(&entry.stats, &dest->stats);
2123 if (copy_to_user(&uptr->entrytable[count],
2124 &entry, sizeof(entry))) {
2125 ret = -EFAULT;
2126 break;
2127 }
2128 count++;
2129 }
2130 ip_vs_service_put(svc);
2131 } else
2132 ret = -ESRCH;
2133 return ret;
2134}
2135
2136static inline void
2137__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2138{
2139#ifdef CONFIG_IP_VS_PROTO_TCP
2140 u->tcp_timeout =
2141 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2142 u->tcp_fin_timeout =
2143 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2144#endif
2145#ifdef CONFIG_IP_VS_PROTO_UDP
2146 u->udp_timeout =
2147 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2148#endif
2149}
2150
2151
2152#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2153#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2154#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2155#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2156#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2157#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2158#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2159
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08002160static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002161 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2162 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2163 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2164 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2165 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2166 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2167 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2168};
2169
2170static int
2171do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2172{
2173 unsigned char arg[128];
2174 int ret = 0;
2175
2176 if (!capable(CAP_NET_ADMIN))
2177 return -EPERM;
2178
2179 if (*len < get_arglen[GET_CMDID(cmd)]) {
2180 IP_VS_ERR("get_ctl: len %u < %u\n",
2181 *len, get_arglen[GET_CMDID(cmd)]);
2182 return -EINVAL;
2183 }
2184
2185 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2186 return -EFAULT;
2187
Ingo Molnar14cc3e22006-03-26 01:37:14 -08002188 if (mutex_lock_interruptible(&__ip_vs_mutex))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002189 return -ERESTARTSYS;
2190
2191 switch (cmd) {
2192 case IP_VS_SO_GET_VERSION:
2193 {
2194 char buf[64];
2195
2196 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2197 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2198 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2199 ret = -EFAULT;
2200 goto out;
2201 }
2202 *len = strlen(buf)+1;
2203 }
2204 break;
2205
2206 case IP_VS_SO_GET_INFO:
2207 {
2208 struct ip_vs_getinfo info;
2209 info.version = IP_VS_VERSION_CODE;
2210 info.size = IP_VS_CONN_TAB_SIZE;
2211 info.num_services = ip_vs_num_services;
2212 if (copy_to_user(user, &info, sizeof(info)) != 0)
2213 ret = -EFAULT;
2214 }
2215 break;
2216
2217 case IP_VS_SO_GET_SERVICES:
2218 {
2219 struct ip_vs_get_services *get;
2220 int size;
2221
2222 get = (struct ip_vs_get_services *)arg;
2223 size = sizeof(*get) +
2224 sizeof(struct ip_vs_service_entry) * get->num_services;
2225 if (*len != size) {
2226 IP_VS_ERR("length: %u != %u\n", *len, size);
2227 ret = -EINVAL;
2228 goto out;
2229 }
2230 ret = __ip_vs_get_service_entries(get, user);
2231 }
2232 break;
2233
2234 case IP_VS_SO_GET_SERVICE:
2235 {
2236 struct ip_vs_service_entry *entry;
2237 struct ip_vs_service *svc;
2238
2239 entry = (struct ip_vs_service_entry *)arg;
2240 if (entry->fwmark)
2241 svc = __ip_vs_svc_fwm_get(entry->fwmark);
2242 else
2243 svc = __ip_vs_service_get(entry->protocol,
2244 entry->addr, entry->port);
2245 if (svc) {
2246 ip_vs_copy_service(entry, svc);
2247 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2248 ret = -EFAULT;
2249 ip_vs_service_put(svc);
2250 } else
2251 ret = -ESRCH;
2252 }
2253 break;
2254
2255 case IP_VS_SO_GET_DESTS:
2256 {
2257 struct ip_vs_get_dests *get;
2258 int size;
2259
2260 get = (struct ip_vs_get_dests *)arg;
2261 size = sizeof(*get) +
2262 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2263 if (*len != size) {
2264 IP_VS_ERR("length: %u != %u\n", *len, size);
2265 ret = -EINVAL;
2266 goto out;
2267 }
2268 ret = __ip_vs_get_dest_entries(get, user);
2269 }
2270 break;
2271
2272 case IP_VS_SO_GET_TIMEOUT:
2273 {
2274 struct ip_vs_timeout_user t;
2275
2276 __ip_vs_get_timeouts(&t);
2277 if (copy_to_user(user, &t, sizeof(t)) != 0)
2278 ret = -EFAULT;
2279 }
2280 break;
2281
2282 case IP_VS_SO_GET_DAEMON:
2283 {
2284 struct ip_vs_daemon_user d[2];
2285
2286 memset(&d, 0, sizeof(d));
2287 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2288 d[0].state = IP_VS_STATE_MASTER;
pageexec4da62fc2005-06-26 16:00:19 -07002289 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002290 d[0].syncid = ip_vs_master_syncid;
2291 }
2292 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2293 d[1].state = IP_VS_STATE_BACKUP;
pageexec4da62fc2005-06-26 16:00:19 -07002294 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002295 d[1].syncid = ip_vs_backup_syncid;
2296 }
2297 if (copy_to_user(user, &d, sizeof(d)) != 0)
2298 ret = -EFAULT;
2299 }
2300 break;
2301
2302 default:
2303 ret = -EINVAL;
2304 }
2305
2306 out:
Ingo Molnar14cc3e22006-03-26 01:37:14 -08002307 mutex_unlock(&__ip_vs_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002308 return ret;
2309}
2310
2311
2312static struct nf_sockopt_ops ip_vs_sockopts = {
2313 .pf = PF_INET,
2314 .set_optmin = IP_VS_BASE_CTL,
2315 .set_optmax = IP_VS_SO_SET_MAX+1,
2316 .set = do_ip_vs_set_ctl,
2317 .get_optmin = IP_VS_BASE_CTL,
2318 .get_optmax = IP_VS_SO_GET_MAX+1,
2319 .get = do_ip_vs_get_ctl,
Neil Horman16fcec32007-09-11 11:28:26 +02002320 .owner = THIS_MODULE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002321};
2322
2323
Sven Wegener048cf482008-08-10 18:24:35 +00002324int __init ip_vs_control_init(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002325{
2326 int ret;
2327 int idx;
2328
2329 EnterFunction(2);
2330
2331 ret = nf_register_sockopt(&ip_vs_sockopts);
2332 if (ret) {
2333 IP_VS_ERR("cannot register sockopt.\n");
2334 return ret;
2335 }
2336
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02002337 proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
2338 proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002339
Pavel Emelyanov90754f82008-01-12 02:33:50 -08002340 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002341
2342 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2343 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2344 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2345 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2346 }
2347 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
2348 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2349 }
2350
Linus Torvalds1da177e2005-04-16 15:20:36 -07002351 ip_vs_new_estimator(&ip_vs_stats);
2352
2353 /* Hook the defense timer */
2354 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2355
2356 LeaveFunction(2);
2357 return 0;
2358}
2359
2360
2361void ip_vs_control_cleanup(void)
2362{
2363 EnterFunction(2);
2364 ip_vs_trash_cleanup();
2365 cancel_rearming_delayed_work(&defense_work);
Oleg Nesterov28e53bd2007-05-09 02:34:22 -07002366 cancel_work_sync(&defense_work.work);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002367 ip_vs_kill_estimator(&ip_vs_stats);
2368 unregister_sysctl_table(sysctl_header);
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02002369 proc_net_remove(&init_net, "ip_vs_stats");
2370 proc_net_remove(&init_net, "ip_vs");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002371 nf_unregister_sockopt(&ip_vs_sockopts);
2372 LeaveFunction(2);
2373}