blob: f28ec68821622d7036132137f2af56f314f31a75 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 *
19 * Changes:
20 *
21 */
22
23#include <linux/module.h>
24#include <linux/init.h>
25#include <linux/types.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080026#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070027#include <linux/fs.h>
28#include <linux/sysctl.h>
29#include <linux/proc_fs.h>
30#include <linux/workqueue.h>
31#include <linux/swap.h>
32#include <linux/proc_fs.h>
33#include <linux/seq_file.h>
34
35#include <linux/netfilter.h>
36#include <linux/netfilter_ipv4.h>
Ingo Molnar14cc3e22006-03-26 01:37:14 -080037#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070038
39#include <net/ip.h>
Arnaldo Carvalho de Melo14c85022005-12-27 02:43:12 -020040#include <net/route.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070041#include <net/sock.h>
42
43#include <asm/uaccess.h>
44
45#include <net/ip_vs.h>
46
47/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
Ingo Molnar14cc3e22006-03-26 01:37:14 -080048static DEFINE_MUTEX(__ip_vs_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -070049
50/* lock for service table */
51static DEFINE_RWLOCK(__ip_vs_svc_lock);
52
53/* lock for table with the real services */
54static DEFINE_RWLOCK(__ip_vs_rs_lock);
55
56/* lock for state and timeout tables */
57static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
58
59/* lock for drop entry handling */
60static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
61
62/* lock for drop packet handling */
63static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
64
65/* 1/rate drop and drop-entry variables */
66int ip_vs_drop_rate = 0;
67int ip_vs_drop_counter = 0;
68static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
69
70/* number of virtual services */
71static int ip_vs_num_services = 0;
72
73/* sysctl variables */
74static int sysctl_ip_vs_drop_entry = 0;
75static int sysctl_ip_vs_drop_packet = 0;
76static int sysctl_ip_vs_secure_tcp = 0;
77static int sysctl_ip_vs_amemthresh = 1024;
78static int sysctl_ip_vs_am_droprate = 10;
79int sysctl_ip_vs_cache_bypass = 0;
80int sysctl_ip_vs_expire_nodest_conn = 0;
81int sysctl_ip_vs_expire_quiescent_template = 0;
82int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
83int sysctl_ip_vs_nat_icmp_send = 0;
84
85
86#ifdef CONFIG_IP_VS_DEBUG
87static int sysctl_ip_vs_debug_level = 0;
88
89int ip_vs_get_debug_level(void)
90{
91 return sysctl_ip_vs_debug_level;
92}
93#endif
94
95/*
Julian Anastasovaf9debd2005-07-11 20:59:57 -070096 * update_defense_level is called from keventd and from sysctl,
97 * so it needs to protect itself from softirqs
Linus Torvalds1da177e2005-04-16 15:20:36 -070098 */
99static void update_defense_level(void)
100{
101 struct sysinfo i;
102 static int old_secure_tcp = 0;
103 int availmem;
104 int nomem;
105 int to_change = -1;
106
107 /* we only count free and buffered memory (in pages) */
108 si_meminfo(&i);
109 availmem = i.freeram + i.bufferram;
110 /* however in linux 2.5 the i.bufferram is total page cache size,
111 we need adjust it */
112 /* si_swapinfo(&i); */
113 /* availmem = availmem - (i.totalswap - i.freeswap); */
114
115 nomem = (availmem < sysctl_ip_vs_amemthresh);
116
Julian Anastasovaf9debd2005-07-11 20:59:57 -0700117 local_bh_disable();
118
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119 /* drop_entry */
120 spin_lock(&__ip_vs_dropentry_lock);
121 switch (sysctl_ip_vs_drop_entry) {
122 case 0:
123 atomic_set(&ip_vs_dropentry, 0);
124 break;
125 case 1:
126 if (nomem) {
127 atomic_set(&ip_vs_dropentry, 1);
128 sysctl_ip_vs_drop_entry = 2;
129 } else {
130 atomic_set(&ip_vs_dropentry, 0);
131 }
132 break;
133 case 2:
134 if (nomem) {
135 atomic_set(&ip_vs_dropentry, 1);
136 } else {
137 atomic_set(&ip_vs_dropentry, 0);
138 sysctl_ip_vs_drop_entry = 1;
139 };
140 break;
141 case 3:
142 atomic_set(&ip_vs_dropentry, 1);
143 break;
144 }
145 spin_unlock(&__ip_vs_dropentry_lock);
146
147 /* drop_packet */
148 spin_lock(&__ip_vs_droppacket_lock);
149 switch (sysctl_ip_vs_drop_packet) {
150 case 0:
151 ip_vs_drop_rate = 0;
152 break;
153 case 1:
154 if (nomem) {
155 ip_vs_drop_rate = ip_vs_drop_counter
156 = sysctl_ip_vs_amemthresh /
157 (sysctl_ip_vs_amemthresh-availmem);
158 sysctl_ip_vs_drop_packet = 2;
159 } else {
160 ip_vs_drop_rate = 0;
161 }
162 break;
163 case 2:
164 if (nomem) {
165 ip_vs_drop_rate = ip_vs_drop_counter
166 = sysctl_ip_vs_amemthresh /
167 (sysctl_ip_vs_amemthresh-availmem);
168 } else {
169 ip_vs_drop_rate = 0;
170 sysctl_ip_vs_drop_packet = 1;
171 }
172 break;
173 case 3:
174 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
175 break;
176 }
177 spin_unlock(&__ip_vs_droppacket_lock);
178
179 /* secure_tcp */
180 write_lock(&__ip_vs_securetcp_lock);
181 switch (sysctl_ip_vs_secure_tcp) {
182 case 0:
183 if (old_secure_tcp >= 2)
184 to_change = 0;
185 break;
186 case 1:
187 if (nomem) {
188 if (old_secure_tcp < 2)
189 to_change = 1;
190 sysctl_ip_vs_secure_tcp = 2;
191 } else {
192 if (old_secure_tcp >= 2)
193 to_change = 0;
194 }
195 break;
196 case 2:
197 if (nomem) {
198 if (old_secure_tcp < 2)
199 to_change = 1;
200 } else {
201 if (old_secure_tcp >= 2)
202 to_change = 0;
203 sysctl_ip_vs_secure_tcp = 1;
204 }
205 break;
206 case 3:
207 if (old_secure_tcp < 2)
208 to_change = 1;
209 break;
210 }
211 old_secure_tcp = sysctl_ip_vs_secure_tcp;
212 if (to_change >= 0)
213 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
214 write_unlock(&__ip_vs_securetcp_lock);
Julian Anastasovaf9debd2005-07-11 20:59:57 -0700215
216 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700217}
218
219
220/*
221 * Timer for checking the defense
222 */
223#define DEFENSE_TIMER_PERIOD 1*HZ
224static void defense_work_handler(void *data);
225static DECLARE_WORK(defense_work, defense_work_handler, NULL);
226
227static void defense_work_handler(void *data)
228{
229 update_defense_level();
230 if (atomic_read(&ip_vs_dropentry))
231 ip_vs_random_dropentry();
232
233 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
234}
235
236int
237ip_vs_use_count_inc(void)
238{
239 return try_module_get(THIS_MODULE);
240}
241
242void
243ip_vs_use_count_dec(void)
244{
245 module_put(THIS_MODULE);
246}
247
248
249/*
250 * Hash table: for virtual service lookups
251 */
252#define IP_VS_SVC_TAB_BITS 8
253#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
254#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
255
256/* the service table hashed by <protocol, addr, port> */
257static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
258/* the service table hashed by fwmark */
259static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
260
261/*
262 * Hash table: for real service lookups
263 */
264#define IP_VS_RTAB_BITS 4
265#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
266#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
267
268static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
269
270/*
271 * Trash for destinations
272 */
273static LIST_HEAD(ip_vs_dest_trash);
274
275/*
276 * FTP & NULL virtual service counters
277 */
278static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
279static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
280
281
282/*
283 * Returns hash value for virtual service
284 */
285static __inline__ unsigned
286ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
287{
288 register unsigned porth = ntohs(port);
289
290 return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
291 & IP_VS_SVC_TAB_MASK;
292}
293
294/*
295 * Returns hash value of fwmark for virtual service lookup
296 */
297static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
298{
299 return fwmark & IP_VS_SVC_TAB_MASK;
300}
301
302/*
303 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
304 * or in the ip_vs_svc_fwm_table by fwmark.
305 * Should be called with locked tables.
306 */
307static int ip_vs_svc_hash(struct ip_vs_service *svc)
308{
309 unsigned hash;
310
311 if (svc->flags & IP_VS_SVC_F_HASHED) {
312 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
313 "called from %p\n", __builtin_return_address(0));
314 return 0;
315 }
316
317 if (svc->fwmark == 0) {
318 /*
319 * Hash it by <protocol,addr,port> in ip_vs_svc_table
320 */
321 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
322 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
323 } else {
324 /*
325 * Hash it by fwmark in ip_vs_svc_fwm_table
326 */
327 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
328 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
329 }
330
331 svc->flags |= IP_VS_SVC_F_HASHED;
332 /* increase its refcnt because it is referenced by the svc table */
333 atomic_inc(&svc->refcnt);
334 return 1;
335}
336
337
338/*
339 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
340 * Should be called with locked tables.
341 */
342static int ip_vs_svc_unhash(struct ip_vs_service *svc)
343{
344 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
345 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
346 "called from %p\n", __builtin_return_address(0));
347 return 0;
348 }
349
350 if (svc->fwmark == 0) {
351 /* Remove it from the ip_vs_svc_table table */
352 list_del(&svc->s_list);
353 } else {
354 /* Remove it from the ip_vs_svc_fwm_table table */
355 list_del(&svc->f_list);
356 }
357
358 svc->flags &= ~IP_VS_SVC_F_HASHED;
359 atomic_dec(&svc->refcnt);
360 return 1;
361}
362
363
364/*
365 * Get service by {proto,addr,port} in the service table.
366 */
367static __inline__ struct ip_vs_service *
368__ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
369{
370 unsigned hash;
371 struct ip_vs_service *svc;
372
373 /* Check for "full" addressed entries */
374 hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
375
376 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
377 if ((svc->addr == vaddr)
378 && (svc->port == vport)
379 && (svc->protocol == protocol)) {
380 /* HIT */
381 atomic_inc(&svc->usecnt);
382 return svc;
383 }
384 }
385
386 return NULL;
387}
388
389
390/*
391 * Get service by {fwmark} in the service table.
392 */
393static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
394{
395 unsigned hash;
396 struct ip_vs_service *svc;
397
398 /* Check for fwmark addressed entries */
399 hash = ip_vs_svc_fwm_hashkey(fwmark);
400
401 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
402 if (svc->fwmark == fwmark) {
403 /* HIT */
404 atomic_inc(&svc->usecnt);
405 return svc;
406 }
407 }
408
409 return NULL;
410}
411
412struct ip_vs_service *
413ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
414{
415 struct ip_vs_service *svc;
416
417 read_lock(&__ip_vs_svc_lock);
418
419 /*
420 * Check the table hashed by fwmark first
421 */
422 if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
423 goto out;
424
425 /*
426 * Check the table hashed by <protocol,addr,port>
427 * for "full" addressed entries
428 */
429 svc = __ip_vs_service_get(protocol, vaddr, vport);
430
431 if (svc == NULL
432 && protocol == IPPROTO_TCP
433 && atomic_read(&ip_vs_ftpsvc_counter)
434 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
435 /*
436 * Check if ftp service entry exists, the packet
437 * might belong to FTP data connections.
438 */
439 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
440 }
441
442 if (svc == NULL
443 && atomic_read(&ip_vs_nullsvc_counter)) {
444 /*
445 * Check if the catch-all port (port zero) exists
446 */
447 svc = __ip_vs_service_get(protocol, vaddr, 0);
448 }
449
450 out:
451 read_unlock(&__ip_vs_svc_lock);
452
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800453 IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454 fwmark, ip_vs_proto_name(protocol),
455 NIPQUAD(vaddr), ntohs(vport),
456 svc?"hit":"not hit");
457
458 return svc;
459}
460
461
462static inline void
463__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
464{
465 atomic_inc(&svc->refcnt);
466 dest->svc = svc;
467}
468
469static inline void
470__ip_vs_unbind_svc(struct ip_vs_dest *dest)
471{
472 struct ip_vs_service *svc = dest->svc;
473
474 dest->svc = NULL;
475 if (atomic_dec_and_test(&svc->refcnt))
476 kfree(svc);
477}
478
479
480/*
481 * Returns hash value for real service
482 */
483static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
484{
485 register unsigned porth = ntohs(port);
486
487 return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
488 & IP_VS_RTAB_MASK;
489}
490
491/*
492 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
493 * should be called with locked tables.
494 */
495static int ip_vs_rs_hash(struct ip_vs_dest *dest)
496{
497 unsigned hash;
498
499 if (!list_empty(&dest->d_list)) {
500 return 0;
501 }
502
503 /*
504 * Hash by proto,addr,port,
505 * which are the parameters of the real service.
506 */
507 hash = ip_vs_rs_hashkey(dest->addr, dest->port);
508 list_add(&dest->d_list, &ip_vs_rtable[hash]);
509
510 return 1;
511}
512
513/*
514 * UNhashes ip_vs_dest from ip_vs_rtable.
515 * should be called with locked tables.
516 */
517static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
518{
519 /*
520 * Remove it from the ip_vs_rtable table.
521 */
522 if (!list_empty(&dest->d_list)) {
523 list_del(&dest->d_list);
524 INIT_LIST_HEAD(&dest->d_list);
525 }
526
527 return 1;
528}
529
530/*
531 * Lookup real service by <proto,addr,port> in the real service table.
532 */
533struct ip_vs_dest *
534ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
535{
536 unsigned hash;
537 struct ip_vs_dest *dest;
538
539 /*
540 * Check for "full" addressed entries
541 * Return the first found entry
542 */
543 hash = ip_vs_rs_hashkey(daddr, dport);
544
545 read_lock(&__ip_vs_rs_lock);
546 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
547 if ((dest->addr == daddr)
548 && (dest->port == dport)
549 && ((dest->protocol == protocol) ||
550 dest->vfwmark)) {
551 /* HIT */
552 read_unlock(&__ip_vs_rs_lock);
553 return dest;
554 }
555 }
556 read_unlock(&__ip_vs_rs_lock);
557
558 return NULL;
559}
560
561/*
562 * Lookup destination by {addr,port} in the given service
563 */
564static struct ip_vs_dest *
565ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
566{
567 struct ip_vs_dest *dest;
568
569 /*
570 * Find the destination for the given service
571 */
572 list_for_each_entry(dest, &svc->destinations, n_list) {
573 if ((dest->addr == daddr) && (dest->port == dport)) {
574 /* HIT */
575 return dest;
576 }
577 }
578
579 return NULL;
580}
581
582
583/*
584 * Lookup dest by {svc,addr,port} in the destination trash.
585 * The destination trash is used to hold the destinations that are removed
586 * from the service table but are still referenced by some conn entries.
587 * The reason to add the destination trash is when the dest is temporary
588 * down (either by administrator or by monitor program), the dest can be
589 * picked back from the trash, the remaining connections to the dest can
590 * continue, and the counting information of the dest is also useful for
591 * scheduling.
592 */
593static struct ip_vs_dest *
594ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
595{
596 struct ip_vs_dest *dest, *nxt;
597
598 /*
599 * Find the destination in trash
600 */
601 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
602 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800603 "dest->refcnt=%d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700604 dest->vfwmark,
605 NIPQUAD(dest->addr), ntohs(dest->port),
606 atomic_read(&dest->refcnt));
607 if (dest->addr == daddr &&
608 dest->port == dport &&
609 dest->vfwmark == svc->fwmark &&
610 dest->protocol == svc->protocol &&
611 (svc->fwmark ||
612 (dest->vaddr == svc->addr &&
613 dest->vport == svc->port))) {
614 /* HIT */
615 return dest;
616 }
617
618 /*
619 * Try to purge the destination from trash if not referenced
620 */
621 if (atomic_read(&dest->refcnt) == 1) {
622 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
623 "from trash\n",
624 dest->vfwmark,
625 NIPQUAD(dest->addr), ntohs(dest->port));
626 list_del(&dest->n_list);
627 ip_vs_dst_reset(dest);
628 __ip_vs_unbind_svc(dest);
629 kfree(dest);
630 }
631 }
632
633 return NULL;
634}
635
636
637/*
638 * Clean up all the destinations in the trash
639 * Called by the ip_vs_control_cleanup()
640 *
641 * When the ip_vs_control_clearup is activated by ipvs module exit,
642 * the service tables must have been flushed and all the connections
643 * are expired, and the refcnt of each destination in the trash must
644 * be 1, so we simply release them here.
645 */
646static void ip_vs_trash_cleanup(void)
647{
648 struct ip_vs_dest *dest, *nxt;
649
650 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
651 list_del(&dest->n_list);
652 ip_vs_dst_reset(dest);
653 __ip_vs_unbind_svc(dest);
654 kfree(dest);
655 }
656}
657
658
659static void
660ip_vs_zero_stats(struct ip_vs_stats *stats)
661{
662 spin_lock_bh(&stats->lock);
663 memset(stats, 0, (char *)&stats->lock - (char *)stats);
664 spin_unlock_bh(&stats->lock);
665 ip_vs_zero_estimator(stats);
666}
667
668/*
669 * Update a destination in the given service
670 */
671static void
672__ip_vs_update_dest(struct ip_vs_service *svc,
673 struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
674{
675 int conn_flags;
676
677 /* set the weight and the flags */
678 atomic_set(&dest->weight, udest->weight);
679 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
680
681 /* check if local node and update the flags */
682 if (inet_addr_type(udest->addr) == RTN_LOCAL) {
683 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
684 | IP_VS_CONN_F_LOCALNODE;
685 }
686
687 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
688 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
689 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
690 } else {
691 /*
692 * Put the real service in ip_vs_rtable if not present.
693 * For now only for NAT!
694 */
695 write_lock_bh(&__ip_vs_rs_lock);
696 ip_vs_rs_hash(dest);
697 write_unlock_bh(&__ip_vs_rs_lock);
698 }
699 atomic_set(&dest->conn_flags, conn_flags);
700
701 /* bind the service */
702 if (!dest->svc) {
703 __ip_vs_bind_svc(dest, svc);
704 } else {
705 if (dest->svc != svc) {
706 __ip_vs_unbind_svc(dest);
707 ip_vs_zero_stats(&dest->stats);
708 __ip_vs_bind_svc(dest, svc);
709 }
710 }
711
712 /* set the dest status flags */
713 dest->flags |= IP_VS_DEST_F_AVAILABLE;
714
715 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
716 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
717 dest->u_threshold = udest->u_threshold;
718 dest->l_threshold = udest->l_threshold;
719}
720
721
722/*
723 * Create a destination for the given service
724 */
725static int
726ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
727 struct ip_vs_dest **dest_p)
728{
729 struct ip_vs_dest *dest;
730 unsigned atype;
731
732 EnterFunction(2);
733
734 atype = inet_addr_type(udest->addr);
735 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
736 return -EINVAL;
737
738 dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
739 if (dest == NULL) {
740 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
741 return -ENOMEM;
742 }
743 memset(dest, 0, sizeof(struct ip_vs_dest));
744
745 dest->protocol = svc->protocol;
746 dest->vaddr = svc->addr;
747 dest->vport = svc->port;
748 dest->vfwmark = svc->fwmark;
749 dest->addr = udest->addr;
750 dest->port = udest->port;
751
752 atomic_set(&dest->activeconns, 0);
753 atomic_set(&dest->inactconns, 0);
754 atomic_set(&dest->persistconns, 0);
755 atomic_set(&dest->refcnt, 0);
756
757 INIT_LIST_HEAD(&dest->d_list);
758 spin_lock_init(&dest->dst_lock);
759 spin_lock_init(&dest->stats.lock);
760 __ip_vs_update_dest(svc, dest, udest);
761 ip_vs_new_estimator(&dest->stats);
762
763 *dest_p = dest;
764
765 LeaveFunction(2);
766 return 0;
767}
768
769
770/*
771 * Add a destination into an existing service
772 */
773static int
774ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
775{
776 struct ip_vs_dest *dest;
777 __u32 daddr = udest->addr;
778 __u16 dport = udest->port;
779 int ret;
780
781 EnterFunction(2);
782
783 if (udest->weight < 0) {
784 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
785 return -ERANGE;
786 }
787
788 if (udest->l_threshold > udest->u_threshold) {
789 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
790 "upper threshold\n");
791 return -ERANGE;
792 }
793
794 /*
795 * Check if the dest already exists in the list
796 */
797 dest = ip_vs_lookup_dest(svc, daddr, dport);
798 if (dest != NULL) {
799 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
800 return -EEXIST;
801 }
802
803 /*
804 * Check if the dest already exists in the trash and
805 * is from the same service
806 */
807 dest = ip_vs_trash_get_dest(svc, daddr, dport);
808 if (dest != NULL) {
809 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800810 "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700811 NIPQUAD(daddr), ntohs(dport),
812 atomic_read(&dest->refcnt),
813 dest->vfwmark,
814 NIPQUAD(dest->vaddr),
815 ntohs(dest->vport));
816 __ip_vs_update_dest(svc, dest, udest);
817
818 /*
819 * Get the destination from the trash
820 */
821 list_del(&dest->n_list);
822
823 ip_vs_new_estimator(&dest->stats);
824
825 write_lock_bh(&__ip_vs_svc_lock);
826
827 /*
828 * Wait until all other svc users go away.
829 */
830 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
831
832 list_add(&dest->n_list, &svc->destinations);
833 svc->num_dests++;
834
835 /* call the update_service function of its scheduler */
836 svc->scheduler->update_service(svc);
837
838 write_unlock_bh(&__ip_vs_svc_lock);
839 return 0;
840 }
841
842 /*
843 * Allocate and initialize the dest structure
844 */
845 ret = ip_vs_new_dest(svc, udest, &dest);
846 if (ret) {
847 return ret;
848 }
849
850 /*
851 * Add the dest entry into the list
852 */
853 atomic_inc(&dest->refcnt);
854
855 write_lock_bh(&__ip_vs_svc_lock);
856
857 /*
858 * Wait until all other svc users go away.
859 */
860 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
861
862 list_add(&dest->n_list, &svc->destinations);
863 svc->num_dests++;
864
865 /* call the update_service function of its scheduler */
866 svc->scheduler->update_service(svc);
867
868 write_unlock_bh(&__ip_vs_svc_lock);
869
870 LeaveFunction(2);
871
872 return 0;
873}
874
875
876/*
877 * Edit a destination in the given service
878 */
879static int
880ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
881{
882 struct ip_vs_dest *dest;
883 __u32 daddr = udest->addr;
884 __u16 dport = udest->port;
885
886 EnterFunction(2);
887
888 if (udest->weight < 0) {
889 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
890 return -ERANGE;
891 }
892
893 if (udest->l_threshold > udest->u_threshold) {
894 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
895 "upper threshold\n");
896 return -ERANGE;
897 }
898
899 /*
900 * Lookup the destination list
901 */
902 dest = ip_vs_lookup_dest(svc, daddr, dport);
903 if (dest == NULL) {
904 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
905 return -ENOENT;
906 }
907
908 __ip_vs_update_dest(svc, dest, udest);
909
910 write_lock_bh(&__ip_vs_svc_lock);
911
912 /* Wait until all other svc users go away */
913 while (atomic_read(&svc->usecnt) > 1) {};
914
915 /* call the update_service, because server weight may be changed */
916 svc->scheduler->update_service(svc);
917
918 write_unlock_bh(&__ip_vs_svc_lock);
919
920 LeaveFunction(2);
921
922 return 0;
923}
924
925
926/*
927 * Delete a destination (must be already unlinked from the service)
928 */
929static void __ip_vs_del_dest(struct ip_vs_dest *dest)
930{
931 ip_vs_kill_estimator(&dest->stats);
932
933 /*
934 * Remove it from the d-linked list with the real services.
935 */
936 write_lock_bh(&__ip_vs_rs_lock);
937 ip_vs_rs_unhash(dest);
938 write_unlock_bh(&__ip_vs_rs_lock);
939
940 /*
941 * Decrease the refcnt of the dest, and free the dest
942 * if nobody refers to it (refcnt=0). Otherwise, throw
943 * the destination into the trash.
944 */
945 if (atomic_dec_and_test(&dest->refcnt)) {
946 ip_vs_dst_reset(dest);
947 /* simply decrease svc->refcnt here, let the caller check
948 and release the service if nobody refers to it.
949 Only user context can release destination and service,
950 and only one user context can update virtual service at a
951 time, so the operation here is OK */
952 atomic_dec(&dest->svc->refcnt);
953 kfree(dest);
954 } else {
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800955 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
956 "dest->refcnt=%d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700957 NIPQUAD(dest->addr), ntohs(dest->port),
958 atomic_read(&dest->refcnt));
959 list_add(&dest->n_list, &ip_vs_dest_trash);
960 atomic_inc(&dest->refcnt);
961 }
962}
963
964
965/*
966 * Unlink a destination from the given service
967 */
968static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
969 struct ip_vs_dest *dest,
970 int svcupd)
971{
972 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
973
974 /*
975 * Remove it from the d-linked destination list.
976 */
977 list_del(&dest->n_list);
978 svc->num_dests--;
979 if (svcupd) {
980 /*
981 * Call the update_service function of its scheduler
982 */
983 svc->scheduler->update_service(svc);
984 }
985}
986
987
988/*
989 * Delete a destination server in the given service
990 */
991static int
992ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
993{
994 struct ip_vs_dest *dest;
995 __u32 daddr = udest->addr;
996 __u16 dport = udest->port;
997
998 EnterFunction(2);
999
1000 dest = ip_vs_lookup_dest(svc, daddr, dport);
1001 if (dest == NULL) {
1002 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1003 return -ENOENT;
1004 }
1005
1006 write_lock_bh(&__ip_vs_svc_lock);
1007
1008 /*
1009 * Wait until all other svc users go away.
1010 */
1011 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1012
1013 /*
1014 * Unlink dest from the service
1015 */
1016 __ip_vs_unlink_dest(svc, dest, 1);
1017
1018 write_unlock_bh(&__ip_vs_svc_lock);
1019
1020 /*
1021 * Delete the destination
1022 */
1023 __ip_vs_del_dest(dest);
1024
1025 LeaveFunction(2);
1026
1027 return 0;
1028}
1029
1030
1031/*
1032 * Add a service into the service hash table
1033 */
1034static int
1035ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1036{
1037 int ret = 0;
1038 struct ip_vs_scheduler *sched = NULL;
1039 struct ip_vs_service *svc = NULL;
1040
1041 /* increase the module use count */
1042 ip_vs_use_count_inc();
1043
1044 /* Lookup the scheduler by 'u->sched_name' */
1045 sched = ip_vs_scheduler_get(u->sched_name);
1046 if (sched == NULL) {
1047 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1048 u->sched_name);
1049 ret = -ENOENT;
1050 goto out_mod_dec;
1051 }
1052
1053 svc = (struct ip_vs_service *)
1054 kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1055 if (svc == NULL) {
1056 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1057 ret = -ENOMEM;
1058 goto out_err;
1059 }
1060 memset(svc, 0, sizeof(struct ip_vs_service));
1061
1062 /* I'm the first user of the service */
1063 atomic_set(&svc->usecnt, 1);
1064 atomic_set(&svc->refcnt, 0);
1065
1066 svc->protocol = u->protocol;
1067 svc->addr = u->addr;
1068 svc->port = u->port;
1069 svc->fwmark = u->fwmark;
1070 svc->flags = u->flags;
1071 svc->timeout = u->timeout * HZ;
1072 svc->netmask = u->netmask;
1073
1074 INIT_LIST_HEAD(&svc->destinations);
1075 rwlock_init(&svc->sched_lock);
1076 spin_lock_init(&svc->stats.lock);
1077
1078 /* Bind the scheduler */
1079 ret = ip_vs_bind_scheduler(svc, sched);
1080 if (ret)
1081 goto out_err;
1082 sched = NULL;
1083
1084 /* Update the virtual service counters */
1085 if (svc->port == FTPPORT)
1086 atomic_inc(&ip_vs_ftpsvc_counter);
1087 else if (svc->port == 0)
1088 atomic_inc(&ip_vs_nullsvc_counter);
1089
1090 ip_vs_new_estimator(&svc->stats);
1091 ip_vs_num_services++;
1092
1093 /* Hash the service into the service table */
1094 write_lock_bh(&__ip_vs_svc_lock);
1095 ip_vs_svc_hash(svc);
1096 write_unlock_bh(&__ip_vs_svc_lock);
1097
1098 *svc_p = svc;
1099 return 0;
1100
1101 out_err:
1102 if (svc != NULL) {
1103 if (svc->scheduler)
1104 ip_vs_unbind_scheduler(svc);
1105 if (svc->inc) {
1106 local_bh_disable();
1107 ip_vs_app_inc_put(svc->inc);
1108 local_bh_enable();
1109 }
1110 kfree(svc);
1111 }
1112 ip_vs_scheduler_put(sched);
1113
1114 out_mod_dec:
1115 /* decrease the module use count */
1116 ip_vs_use_count_dec();
1117
1118 return ret;
1119}
1120
1121
1122/*
1123 * Edit a service and bind it with a new scheduler
1124 */
1125static int
1126ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1127{
1128 struct ip_vs_scheduler *sched, *old_sched;
1129 int ret = 0;
1130
1131 /*
1132 * Lookup the scheduler, by 'u->sched_name'
1133 */
1134 sched = ip_vs_scheduler_get(u->sched_name);
1135 if (sched == NULL) {
1136 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1137 u->sched_name);
1138 return -ENOENT;
1139 }
1140 old_sched = sched;
1141
1142 write_lock_bh(&__ip_vs_svc_lock);
1143
1144 /*
1145 * Wait until all other svc users go away.
1146 */
1147 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1148
1149 /*
1150 * Set the flags and timeout value
1151 */
1152 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1153 svc->timeout = u->timeout * HZ;
1154 svc->netmask = u->netmask;
1155
1156 old_sched = svc->scheduler;
1157 if (sched != old_sched) {
1158 /*
1159 * Unbind the old scheduler
1160 */
1161 if ((ret = ip_vs_unbind_scheduler(svc))) {
1162 old_sched = sched;
1163 goto out;
1164 }
1165
1166 /*
1167 * Bind the new scheduler
1168 */
1169 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1170 /*
1171 * If ip_vs_bind_scheduler fails, restore the old
1172 * scheduler.
1173 * The main reason of failure is out of memory.
1174 *
1175 * The question is if the old scheduler can be
1176 * restored all the time. TODO: if it cannot be
1177 * restored some time, we must delete the service,
1178 * otherwise the system may crash.
1179 */
1180 ip_vs_bind_scheduler(svc, old_sched);
1181 old_sched = sched;
1182 goto out;
1183 }
1184 }
1185
1186 out:
1187 write_unlock_bh(&__ip_vs_svc_lock);
1188
1189 if (old_sched)
1190 ip_vs_scheduler_put(old_sched);
1191
1192 return ret;
1193}
1194
1195
1196/*
1197 * Delete a service from the service list
1198 * - The service must be unlinked, unlocked and not referenced!
1199 * - We are called under _bh lock
1200 */
1201static void __ip_vs_del_service(struct ip_vs_service *svc)
1202{
1203 struct ip_vs_dest *dest, *nxt;
1204 struct ip_vs_scheduler *old_sched;
1205
1206 ip_vs_num_services--;
1207 ip_vs_kill_estimator(&svc->stats);
1208
1209 /* Unbind scheduler */
1210 old_sched = svc->scheduler;
1211 ip_vs_unbind_scheduler(svc);
1212 if (old_sched)
1213 ip_vs_scheduler_put(old_sched);
1214
1215 /* Unbind app inc */
1216 if (svc->inc) {
1217 ip_vs_app_inc_put(svc->inc);
1218 svc->inc = NULL;
1219 }
1220
1221 /*
1222 * Unlink the whole destination list
1223 */
1224 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1225 __ip_vs_unlink_dest(svc, dest, 0);
1226 __ip_vs_del_dest(dest);
1227 }
1228
1229 /*
1230 * Update the virtual service counters
1231 */
1232 if (svc->port == FTPPORT)
1233 atomic_dec(&ip_vs_ftpsvc_counter);
1234 else if (svc->port == 0)
1235 atomic_dec(&ip_vs_nullsvc_counter);
1236
1237 /*
1238 * Free the service if nobody refers to it
1239 */
1240 if (atomic_read(&svc->refcnt) == 0)
1241 kfree(svc);
1242
1243 /* decrease the module use count */
1244 ip_vs_use_count_dec();
1245}
1246
1247/*
1248 * Delete a service from the service list
1249 */
1250static int ip_vs_del_service(struct ip_vs_service *svc)
1251{
1252 if (svc == NULL)
1253 return -EEXIST;
1254
1255 /*
1256 * Unhash it from the service table
1257 */
1258 write_lock_bh(&__ip_vs_svc_lock);
1259
1260 ip_vs_svc_unhash(svc);
1261
1262 /*
1263 * Wait until all the svc users go away.
1264 */
1265 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1266
1267 __ip_vs_del_service(svc);
1268
1269 write_unlock_bh(&__ip_vs_svc_lock);
1270
1271 return 0;
1272}
1273
1274
1275/*
1276 * Flush all the virtual services
1277 */
1278static int ip_vs_flush(void)
1279{
1280 int idx;
1281 struct ip_vs_service *svc, *nxt;
1282
1283 /*
1284 * Flush the service table hashed by <protocol,addr,port>
1285 */
1286 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1287 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1288 write_lock_bh(&__ip_vs_svc_lock);
1289 ip_vs_svc_unhash(svc);
1290 /*
1291 * Wait until all the svc users go away.
1292 */
1293 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1294 __ip_vs_del_service(svc);
1295 write_unlock_bh(&__ip_vs_svc_lock);
1296 }
1297 }
1298
1299 /*
1300 * Flush the service table hashed by fwmark
1301 */
1302 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1303 list_for_each_entry_safe(svc, nxt,
1304 &ip_vs_svc_fwm_table[idx], f_list) {
1305 write_lock_bh(&__ip_vs_svc_lock);
1306 ip_vs_svc_unhash(svc);
1307 /*
1308 * Wait until all the svc users go away.
1309 */
1310 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1311 __ip_vs_del_service(svc);
1312 write_unlock_bh(&__ip_vs_svc_lock);
1313 }
1314 }
1315
1316 return 0;
1317}
1318
1319
1320/*
1321 * Zero counters in a service or all services
1322 */
1323static int ip_vs_zero_service(struct ip_vs_service *svc)
1324{
1325 struct ip_vs_dest *dest;
1326
1327 write_lock_bh(&__ip_vs_svc_lock);
1328 list_for_each_entry(dest, &svc->destinations, n_list) {
1329 ip_vs_zero_stats(&dest->stats);
1330 }
1331 ip_vs_zero_stats(&svc->stats);
1332 write_unlock_bh(&__ip_vs_svc_lock);
1333 return 0;
1334}
1335
1336static int ip_vs_zero_all(void)
1337{
1338 int idx;
1339 struct ip_vs_service *svc;
1340
1341 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1342 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1343 ip_vs_zero_service(svc);
1344 }
1345 }
1346
1347 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1348 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1349 ip_vs_zero_service(svc);
1350 }
1351 }
1352
1353 ip_vs_zero_stats(&ip_vs_stats);
1354 return 0;
1355}
1356
1357
1358static int
1359proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1360 void __user *buffer, size_t *lenp, loff_t *ppos)
1361{
1362 int *valp = table->data;
1363 int val = *valp;
1364 int rc;
1365
1366 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1367 if (write && (*valp != val)) {
1368 if ((*valp < 0) || (*valp > 3)) {
1369 /* Restore the correct value */
1370 *valp = val;
1371 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001372 update_defense_level();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001373 }
1374 }
1375 return rc;
1376}
1377
1378
1379static int
1380proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1381 void __user *buffer, size_t *lenp, loff_t *ppos)
1382{
1383 int *valp = table->data;
1384 int val[2];
1385 int rc;
1386
1387 /* backup the value first */
1388 memcpy(val, valp, sizeof(val));
1389
1390 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1391 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1392 /* Restore the correct value */
1393 memcpy(valp, val, sizeof(val));
1394 }
1395 return rc;
1396}
1397
1398
1399/*
1400 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1401 */
1402
1403static struct ctl_table vs_vars[] = {
1404 {
1405 .ctl_name = NET_IPV4_VS_AMEMTHRESH,
1406 .procname = "amemthresh",
1407 .data = &sysctl_ip_vs_amemthresh,
1408 .maxlen = sizeof(int),
1409 .mode = 0644,
1410 .proc_handler = &proc_dointvec,
1411 },
1412#ifdef CONFIG_IP_VS_DEBUG
1413 {
1414 .ctl_name = NET_IPV4_VS_DEBUG_LEVEL,
1415 .procname = "debug_level",
1416 .data = &sysctl_ip_vs_debug_level,
1417 .maxlen = sizeof(int),
1418 .mode = 0644,
1419 .proc_handler = &proc_dointvec,
1420 },
1421#endif
1422 {
1423 .ctl_name = NET_IPV4_VS_AMDROPRATE,
1424 .procname = "am_droprate",
1425 .data = &sysctl_ip_vs_am_droprate,
1426 .maxlen = sizeof(int),
1427 .mode = 0644,
1428 .proc_handler = &proc_dointvec,
1429 },
1430 {
1431 .ctl_name = NET_IPV4_VS_DROP_ENTRY,
1432 .procname = "drop_entry",
1433 .data = &sysctl_ip_vs_drop_entry,
1434 .maxlen = sizeof(int),
1435 .mode = 0644,
1436 .proc_handler = &proc_do_defense_mode,
1437 },
1438 {
1439 .ctl_name = NET_IPV4_VS_DROP_PACKET,
1440 .procname = "drop_packet",
1441 .data = &sysctl_ip_vs_drop_packet,
1442 .maxlen = sizeof(int),
1443 .mode = 0644,
1444 .proc_handler = &proc_do_defense_mode,
1445 },
1446 {
1447 .ctl_name = NET_IPV4_VS_SECURE_TCP,
1448 .procname = "secure_tcp",
1449 .data = &sysctl_ip_vs_secure_tcp,
1450 .maxlen = sizeof(int),
1451 .mode = 0644,
1452 .proc_handler = &proc_do_defense_mode,
1453 },
1454#if 0
1455 {
1456 .ctl_name = NET_IPV4_VS_TO_ES,
1457 .procname = "timeout_established",
1458 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1459 .maxlen = sizeof(int),
1460 .mode = 0644,
1461 .proc_handler = &proc_dointvec_jiffies,
1462 },
1463 {
1464 .ctl_name = NET_IPV4_VS_TO_SS,
1465 .procname = "timeout_synsent",
1466 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1467 .maxlen = sizeof(int),
1468 .mode = 0644,
1469 .proc_handler = &proc_dointvec_jiffies,
1470 },
1471 {
1472 .ctl_name = NET_IPV4_VS_TO_SR,
1473 .procname = "timeout_synrecv",
1474 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1475 .maxlen = sizeof(int),
1476 .mode = 0644,
1477 .proc_handler = &proc_dointvec_jiffies,
1478 },
1479 {
1480 .ctl_name = NET_IPV4_VS_TO_FW,
1481 .procname = "timeout_finwait",
1482 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1483 .maxlen = sizeof(int),
1484 .mode = 0644,
1485 .proc_handler = &proc_dointvec_jiffies,
1486 },
1487 {
1488 .ctl_name = NET_IPV4_VS_TO_TW,
1489 .procname = "timeout_timewait",
1490 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1491 .maxlen = sizeof(int),
1492 .mode = 0644,
1493 .proc_handler = &proc_dointvec_jiffies,
1494 },
1495 {
1496 .ctl_name = NET_IPV4_VS_TO_CL,
1497 .procname = "timeout_close",
1498 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1499 .maxlen = sizeof(int),
1500 .mode = 0644,
1501 .proc_handler = &proc_dointvec_jiffies,
1502 },
1503 {
1504 .ctl_name = NET_IPV4_VS_TO_CW,
1505 .procname = "timeout_closewait",
1506 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1507 .maxlen = sizeof(int),
1508 .mode = 0644,
1509 .proc_handler = &proc_dointvec_jiffies,
1510 },
1511 {
1512 .ctl_name = NET_IPV4_VS_TO_LA,
1513 .procname = "timeout_lastack",
1514 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1515 .maxlen = sizeof(int),
1516 .mode = 0644,
1517 .proc_handler = &proc_dointvec_jiffies,
1518 },
1519 {
1520 .ctl_name = NET_IPV4_VS_TO_LI,
1521 .procname = "timeout_listen",
1522 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1523 .maxlen = sizeof(int),
1524 .mode = 0644,
1525 .proc_handler = &proc_dointvec_jiffies,
1526 },
1527 {
1528 .ctl_name = NET_IPV4_VS_TO_SA,
1529 .procname = "timeout_synack",
1530 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1531 .maxlen = sizeof(int),
1532 .mode = 0644,
1533 .proc_handler = &proc_dointvec_jiffies,
1534 },
1535 {
1536 .ctl_name = NET_IPV4_VS_TO_UDP,
1537 .procname = "timeout_udp",
1538 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1539 .maxlen = sizeof(int),
1540 .mode = 0644,
1541 .proc_handler = &proc_dointvec_jiffies,
1542 },
1543 {
1544 .ctl_name = NET_IPV4_VS_TO_ICMP,
1545 .procname = "timeout_icmp",
1546 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1547 .maxlen = sizeof(int),
1548 .mode = 0644,
1549 .proc_handler = &proc_dointvec_jiffies,
1550 },
1551#endif
1552 {
1553 .ctl_name = NET_IPV4_VS_CACHE_BYPASS,
1554 .procname = "cache_bypass",
1555 .data = &sysctl_ip_vs_cache_bypass,
1556 .maxlen = sizeof(int),
1557 .mode = 0644,
1558 .proc_handler = &proc_dointvec,
1559 },
1560 {
1561 .ctl_name = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1562 .procname = "expire_nodest_conn",
1563 .data = &sysctl_ip_vs_expire_nodest_conn,
1564 .maxlen = sizeof(int),
1565 .mode = 0644,
1566 .proc_handler = &proc_dointvec,
1567 },
1568 {
1569 .ctl_name = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1570 .procname = "expire_quiescent_template",
1571 .data = &sysctl_ip_vs_expire_quiescent_template,
1572 .maxlen = sizeof(int),
1573 .mode = 0644,
1574 .proc_handler = &proc_dointvec,
1575 },
1576 {
1577 .ctl_name = NET_IPV4_VS_SYNC_THRESHOLD,
1578 .procname = "sync_threshold",
1579 .data = &sysctl_ip_vs_sync_threshold,
1580 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1581 .mode = 0644,
1582 .proc_handler = &proc_do_sync_threshold,
1583 },
1584 {
1585 .ctl_name = NET_IPV4_VS_NAT_ICMP_SEND,
1586 .procname = "nat_icmp_send",
1587 .data = &sysctl_ip_vs_nat_icmp_send,
1588 .maxlen = sizeof(int),
1589 .mode = 0644,
1590 .proc_handler = &proc_dointvec,
1591 },
1592 { .ctl_name = 0 }
1593};
1594
1595static ctl_table vs_table[] = {
1596 {
1597 .ctl_name = NET_IPV4_VS,
1598 .procname = "vs",
1599 .mode = 0555,
1600 .child = vs_vars
1601 },
1602 { .ctl_name = 0 }
1603};
1604
David S. Millerbf0ff9e2005-08-19 16:37:30 -07001605static ctl_table ipvs_ipv4_table[] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001606 {
1607 .ctl_name = NET_IPV4,
1608 .procname = "ipv4",
1609 .mode = 0555,
1610 .child = vs_table,
1611 },
1612 { .ctl_name = 0 }
1613};
1614
1615static ctl_table vs_root_table[] = {
1616 {
1617 .ctl_name = CTL_NET,
1618 .procname = "net",
1619 .mode = 0555,
David S. Millerbf0ff9e2005-08-19 16:37:30 -07001620 .child = ipvs_ipv4_table,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001621 },
1622 { .ctl_name = 0 }
1623};
1624
1625static struct ctl_table_header * sysctl_header;
1626
1627#ifdef CONFIG_PROC_FS
1628
1629struct ip_vs_iter {
1630 struct list_head *table;
1631 int bucket;
1632};
1633
1634/*
1635 * Write the contents of the VS rule table to a PROCfs file.
1636 * (It is kept just for backward compatibility)
1637 */
1638static inline const char *ip_vs_fwd_name(unsigned flags)
1639{
1640 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1641 case IP_VS_CONN_F_LOCALNODE:
1642 return "Local";
1643 case IP_VS_CONN_F_TUNNEL:
1644 return "Tunnel";
1645 case IP_VS_CONN_F_DROUTE:
1646 return "Route";
1647 default:
1648 return "Masq";
1649 }
1650}
1651
1652
1653/* Get the Nth entry in the two lists */
1654static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1655{
1656 struct ip_vs_iter *iter = seq->private;
1657 int idx;
1658 struct ip_vs_service *svc;
1659
1660 /* look in hash by protocol */
1661 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1662 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1663 if (pos-- == 0){
1664 iter->table = ip_vs_svc_table;
1665 iter->bucket = idx;
1666 return svc;
1667 }
1668 }
1669 }
1670
1671 /* keep looking in fwmark */
1672 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1673 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1674 if (pos-- == 0) {
1675 iter->table = ip_vs_svc_fwm_table;
1676 iter->bucket = idx;
1677 return svc;
1678 }
1679 }
1680 }
1681
1682 return NULL;
1683}
1684
1685static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1686{
1687
1688 read_lock_bh(&__ip_vs_svc_lock);
1689 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1690}
1691
1692
1693static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1694{
1695 struct list_head *e;
1696 struct ip_vs_iter *iter;
1697 struct ip_vs_service *svc;
1698
1699 ++*pos;
1700 if (v == SEQ_START_TOKEN)
1701 return ip_vs_info_array(seq,0);
1702
1703 svc = v;
1704 iter = seq->private;
1705
1706 if (iter->table == ip_vs_svc_table) {
1707 /* next service in table hashed by protocol */
1708 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1709 return list_entry(e, struct ip_vs_service, s_list);
1710
1711
1712 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1713 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1714 s_list) {
1715 return svc;
1716 }
1717 }
1718
1719 iter->table = ip_vs_svc_fwm_table;
1720 iter->bucket = -1;
1721 goto scan_fwmark;
1722 }
1723
1724 /* next service in hashed by fwmark */
1725 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1726 return list_entry(e, struct ip_vs_service, f_list);
1727
1728 scan_fwmark:
1729 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1730 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1731 f_list)
1732 return svc;
1733 }
1734
1735 return NULL;
1736}
1737
1738static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1739{
1740 read_unlock_bh(&__ip_vs_svc_lock);
1741}
1742
1743
1744static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1745{
1746 if (v == SEQ_START_TOKEN) {
1747 seq_printf(seq,
1748 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1749 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1750 seq_puts(seq,
1751 "Prot LocalAddress:Port Scheduler Flags\n");
1752 seq_puts(seq,
1753 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1754 } else {
1755 const struct ip_vs_service *svc = v;
1756 const struct ip_vs_iter *iter = seq->private;
1757 const struct ip_vs_dest *dest;
1758
1759 if (iter->table == ip_vs_svc_table)
1760 seq_printf(seq, "%s %08X:%04X %s ",
1761 ip_vs_proto_name(svc->protocol),
1762 ntohl(svc->addr),
1763 ntohs(svc->port),
1764 svc->scheduler->name);
1765 else
1766 seq_printf(seq, "FWM %08X %s ",
1767 svc->fwmark, svc->scheduler->name);
1768
1769 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1770 seq_printf(seq, "persistent %d %08X\n",
1771 svc->timeout,
1772 ntohl(svc->netmask));
1773 else
1774 seq_putc(seq, '\n');
1775
1776 list_for_each_entry(dest, &svc->destinations, n_list) {
1777 seq_printf(seq,
1778 " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
1779 ntohl(dest->addr), ntohs(dest->port),
1780 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1781 atomic_read(&dest->weight),
1782 atomic_read(&dest->activeconns),
1783 atomic_read(&dest->inactconns));
1784 }
1785 }
1786 return 0;
1787}
1788
1789static struct seq_operations ip_vs_info_seq_ops = {
1790 .start = ip_vs_info_seq_start,
1791 .next = ip_vs_info_seq_next,
1792 .stop = ip_vs_info_seq_stop,
1793 .show = ip_vs_info_seq_show,
1794};
1795
1796static int ip_vs_info_open(struct inode *inode, struct file *file)
1797{
1798 struct seq_file *seq;
1799 int rc = -ENOMEM;
1800 struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1801
1802 if (!s)
1803 goto out;
1804
1805 rc = seq_open(file, &ip_vs_info_seq_ops);
1806 if (rc)
1807 goto out_kfree;
1808
1809 seq = file->private_data;
1810 seq->private = s;
1811 memset(s, 0, sizeof(*s));
1812out:
1813 return rc;
1814out_kfree:
1815 kfree(s);
1816 goto out;
1817}
1818
1819static struct file_operations ip_vs_info_fops = {
1820 .owner = THIS_MODULE,
1821 .open = ip_vs_info_open,
1822 .read = seq_read,
1823 .llseek = seq_lseek,
1824 .release = seq_release_private,
1825};
1826
1827#endif
1828
1829struct ip_vs_stats ip_vs_stats;
1830
1831#ifdef CONFIG_PROC_FS
1832static int ip_vs_stats_show(struct seq_file *seq, void *v)
1833{
1834
1835/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1836 seq_puts(seq,
1837 " Total Incoming Outgoing Incoming Outgoing\n");
1838 seq_printf(seq,
1839 " Conns Packets Packets Bytes Bytes\n");
1840
1841 spin_lock_bh(&ip_vs_stats.lock);
1842 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1843 ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1844 (unsigned long long) ip_vs_stats.inbytes,
1845 (unsigned long long) ip_vs_stats.outbytes);
1846
1847/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1848 seq_puts(seq,
1849 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1850 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1851 ip_vs_stats.cps,
1852 ip_vs_stats.inpps,
1853 ip_vs_stats.outpps,
1854 ip_vs_stats.inbps,
1855 ip_vs_stats.outbps);
1856 spin_unlock_bh(&ip_vs_stats.lock);
1857
1858 return 0;
1859}
1860
1861static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1862{
1863 return single_open(file, ip_vs_stats_show, NULL);
1864}
1865
1866static struct file_operations ip_vs_stats_fops = {
1867 .owner = THIS_MODULE,
1868 .open = ip_vs_stats_seq_open,
1869 .read = seq_read,
1870 .llseek = seq_lseek,
1871 .release = single_release,
1872};
1873
1874#endif
1875
1876/*
1877 * Set timeout values for tcp tcpfin udp in the timeout_table.
1878 */
1879static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1880{
1881 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1882 u->tcp_timeout,
1883 u->tcp_fin_timeout,
1884 u->udp_timeout);
1885
1886#ifdef CONFIG_IP_VS_PROTO_TCP
1887 if (u->tcp_timeout) {
1888 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1889 = u->tcp_timeout * HZ;
1890 }
1891
1892 if (u->tcp_fin_timeout) {
1893 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1894 = u->tcp_fin_timeout * HZ;
1895 }
1896#endif
1897
1898#ifdef CONFIG_IP_VS_PROTO_UDP
1899 if (u->udp_timeout) {
1900 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1901 = u->udp_timeout * HZ;
1902 }
1903#endif
1904 return 0;
1905}
1906
1907
1908#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
1909#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
1910#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
1911 sizeof(struct ip_vs_dest_user))
1912#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
1913#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
1914#define MAX_ARG_LEN SVCDEST_ARG_LEN
1915
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001916static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001917 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
1918 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
1919 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
1920 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
1921 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
1922 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
1923 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
1924 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
1925 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
1926 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
1927 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
1928};
1929
1930static int
1931do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1932{
1933 int ret;
1934 unsigned char arg[MAX_ARG_LEN];
1935 struct ip_vs_service_user *usvc;
1936 struct ip_vs_service *svc;
1937 struct ip_vs_dest_user *udest;
1938
1939 if (!capable(CAP_NET_ADMIN))
1940 return -EPERM;
1941
1942 if (len != set_arglen[SET_CMDID(cmd)]) {
1943 IP_VS_ERR("set_ctl: len %u != %u\n",
1944 len, set_arglen[SET_CMDID(cmd)]);
1945 return -EINVAL;
1946 }
1947
1948 if (copy_from_user(arg, user, len) != 0)
1949 return -EFAULT;
1950
1951 /* increase the module use count */
1952 ip_vs_use_count_inc();
1953
Ingo Molnar14cc3e22006-03-26 01:37:14 -08001954 if (mutex_lock_interruptible(&__ip_vs_mutex)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001955 ret = -ERESTARTSYS;
1956 goto out_dec;
1957 }
1958
1959 if (cmd == IP_VS_SO_SET_FLUSH) {
1960 /* Flush the virtual service */
1961 ret = ip_vs_flush();
1962 goto out_unlock;
1963 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1964 /* Set timeout values for (tcp tcpfin udp) */
1965 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1966 goto out_unlock;
1967 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1968 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1969 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1970 goto out_unlock;
1971 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1972 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1973 ret = stop_sync_thread(dm->state);
1974 goto out_unlock;
1975 }
1976
1977 usvc = (struct ip_vs_service_user *)arg;
1978 udest = (struct ip_vs_dest_user *)(usvc + 1);
1979
1980 if (cmd == IP_VS_SO_SET_ZERO) {
1981 /* if no service address is set, zero counters in all */
1982 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1983 ret = ip_vs_zero_all();
1984 goto out_unlock;
1985 }
1986 }
1987
1988 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1989 if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1990 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1991 usvc->protocol, NIPQUAD(usvc->addr),
1992 ntohs(usvc->port), usvc->sched_name);
1993 ret = -EFAULT;
1994 goto out_unlock;
1995 }
1996
1997 /* Lookup the exact service by <protocol, addr, port> or fwmark */
1998 if (usvc->fwmark == 0)
1999 svc = __ip_vs_service_get(usvc->protocol,
2000 usvc->addr, usvc->port);
2001 else
2002 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
2003
2004 if (cmd != IP_VS_SO_SET_ADD
2005 && (svc == NULL || svc->protocol != usvc->protocol)) {
2006 ret = -ESRCH;
2007 goto out_unlock;
2008 }
2009
2010 switch (cmd) {
2011 case IP_VS_SO_SET_ADD:
2012 if (svc != NULL)
2013 ret = -EEXIST;
2014 else
2015 ret = ip_vs_add_service(usvc, &svc);
2016 break;
2017 case IP_VS_SO_SET_EDIT:
2018 ret = ip_vs_edit_service(svc, usvc);
2019 break;
2020 case IP_VS_SO_SET_DEL:
2021 ret = ip_vs_del_service(svc);
2022 if (!ret)
2023 goto out_unlock;
2024 break;
2025 case IP_VS_SO_SET_ZERO:
2026 ret = ip_vs_zero_service(svc);
2027 break;
2028 case IP_VS_SO_SET_ADDDEST:
2029 ret = ip_vs_add_dest(svc, udest);
2030 break;
2031 case IP_VS_SO_SET_EDITDEST:
2032 ret = ip_vs_edit_dest(svc, udest);
2033 break;
2034 case IP_VS_SO_SET_DELDEST:
2035 ret = ip_vs_del_dest(svc, udest);
2036 break;
2037 default:
2038 ret = -EINVAL;
2039 }
2040
2041 if (svc)
2042 ip_vs_service_put(svc);
2043
2044 out_unlock:
Ingo Molnar14cc3e22006-03-26 01:37:14 -08002045 mutex_unlock(&__ip_vs_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002046 out_dec:
2047 /* decrease the module use count */
2048 ip_vs_use_count_dec();
2049
2050 return ret;
2051}
2052
2053
2054static void
2055ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2056{
2057 spin_lock_bh(&src->lock);
2058 memcpy(dst, src, (char*)&src->lock - (char*)src);
2059 spin_unlock_bh(&src->lock);
2060}
2061
2062static void
2063ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2064{
2065 dst->protocol = src->protocol;
2066 dst->addr = src->addr;
2067 dst->port = src->port;
2068 dst->fwmark = src->fwmark;
pageexec4da62fc2005-06-26 16:00:19 -07002069 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002070 dst->flags = src->flags;
2071 dst->timeout = src->timeout / HZ;
2072 dst->netmask = src->netmask;
2073 dst->num_dests = src->num_dests;
2074 ip_vs_copy_stats(&dst->stats, &src->stats);
2075}
2076
2077static inline int
2078__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2079 struct ip_vs_get_services __user *uptr)
2080{
2081 int idx, count=0;
2082 struct ip_vs_service *svc;
2083 struct ip_vs_service_entry entry;
2084 int ret = 0;
2085
2086 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2087 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2088 if (count >= get->num_services)
2089 goto out;
pageexec4da62fc2005-06-26 16:00:19 -07002090 memset(&entry, 0, sizeof(entry));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002091 ip_vs_copy_service(&entry, svc);
2092 if (copy_to_user(&uptr->entrytable[count],
2093 &entry, sizeof(entry))) {
2094 ret = -EFAULT;
2095 goto out;
2096 }
2097 count++;
2098 }
2099 }
2100
2101 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2102 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2103 if (count >= get->num_services)
2104 goto out;
pageexec4da62fc2005-06-26 16:00:19 -07002105 memset(&entry, 0, sizeof(entry));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002106 ip_vs_copy_service(&entry, svc);
2107 if (copy_to_user(&uptr->entrytable[count],
2108 &entry, sizeof(entry))) {
2109 ret = -EFAULT;
2110 goto out;
2111 }
2112 count++;
2113 }
2114 }
2115 out:
2116 return ret;
2117}
2118
2119static inline int
2120__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2121 struct ip_vs_get_dests __user *uptr)
2122{
2123 struct ip_vs_service *svc;
2124 int ret = 0;
2125
2126 if (get->fwmark)
2127 svc = __ip_vs_svc_fwm_get(get->fwmark);
2128 else
2129 svc = __ip_vs_service_get(get->protocol,
2130 get->addr, get->port);
2131 if (svc) {
2132 int count = 0;
2133 struct ip_vs_dest *dest;
2134 struct ip_vs_dest_entry entry;
2135
2136 list_for_each_entry(dest, &svc->destinations, n_list) {
2137 if (count >= get->num_dests)
2138 break;
2139
2140 entry.addr = dest->addr;
2141 entry.port = dest->port;
2142 entry.conn_flags = atomic_read(&dest->conn_flags);
2143 entry.weight = atomic_read(&dest->weight);
2144 entry.u_threshold = dest->u_threshold;
2145 entry.l_threshold = dest->l_threshold;
2146 entry.activeconns = atomic_read(&dest->activeconns);
2147 entry.inactconns = atomic_read(&dest->inactconns);
2148 entry.persistconns = atomic_read(&dest->persistconns);
2149 ip_vs_copy_stats(&entry.stats, &dest->stats);
2150 if (copy_to_user(&uptr->entrytable[count],
2151 &entry, sizeof(entry))) {
2152 ret = -EFAULT;
2153 break;
2154 }
2155 count++;
2156 }
2157 ip_vs_service_put(svc);
2158 } else
2159 ret = -ESRCH;
2160 return ret;
2161}
2162
2163static inline void
2164__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2165{
2166#ifdef CONFIG_IP_VS_PROTO_TCP
2167 u->tcp_timeout =
2168 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2169 u->tcp_fin_timeout =
2170 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2171#endif
2172#ifdef CONFIG_IP_VS_PROTO_UDP
2173 u->udp_timeout =
2174 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2175#endif
2176}
2177
2178
2179#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2180#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2181#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2182#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2183#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2184#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2185#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2186
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08002187static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002188 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2189 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2190 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2191 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2192 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2193 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2194 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2195};
2196
2197static int
2198do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2199{
2200 unsigned char arg[128];
2201 int ret = 0;
2202
2203 if (!capable(CAP_NET_ADMIN))
2204 return -EPERM;
2205
2206 if (*len < get_arglen[GET_CMDID(cmd)]) {
2207 IP_VS_ERR("get_ctl: len %u < %u\n",
2208 *len, get_arglen[GET_CMDID(cmd)]);
2209 return -EINVAL;
2210 }
2211
2212 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2213 return -EFAULT;
2214
Ingo Molnar14cc3e22006-03-26 01:37:14 -08002215 if (mutex_lock_interruptible(&__ip_vs_mutex))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002216 return -ERESTARTSYS;
2217
2218 switch (cmd) {
2219 case IP_VS_SO_GET_VERSION:
2220 {
2221 char buf[64];
2222
2223 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2224 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2225 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2226 ret = -EFAULT;
2227 goto out;
2228 }
2229 *len = strlen(buf)+1;
2230 }
2231 break;
2232
2233 case IP_VS_SO_GET_INFO:
2234 {
2235 struct ip_vs_getinfo info;
2236 info.version = IP_VS_VERSION_CODE;
2237 info.size = IP_VS_CONN_TAB_SIZE;
2238 info.num_services = ip_vs_num_services;
2239 if (copy_to_user(user, &info, sizeof(info)) != 0)
2240 ret = -EFAULT;
2241 }
2242 break;
2243
2244 case IP_VS_SO_GET_SERVICES:
2245 {
2246 struct ip_vs_get_services *get;
2247 int size;
2248
2249 get = (struct ip_vs_get_services *)arg;
2250 size = sizeof(*get) +
2251 sizeof(struct ip_vs_service_entry) * get->num_services;
2252 if (*len != size) {
2253 IP_VS_ERR("length: %u != %u\n", *len, size);
2254 ret = -EINVAL;
2255 goto out;
2256 }
2257 ret = __ip_vs_get_service_entries(get, user);
2258 }
2259 break;
2260
2261 case IP_VS_SO_GET_SERVICE:
2262 {
2263 struct ip_vs_service_entry *entry;
2264 struct ip_vs_service *svc;
2265
2266 entry = (struct ip_vs_service_entry *)arg;
2267 if (entry->fwmark)
2268 svc = __ip_vs_svc_fwm_get(entry->fwmark);
2269 else
2270 svc = __ip_vs_service_get(entry->protocol,
2271 entry->addr, entry->port);
2272 if (svc) {
2273 ip_vs_copy_service(entry, svc);
2274 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2275 ret = -EFAULT;
2276 ip_vs_service_put(svc);
2277 } else
2278 ret = -ESRCH;
2279 }
2280 break;
2281
2282 case IP_VS_SO_GET_DESTS:
2283 {
2284 struct ip_vs_get_dests *get;
2285 int size;
2286
2287 get = (struct ip_vs_get_dests *)arg;
2288 size = sizeof(*get) +
2289 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2290 if (*len != size) {
2291 IP_VS_ERR("length: %u != %u\n", *len, size);
2292 ret = -EINVAL;
2293 goto out;
2294 }
2295 ret = __ip_vs_get_dest_entries(get, user);
2296 }
2297 break;
2298
2299 case IP_VS_SO_GET_TIMEOUT:
2300 {
2301 struct ip_vs_timeout_user t;
2302
2303 __ip_vs_get_timeouts(&t);
2304 if (copy_to_user(user, &t, sizeof(t)) != 0)
2305 ret = -EFAULT;
2306 }
2307 break;
2308
2309 case IP_VS_SO_GET_DAEMON:
2310 {
2311 struct ip_vs_daemon_user d[2];
2312
2313 memset(&d, 0, sizeof(d));
2314 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2315 d[0].state = IP_VS_STATE_MASTER;
pageexec4da62fc2005-06-26 16:00:19 -07002316 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317 d[0].syncid = ip_vs_master_syncid;
2318 }
2319 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2320 d[1].state = IP_VS_STATE_BACKUP;
pageexec4da62fc2005-06-26 16:00:19 -07002321 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002322 d[1].syncid = ip_vs_backup_syncid;
2323 }
2324 if (copy_to_user(user, &d, sizeof(d)) != 0)
2325 ret = -EFAULT;
2326 }
2327 break;
2328
2329 default:
2330 ret = -EINVAL;
2331 }
2332
2333 out:
Ingo Molnar14cc3e22006-03-26 01:37:14 -08002334 mutex_unlock(&__ip_vs_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002335 return ret;
2336}
2337
2338
2339static struct nf_sockopt_ops ip_vs_sockopts = {
2340 .pf = PF_INET,
2341 .set_optmin = IP_VS_BASE_CTL,
2342 .set_optmax = IP_VS_SO_SET_MAX+1,
2343 .set = do_ip_vs_set_ctl,
2344 .get_optmin = IP_VS_BASE_CTL,
2345 .get_optmax = IP_VS_SO_GET_MAX+1,
2346 .get = do_ip_vs_get_ctl,
2347};
2348
2349
2350int ip_vs_control_init(void)
2351{
2352 int ret;
2353 int idx;
2354
2355 EnterFunction(2);
2356
2357 ret = nf_register_sockopt(&ip_vs_sockopts);
2358 if (ret) {
2359 IP_VS_ERR("cannot register sockopt.\n");
2360 return ret;
2361 }
2362
2363 proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2364 proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2365
2366 sysctl_header = register_sysctl_table(vs_root_table, 0);
2367
2368 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2369 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2370 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2371 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2372 }
2373 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
2374 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2375 }
2376
2377 memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2378 spin_lock_init(&ip_vs_stats.lock);
2379 ip_vs_new_estimator(&ip_vs_stats);
2380
2381 /* Hook the defense timer */
2382 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2383
2384 LeaveFunction(2);
2385 return 0;
2386}
2387
2388
2389void ip_vs_control_cleanup(void)
2390{
2391 EnterFunction(2);
2392 ip_vs_trash_cleanup();
2393 cancel_rearming_delayed_work(&defense_work);
2394 ip_vs_kill_estimator(&ip_vs_stats);
2395 unregister_sysctl_table(sysctl_header);
2396 proc_net_remove("ip_vs_stats");
2397 proc_net_remove("ip_vs");
2398 nf_unregister_sockopt(&ip_vs_sockopts);
2399 LeaveFunction(2);
2400}