blob: 7f0288b25fa16e77e017665e8788a0d20d98c5d9 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 *
19 * Changes:
20 *
21 */
22
23#include <linux/module.h>
24#include <linux/init.h>
25#include <linux/types.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080026#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070027#include <linux/fs.h>
28#include <linux/sysctl.h>
29#include <linux/proc_fs.h>
30#include <linux/workqueue.h>
31#include <linux/swap.h>
32#include <linux/proc_fs.h>
33#include <linux/seq_file.h>
34
35#include <linux/netfilter.h>
36#include <linux/netfilter_ipv4.h>
37
38#include <net/ip.h>
Arnaldo Carvalho de Melo14c85022005-12-27 02:43:12 -020039#include <net/route.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070040#include <net/sock.h>
41
42#include <asm/uaccess.h>
43
44#include <net/ip_vs.h>
45
46/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
47static DECLARE_MUTEX(__ip_vs_mutex);
48
49/* lock for service table */
50static DEFINE_RWLOCK(__ip_vs_svc_lock);
51
52/* lock for table with the real services */
53static DEFINE_RWLOCK(__ip_vs_rs_lock);
54
55/* lock for state and timeout tables */
56static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
57
58/* lock for drop entry handling */
59static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
60
61/* lock for drop packet handling */
62static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
63
64/* 1/rate drop and drop-entry variables */
65int ip_vs_drop_rate = 0;
66int ip_vs_drop_counter = 0;
67static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
68
69/* number of virtual services */
70static int ip_vs_num_services = 0;
71
72/* sysctl variables */
73static int sysctl_ip_vs_drop_entry = 0;
74static int sysctl_ip_vs_drop_packet = 0;
75static int sysctl_ip_vs_secure_tcp = 0;
76static int sysctl_ip_vs_amemthresh = 1024;
77static int sysctl_ip_vs_am_droprate = 10;
78int sysctl_ip_vs_cache_bypass = 0;
79int sysctl_ip_vs_expire_nodest_conn = 0;
80int sysctl_ip_vs_expire_quiescent_template = 0;
81int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
82int sysctl_ip_vs_nat_icmp_send = 0;
83
84
85#ifdef CONFIG_IP_VS_DEBUG
86static int sysctl_ip_vs_debug_level = 0;
87
88int ip_vs_get_debug_level(void)
89{
90 return sysctl_ip_vs_debug_level;
91}
92#endif
93
94/*
Julian Anastasovaf9debd2005-07-11 20:59:57 -070095 * update_defense_level is called from keventd and from sysctl,
96 * so it needs to protect itself from softirqs
Linus Torvalds1da177e2005-04-16 15:20:36 -070097 */
98static void update_defense_level(void)
99{
100 struct sysinfo i;
101 static int old_secure_tcp = 0;
102 int availmem;
103 int nomem;
104 int to_change = -1;
105
106 /* we only count free and buffered memory (in pages) */
107 si_meminfo(&i);
108 availmem = i.freeram + i.bufferram;
109 /* however in linux 2.5 the i.bufferram is total page cache size,
110 we need adjust it */
111 /* si_swapinfo(&i); */
112 /* availmem = availmem - (i.totalswap - i.freeswap); */
113
114 nomem = (availmem < sysctl_ip_vs_amemthresh);
115
Julian Anastasovaf9debd2005-07-11 20:59:57 -0700116 local_bh_disable();
117
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118 /* drop_entry */
119 spin_lock(&__ip_vs_dropentry_lock);
120 switch (sysctl_ip_vs_drop_entry) {
121 case 0:
122 atomic_set(&ip_vs_dropentry, 0);
123 break;
124 case 1:
125 if (nomem) {
126 atomic_set(&ip_vs_dropentry, 1);
127 sysctl_ip_vs_drop_entry = 2;
128 } else {
129 atomic_set(&ip_vs_dropentry, 0);
130 }
131 break;
132 case 2:
133 if (nomem) {
134 atomic_set(&ip_vs_dropentry, 1);
135 } else {
136 atomic_set(&ip_vs_dropentry, 0);
137 sysctl_ip_vs_drop_entry = 1;
138 };
139 break;
140 case 3:
141 atomic_set(&ip_vs_dropentry, 1);
142 break;
143 }
144 spin_unlock(&__ip_vs_dropentry_lock);
145
146 /* drop_packet */
147 spin_lock(&__ip_vs_droppacket_lock);
148 switch (sysctl_ip_vs_drop_packet) {
149 case 0:
150 ip_vs_drop_rate = 0;
151 break;
152 case 1:
153 if (nomem) {
154 ip_vs_drop_rate = ip_vs_drop_counter
155 = sysctl_ip_vs_amemthresh /
156 (sysctl_ip_vs_amemthresh-availmem);
157 sysctl_ip_vs_drop_packet = 2;
158 } else {
159 ip_vs_drop_rate = 0;
160 }
161 break;
162 case 2:
163 if (nomem) {
164 ip_vs_drop_rate = ip_vs_drop_counter
165 = sysctl_ip_vs_amemthresh /
166 (sysctl_ip_vs_amemthresh-availmem);
167 } else {
168 ip_vs_drop_rate = 0;
169 sysctl_ip_vs_drop_packet = 1;
170 }
171 break;
172 case 3:
173 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
174 break;
175 }
176 spin_unlock(&__ip_vs_droppacket_lock);
177
178 /* secure_tcp */
179 write_lock(&__ip_vs_securetcp_lock);
180 switch (sysctl_ip_vs_secure_tcp) {
181 case 0:
182 if (old_secure_tcp >= 2)
183 to_change = 0;
184 break;
185 case 1:
186 if (nomem) {
187 if (old_secure_tcp < 2)
188 to_change = 1;
189 sysctl_ip_vs_secure_tcp = 2;
190 } else {
191 if (old_secure_tcp >= 2)
192 to_change = 0;
193 }
194 break;
195 case 2:
196 if (nomem) {
197 if (old_secure_tcp < 2)
198 to_change = 1;
199 } else {
200 if (old_secure_tcp >= 2)
201 to_change = 0;
202 sysctl_ip_vs_secure_tcp = 1;
203 }
204 break;
205 case 3:
206 if (old_secure_tcp < 2)
207 to_change = 1;
208 break;
209 }
210 old_secure_tcp = sysctl_ip_vs_secure_tcp;
211 if (to_change >= 0)
212 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
213 write_unlock(&__ip_vs_securetcp_lock);
Julian Anastasovaf9debd2005-07-11 20:59:57 -0700214
215 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700216}
217
218
219/*
220 * Timer for checking the defense
221 */
222#define DEFENSE_TIMER_PERIOD 1*HZ
223static void defense_work_handler(void *data);
224static DECLARE_WORK(defense_work, defense_work_handler, NULL);
225
226static void defense_work_handler(void *data)
227{
228 update_defense_level();
229 if (atomic_read(&ip_vs_dropentry))
230 ip_vs_random_dropentry();
231
232 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
233}
234
235int
236ip_vs_use_count_inc(void)
237{
238 return try_module_get(THIS_MODULE);
239}
240
241void
242ip_vs_use_count_dec(void)
243{
244 module_put(THIS_MODULE);
245}
246
247
248/*
249 * Hash table: for virtual service lookups
250 */
251#define IP_VS_SVC_TAB_BITS 8
252#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
253#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
254
255/* the service table hashed by <protocol, addr, port> */
256static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
257/* the service table hashed by fwmark */
258static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
259
260/*
261 * Hash table: for real service lookups
262 */
263#define IP_VS_RTAB_BITS 4
264#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
265#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
266
267static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
268
269/*
270 * Trash for destinations
271 */
272static LIST_HEAD(ip_vs_dest_trash);
273
274/*
275 * FTP & NULL virtual service counters
276 */
277static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
278static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
279
280
281/*
282 * Returns hash value for virtual service
283 */
284static __inline__ unsigned
285ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
286{
287 register unsigned porth = ntohs(port);
288
289 return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
290 & IP_VS_SVC_TAB_MASK;
291}
292
293/*
294 * Returns hash value of fwmark for virtual service lookup
295 */
296static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
297{
298 return fwmark & IP_VS_SVC_TAB_MASK;
299}
300
301/*
302 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
303 * or in the ip_vs_svc_fwm_table by fwmark.
304 * Should be called with locked tables.
305 */
306static int ip_vs_svc_hash(struct ip_vs_service *svc)
307{
308 unsigned hash;
309
310 if (svc->flags & IP_VS_SVC_F_HASHED) {
311 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
312 "called from %p\n", __builtin_return_address(0));
313 return 0;
314 }
315
316 if (svc->fwmark == 0) {
317 /*
318 * Hash it by <protocol,addr,port> in ip_vs_svc_table
319 */
320 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
321 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
322 } else {
323 /*
324 * Hash it by fwmark in ip_vs_svc_fwm_table
325 */
326 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
327 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
328 }
329
330 svc->flags |= IP_VS_SVC_F_HASHED;
331 /* increase its refcnt because it is referenced by the svc table */
332 atomic_inc(&svc->refcnt);
333 return 1;
334}
335
336
337/*
338 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
339 * Should be called with locked tables.
340 */
341static int ip_vs_svc_unhash(struct ip_vs_service *svc)
342{
343 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
344 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
345 "called from %p\n", __builtin_return_address(0));
346 return 0;
347 }
348
349 if (svc->fwmark == 0) {
350 /* Remove it from the ip_vs_svc_table table */
351 list_del(&svc->s_list);
352 } else {
353 /* Remove it from the ip_vs_svc_fwm_table table */
354 list_del(&svc->f_list);
355 }
356
357 svc->flags &= ~IP_VS_SVC_F_HASHED;
358 atomic_dec(&svc->refcnt);
359 return 1;
360}
361
362
363/*
364 * Get service by {proto,addr,port} in the service table.
365 */
366static __inline__ struct ip_vs_service *
367__ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
368{
369 unsigned hash;
370 struct ip_vs_service *svc;
371
372 /* Check for "full" addressed entries */
373 hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
374
375 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
376 if ((svc->addr == vaddr)
377 && (svc->port == vport)
378 && (svc->protocol == protocol)) {
379 /* HIT */
380 atomic_inc(&svc->usecnt);
381 return svc;
382 }
383 }
384
385 return NULL;
386}
387
388
389/*
390 * Get service by {fwmark} in the service table.
391 */
392static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
393{
394 unsigned hash;
395 struct ip_vs_service *svc;
396
397 /* Check for fwmark addressed entries */
398 hash = ip_vs_svc_fwm_hashkey(fwmark);
399
400 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
401 if (svc->fwmark == fwmark) {
402 /* HIT */
403 atomic_inc(&svc->usecnt);
404 return svc;
405 }
406 }
407
408 return NULL;
409}
410
411struct ip_vs_service *
412ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
413{
414 struct ip_vs_service *svc;
415
416 read_lock(&__ip_vs_svc_lock);
417
418 /*
419 * Check the table hashed by fwmark first
420 */
421 if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
422 goto out;
423
424 /*
425 * Check the table hashed by <protocol,addr,port>
426 * for "full" addressed entries
427 */
428 svc = __ip_vs_service_get(protocol, vaddr, vport);
429
430 if (svc == NULL
431 && protocol == IPPROTO_TCP
432 && atomic_read(&ip_vs_ftpsvc_counter)
433 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
434 /*
435 * Check if ftp service entry exists, the packet
436 * might belong to FTP data connections.
437 */
438 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
439 }
440
441 if (svc == NULL
442 && atomic_read(&ip_vs_nullsvc_counter)) {
443 /*
444 * Check if the catch-all port (port zero) exists
445 */
446 svc = __ip_vs_service_get(protocol, vaddr, 0);
447 }
448
449 out:
450 read_unlock(&__ip_vs_svc_lock);
451
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800452 IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453 fwmark, ip_vs_proto_name(protocol),
454 NIPQUAD(vaddr), ntohs(vport),
455 svc?"hit":"not hit");
456
457 return svc;
458}
459
460
461static inline void
462__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
463{
464 atomic_inc(&svc->refcnt);
465 dest->svc = svc;
466}
467
468static inline void
469__ip_vs_unbind_svc(struct ip_vs_dest *dest)
470{
471 struct ip_vs_service *svc = dest->svc;
472
473 dest->svc = NULL;
474 if (atomic_dec_and_test(&svc->refcnt))
475 kfree(svc);
476}
477
478
479/*
480 * Returns hash value for real service
481 */
482static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
483{
484 register unsigned porth = ntohs(port);
485
486 return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
487 & IP_VS_RTAB_MASK;
488}
489
490/*
491 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
492 * should be called with locked tables.
493 */
494static int ip_vs_rs_hash(struct ip_vs_dest *dest)
495{
496 unsigned hash;
497
498 if (!list_empty(&dest->d_list)) {
499 return 0;
500 }
501
502 /*
503 * Hash by proto,addr,port,
504 * which are the parameters of the real service.
505 */
506 hash = ip_vs_rs_hashkey(dest->addr, dest->port);
507 list_add(&dest->d_list, &ip_vs_rtable[hash]);
508
509 return 1;
510}
511
512/*
513 * UNhashes ip_vs_dest from ip_vs_rtable.
514 * should be called with locked tables.
515 */
516static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
517{
518 /*
519 * Remove it from the ip_vs_rtable table.
520 */
521 if (!list_empty(&dest->d_list)) {
522 list_del(&dest->d_list);
523 INIT_LIST_HEAD(&dest->d_list);
524 }
525
526 return 1;
527}
528
529/*
530 * Lookup real service by <proto,addr,port> in the real service table.
531 */
532struct ip_vs_dest *
533ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
534{
535 unsigned hash;
536 struct ip_vs_dest *dest;
537
538 /*
539 * Check for "full" addressed entries
540 * Return the first found entry
541 */
542 hash = ip_vs_rs_hashkey(daddr, dport);
543
544 read_lock(&__ip_vs_rs_lock);
545 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
546 if ((dest->addr == daddr)
547 && (dest->port == dport)
548 && ((dest->protocol == protocol) ||
549 dest->vfwmark)) {
550 /* HIT */
551 read_unlock(&__ip_vs_rs_lock);
552 return dest;
553 }
554 }
555 read_unlock(&__ip_vs_rs_lock);
556
557 return NULL;
558}
559
560/*
561 * Lookup destination by {addr,port} in the given service
562 */
563static struct ip_vs_dest *
564ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
565{
566 struct ip_vs_dest *dest;
567
568 /*
569 * Find the destination for the given service
570 */
571 list_for_each_entry(dest, &svc->destinations, n_list) {
572 if ((dest->addr == daddr) && (dest->port == dport)) {
573 /* HIT */
574 return dest;
575 }
576 }
577
578 return NULL;
579}
580
581
582/*
583 * Lookup dest by {svc,addr,port} in the destination trash.
584 * The destination trash is used to hold the destinations that are removed
585 * from the service table but are still referenced by some conn entries.
586 * The reason to add the destination trash is when the dest is temporary
587 * down (either by administrator or by monitor program), the dest can be
588 * picked back from the trash, the remaining connections to the dest can
589 * continue, and the counting information of the dest is also useful for
590 * scheduling.
591 */
592static struct ip_vs_dest *
593ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
594{
595 struct ip_vs_dest *dest, *nxt;
596
597 /*
598 * Find the destination in trash
599 */
600 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
601 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800602 "dest->refcnt=%d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700603 dest->vfwmark,
604 NIPQUAD(dest->addr), ntohs(dest->port),
605 atomic_read(&dest->refcnt));
606 if (dest->addr == daddr &&
607 dest->port == dport &&
608 dest->vfwmark == svc->fwmark &&
609 dest->protocol == svc->protocol &&
610 (svc->fwmark ||
611 (dest->vaddr == svc->addr &&
612 dest->vport == svc->port))) {
613 /* HIT */
614 return dest;
615 }
616
617 /*
618 * Try to purge the destination from trash if not referenced
619 */
620 if (atomic_read(&dest->refcnt) == 1) {
621 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
622 "from trash\n",
623 dest->vfwmark,
624 NIPQUAD(dest->addr), ntohs(dest->port));
625 list_del(&dest->n_list);
626 ip_vs_dst_reset(dest);
627 __ip_vs_unbind_svc(dest);
628 kfree(dest);
629 }
630 }
631
632 return NULL;
633}
634
635
636/*
637 * Clean up all the destinations in the trash
638 * Called by the ip_vs_control_cleanup()
639 *
640 * When the ip_vs_control_clearup is activated by ipvs module exit,
641 * the service tables must have been flushed and all the connections
642 * are expired, and the refcnt of each destination in the trash must
643 * be 1, so we simply release them here.
644 */
645static void ip_vs_trash_cleanup(void)
646{
647 struct ip_vs_dest *dest, *nxt;
648
649 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
650 list_del(&dest->n_list);
651 ip_vs_dst_reset(dest);
652 __ip_vs_unbind_svc(dest);
653 kfree(dest);
654 }
655}
656
657
658static void
659ip_vs_zero_stats(struct ip_vs_stats *stats)
660{
661 spin_lock_bh(&stats->lock);
662 memset(stats, 0, (char *)&stats->lock - (char *)stats);
663 spin_unlock_bh(&stats->lock);
664 ip_vs_zero_estimator(stats);
665}
666
667/*
668 * Update a destination in the given service
669 */
670static void
671__ip_vs_update_dest(struct ip_vs_service *svc,
672 struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
673{
674 int conn_flags;
675
676 /* set the weight and the flags */
677 atomic_set(&dest->weight, udest->weight);
678 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
679
680 /* check if local node and update the flags */
681 if (inet_addr_type(udest->addr) == RTN_LOCAL) {
682 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
683 | IP_VS_CONN_F_LOCALNODE;
684 }
685
686 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
687 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
688 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
689 } else {
690 /*
691 * Put the real service in ip_vs_rtable if not present.
692 * For now only for NAT!
693 */
694 write_lock_bh(&__ip_vs_rs_lock);
695 ip_vs_rs_hash(dest);
696 write_unlock_bh(&__ip_vs_rs_lock);
697 }
698 atomic_set(&dest->conn_flags, conn_flags);
699
700 /* bind the service */
701 if (!dest->svc) {
702 __ip_vs_bind_svc(dest, svc);
703 } else {
704 if (dest->svc != svc) {
705 __ip_vs_unbind_svc(dest);
706 ip_vs_zero_stats(&dest->stats);
707 __ip_vs_bind_svc(dest, svc);
708 }
709 }
710
711 /* set the dest status flags */
712 dest->flags |= IP_VS_DEST_F_AVAILABLE;
713
714 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
715 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
716 dest->u_threshold = udest->u_threshold;
717 dest->l_threshold = udest->l_threshold;
718}
719
720
721/*
722 * Create a destination for the given service
723 */
724static int
725ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
726 struct ip_vs_dest **dest_p)
727{
728 struct ip_vs_dest *dest;
729 unsigned atype;
730
731 EnterFunction(2);
732
733 atype = inet_addr_type(udest->addr);
734 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
735 return -EINVAL;
736
737 dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
738 if (dest == NULL) {
739 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
740 return -ENOMEM;
741 }
742 memset(dest, 0, sizeof(struct ip_vs_dest));
743
744 dest->protocol = svc->protocol;
745 dest->vaddr = svc->addr;
746 dest->vport = svc->port;
747 dest->vfwmark = svc->fwmark;
748 dest->addr = udest->addr;
749 dest->port = udest->port;
750
751 atomic_set(&dest->activeconns, 0);
752 atomic_set(&dest->inactconns, 0);
753 atomic_set(&dest->persistconns, 0);
754 atomic_set(&dest->refcnt, 0);
755
756 INIT_LIST_HEAD(&dest->d_list);
757 spin_lock_init(&dest->dst_lock);
758 spin_lock_init(&dest->stats.lock);
759 __ip_vs_update_dest(svc, dest, udest);
760 ip_vs_new_estimator(&dest->stats);
761
762 *dest_p = dest;
763
764 LeaveFunction(2);
765 return 0;
766}
767
768
769/*
770 * Add a destination into an existing service
771 */
772static int
773ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
774{
775 struct ip_vs_dest *dest;
776 __u32 daddr = udest->addr;
777 __u16 dport = udest->port;
778 int ret;
779
780 EnterFunction(2);
781
782 if (udest->weight < 0) {
783 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
784 return -ERANGE;
785 }
786
787 if (udest->l_threshold > udest->u_threshold) {
788 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
789 "upper threshold\n");
790 return -ERANGE;
791 }
792
793 /*
794 * Check if the dest already exists in the list
795 */
796 dest = ip_vs_lookup_dest(svc, daddr, dport);
797 if (dest != NULL) {
798 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
799 return -EEXIST;
800 }
801
802 /*
803 * Check if the dest already exists in the trash and
804 * is from the same service
805 */
806 dest = ip_vs_trash_get_dest(svc, daddr, dport);
807 if (dest != NULL) {
808 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800809 "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700810 NIPQUAD(daddr), ntohs(dport),
811 atomic_read(&dest->refcnt),
812 dest->vfwmark,
813 NIPQUAD(dest->vaddr),
814 ntohs(dest->vport));
815 __ip_vs_update_dest(svc, dest, udest);
816
817 /*
818 * Get the destination from the trash
819 */
820 list_del(&dest->n_list);
821
822 ip_vs_new_estimator(&dest->stats);
823
824 write_lock_bh(&__ip_vs_svc_lock);
825
826 /*
827 * Wait until all other svc users go away.
828 */
829 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
830
831 list_add(&dest->n_list, &svc->destinations);
832 svc->num_dests++;
833
834 /* call the update_service function of its scheduler */
835 svc->scheduler->update_service(svc);
836
837 write_unlock_bh(&__ip_vs_svc_lock);
838 return 0;
839 }
840
841 /*
842 * Allocate and initialize the dest structure
843 */
844 ret = ip_vs_new_dest(svc, udest, &dest);
845 if (ret) {
846 return ret;
847 }
848
849 /*
850 * Add the dest entry into the list
851 */
852 atomic_inc(&dest->refcnt);
853
854 write_lock_bh(&__ip_vs_svc_lock);
855
856 /*
857 * Wait until all other svc users go away.
858 */
859 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
860
861 list_add(&dest->n_list, &svc->destinations);
862 svc->num_dests++;
863
864 /* call the update_service function of its scheduler */
865 svc->scheduler->update_service(svc);
866
867 write_unlock_bh(&__ip_vs_svc_lock);
868
869 LeaveFunction(2);
870
871 return 0;
872}
873
874
875/*
876 * Edit a destination in the given service
877 */
878static int
879ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
880{
881 struct ip_vs_dest *dest;
882 __u32 daddr = udest->addr;
883 __u16 dport = udest->port;
884
885 EnterFunction(2);
886
887 if (udest->weight < 0) {
888 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
889 return -ERANGE;
890 }
891
892 if (udest->l_threshold > udest->u_threshold) {
893 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
894 "upper threshold\n");
895 return -ERANGE;
896 }
897
898 /*
899 * Lookup the destination list
900 */
901 dest = ip_vs_lookup_dest(svc, daddr, dport);
902 if (dest == NULL) {
903 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
904 return -ENOENT;
905 }
906
907 __ip_vs_update_dest(svc, dest, udest);
908
909 write_lock_bh(&__ip_vs_svc_lock);
910
911 /* Wait until all other svc users go away */
912 while (atomic_read(&svc->usecnt) > 1) {};
913
914 /* call the update_service, because server weight may be changed */
915 svc->scheduler->update_service(svc);
916
917 write_unlock_bh(&__ip_vs_svc_lock);
918
919 LeaveFunction(2);
920
921 return 0;
922}
923
924
925/*
926 * Delete a destination (must be already unlinked from the service)
927 */
928static void __ip_vs_del_dest(struct ip_vs_dest *dest)
929{
930 ip_vs_kill_estimator(&dest->stats);
931
932 /*
933 * Remove it from the d-linked list with the real services.
934 */
935 write_lock_bh(&__ip_vs_rs_lock);
936 ip_vs_rs_unhash(dest);
937 write_unlock_bh(&__ip_vs_rs_lock);
938
939 /*
940 * Decrease the refcnt of the dest, and free the dest
941 * if nobody refers to it (refcnt=0). Otherwise, throw
942 * the destination into the trash.
943 */
944 if (atomic_dec_and_test(&dest->refcnt)) {
945 ip_vs_dst_reset(dest);
946 /* simply decrease svc->refcnt here, let the caller check
947 and release the service if nobody refers to it.
948 Only user context can release destination and service,
949 and only one user context can update virtual service at a
950 time, so the operation here is OK */
951 atomic_dec(&dest->svc->refcnt);
952 kfree(dest);
953 } else {
Roberto Nibali4b5bdf52006-01-03 14:22:59 -0800954 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
955 "dest->refcnt=%d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700956 NIPQUAD(dest->addr), ntohs(dest->port),
957 atomic_read(&dest->refcnt));
958 list_add(&dest->n_list, &ip_vs_dest_trash);
959 atomic_inc(&dest->refcnt);
960 }
961}
962
963
964/*
965 * Unlink a destination from the given service
966 */
967static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
968 struct ip_vs_dest *dest,
969 int svcupd)
970{
971 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
972
973 /*
974 * Remove it from the d-linked destination list.
975 */
976 list_del(&dest->n_list);
977 svc->num_dests--;
978 if (svcupd) {
979 /*
980 * Call the update_service function of its scheduler
981 */
982 svc->scheduler->update_service(svc);
983 }
984}
985
986
987/*
988 * Delete a destination server in the given service
989 */
990static int
991ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
992{
993 struct ip_vs_dest *dest;
994 __u32 daddr = udest->addr;
995 __u16 dport = udest->port;
996
997 EnterFunction(2);
998
999 dest = ip_vs_lookup_dest(svc, daddr, dport);
1000 if (dest == NULL) {
1001 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1002 return -ENOENT;
1003 }
1004
1005 write_lock_bh(&__ip_vs_svc_lock);
1006
1007 /*
1008 * Wait until all other svc users go away.
1009 */
1010 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1011
1012 /*
1013 * Unlink dest from the service
1014 */
1015 __ip_vs_unlink_dest(svc, dest, 1);
1016
1017 write_unlock_bh(&__ip_vs_svc_lock);
1018
1019 /*
1020 * Delete the destination
1021 */
1022 __ip_vs_del_dest(dest);
1023
1024 LeaveFunction(2);
1025
1026 return 0;
1027}
1028
1029
1030/*
1031 * Add a service into the service hash table
1032 */
1033static int
1034ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1035{
1036 int ret = 0;
1037 struct ip_vs_scheduler *sched = NULL;
1038 struct ip_vs_service *svc = NULL;
1039
1040 /* increase the module use count */
1041 ip_vs_use_count_inc();
1042
1043 /* Lookup the scheduler by 'u->sched_name' */
1044 sched = ip_vs_scheduler_get(u->sched_name);
1045 if (sched == NULL) {
1046 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1047 u->sched_name);
1048 ret = -ENOENT;
1049 goto out_mod_dec;
1050 }
1051
1052 svc = (struct ip_vs_service *)
1053 kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1054 if (svc == NULL) {
1055 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1056 ret = -ENOMEM;
1057 goto out_err;
1058 }
1059 memset(svc, 0, sizeof(struct ip_vs_service));
1060
1061 /* I'm the first user of the service */
1062 atomic_set(&svc->usecnt, 1);
1063 atomic_set(&svc->refcnt, 0);
1064
1065 svc->protocol = u->protocol;
1066 svc->addr = u->addr;
1067 svc->port = u->port;
1068 svc->fwmark = u->fwmark;
1069 svc->flags = u->flags;
1070 svc->timeout = u->timeout * HZ;
1071 svc->netmask = u->netmask;
1072
1073 INIT_LIST_HEAD(&svc->destinations);
1074 rwlock_init(&svc->sched_lock);
1075 spin_lock_init(&svc->stats.lock);
1076
1077 /* Bind the scheduler */
1078 ret = ip_vs_bind_scheduler(svc, sched);
1079 if (ret)
1080 goto out_err;
1081 sched = NULL;
1082
1083 /* Update the virtual service counters */
1084 if (svc->port == FTPPORT)
1085 atomic_inc(&ip_vs_ftpsvc_counter);
1086 else if (svc->port == 0)
1087 atomic_inc(&ip_vs_nullsvc_counter);
1088
1089 ip_vs_new_estimator(&svc->stats);
1090 ip_vs_num_services++;
1091
1092 /* Hash the service into the service table */
1093 write_lock_bh(&__ip_vs_svc_lock);
1094 ip_vs_svc_hash(svc);
1095 write_unlock_bh(&__ip_vs_svc_lock);
1096
1097 *svc_p = svc;
1098 return 0;
1099
1100 out_err:
1101 if (svc != NULL) {
1102 if (svc->scheduler)
1103 ip_vs_unbind_scheduler(svc);
1104 if (svc->inc) {
1105 local_bh_disable();
1106 ip_vs_app_inc_put(svc->inc);
1107 local_bh_enable();
1108 }
1109 kfree(svc);
1110 }
1111 ip_vs_scheduler_put(sched);
1112
1113 out_mod_dec:
1114 /* decrease the module use count */
1115 ip_vs_use_count_dec();
1116
1117 return ret;
1118}
1119
1120
1121/*
1122 * Edit a service and bind it with a new scheduler
1123 */
1124static int
1125ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1126{
1127 struct ip_vs_scheduler *sched, *old_sched;
1128 int ret = 0;
1129
1130 /*
1131 * Lookup the scheduler, by 'u->sched_name'
1132 */
1133 sched = ip_vs_scheduler_get(u->sched_name);
1134 if (sched == NULL) {
1135 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1136 u->sched_name);
1137 return -ENOENT;
1138 }
1139 old_sched = sched;
1140
1141 write_lock_bh(&__ip_vs_svc_lock);
1142
1143 /*
1144 * Wait until all other svc users go away.
1145 */
1146 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1147
1148 /*
1149 * Set the flags and timeout value
1150 */
1151 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1152 svc->timeout = u->timeout * HZ;
1153 svc->netmask = u->netmask;
1154
1155 old_sched = svc->scheduler;
1156 if (sched != old_sched) {
1157 /*
1158 * Unbind the old scheduler
1159 */
1160 if ((ret = ip_vs_unbind_scheduler(svc))) {
1161 old_sched = sched;
1162 goto out;
1163 }
1164
1165 /*
1166 * Bind the new scheduler
1167 */
1168 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1169 /*
1170 * If ip_vs_bind_scheduler fails, restore the old
1171 * scheduler.
1172 * The main reason of failure is out of memory.
1173 *
1174 * The question is if the old scheduler can be
1175 * restored all the time. TODO: if it cannot be
1176 * restored some time, we must delete the service,
1177 * otherwise the system may crash.
1178 */
1179 ip_vs_bind_scheduler(svc, old_sched);
1180 old_sched = sched;
1181 goto out;
1182 }
1183 }
1184
1185 out:
1186 write_unlock_bh(&__ip_vs_svc_lock);
1187
1188 if (old_sched)
1189 ip_vs_scheduler_put(old_sched);
1190
1191 return ret;
1192}
1193
1194
1195/*
1196 * Delete a service from the service list
1197 * - The service must be unlinked, unlocked and not referenced!
1198 * - We are called under _bh lock
1199 */
1200static void __ip_vs_del_service(struct ip_vs_service *svc)
1201{
1202 struct ip_vs_dest *dest, *nxt;
1203 struct ip_vs_scheduler *old_sched;
1204
1205 ip_vs_num_services--;
1206 ip_vs_kill_estimator(&svc->stats);
1207
1208 /* Unbind scheduler */
1209 old_sched = svc->scheduler;
1210 ip_vs_unbind_scheduler(svc);
1211 if (old_sched)
1212 ip_vs_scheduler_put(old_sched);
1213
1214 /* Unbind app inc */
1215 if (svc->inc) {
1216 ip_vs_app_inc_put(svc->inc);
1217 svc->inc = NULL;
1218 }
1219
1220 /*
1221 * Unlink the whole destination list
1222 */
1223 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1224 __ip_vs_unlink_dest(svc, dest, 0);
1225 __ip_vs_del_dest(dest);
1226 }
1227
1228 /*
1229 * Update the virtual service counters
1230 */
1231 if (svc->port == FTPPORT)
1232 atomic_dec(&ip_vs_ftpsvc_counter);
1233 else if (svc->port == 0)
1234 atomic_dec(&ip_vs_nullsvc_counter);
1235
1236 /*
1237 * Free the service if nobody refers to it
1238 */
1239 if (atomic_read(&svc->refcnt) == 0)
1240 kfree(svc);
1241
1242 /* decrease the module use count */
1243 ip_vs_use_count_dec();
1244}
1245
1246/*
1247 * Delete a service from the service list
1248 */
1249static int ip_vs_del_service(struct ip_vs_service *svc)
1250{
1251 if (svc == NULL)
1252 return -EEXIST;
1253
1254 /*
1255 * Unhash it from the service table
1256 */
1257 write_lock_bh(&__ip_vs_svc_lock);
1258
1259 ip_vs_svc_unhash(svc);
1260
1261 /*
1262 * Wait until all the svc users go away.
1263 */
1264 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1265
1266 __ip_vs_del_service(svc);
1267
1268 write_unlock_bh(&__ip_vs_svc_lock);
1269
1270 return 0;
1271}
1272
1273
1274/*
1275 * Flush all the virtual services
1276 */
1277static int ip_vs_flush(void)
1278{
1279 int idx;
1280 struct ip_vs_service *svc, *nxt;
1281
1282 /*
1283 * Flush the service table hashed by <protocol,addr,port>
1284 */
1285 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1286 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1287 write_lock_bh(&__ip_vs_svc_lock);
1288 ip_vs_svc_unhash(svc);
1289 /*
1290 * Wait until all the svc users go away.
1291 */
1292 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1293 __ip_vs_del_service(svc);
1294 write_unlock_bh(&__ip_vs_svc_lock);
1295 }
1296 }
1297
1298 /*
1299 * Flush the service table hashed by fwmark
1300 */
1301 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1302 list_for_each_entry_safe(svc, nxt,
1303 &ip_vs_svc_fwm_table[idx], f_list) {
1304 write_lock_bh(&__ip_vs_svc_lock);
1305 ip_vs_svc_unhash(svc);
1306 /*
1307 * Wait until all the svc users go away.
1308 */
1309 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1310 __ip_vs_del_service(svc);
1311 write_unlock_bh(&__ip_vs_svc_lock);
1312 }
1313 }
1314
1315 return 0;
1316}
1317
1318
1319/*
1320 * Zero counters in a service or all services
1321 */
1322static int ip_vs_zero_service(struct ip_vs_service *svc)
1323{
1324 struct ip_vs_dest *dest;
1325
1326 write_lock_bh(&__ip_vs_svc_lock);
1327 list_for_each_entry(dest, &svc->destinations, n_list) {
1328 ip_vs_zero_stats(&dest->stats);
1329 }
1330 ip_vs_zero_stats(&svc->stats);
1331 write_unlock_bh(&__ip_vs_svc_lock);
1332 return 0;
1333}
1334
1335static int ip_vs_zero_all(void)
1336{
1337 int idx;
1338 struct ip_vs_service *svc;
1339
1340 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1341 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1342 ip_vs_zero_service(svc);
1343 }
1344 }
1345
1346 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1347 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1348 ip_vs_zero_service(svc);
1349 }
1350 }
1351
1352 ip_vs_zero_stats(&ip_vs_stats);
1353 return 0;
1354}
1355
1356
1357static int
1358proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1359 void __user *buffer, size_t *lenp, loff_t *ppos)
1360{
1361 int *valp = table->data;
1362 int val = *valp;
1363 int rc;
1364
1365 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1366 if (write && (*valp != val)) {
1367 if ((*valp < 0) || (*valp > 3)) {
1368 /* Restore the correct value */
1369 *valp = val;
1370 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001371 update_defense_level();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001372 }
1373 }
1374 return rc;
1375}
1376
1377
1378static int
1379proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1380 void __user *buffer, size_t *lenp, loff_t *ppos)
1381{
1382 int *valp = table->data;
1383 int val[2];
1384 int rc;
1385
1386 /* backup the value first */
1387 memcpy(val, valp, sizeof(val));
1388
1389 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1390 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1391 /* Restore the correct value */
1392 memcpy(valp, val, sizeof(val));
1393 }
1394 return rc;
1395}
1396
1397
1398/*
1399 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1400 */
1401
1402static struct ctl_table vs_vars[] = {
1403 {
1404 .ctl_name = NET_IPV4_VS_AMEMTHRESH,
1405 .procname = "amemthresh",
1406 .data = &sysctl_ip_vs_amemthresh,
1407 .maxlen = sizeof(int),
1408 .mode = 0644,
1409 .proc_handler = &proc_dointvec,
1410 },
1411#ifdef CONFIG_IP_VS_DEBUG
1412 {
1413 .ctl_name = NET_IPV4_VS_DEBUG_LEVEL,
1414 .procname = "debug_level",
1415 .data = &sysctl_ip_vs_debug_level,
1416 .maxlen = sizeof(int),
1417 .mode = 0644,
1418 .proc_handler = &proc_dointvec,
1419 },
1420#endif
1421 {
1422 .ctl_name = NET_IPV4_VS_AMDROPRATE,
1423 .procname = "am_droprate",
1424 .data = &sysctl_ip_vs_am_droprate,
1425 .maxlen = sizeof(int),
1426 .mode = 0644,
1427 .proc_handler = &proc_dointvec,
1428 },
1429 {
1430 .ctl_name = NET_IPV4_VS_DROP_ENTRY,
1431 .procname = "drop_entry",
1432 .data = &sysctl_ip_vs_drop_entry,
1433 .maxlen = sizeof(int),
1434 .mode = 0644,
1435 .proc_handler = &proc_do_defense_mode,
1436 },
1437 {
1438 .ctl_name = NET_IPV4_VS_DROP_PACKET,
1439 .procname = "drop_packet",
1440 .data = &sysctl_ip_vs_drop_packet,
1441 .maxlen = sizeof(int),
1442 .mode = 0644,
1443 .proc_handler = &proc_do_defense_mode,
1444 },
1445 {
1446 .ctl_name = NET_IPV4_VS_SECURE_TCP,
1447 .procname = "secure_tcp",
1448 .data = &sysctl_ip_vs_secure_tcp,
1449 .maxlen = sizeof(int),
1450 .mode = 0644,
1451 .proc_handler = &proc_do_defense_mode,
1452 },
1453#if 0
1454 {
1455 .ctl_name = NET_IPV4_VS_TO_ES,
1456 .procname = "timeout_established",
1457 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1458 .maxlen = sizeof(int),
1459 .mode = 0644,
1460 .proc_handler = &proc_dointvec_jiffies,
1461 },
1462 {
1463 .ctl_name = NET_IPV4_VS_TO_SS,
1464 .procname = "timeout_synsent",
1465 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1466 .maxlen = sizeof(int),
1467 .mode = 0644,
1468 .proc_handler = &proc_dointvec_jiffies,
1469 },
1470 {
1471 .ctl_name = NET_IPV4_VS_TO_SR,
1472 .procname = "timeout_synrecv",
1473 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1474 .maxlen = sizeof(int),
1475 .mode = 0644,
1476 .proc_handler = &proc_dointvec_jiffies,
1477 },
1478 {
1479 .ctl_name = NET_IPV4_VS_TO_FW,
1480 .procname = "timeout_finwait",
1481 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1482 .maxlen = sizeof(int),
1483 .mode = 0644,
1484 .proc_handler = &proc_dointvec_jiffies,
1485 },
1486 {
1487 .ctl_name = NET_IPV4_VS_TO_TW,
1488 .procname = "timeout_timewait",
1489 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1490 .maxlen = sizeof(int),
1491 .mode = 0644,
1492 .proc_handler = &proc_dointvec_jiffies,
1493 },
1494 {
1495 .ctl_name = NET_IPV4_VS_TO_CL,
1496 .procname = "timeout_close",
1497 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1498 .maxlen = sizeof(int),
1499 .mode = 0644,
1500 .proc_handler = &proc_dointvec_jiffies,
1501 },
1502 {
1503 .ctl_name = NET_IPV4_VS_TO_CW,
1504 .procname = "timeout_closewait",
1505 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1506 .maxlen = sizeof(int),
1507 .mode = 0644,
1508 .proc_handler = &proc_dointvec_jiffies,
1509 },
1510 {
1511 .ctl_name = NET_IPV4_VS_TO_LA,
1512 .procname = "timeout_lastack",
1513 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1514 .maxlen = sizeof(int),
1515 .mode = 0644,
1516 .proc_handler = &proc_dointvec_jiffies,
1517 },
1518 {
1519 .ctl_name = NET_IPV4_VS_TO_LI,
1520 .procname = "timeout_listen",
1521 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1522 .maxlen = sizeof(int),
1523 .mode = 0644,
1524 .proc_handler = &proc_dointvec_jiffies,
1525 },
1526 {
1527 .ctl_name = NET_IPV4_VS_TO_SA,
1528 .procname = "timeout_synack",
1529 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1530 .maxlen = sizeof(int),
1531 .mode = 0644,
1532 .proc_handler = &proc_dointvec_jiffies,
1533 },
1534 {
1535 .ctl_name = NET_IPV4_VS_TO_UDP,
1536 .procname = "timeout_udp",
1537 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1538 .maxlen = sizeof(int),
1539 .mode = 0644,
1540 .proc_handler = &proc_dointvec_jiffies,
1541 },
1542 {
1543 .ctl_name = NET_IPV4_VS_TO_ICMP,
1544 .procname = "timeout_icmp",
1545 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1546 .maxlen = sizeof(int),
1547 .mode = 0644,
1548 .proc_handler = &proc_dointvec_jiffies,
1549 },
1550#endif
1551 {
1552 .ctl_name = NET_IPV4_VS_CACHE_BYPASS,
1553 .procname = "cache_bypass",
1554 .data = &sysctl_ip_vs_cache_bypass,
1555 .maxlen = sizeof(int),
1556 .mode = 0644,
1557 .proc_handler = &proc_dointvec,
1558 },
1559 {
1560 .ctl_name = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1561 .procname = "expire_nodest_conn",
1562 .data = &sysctl_ip_vs_expire_nodest_conn,
1563 .maxlen = sizeof(int),
1564 .mode = 0644,
1565 .proc_handler = &proc_dointvec,
1566 },
1567 {
1568 .ctl_name = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1569 .procname = "expire_quiescent_template",
1570 .data = &sysctl_ip_vs_expire_quiescent_template,
1571 .maxlen = sizeof(int),
1572 .mode = 0644,
1573 .proc_handler = &proc_dointvec,
1574 },
1575 {
1576 .ctl_name = NET_IPV4_VS_SYNC_THRESHOLD,
1577 .procname = "sync_threshold",
1578 .data = &sysctl_ip_vs_sync_threshold,
1579 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1580 .mode = 0644,
1581 .proc_handler = &proc_do_sync_threshold,
1582 },
1583 {
1584 .ctl_name = NET_IPV4_VS_NAT_ICMP_SEND,
1585 .procname = "nat_icmp_send",
1586 .data = &sysctl_ip_vs_nat_icmp_send,
1587 .maxlen = sizeof(int),
1588 .mode = 0644,
1589 .proc_handler = &proc_dointvec,
1590 },
1591 { .ctl_name = 0 }
1592};
1593
1594static ctl_table vs_table[] = {
1595 {
1596 .ctl_name = NET_IPV4_VS,
1597 .procname = "vs",
1598 .mode = 0555,
1599 .child = vs_vars
1600 },
1601 { .ctl_name = 0 }
1602};
1603
David S. Millerbf0ff9e2005-08-19 16:37:30 -07001604static ctl_table ipvs_ipv4_table[] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001605 {
1606 .ctl_name = NET_IPV4,
1607 .procname = "ipv4",
1608 .mode = 0555,
1609 .child = vs_table,
1610 },
1611 { .ctl_name = 0 }
1612};
1613
1614static ctl_table vs_root_table[] = {
1615 {
1616 .ctl_name = CTL_NET,
1617 .procname = "net",
1618 .mode = 0555,
David S. Millerbf0ff9e2005-08-19 16:37:30 -07001619 .child = ipvs_ipv4_table,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001620 },
1621 { .ctl_name = 0 }
1622};
1623
1624static struct ctl_table_header * sysctl_header;
1625
1626#ifdef CONFIG_PROC_FS
1627
1628struct ip_vs_iter {
1629 struct list_head *table;
1630 int bucket;
1631};
1632
1633/*
1634 * Write the contents of the VS rule table to a PROCfs file.
1635 * (It is kept just for backward compatibility)
1636 */
1637static inline const char *ip_vs_fwd_name(unsigned flags)
1638{
1639 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1640 case IP_VS_CONN_F_LOCALNODE:
1641 return "Local";
1642 case IP_VS_CONN_F_TUNNEL:
1643 return "Tunnel";
1644 case IP_VS_CONN_F_DROUTE:
1645 return "Route";
1646 default:
1647 return "Masq";
1648 }
1649}
1650
1651
1652/* Get the Nth entry in the two lists */
1653static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1654{
1655 struct ip_vs_iter *iter = seq->private;
1656 int idx;
1657 struct ip_vs_service *svc;
1658
1659 /* look in hash by protocol */
1660 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1661 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1662 if (pos-- == 0){
1663 iter->table = ip_vs_svc_table;
1664 iter->bucket = idx;
1665 return svc;
1666 }
1667 }
1668 }
1669
1670 /* keep looking in fwmark */
1671 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1672 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1673 if (pos-- == 0) {
1674 iter->table = ip_vs_svc_fwm_table;
1675 iter->bucket = idx;
1676 return svc;
1677 }
1678 }
1679 }
1680
1681 return NULL;
1682}
1683
1684static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1685{
1686
1687 read_lock_bh(&__ip_vs_svc_lock);
1688 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1689}
1690
1691
1692static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1693{
1694 struct list_head *e;
1695 struct ip_vs_iter *iter;
1696 struct ip_vs_service *svc;
1697
1698 ++*pos;
1699 if (v == SEQ_START_TOKEN)
1700 return ip_vs_info_array(seq,0);
1701
1702 svc = v;
1703 iter = seq->private;
1704
1705 if (iter->table == ip_vs_svc_table) {
1706 /* next service in table hashed by protocol */
1707 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1708 return list_entry(e, struct ip_vs_service, s_list);
1709
1710
1711 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1712 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1713 s_list) {
1714 return svc;
1715 }
1716 }
1717
1718 iter->table = ip_vs_svc_fwm_table;
1719 iter->bucket = -1;
1720 goto scan_fwmark;
1721 }
1722
1723 /* next service in hashed by fwmark */
1724 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1725 return list_entry(e, struct ip_vs_service, f_list);
1726
1727 scan_fwmark:
1728 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1729 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1730 f_list)
1731 return svc;
1732 }
1733
1734 return NULL;
1735}
1736
1737static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1738{
1739 read_unlock_bh(&__ip_vs_svc_lock);
1740}
1741
1742
1743static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1744{
1745 if (v == SEQ_START_TOKEN) {
1746 seq_printf(seq,
1747 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1748 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1749 seq_puts(seq,
1750 "Prot LocalAddress:Port Scheduler Flags\n");
1751 seq_puts(seq,
1752 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1753 } else {
1754 const struct ip_vs_service *svc = v;
1755 const struct ip_vs_iter *iter = seq->private;
1756 const struct ip_vs_dest *dest;
1757
1758 if (iter->table == ip_vs_svc_table)
1759 seq_printf(seq, "%s %08X:%04X %s ",
1760 ip_vs_proto_name(svc->protocol),
1761 ntohl(svc->addr),
1762 ntohs(svc->port),
1763 svc->scheduler->name);
1764 else
1765 seq_printf(seq, "FWM %08X %s ",
1766 svc->fwmark, svc->scheduler->name);
1767
1768 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1769 seq_printf(seq, "persistent %d %08X\n",
1770 svc->timeout,
1771 ntohl(svc->netmask));
1772 else
1773 seq_putc(seq, '\n');
1774
1775 list_for_each_entry(dest, &svc->destinations, n_list) {
1776 seq_printf(seq,
1777 " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
1778 ntohl(dest->addr), ntohs(dest->port),
1779 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1780 atomic_read(&dest->weight),
1781 atomic_read(&dest->activeconns),
1782 atomic_read(&dest->inactconns));
1783 }
1784 }
1785 return 0;
1786}
1787
1788static struct seq_operations ip_vs_info_seq_ops = {
1789 .start = ip_vs_info_seq_start,
1790 .next = ip_vs_info_seq_next,
1791 .stop = ip_vs_info_seq_stop,
1792 .show = ip_vs_info_seq_show,
1793};
1794
1795static int ip_vs_info_open(struct inode *inode, struct file *file)
1796{
1797 struct seq_file *seq;
1798 int rc = -ENOMEM;
1799 struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1800
1801 if (!s)
1802 goto out;
1803
1804 rc = seq_open(file, &ip_vs_info_seq_ops);
1805 if (rc)
1806 goto out_kfree;
1807
1808 seq = file->private_data;
1809 seq->private = s;
1810 memset(s, 0, sizeof(*s));
1811out:
1812 return rc;
1813out_kfree:
1814 kfree(s);
1815 goto out;
1816}
1817
1818static struct file_operations ip_vs_info_fops = {
1819 .owner = THIS_MODULE,
1820 .open = ip_vs_info_open,
1821 .read = seq_read,
1822 .llseek = seq_lseek,
1823 .release = seq_release_private,
1824};
1825
1826#endif
1827
1828struct ip_vs_stats ip_vs_stats;
1829
1830#ifdef CONFIG_PROC_FS
1831static int ip_vs_stats_show(struct seq_file *seq, void *v)
1832{
1833
1834/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1835 seq_puts(seq,
1836 " Total Incoming Outgoing Incoming Outgoing\n");
1837 seq_printf(seq,
1838 " Conns Packets Packets Bytes Bytes\n");
1839
1840 spin_lock_bh(&ip_vs_stats.lock);
1841 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1842 ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1843 (unsigned long long) ip_vs_stats.inbytes,
1844 (unsigned long long) ip_vs_stats.outbytes);
1845
1846/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1847 seq_puts(seq,
1848 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1849 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1850 ip_vs_stats.cps,
1851 ip_vs_stats.inpps,
1852 ip_vs_stats.outpps,
1853 ip_vs_stats.inbps,
1854 ip_vs_stats.outbps);
1855 spin_unlock_bh(&ip_vs_stats.lock);
1856
1857 return 0;
1858}
1859
1860static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1861{
1862 return single_open(file, ip_vs_stats_show, NULL);
1863}
1864
1865static struct file_operations ip_vs_stats_fops = {
1866 .owner = THIS_MODULE,
1867 .open = ip_vs_stats_seq_open,
1868 .read = seq_read,
1869 .llseek = seq_lseek,
1870 .release = single_release,
1871};
1872
1873#endif
1874
1875/*
1876 * Set timeout values for tcp tcpfin udp in the timeout_table.
1877 */
1878static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1879{
1880 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1881 u->tcp_timeout,
1882 u->tcp_fin_timeout,
1883 u->udp_timeout);
1884
1885#ifdef CONFIG_IP_VS_PROTO_TCP
1886 if (u->tcp_timeout) {
1887 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1888 = u->tcp_timeout * HZ;
1889 }
1890
1891 if (u->tcp_fin_timeout) {
1892 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1893 = u->tcp_fin_timeout * HZ;
1894 }
1895#endif
1896
1897#ifdef CONFIG_IP_VS_PROTO_UDP
1898 if (u->udp_timeout) {
1899 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1900 = u->udp_timeout * HZ;
1901 }
1902#endif
1903 return 0;
1904}
1905
1906
1907#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
1908#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
1909#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
1910 sizeof(struct ip_vs_dest_user))
1911#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
1912#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
1913#define MAX_ARG_LEN SVCDEST_ARG_LEN
1914
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001915static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001916 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
1917 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
1918 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
1919 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
1920 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
1921 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
1922 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
1923 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
1924 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
1925 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
1926 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
1927};
1928
1929static int
1930do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1931{
1932 int ret;
1933 unsigned char arg[MAX_ARG_LEN];
1934 struct ip_vs_service_user *usvc;
1935 struct ip_vs_service *svc;
1936 struct ip_vs_dest_user *udest;
1937
1938 if (!capable(CAP_NET_ADMIN))
1939 return -EPERM;
1940
1941 if (len != set_arglen[SET_CMDID(cmd)]) {
1942 IP_VS_ERR("set_ctl: len %u != %u\n",
1943 len, set_arglen[SET_CMDID(cmd)]);
1944 return -EINVAL;
1945 }
1946
1947 if (copy_from_user(arg, user, len) != 0)
1948 return -EFAULT;
1949
1950 /* increase the module use count */
1951 ip_vs_use_count_inc();
1952
1953 if (down_interruptible(&__ip_vs_mutex)) {
1954 ret = -ERESTARTSYS;
1955 goto out_dec;
1956 }
1957
1958 if (cmd == IP_VS_SO_SET_FLUSH) {
1959 /* Flush the virtual service */
1960 ret = ip_vs_flush();
1961 goto out_unlock;
1962 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1963 /* Set timeout values for (tcp tcpfin udp) */
1964 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1965 goto out_unlock;
1966 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1967 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1968 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1969 goto out_unlock;
1970 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1971 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1972 ret = stop_sync_thread(dm->state);
1973 goto out_unlock;
1974 }
1975
1976 usvc = (struct ip_vs_service_user *)arg;
1977 udest = (struct ip_vs_dest_user *)(usvc + 1);
1978
1979 if (cmd == IP_VS_SO_SET_ZERO) {
1980 /* if no service address is set, zero counters in all */
1981 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1982 ret = ip_vs_zero_all();
1983 goto out_unlock;
1984 }
1985 }
1986
1987 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1988 if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1989 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1990 usvc->protocol, NIPQUAD(usvc->addr),
1991 ntohs(usvc->port), usvc->sched_name);
1992 ret = -EFAULT;
1993 goto out_unlock;
1994 }
1995
1996 /* Lookup the exact service by <protocol, addr, port> or fwmark */
1997 if (usvc->fwmark == 0)
1998 svc = __ip_vs_service_get(usvc->protocol,
1999 usvc->addr, usvc->port);
2000 else
2001 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
2002
2003 if (cmd != IP_VS_SO_SET_ADD
2004 && (svc == NULL || svc->protocol != usvc->protocol)) {
2005 ret = -ESRCH;
2006 goto out_unlock;
2007 }
2008
2009 switch (cmd) {
2010 case IP_VS_SO_SET_ADD:
2011 if (svc != NULL)
2012 ret = -EEXIST;
2013 else
2014 ret = ip_vs_add_service(usvc, &svc);
2015 break;
2016 case IP_VS_SO_SET_EDIT:
2017 ret = ip_vs_edit_service(svc, usvc);
2018 break;
2019 case IP_VS_SO_SET_DEL:
2020 ret = ip_vs_del_service(svc);
2021 if (!ret)
2022 goto out_unlock;
2023 break;
2024 case IP_VS_SO_SET_ZERO:
2025 ret = ip_vs_zero_service(svc);
2026 break;
2027 case IP_VS_SO_SET_ADDDEST:
2028 ret = ip_vs_add_dest(svc, udest);
2029 break;
2030 case IP_VS_SO_SET_EDITDEST:
2031 ret = ip_vs_edit_dest(svc, udest);
2032 break;
2033 case IP_VS_SO_SET_DELDEST:
2034 ret = ip_vs_del_dest(svc, udest);
2035 break;
2036 default:
2037 ret = -EINVAL;
2038 }
2039
2040 if (svc)
2041 ip_vs_service_put(svc);
2042
2043 out_unlock:
2044 up(&__ip_vs_mutex);
2045 out_dec:
2046 /* decrease the module use count */
2047 ip_vs_use_count_dec();
2048
2049 return ret;
2050}
2051
2052
2053static void
2054ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2055{
2056 spin_lock_bh(&src->lock);
2057 memcpy(dst, src, (char*)&src->lock - (char*)src);
2058 spin_unlock_bh(&src->lock);
2059}
2060
2061static void
2062ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2063{
2064 dst->protocol = src->protocol;
2065 dst->addr = src->addr;
2066 dst->port = src->port;
2067 dst->fwmark = src->fwmark;
pageexec4da62fc72005-06-26 16:00:19 -07002068 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002069 dst->flags = src->flags;
2070 dst->timeout = src->timeout / HZ;
2071 dst->netmask = src->netmask;
2072 dst->num_dests = src->num_dests;
2073 ip_vs_copy_stats(&dst->stats, &src->stats);
2074}
2075
2076static inline int
2077__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2078 struct ip_vs_get_services __user *uptr)
2079{
2080 int idx, count=0;
2081 struct ip_vs_service *svc;
2082 struct ip_vs_service_entry entry;
2083 int ret = 0;
2084
2085 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2086 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2087 if (count >= get->num_services)
2088 goto out;
pageexec4da62fc72005-06-26 16:00:19 -07002089 memset(&entry, 0, sizeof(entry));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002090 ip_vs_copy_service(&entry, svc);
2091 if (copy_to_user(&uptr->entrytable[count],
2092 &entry, sizeof(entry))) {
2093 ret = -EFAULT;
2094 goto out;
2095 }
2096 count++;
2097 }
2098 }
2099
2100 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2101 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2102 if (count >= get->num_services)
2103 goto out;
pageexec4da62fc72005-06-26 16:00:19 -07002104 memset(&entry, 0, sizeof(entry));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002105 ip_vs_copy_service(&entry, svc);
2106 if (copy_to_user(&uptr->entrytable[count],
2107 &entry, sizeof(entry))) {
2108 ret = -EFAULT;
2109 goto out;
2110 }
2111 count++;
2112 }
2113 }
2114 out:
2115 return ret;
2116}
2117
2118static inline int
2119__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2120 struct ip_vs_get_dests __user *uptr)
2121{
2122 struct ip_vs_service *svc;
2123 int ret = 0;
2124
2125 if (get->fwmark)
2126 svc = __ip_vs_svc_fwm_get(get->fwmark);
2127 else
2128 svc = __ip_vs_service_get(get->protocol,
2129 get->addr, get->port);
2130 if (svc) {
2131 int count = 0;
2132 struct ip_vs_dest *dest;
2133 struct ip_vs_dest_entry entry;
2134
2135 list_for_each_entry(dest, &svc->destinations, n_list) {
2136 if (count >= get->num_dests)
2137 break;
2138
2139 entry.addr = dest->addr;
2140 entry.port = dest->port;
2141 entry.conn_flags = atomic_read(&dest->conn_flags);
2142 entry.weight = atomic_read(&dest->weight);
2143 entry.u_threshold = dest->u_threshold;
2144 entry.l_threshold = dest->l_threshold;
2145 entry.activeconns = atomic_read(&dest->activeconns);
2146 entry.inactconns = atomic_read(&dest->inactconns);
2147 entry.persistconns = atomic_read(&dest->persistconns);
2148 ip_vs_copy_stats(&entry.stats, &dest->stats);
2149 if (copy_to_user(&uptr->entrytable[count],
2150 &entry, sizeof(entry))) {
2151 ret = -EFAULT;
2152 break;
2153 }
2154 count++;
2155 }
2156 ip_vs_service_put(svc);
2157 } else
2158 ret = -ESRCH;
2159 return ret;
2160}
2161
2162static inline void
2163__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2164{
2165#ifdef CONFIG_IP_VS_PROTO_TCP
2166 u->tcp_timeout =
2167 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2168 u->tcp_fin_timeout =
2169 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2170#endif
2171#ifdef CONFIG_IP_VS_PROTO_UDP
2172 u->udp_timeout =
2173 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2174#endif
2175}
2176
2177
2178#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2179#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2180#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2181#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2182#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2183#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2184#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2185
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08002186static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002187 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2188 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2189 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2190 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2191 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2192 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2193 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2194};
2195
2196static int
2197do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2198{
2199 unsigned char arg[128];
2200 int ret = 0;
2201
2202 if (!capable(CAP_NET_ADMIN))
2203 return -EPERM;
2204
2205 if (*len < get_arglen[GET_CMDID(cmd)]) {
2206 IP_VS_ERR("get_ctl: len %u < %u\n",
2207 *len, get_arglen[GET_CMDID(cmd)]);
2208 return -EINVAL;
2209 }
2210
2211 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2212 return -EFAULT;
2213
2214 if (down_interruptible(&__ip_vs_mutex))
2215 return -ERESTARTSYS;
2216
2217 switch (cmd) {
2218 case IP_VS_SO_GET_VERSION:
2219 {
2220 char buf[64];
2221
2222 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2223 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2224 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2225 ret = -EFAULT;
2226 goto out;
2227 }
2228 *len = strlen(buf)+1;
2229 }
2230 break;
2231
2232 case IP_VS_SO_GET_INFO:
2233 {
2234 struct ip_vs_getinfo info;
2235 info.version = IP_VS_VERSION_CODE;
2236 info.size = IP_VS_CONN_TAB_SIZE;
2237 info.num_services = ip_vs_num_services;
2238 if (copy_to_user(user, &info, sizeof(info)) != 0)
2239 ret = -EFAULT;
2240 }
2241 break;
2242
2243 case IP_VS_SO_GET_SERVICES:
2244 {
2245 struct ip_vs_get_services *get;
2246 int size;
2247
2248 get = (struct ip_vs_get_services *)arg;
2249 size = sizeof(*get) +
2250 sizeof(struct ip_vs_service_entry) * get->num_services;
2251 if (*len != size) {
2252 IP_VS_ERR("length: %u != %u\n", *len, size);
2253 ret = -EINVAL;
2254 goto out;
2255 }
2256 ret = __ip_vs_get_service_entries(get, user);
2257 }
2258 break;
2259
2260 case IP_VS_SO_GET_SERVICE:
2261 {
2262 struct ip_vs_service_entry *entry;
2263 struct ip_vs_service *svc;
2264
2265 entry = (struct ip_vs_service_entry *)arg;
2266 if (entry->fwmark)
2267 svc = __ip_vs_svc_fwm_get(entry->fwmark);
2268 else
2269 svc = __ip_vs_service_get(entry->protocol,
2270 entry->addr, entry->port);
2271 if (svc) {
2272 ip_vs_copy_service(entry, svc);
2273 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2274 ret = -EFAULT;
2275 ip_vs_service_put(svc);
2276 } else
2277 ret = -ESRCH;
2278 }
2279 break;
2280
2281 case IP_VS_SO_GET_DESTS:
2282 {
2283 struct ip_vs_get_dests *get;
2284 int size;
2285
2286 get = (struct ip_vs_get_dests *)arg;
2287 size = sizeof(*get) +
2288 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2289 if (*len != size) {
2290 IP_VS_ERR("length: %u != %u\n", *len, size);
2291 ret = -EINVAL;
2292 goto out;
2293 }
2294 ret = __ip_vs_get_dest_entries(get, user);
2295 }
2296 break;
2297
2298 case IP_VS_SO_GET_TIMEOUT:
2299 {
2300 struct ip_vs_timeout_user t;
2301
2302 __ip_vs_get_timeouts(&t);
2303 if (copy_to_user(user, &t, sizeof(t)) != 0)
2304 ret = -EFAULT;
2305 }
2306 break;
2307
2308 case IP_VS_SO_GET_DAEMON:
2309 {
2310 struct ip_vs_daemon_user d[2];
2311
2312 memset(&d, 0, sizeof(d));
2313 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2314 d[0].state = IP_VS_STATE_MASTER;
pageexec4da62fc72005-06-26 16:00:19 -07002315 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002316 d[0].syncid = ip_vs_master_syncid;
2317 }
2318 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2319 d[1].state = IP_VS_STATE_BACKUP;
pageexec4da62fc72005-06-26 16:00:19 -07002320 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002321 d[1].syncid = ip_vs_backup_syncid;
2322 }
2323 if (copy_to_user(user, &d, sizeof(d)) != 0)
2324 ret = -EFAULT;
2325 }
2326 break;
2327
2328 default:
2329 ret = -EINVAL;
2330 }
2331
2332 out:
2333 up(&__ip_vs_mutex);
2334 return ret;
2335}
2336
2337
2338static struct nf_sockopt_ops ip_vs_sockopts = {
2339 .pf = PF_INET,
2340 .set_optmin = IP_VS_BASE_CTL,
2341 .set_optmax = IP_VS_SO_SET_MAX+1,
2342 .set = do_ip_vs_set_ctl,
2343 .get_optmin = IP_VS_BASE_CTL,
2344 .get_optmax = IP_VS_SO_GET_MAX+1,
2345 .get = do_ip_vs_get_ctl,
2346};
2347
2348
2349int ip_vs_control_init(void)
2350{
2351 int ret;
2352 int idx;
2353
2354 EnterFunction(2);
2355
2356 ret = nf_register_sockopt(&ip_vs_sockopts);
2357 if (ret) {
2358 IP_VS_ERR("cannot register sockopt.\n");
2359 return ret;
2360 }
2361
2362 proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2363 proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2364
2365 sysctl_header = register_sysctl_table(vs_root_table, 0);
2366
2367 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2368 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2369 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2370 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2371 }
2372 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
2373 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2374 }
2375
2376 memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2377 spin_lock_init(&ip_vs_stats.lock);
2378 ip_vs_new_estimator(&ip_vs_stats);
2379
2380 /* Hook the defense timer */
2381 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2382
2383 LeaveFunction(2);
2384 return 0;
2385}
2386
2387
2388void ip_vs_control_cleanup(void)
2389{
2390 EnterFunction(2);
2391 ip_vs_trash_cleanup();
2392 cancel_rearming_delayed_work(&defense_work);
2393 ip_vs_kill_estimator(&ip_vs_stats);
2394 unregister_sysctl_table(sysctl_header);
2395 proc_net_remove("ip_vs_stats");
2396 proc_net_remove("ip_vs");
2397 nf_unregister_sockopt(&ip_vs_sockopts);
2398 LeaveFunction(2);
2399}