blob: 9bdcf31b760eee5a35a0cba61c69f734b056f02c [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
9 *
10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
11 * Peter Kese <peter.kese@ijs.si>
12 * Julian Anastasov <ja@ssi.bg>
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 *
19 * Changes:
20 *
21 */
22
23#include <linux/module.h>
24#include <linux/init.h>
25#include <linux/types.h>
26#include <linux/fs.h>
27#include <linux/sysctl.h>
28#include <linux/proc_fs.h>
29#include <linux/workqueue.h>
30#include <linux/swap.h>
31#include <linux/proc_fs.h>
32#include <linux/seq_file.h>
33
34#include <linux/netfilter.h>
35#include <linux/netfilter_ipv4.h>
36
37#include <net/ip.h>
38#include <net/sock.h>
39
40#include <asm/uaccess.h>
41
42#include <net/ip_vs.h>
43
44/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
45static DECLARE_MUTEX(__ip_vs_mutex);
46
47/* lock for service table */
48static DEFINE_RWLOCK(__ip_vs_svc_lock);
49
50/* lock for table with the real services */
51static DEFINE_RWLOCK(__ip_vs_rs_lock);
52
53/* lock for state and timeout tables */
54static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
55
56/* lock for drop entry handling */
57static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
58
59/* lock for drop packet handling */
60static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
61
62/* 1/rate drop and drop-entry variables */
63int ip_vs_drop_rate = 0;
64int ip_vs_drop_counter = 0;
65static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
66
67/* number of virtual services */
68static int ip_vs_num_services = 0;
69
70/* sysctl variables */
71static int sysctl_ip_vs_drop_entry = 0;
72static int sysctl_ip_vs_drop_packet = 0;
73static int sysctl_ip_vs_secure_tcp = 0;
74static int sysctl_ip_vs_amemthresh = 1024;
75static int sysctl_ip_vs_am_droprate = 10;
76int sysctl_ip_vs_cache_bypass = 0;
77int sysctl_ip_vs_expire_nodest_conn = 0;
78int sysctl_ip_vs_expire_quiescent_template = 0;
79int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
80int sysctl_ip_vs_nat_icmp_send = 0;
81
82
83#ifdef CONFIG_IP_VS_DEBUG
84static int sysctl_ip_vs_debug_level = 0;
85
86int ip_vs_get_debug_level(void)
87{
88 return sysctl_ip_vs_debug_level;
89}
90#endif
91
92/*
Julian Anastasovaf9debd2005-07-11 20:59:57 -070093 * update_defense_level is called from keventd and from sysctl,
94 * so it needs to protect itself from softirqs
Linus Torvalds1da177e2005-04-16 15:20:36 -070095 */
96static void update_defense_level(void)
97{
98 struct sysinfo i;
99 static int old_secure_tcp = 0;
100 int availmem;
101 int nomem;
102 int to_change = -1;
103
104 /* we only count free and buffered memory (in pages) */
105 si_meminfo(&i);
106 availmem = i.freeram + i.bufferram;
107 /* however in linux 2.5 the i.bufferram is total page cache size,
108 we need adjust it */
109 /* si_swapinfo(&i); */
110 /* availmem = availmem - (i.totalswap - i.freeswap); */
111
112 nomem = (availmem < sysctl_ip_vs_amemthresh);
113
Julian Anastasovaf9debd2005-07-11 20:59:57 -0700114 local_bh_disable();
115
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116 /* drop_entry */
117 spin_lock(&__ip_vs_dropentry_lock);
118 switch (sysctl_ip_vs_drop_entry) {
119 case 0:
120 atomic_set(&ip_vs_dropentry, 0);
121 break;
122 case 1:
123 if (nomem) {
124 atomic_set(&ip_vs_dropentry, 1);
125 sysctl_ip_vs_drop_entry = 2;
126 } else {
127 atomic_set(&ip_vs_dropentry, 0);
128 }
129 break;
130 case 2:
131 if (nomem) {
132 atomic_set(&ip_vs_dropentry, 1);
133 } else {
134 atomic_set(&ip_vs_dropentry, 0);
135 sysctl_ip_vs_drop_entry = 1;
136 };
137 break;
138 case 3:
139 atomic_set(&ip_vs_dropentry, 1);
140 break;
141 }
142 spin_unlock(&__ip_vs_dropentry_lock);
143
144 /* drop_packet */
145 spin_lock(&__ip_vs_droppacket_lock);
146 switch (sysctl_ip_vs_drop_packet) {
147 case 0:
148 ip_vs_drop_rate = 0;
149 break;
150 case 1:
151 if (nomem) {
152 ip_vs_drop_rate = ip_vs_drop_counter
153 = sysctl_ip_vs_amemthresh /
154 (sysctl_ip_vs_amemthresh-availmem);
155 sysctl_ip_vs_drop_packet = 2;
156 } else {
157 ip_vs_drop_rate = 0;
158 }
159 break;
160 case 2:
161 if (nomem) {
162 ip_vs_drop_rate = ip_vs_drop_counter
163 = sysctl_ip_vs_amemthresh /
164 (sysctl_ip_vs_amemthresh-availmem);
165 } else {
166 ip_vs_drop_rate = 0;
167 sysctl_ip_vs_drop_packet = 1;
168 }
169 break;
170 case 3:
171 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
172 break;
173 }
174 spin_unlock(&__ip_vs_droppacket_lock);
175
176 /* secure_tcp */
177 write_lock(&__ip_vs_securetcp_lock);
178 switch (sysctl_ip_vs_secure_tcp) {
179 case 0:
180 if (old_secure_tcp >= 2)
181 to_change = 0;
182 break;
183 case 1:
184 if (nomem) {
185 if (old_secure_tcp < 2)
186 to_change = 1;
187 sysctl_ip_vs_secure_tcp = 2;
188 } else {
189 if (old_secure_tcp >= 2)
190 to_change = 0;
191 }
192 break;
193 case 2:
194 if (nomem) {
195 if (old_secure_tcp < 2)
196 to_change = 1;
197 } else {
198 if (old_secure_tcp >= 2)
199 to_change = 0;
200 sysctl_ip_vs_secure_tcp = 1;
201 }
202 break;
203 case 3:
204 if (old_secure_tcp < 2)
205 to_change = 1;
206 break;
207 }
208 old_secure_tcp = sysctl_ip_vs_secure_tcp;
209 if (to_change >= 0)
210 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
211 write_unlock(&__ip_vs_securetcp_lock);
Julian Anastasovaf9debd2005-07-11 20:59:57 -0700212
213 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700214}
215
216
217/*
218 * Timer for checking the defense
219 */
220#define DEFENSE_TIMER_PERIOD 1*HZ
221static void defense_work_handler(void *data);
222static DECLARE_WORK(defense_work, defense_work_handler, NULL);
223
224static void defense_work_handler(void *data)
225{
226 update_defense_level();
227 if (atomic_read(&ip_vs_dropentry))
228 ip_vs_random_dropentry();
229
230 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
231}
232
233int
234ip_vs_use_count_inc(void)
235{
236 return try_module_get(THIS_MODULE);
237}
238
239void
240ip_vs_use_count_dec(void)
241{
242 module_put(THIS_MODULE);
243}
244
245
246/*
247 * Hash table: for virtual service lookups
248 */
249#define IP_VS_SVC_TAB_BITS 8
250#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
251#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
252
253/* the service table hashed by <protocol, addr, port> */
254static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
255/* the service table hashed by fwmark */
256static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
257
258/*
259 * Hash table: for real service lookups
260 */
261#define IP_VS_RTAB_BITS 4
262#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
263#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
264
265static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
266
267/*
268 * Trash for destinations
269 */
270static LIST_HEAD(ip_vs_dest_trash);
271
272/*
273 * FTP & NULL virtual service counters
274 */
275static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
276static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
277
278
279/*
280 * Returns hash value for virtual service
281 */
282static __inline__ unsigned
283ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
284{
285 register unsigned porth = ntohs(port);
286
287 return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
288 & IP_VS_SVC_TAB_MASK;
289}
290
291/*
292 * Returns hash value of fwmark for virtual service lookup
293 */
294static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
295{
296 return fwmark & IP_VS_SVC_TAB_MASK;
297}
298
299/*
300 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
301 * or in the ip_vs_svc_fwm_table by fwmark.
302 * Should be called with locked tables.
303 */
304static int ip_vs_svc_hash(struct ip_vs_service *svc)
305{
306 unsigned hash;
307
308 if (svc->flags & IP_VS_SVC_F_HASHED) {
309 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
310 "called from %p\n", __builtin_return_address(0));
311 return 0;
312 }
313
314 if (svc->fwmark == 0) {
315 /*
316 * Hash it by <protocol,addr,port> in ip_vs_svc_table
317 */
318 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
319 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
320 } else {
321 /*
322 * Hash it by fwmark in ip_vs_svc_fwm_table
323 */
324 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
325 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
326 }
327
328 svc->flags |= IP_VS_SVC_F_HASHED;
329 /* increase its refcnt because it is referenced by the svc table */
330 atomic_inc(&svc->refcnt);
331 return 1;
332}
333
334
335/*
336 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
337 * Should be called with locked tables.
338 */
339static int ip_vs_svc_unhash(struct ip_vs_service *svc)
340{
341 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
342 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
343 "called from %p\n", __builtin_return_address(0));
344 return 0;
345 }
346
347 if (svc->fwmark == 0) {
348 /* Remove it from the ip_vs_svc_table table */
349 list_del(&svc->s_list);
350 } else {
351 /* Remove it from the ip_vs_svc_fwm_table table */
352 list_del(&svc->f_list);
353 }
354
355 svc->flags &= ~IP_VS_SVC_F_HASHED;
356 atomic_dec(&svc->refcnt);
357 return 1;
358}
359
360
361/*
362 * Get service by {proto,addr,port} in the service table.
363 */
364static __inline__ struct ip_vs_service *
365__ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
366{
367 unsigned hash;
368 struct ip_vs_service *svc;
369
370 /* Check for "full" addressed entries */
371 hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
372
373 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
374 if ((svc->addr == vaddr)
375 && (svc->port == vport)
376 && (svc->protocol == protocol)) {
377 /* HIT */
378 atomic_inc(&svc->usecnt);
379 return svc;
380 }
381 }
382
383 return NULL;
384}
385
386
387/*
388 * Get service by {fwmark} in the service table.
389 */
390static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
391{
392 unsigned hash;
393 struct ip_vs_service *svc;
394
395 /* Check for fwmark addressed entries */
396 hash = ip_vs_svc_fwm_hashkey(fwmark);
397
398 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
399 if (svc->fwmark == fwmark) {
400 /* HIT */
401 atomic_inc(&svc->usecnt);
402 return svc;
403 }
404 }
405
406 return NULL;
407}
408
409struct ip_vs_service *
410ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
411{
412 struct ip_vs_service *svc;
413
414 read_lock(&__ip_vs_svc_lock);
415
416 /*
417 * Check the table hashed by fwmark first
418 */
419 if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
420 goto out;
421
422 /*
423 * Check the table hashed by <protocol,addr,port>
424 * for "full" addressed entries
425 */
426 svc = __ip_vs_service_get(protocol, vaddr, vport);
427
428 if (svc == NULL
429 && protocol == IPPROTO_TCP
430 && atomic_read(&ip_vs_ftpsvc_counter)
431 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
432 /*
433 * Check if ftp service entry exists, the packet
434 * might belong to FTP data connections.
435 */
436 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
437 }
438
439 if (svc == NULL
440 && atomic_read(&ip_vs_nullsvc_counter)) {
441 /*
442 * Check if the catch-all port (port zero) exists
443 */
444 svc = __ip_vs_service_get(protocol, vaddr, 0);
445 }
446
447 out:
448 read_unlock(&__ip_vs_svc_lock);
449
450 IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
451 fwmark, ip_vs_proto_name(protocol),
452 NIPQUAD(vaddr), ntohs(vport),
453 svc?"hit":"not hit");
454
455 return svc;
456}
457
458
459static inline void
460__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
461{
462 atomic_inc(&svc->refcnt);
463 dest->svc = svc;
464}
465
466static inline void
467__ip_vs_unbind_svc(struct ip_vs_dest *dest)
468{
469 struct ip_vs_service *svc = dest->svc;
470
471 dest->svc = NULL;
472 if (atomic_dec_and_test(&svc->refcnt))
473 kfree(svc);
474}
475
476
477/*
478 * Returns hash value for real service
479 */
480static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
481{
482 register unsigned porth = ntohs(port);
483
484 return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
485 & IP_VS_RTAB_MASK;
486}
487
488/*
489 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
490 * should be called with locked tables.
491 */
492static int ip_vs_rs_hash(struct ip_vs_dest *dest)
493{
494 unsigned hash;
495
496 if (!list_empty(&dest->d_list)) {
497 return 0;
498 }
499
500 /*
501 * Hash by proto,addr,port,
502 * which are the parameters of the real service.
503 */
504 hash = ip_vs_rs_hashkey(dest->addr, dest->port);
505 list_add(&dest->d_list, &ip_vs_rtable[hash]);
506
507 return 1;
508}
509
510/*
511 * UNhashes ip_vs_dest from ip_vs_rtable.
512 * should be called with locked tables.
513 */
514static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
515{
516 /*
517 * Remove it from the ip_vs_rtable table.
518 */
519 if (!list_empty(&dest->d_list)) {
520 list_del(&dest->d_list);
521 INIT_LIST_HEAD(&dest->d_list);
522 }
523
524 return 1;
525}
526
527/*
528 * Lookup real service by <proto,addr,port> in the real service table.
529 */
530struct ip_vs_dest *
531ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
532{
533 unsigned hash;
534 struct ip_vs_dest *dest;
535
536 /*
537 * Check for "full" addressed entries
538 * Return the first found entry
539 */
540 hash = ip_vs_rs_hashkey(daddr, dport);
541
542 read_lock(&__ip_vs_rs_lock);
543 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
544 if ((dest->addr == daddr)
545 && (dest->port == dport)
546 && ((dest->protocol == protocol) ||
547 dest->vfwmark)) {
548 /* HIT */
549 read_unlock(&__ip_vs_rs_lock);
550 return dest;
551 }
552 }
553 read_unlock(&__ip_vs_rs_lock);
554
555 return NULL;
556}
557
558/*
559 * Lookup destination by {addr,port} in the given service
560 */
561static struct ip_vs_dest *
562ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
563{
564 struct ip_vs_dest *dest;
565
566 /*
567 * Find the destination for the given service
568 */
569 list_for_each_entry(dest, &svc->destinations, n_list) {
570 if ((dest->addr == daddr) && (dest->port == dport)) {
571 /* HIT */
572 return dest;
573 }
574 }
575
576 return NULL;
577}
578
579
580/*
581 * Lookup dest by {svc,addr,port} in the destination trash.
582 * The destination trash is used to hold the destinations that are removed
583 * from the service table but are still referenced by some conn entries.
584 * The reason to add the destination trash is when the dest is temporary
585 * down (either by administrator or by monitor program), the dest can be
586 * picked back from the trash, the remaining connections to the dest can
587 * continue, and the counting information of the dest is also useful for
588 * scheduling.
589 */
590static struct ip_vs_dest *
591ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
592{
593 struct ip_vs_dest *dest, *nxt;
594
595 /*
596 * Find the destination in trash
597 */
598 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
599 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
600 "refcnt=%d\n",
601 dest->vfwmark,
602 NIPQUAD(dest->addr), ntohs(dest->port),
603 atomic_read(&dest->refcnt));
604 if (dest->addr == daddr &&
605 dest->port == dport &&
606 dest->vfwmark == svc->fwmark &&
607 dest->protocol == svc->protocol &&
608 (svc->fwmark ||
609 (dest->vaddr == svc->addr &&
610 dest->vport == svc->port))) {
611 /* HIT */
612 return dest;
613 }
614
615 /*
616 * Try to purge the destination from trash if not referenced
617 */
618 if (atomic_read(&dest->refcnt) == 1) {
619 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
620 "from trash\n",
621 dest->vfwmark,
622 NIPQUAD(dest->addr), ntohs(dest->port));
623 list_del(&dest->n_list);
624 ip_vs_dst_reset(dest);
625 __ip_vs_unbind_svc(dest);
626 kfree(dest);
627 }
628 }
629
630 return NULL;
631}
632
633
634/*
635 * Clean up all the destinations in the trash
636 * Called by the ip_vs_control_cleanup()
637 *
638 * When the ip_vs_control_clearup is activated by ipvs module exit,
639 * the service tables must have been flushed and all the connections
640 * are expired, and the refcnt of each destination in the trash must
641 * be 1, so we simply release them here.
642 */
643static void ip_vs_trash_cleanup(void)
644{
645 struct ip_vs_dest *dest, *nxt;
646
647 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
648 list_del(&dest->n_list);
649 ip_vs_dst_reset(dest);
650 __ip_vs_unbind_svc(dest);
651 kfree(dest);
652 }
653}
654
655
656static void
657ip_vs_zero_stats(struct ip_vs_stats *stats)
658{
659 spin_lock_bh(&stats->lock);
660 memset(stats, 0, (char *)&stats->lock - (char *)stats);
661 spin_unlock_bh(&stats->lock);
662 ip_vs_zero_estimator(stats);
663}
664
665/*
666 * Update a destination in the given service
667 */
668static void
669__ip_vs_update_dest(struct ip_vs_service *svc,
670 struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
671{
672 int conn_flags;
673
674 /* set the weight and the flags */
675 atomic_set(&dest->weight, udest->weight);
676 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
677
678 /* check if local node and update the flags */
679 if (inet_addr_type(udest->addr) == RTN_LOCAL) {
680 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
681 | IP_VS_CONN_F_LOCALNODE;
682 }
683
684 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
685 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
686 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
687 } else {
688 /*
689 * Put the real service in ip_vs_rtable if not present.
690 * For now only for NAT!
691 */
692 write_lock_bh(&__ip_vs_rs_lock);
693 ip_vs_rs_hash(dest);
694 write_unlock_bh(&__ip_vs_rs_lock);
695 }
696 atomic_set(&dest->conn_flags, conn_flags);
697
698 /* bind the service */
699 if (!dest->svc) {
700 __ip_vs_bind_svc(dest, svc);
701 } else {
702 if (dest->svc != svc) {
703 __ip_vs_unbind_svc(dest);
704 ip_vs_zero_stats(&dest->stats);
705 __ip_vs_bind_svc(dest, svc);
706 }
707 }
708
709 /* set the dest status flags */
710 dest->flags |= IP_VS_DEST_F_AVAILABLE;
711
712 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
713 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
714 dest->u_threshold = udest->u_threshold;
715 dest->l_threshold = udest->l_threshold;
716}
717
718
719/*
720 * Create a destination for the given service
721 */
722static int
723ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
724 struct ip_vs_dest **dest_p)
725{
726 struct ip_vs_dest *dest;
727 unsigned atype;
728
729 EnterFunction(2);
730
731 atype = inet_addr_type(udest->addr);
732 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
733 return -EINVAL;
734
735 dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
736 if (dest == NULL) {
737 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
738 return -ENOMEM;
739 }
740 memset(dest, 0, sizeof(struct ip_vs_dest));
741
742 dest->protocol = svc->protocol;
743 dest->vaddr = svc->addr;
744 dest->vport = svc->port;
745 dest->vfwmark = svc->fwmark;
746 dest->addr = udest->addr;
747 dest->port = udest->port;
748
749 atomic_set(&dest->activeconns, 0);
750 atomic_set(&dest->inactconns, 0);
751 atomic_set(&dest->persistconns, 0);
752 atomic_set(&dest->refcnt, 0);
753
754 INIT_LIST_HEAD(&dest->d_list);
755 spin_lock_init(&dest->dst_lock);
756 spin_lock_init(&dest->stats.lock);
757 __ip_vs_update_dest(svc, dest, udest);
758 ip_vs_new_estimator(&dest->stats);
759
760 *dest_p = dest;
761
762 LeaveFunction(2);
763 return 0;
764}
765
766
767/*
768 * Add a destination into an existing service
769 */
770static int
771ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
772{
773 struct ip_vs_dest *dest;
774 __u32 daddr = udest->addr;
775 __u16 dport = udest->port;
776 int ret;
777
778 EnterFunction(2);
779
780 if (udest->weight < 0) {
781 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
782 return -ERANGE;
783 }
784
785 if (udest->l_threshold > udest->u_threshold) {
786 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
787 "upper threshold\n");
788 return -ERANGE;
789 }
790
791 /*
792 * Check if the dest already exists in the list
793 */
794 dest = ip_vs_lookup_dest(svc, daddr, dport);
795 if (dest != NULL) {
796 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
797 return -EEXIST;
798 }
799
800 /*
801 * Check if the dest already exists in the trash and
802 * is from the same service
803 */
804 dest = ip_vs_trash_get_dest(svc, daddr, dport);
805 if (dest != NULL) {
806 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
807 "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
808 NIPQUAD(daddr), ntohs(dport),
809 atomic_read(&dest->refcnt),
810 dest->vfwmark,
811 NIPQUAD(dest->vaddr),
812 ntohs(dest->vport));
813 __ip_vs_update_dest(svc, dest, udest);
814
815 /*
816 * Get the destination from the trash
817 */
818 list_del(&dest->n_list);
819
820 ip_vs_new_estimator(&dest->stats);
821
822 write_lock_bh(&__ip_vs_svc_lock);
823
824 /*
825 * Wait until all other svc users go away.
826 */
827 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
828
829 list_add(&dest->n_list, &svc->destinations);
830 svc->num_dests++;
831
832 /* call the update_service function of its scheduler */
833 svc->scheduler->update_service(svc);
834
835 write_unlock_bh(&__ip_vs_svc_lock);
836 return 0;
837 }
838
839 /*
840 * Allocate and initialize the dest structure
841 */
842 ret = ip_vs_new_dest(svc, udest, &dest);
843 if (ret) {
844 return ret;
845 }
846
847 /*
848 * Add the dest entry into the list
849 */
850 atomic_inc(&dest->refcnt);
851
852 write_lock_bh(&__ip_vs_svc_lock);
853
854 /*
855 * Wait until all other svc users go away.
856 */
857 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
858
859 list_add(&dest->n_list, &svc->destinations);
860 svc->num_dests++;
861
862 /* call the update_service function of its scheduler */
863 svc->scheduler->update_service(svc);
864
865 write_unlock_bh(&__ip_vs_svc_lock);
866
867 LeaveFunction(2);
868
869 return 0;
870}
871
872
873/*
874 * Edit a destination in the given service
875 */
876static int
877ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
878{
879 struct ip_vs_dest *dest;
880 __u32 daddr = udest->addr;
881 __u16 dport = udest->port;
882
883 EnterFunction(2);
884
885 if (udest->weight < 0) {
886 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
887 return -ERANGE;
888 }
889
890 if (udest->l_threshold > udest->u_threshold) {
891 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
892 "upper threshold\n");
893 return -ERANGE;
894 }
895
896 /*
897 * Lookup the destination list
898 */
899 dest = ip_vs_lookup_dest(svc, daddr, dport);
900 if (dest == NULL) {
901 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
902 return -ENOENT;
903 }
904
905 __ip_vs_update_dest(svc, dest, udest);
906
907 write_lock_bh(&__ip_vs_svc_lock);
908
909 /* Wait until all other svc users go away */
910 while (atomic_read(&svc->usecnt) > 1) {};
911
912 /* call the update_service, because server weight may be changed */
913 svc->scheduler->update_service(svc);
914
915 write_unlock_bh(&__ip_vs_svc_lock);
916
917 LeaveFunction(2);
918
919 return 0;
920}
921
922
923/*
924 * Delete a destination (must be already unlinked from the service)
925 */
926static void __ip_vs_del_dest(struct ip_vs_dest *dest)
927{
928 ip_vs_kill_estimator(&dest->stats);
929
930 /*
931 * Remove it from the d-linked list with the real services.
932 */
933 write_lock_bh(&__ip_vs_rs_lock);
934 ip_vs_rs_unhash(dest);
935 write_unlock_bh(&__ip_vs_rs_lock);
936
937 /*
938 * Decrease the refcnt of the dest, and free the dest
939 * if nobody refers to it (refcnt=0). Otherwise, throw
940 * the destination into the trash.
941 */
942 if (atomic_dec_and_test(&dest->refcnt)) {
943 ip_vs_dst_reset(dest);
944 /* simply decrease svc->refcnt here, let the caller check
945 and release the service if nobody refers to it.
946 Only user context can release destination and service,
947 and only one user context can update virtual service at a
948 time, so the operation here is OK */
949 atomic_dec(&dest->svc->refcnt);
950 kfree(dest);
951 } else {
952 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
953 NIPQUAD(dest->addr), ntohs(dest->port),
954 atomic_read(&dest->refcnt));
955 list_add(&dest->n_list, &ip_vs_dest_trash);
956 atomic_inc(&dest->refcnt);
957 }
958}
959
960
961/*
962 * Unlink a destination from the given service
963 */
964static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
965 struct ip_vs_dest *dest,
966 int svcupd)
967{
968 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
969
970 /*
971 * Remove it from the d-linked destination list.
972 */
973 list_del(&dest->n_list);
974 svc->num_dests--;
975 if (svcupd) {
976 /*
977 * Call the update_service function of its scheduler
978 */
979 svc->scheduler->update_service(svc);
980 }
981}
982
983
984/*
985 * Delete a destination server in the given service
986 */
987static int
988ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
989{
990 struct ip_vs_dest *dest;
991 __u32 daddr = udest->addr;
992 __u16 dport = udest->port;
993
994 EnterFunction(2);
995
996 dest = ip_vs_lookup_dest(svc, daddr, dport);
997 if (dest == NULL) {
998 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
999 return -ENOENT;
1000 }
1001
1002 write_lock_bh(&__ip_vs_svc_lock);
1003
1004 /*
1005 * Wait until all other svc users go away.
1006 */
1007 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1008
1009 /*
1010 * Unlink dest from the service
1011 */
1012 __ip_vs_unlink_dest(svc, dest, 1);
1013
1014 write_unlock_bh(&__ip_vs_svc_lock);
1015
1016 /*
1017 * Delete the destination
1018 */
1019 __ip_vs_del_dest(dest);
1020
1021 LeaveFunction(2);
1022
1023 return 0;
1024}
1025
1026
1027/*
1028 * Add a service into the service hash table
1029 */
1030static int
1031ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1032{
1033 int ret = 0;
1034 struct ip_vs_scheduler *sched = NULL;
1035 struct ip_vs_service *svc = NULL;
1036
1037 /* increase the module use count */
1038 ip_vs_use_count_inc();
1039
1040 /* Lookup the scheduler by 'u->sched_name' */
1041 sched = ip_vs_scheduler_get(u->sched_name);
1042 if (sched == NULL) {
1043 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1044 u->sched_name);
1045 ret = -ENOENT;
1046 goto out_mod_dec;
1047 }
1048
1049 svc = (struct ip_vs_service *)
1050 kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1051 if (svc == NULL) {
1052 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1053 ret = -ENOMEM;
1054 goto out_err;
1055 }
1056 memset(svc, 0, sizeof(struct ip_vs_service));
1057
1058 /* I'm the first user of the service */
1059 atomic_set(&svc->usecnt, 1);
1060 atomic_set(&svc->refcnt, 0);
1061
1062 svc->protocol = u->protocol;
1063 svc->addr = u->addr;
1064 svc->port = u->port;
1065 svc->fwmark = u->fwmark;
1066 svc->flags = u->flags;
1067 svc->timeout = u->timeout * HZ;
1068 svc->netmask = u->netmask;
1069
1070 INIT_LIST_HEAD(&svc->destinations);
1071 rwlock_init(&svc->sched_lock);
1072 spin_lock_init(&svc->stats.lock);
1073
1074 /* Bind the scheduler */
1075 ret = ip_vs_bind_scheduler(svc, sched);
1076 if (ret)
1077 goto out_err;
1078 sched = NULL;
1079
1080 /* Update the virtual service counters */
1081 if (svc->port == FTPPORT)
1082 atomic_inc(&ip_vs_ftpsvc_counter);
1083 else if (svc->port == 0)
1084 atomic_inc(&ip_vs_nullsvc_counter);
1085
1086 ip_vs_new_estimator(&svc->stats);
1087 ip_vs_num_services++;
1088
1089 /* Hash the service into the service table */
1090 write_lock_bh(&__ip_vs_svc_lock);
1091 ip_vs_svc_hash(svc);
1092 write_unlock_bh(&__ip_vs_svc_lock);
1093
1094 *svc_p = svc;
1095 return 0;
1096
1097 out_err:
1098 if (svc != NULL) {
1099 if (svc->scheduler)
1100 ip_vs_unbind_scheduler(svc);
1101 if (svc->inc) {
1102 local_bh_disable();
1103 ip_vs_app_inc_put(svc->inc);
1104 local_bh_enable();
1105 }
1106 kfree(svc);
1107 }
1108 ip_vs_scheduler_put(sched);
1109
1110 out_mod_dec:
1111 /* decrease the module use count */
1112 ip_vs_use_count_dec();
1113
1114 return ret;
1115}
1116
1117
1118/*
1119 * Edit a service and bind it with a new scheduler
1120 */
1121static int
1122ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1123{
1124 struct ip_vs_scheduler *sched, *old_sched;
1125 int ret = 0;
1126
1127 /*
1128 * Lookup the scheduler, by 'u->sched_name'
1129 */
1130 sched = ip_vs_scheduler_get(u->sched_name);
1131 if (sched == NULL) {
1132 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1133 u->sched_name);
1134 return -ENOENT;
1135 }
1136 old_sched = sched;
1137
1138 write_lock_bh(&__ip_vs_svc_lock);
1139
1140 /*
1141 * Wait until all other svc users go away.
1142 */
1143 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1144
1145 /*
1146 * Set the flags and timeout value
1147 */
1148 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1149 svc->timeout = u->timeout * HZ;
1150 svc->netmask = u->netmask;
1151
1152 old_sched = svc->scheduler;
1153 if (sched != old_sched) {
1154 /*
1155 * Unbind the old scheduler
1156 */
1157 if ((ret = ip_vs_unbind_scheduler(svc))) {
1158 old_sched = sched;
1159 goto out;
1160 }
1161
1162 /*
1163 * Bind the new scheduler
1164 */
1165 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1166 /*
1167 * If ip_vs_bind_scheduler fails, restore the old
1168 * scheduler.
1169 * The main reason of failure is out of memory.
1170 *
1171 * The question is if the old scheduler can be
1172 * restored all the time. TODO: if it cannot be
1173 * restored some time, we must delete the service,
1174 * otherwise the system may crash.
1175 */
1176 ip_vs_bind_scheduler(svc, old_sched);
1177 old_sched = sched;
1178 goto out;
1179 }
1180 }
1181
1182 out:
1183 write_unlock_bh(&__ip_vs_svc_lock);
1184
1185 if (old_sched)
1186 ip_vs_scheduler_put(old_sched);
1187
1188 return ret;
1189}
1190
1191
1192/*
1193 * Delete a service from the service list
1194 * - The service must be unlinked, unlocked and not referenced!
1195 * - We are called under _bh lock
1196 */
1197static void __ip_vs_del_service(struct ip_vs_service *svc)
1198{
1199 struct ip_vs_dest *dest, *nxt;
1200 struct ip_vs_scheduler *old_sched;
1201
1202 ip_vs_num_services--;
1203 ip_vs_kill_estimator(&svc->stats);
1204
1205 /* Unbind scheduler */
1206 old_sched = svc->scheduler;
1207 ip_vs_unbind_scheduler(svc);
1208 if (old_sched)
1209 ip_vs_scheduler_put(old_sched);
1210
1211 /* Unbind app inc */
1212 if (svc->inc) {
1213 ip_vs_app_inc_put(svc->inc);
1214 svc->inc = NULL;
1215 }
1216
1217 /*
1218 * Unlink the whole destination list
1219 */
1220 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1221 __ip_vs_unlink_dest(svc, dest, 0);
1222 __ip_vs_del_dest(dest);
1223 }
1224
1225 /*
1226 * Update the virtual service counters
1227 */
1228 if (svc->port == FTPPORT)
1229 atomic_dec(&ip_vs_ftpsvc_counter);
1230 else if (svc->port == 0)
1231 atomic_dec(&ip_vs_nullsvc_counter);
1232
1233 /*
1234 * Free the service if nobody refers to it
1235 */
1236 if (atomic_read(&svc->refcnt) == 0)
1237 kfree(svc);
1238
1239 /* decrease the module use count */
1240 ip_vs_use_count_dec();
1241}
1242
1243/*
1244 * Delete a service from the service list
1245 */
1246static int ip_vs_del_service(struct ip_vs_service *svc)
1247{
1248 if (svc == NULL)
1249 return -EEXIST;
1250
1251 /*
1252 * Unhash it from the service table
1253 */
1254 write_lock_bh(&__ip_vs_svc_lock);
1255
1256 ip_vs_svc_unhash(svc);
1257
1258 /*
1259 * Wait until all the svc users go away.
1260 */
1261 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1262
1263 __ip_vs_del_service(svc);
1264
1265 write_unlock_bh(&__ip_vs_svc_lock);
1266
1267 return 0;
1268}
1269
1270
1271/*
1272 * Flush all the virtual services
1273 */
1274static int ip_vs_flush(void)
1275{
1276 int idx;
1277 struct ip_vs_service *svc, *nxt;
1278
1279 /*
1280 * Flush the service table hashed by <protocol,addr,port>
1281 */
1282 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1283 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1284 write_lock_bh(&__ip_vs_svc_lock);
1285 ip_vs_svc_unhash(svc);
1286 /*
1287 * Wait until all the svc users go away.
1288 */
1289 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1290 __ip_vs_del_service(svc);
1291 write_unlock_bh(&__ip_vs_svc_lock);
1292 }
1293 }
1294
1295 /*
1296 * Flush the service table hashed by fwmark
1297 */
1298 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1299 list_for_each_entry_safe(svc, nxt,
1300 &ip_vs_svc_fwm_table[idx], f_list) {
1301 write_lock_bh(&__ip_vs_svc_lock);
1302 ip_vs_svc_unhash(svc);
1303 /*
1304 * Wait until all the svc users go away.
1305 */
1306 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1307 __ip_vs_del_service(svc);
1308 write_unlock_bh(&__ip_vs_svc_lock);
1309 }
1310 }
1311
1312 return 0;
1313}
1314
1315
1316/*
1317 * Zero counters in a service or all services
1318 */
1319static int ip_vs_zero_service(struct ip_vs_service *svc)
1320{
1321 struct ip_vs_dest *dest;
1322
1323 write_lock_bh(&__ip_vs_svc_lock);
1324 list_for_each_entry(dest, &svc->destinations, n_list) {
1325 ip_vs_zero_stats(&dest->stats);
1326 }
1327 ip_vs_zero_stats(&svc->stats);
1328 write_unlock_bh(&__ip_vs_svc_lock);
1329 return 0;
1330}
1331
1332static int ip_vs_zero_all(void)
1333{
1334 int idx;
1335 struct ip_vs_service *svc;
1336
1337 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1338 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1339 ip_vs_zero_service(svc);
1340 }
1341 }
1342
1343 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1344 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1345 ip_vs_zero_service(svc);
1346 }
1347 }
1348
1349 ip_vs_zero_stats(&ip_vs_stats);
1350 return 0;
1351}
1352
1353
1354static int
1355proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1356 void __user *buffer, size_t *lenp, loff_t *ppos)
1357{
1358 int *valp = table->data;
1359 int val = *valp;
1360 int rc;
1361
1362 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1363 if (write && (*valp != val)) {
1364 if ((*valp < 0) || (*valp > 3)) {
1365 /* Restore the correct value */
1366 *valp = val;
1367 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001368 update_defense_level();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001369 }
1370 }
1371 return rc;
1372}
1373
1374
1375static int
1376proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1377 void __user *buffer, size_t *lenp, loff_t *ppos)
1378{
1379 int *valp = table->data;
1380 int val[2];
1381 int rc;
1382
1383 /* backup the value first */
1384 memcpy(val, valp, sizeof(val));
1385
1386 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1387 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1388 /* Restore the correct value */
1389 memcpy(valp, val, sizeof(val));
1390 }
1391 return rc;
1392}
1393
1394
1395/*
1396 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1397 */
1398
1399static struct ctl_table vs_vars[] = {
1400 {
1401 .ctl_name = NET_IPV4_VS_AMEMTHRESH,
1402 .procname = "amemthresh",
1403 .data = &sysctl_ip_vs_amemthresh,
1404 .maxlen = sizeof(int),
1405 .mode = 0644,
1406 .proc_handler = &proc_dointvec,
1407 },
1408#ifdef CONFIG_IP_VS_DEBUG
1409 {
1410 .ctl_name = NET_IPV4_VS_DEBUG_LEVEL,
1411 .procname = "debug_level",
1412 .data = &sysctl_ip_vs_debug_level,
1413 .maxlen = sizeof(int),
1414 .mode = 0644,
1415 .proc_handler = &proc_dointvec,
1416 },
1417#endif
1418 {
1419 .ctl_name = NET_IPV4_VS_AMDROPRATE,
1420 .procname = "am_droprate",
1421 .data = &sysctl_ip_vs_am_droprate,
1422 .maxlen = sizeof(int),
1423 .mode = 0644,
1424 .proc_handler = &proc_dointvec,
1425 },
1426 {
1427 .ctl_name = NET_IPV4_VS_DROP_ENTRY,
1428 .procname = "drop_entry",
1429 .data = &sysctl_ip_vs_drop_entry,
1430 .maxlen = sizeof(int),
1431 .mode = 0644,
1432 .proc_handler = &proc_do_defense_mode,
1433 },
1434 {
1435 .ctl_name = NET_IPV4_VS_DROP_PACKET,
1436 .procname = "drop_packet",
1437 .data = &sysctl_ip_vs_drop_packet,
1438 .maxlen = sizeof(int),
1439 .mode = 0644,
1440 .proc_handler = &proc_do_defense_mode,
1441 },
1442 {
1443 .ctl_name = NET_IPV4_VS_SECURE_TCP,
1444 .procname = "secure_tcp",
1445 .data = &sysctl_ip_vs_secure_tcp,
1446 .maxlen = sizeof(int),
1447 .mode = 0644,
1448 .proc_handler = &proc_do_defense_mode,
1449 },
1450#if 0
1451 {
1452 .ctl_name = NET_IPV4_VS_TO_ES,
1453 .procname = "timeout_established",
1454 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1455 .maxlen = sizeof(int),
1456 .mode = 0644,
1457 .proc_handler = &proc_dointvec_jiffies,
1458 },
1459 {
1460 .ctl_name = NET_IPV4_VS_TO_SS,
1461 .procname = "timeout_synsent",
1462 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1463 .maxlen = sizeof(int),
1464 .mode = 0644,
1465 .proc_handler = &proc_dointvec_jiffies,
1466 },
1467 {
1468 .ctl_name = NET_IPV4_VS_TO_SR,
1469 .procname = "timeout_synrecv",
1470 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1471 .maxlen = sizeof(int),
1472 .mode = 0644,
1473 .proc_handler = &proc_dointvec_jiffies,
1474 },
1475 {
1476 .ctl_name = NET_IPV4_VS_TO_FW,
1477 .procname = "timeout_finwait",
1478 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1479 .maxlen = sizeof(int),
1480 .mode = 0644,
1481 .proc_handler = &proc_dointvec_jiffies,
1482 },
1483 {
1484 .ctl_name = NET_IPV4_VS_TO_TW,
1485 .procname = "timeout_timewait",
1486 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1487 .maxlen = sizeof(int),
1488 .mode = 0644,
1489 .proc_handler = &proc_dointvec_jiffies,
1490 },
1491 {
1492 .ctl_name = NET_IPV4_VS_TO_CL,
1493 .procname = "timeout_close",
1494 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1495 .maxlen = sizeof(int),
1496 .mode = 0644,
1497 .proc_handler = &proc_dointvec_jiffies,
1498 },
1499 {
1500 .ctl_name = NET_IPV4_VS_TO_CW,
1501 .procname = "timeout_closewait",
1502 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1503 .maxlen = sizeof(int),
1504 .mode = 0644,
1505 .proc_handler = &proc_dointvec_jiffies,
1506 },
1507 {
1508 .ctl_name = NET_IPV4_VS_TO_LA,
1509 .procname = "timeout_lastack",
1510 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1511 .maxlen = sizeof(int),
1512 .mode = 0644,
1513 .proc_handler = &proc_dointvec_jiffies,
1514 },
1515 {
1516 .ctl_name = NET_IPV4_VS_TO_LI,
1517 .procname = "timeout_listen",
1518 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1519 .maxlen = sizeof(int),
1520 .mode = 0644,
1521 .proc_handler = &proc_dointvec_jiffies,
1522 },
1523 {
1524 .ctl_name = NET_IPV4_VS_TO_SA,
1525 .procname = "timeout_synack",
1526 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1527 .maxlen = sizeof(int),
1528 .mode = 0644,
1529 .proc_handler = &proc_dointvec_jiffies,
1530 },
1531 {
1532 .ctl_name = NET_IPV4_VS_TO_UDP,
1533 .procname = "timeout_udp",
1534 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1535 .maxlen = sizeof(int),
1536 .mode = 0644,
1537 .proc_handler = &proc_dointvec_jiffies,
1538 },
1539 {
1540 .ctl_name = NET_IPV4_VS_TO_ICMP,
1541 .procname = "timeout_icmp",
1542 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1543 .maxlen = sizeof(int),
1544 .mode = 0644,
1545 .proc_handler = &proc_dointvec_jiffies,
1546 },
1547#endif
1548 {
1549 .ctl_name = NET_IPV4_VS_CACHE_BYPASS,
1550 .procname = "cache_bypass",
1551 .data = &sysctl_ip_vs_cache_bypass,
1552 .maxlen = sizeof(int),
1553 .mode = 0644,
1554 .proc_handler = &proc_dointvec,
1555 },
1556 {
1557 .ctl_name = NET_IPV4_VS_EXPIRE_NODEST_CONN,
1558 .procname = "expire_nodest_conn",
1559 .data = &sysctl_ip_vs_expire_nodest_conn,
1560 .maxlen = sizeof(int),
1561 .mode = 0644,
1562 .proc_handler = &proc_dointvec,
1563 },
1564 {
1565 .ctl_name = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1566 .procname = "expire_quiescent_template",
1567 .data = &sysctl_ip_vs_expire_quiescent_template,
1568 .maxlen = sizeof(int),
1569 .mode = 0644,
1570 .proc_handler = &proc_dointvec,
1571 },
1572 {
1573 .ctl_name = NET_IPV4_VS_SYNC_THRESHOLD,
1574 .procname = "sync_threshold",
1575 .data = &sysctl_ip_vs_sync_threshold,
1576 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1577 .mode = 0644,
1578 .proc_handler = &proc_do_sync_threshold,
1579 },
1580 {
1581 .ctl_name = NET_IPV4_VS_NAT_ICMP_SEND,
1582 .procname = "nat_icmp_send",
1583 .data = &sysctl_ip_vs_nat_icmp_send,
1584 .maxlen = sizeof(int),
1585 .mode = 0644,
1586 .proc_handler = &proc_dointvec,
1587 },
1588 { .ctl_name = 0 }
1589};
1590
1591static ctl_table vs_table[] = {
1592 {
1593 .ctl_name = NET_IPV4_VS,
1594 .procname = "vs",
1595 .mode = 0555,
1596 .child = vs_vars
1597 },
1598 { .ctl_name = 0 }
1599};
1600
David S. Millerbf0ff9e2005-08-19 16:37:30 -07001601static ctl_table ipvs_ipv4_table[] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001602 {
1603 .ctl_name = NET_IPV4,
1604 .procname = "ipv4",
1605 .mode = 0555,
1606 .child = vs_table,
1607 },
1608 { .ctl_name = 0 }
1609};
1610
1611static ctl_table vs_root_table[] = {
1612 {
1613 .ctl_name = CTL_NET,
1614 .procname = "net",
1615 .mode = 0555,
David S. Millerbf0ff9e2005-08-19 16:37:30 -07001616 .child = ipvs_ipv4_table,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001617 },
1618 { .ctl_name = 0 }
1619};
1620
1621static struct ctl_table_header * sysctl_header;
1622
1623#ifdef CONFIG_PROC_FS
1624
1625struct ip_vs_iter {
1626 struct list_head *table;
1627 int bucket;
1628};
1629
1630/*
1631 * Write the contents of the VS rule table to a PROCfs file.
1632 * (It is kept just for backward compatibility)
1633 */
1634static inline const char *ip_vs_fwd_name(unsigned flags)
1635{
1636 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1637 case IP_VS_CONN_F_LOCALNODE:
1638 return "Local";
1639 case IP_VS_CONN_F_TUNNEL:
1640 return "Tunnel";
1641 case IP_VS_CONN_F_DROUTE:
1642 return "Route";
1643 default:
1644 return "Masq";
1645 }
1646}
1647
1648
1649/* Get the Nth entry in the two lists */
1650static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1651{
1652 struct ip_vs_iter *iter = seq->private;
1653 int idx;
1654 struct ip_vs_service *svc;
1655
1656 /* look in hash by protocol */
1657 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1658 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1659 if (pos-- == 0){
1660 iter->table = ip_vs_svc_table;
1661 iter->bucket = idx;
1662 return svc;
1663 }
1664 }
1665 }
1666
1667 /* keep looking in fwmark */
1668 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1669 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1670 if (pos-- == 0) {
1671 iter->table = ip_vs_svc_fwm_table;
1672 iter->bucket = idx;
1673 return svc;
1674 }
1675 }
1676 }
1677
1678 return NULL;
1679}
1680
1681static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1682{
1683
1684 read_lock_bh(&__ip_vs_svc_lock);
1685 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1686}
1687
1688
1689static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1690{
1691 struct list_head *e;
1692 struct ip_vs_iter *iter;
1693 struct ip_vs_service *svc;
1694
1695 ++*pos;
1696 if (v == SEQ_START_TOKEN)
1697 return ip_vs_info_array(seq,0);
1698
1699 svc = v;
1700 iter = seq->private;
1701
1702 if (iter->table == ip_vs_svc_table) {
1703 /* next service in table hashed by protocol */
1704 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1705 return list_entry(e, struct ip_vs_service, s_list);
1706
1707
1708 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1709 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1710 s_list) {
1711 return svc;
1712 }
1713 }
1714
1715 iter->table = ip_vs_svc_fwm_table;
1716 iter->bucket = -1;
1717 goto scan_fwmark;
1718 }
1719
1720 /* next service in hashed by fwmark */
1721 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1722 return list_entry(e, struct ip_vs_service, f_list);
1723
1724 scan_fwmark:
1725 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1726 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1727 f_list)
1728 return svc;
1729 }
1730
1731 return NULL;
1732}
1733
1734static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1735{
1736 read_unlock_bh(&__ip_vs_svc_lock);
1737}
1738
1739
1740static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1741{
1742 if (v == SEQ_START_TOKEN) {
1743 seq_printf(seq,
1744 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1745 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1746 seq_puts(seq,
1747 "Prot LocalAddress:Port Scheduler Flags\n");
1748 seq_puts(seq,
1749 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1750 } else {
1751 const struct ip_vs_service *svc = v;
1752 const struct ip_vs_iter *iter = seq->private;
1753 const struct ip_vs_dest *dest;
1754
1755 if (iter->table == ip_vs_svc_table)
1756 seq_printf(seq, "%s %08X:%04X %s ",
1757 ip_vs_proto_name(svc->protocol),
1758 ntohl(svc->addr),
1759 ntohs(svc->port),
1760 svc->scheduler->name);
1761 else
1762 seq_printf(seq, "FWM %08X %s ",
1763 svc->fwmark, svc->scheduler->name);
1764
1765 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1766 seq_printf(seq, "persistent %d %08X\n",
1767 svc->timeout,
1768 ntohl(svc->netmask));
1769 else
1770 seq_putc(seq, '\n');
1771
1772 list_for_each_entry(dest, &svc->destinations, n_list) {
1773 seq_printf(seq,
1774 " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
1775 ntohl(dest->addr), ntohs(dest->port),
1776 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1777 atomic_read(&dest->weight),
1778 atomic_read(&dest->activeconns),
1779 atomic_read(&dest->inactconns));
1780 }
1781 }
1782 return 0;
1783}
1784
1785static struct seq_operations ip_vs_info_seq_ops = {
1786 .start = ip_vs_info_seq_start,
1787 .next = ip_vs_info_seq_next,
1788 .stop = ip_vs_info_seq_stop,
1789 .show = ip_vs_info_seq_show,
1790};
1791
1792static int ip_vs_info_open(struct inode *inode, struct file *file)
1793{
1794 struct seq_file *seq;
1795 int rc = -ENOMEM;
1796 struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1797
1798 if (!s)
1799 goto out;
1800
1801 rc = seq_open(file, &ip_vs_info_seq_ops);
1802 if (rc)
1803 goto out_kfree;
1804
1805 seq = file->private_data;
1806 seq->private = s;
1807 memset(s, 0, sizeof(*s));
1808out:
1809 return rc;
1810out_kfree:
1811 kfree(s);
1812 goto out;
1813}
1814
1815static struct file_operations ip_vs_info_fops = {
1816 .owner = THIS_MODULE,
1817 .open = ip_vs_info_open,
1818 .read = seq_read,
1819 .llseek = seq_lseek,
1820 .release = seq_release_private,
1821};
1822
1823#endif
1824
1825struct ip_vs_stats ip_vs_stats;
1826
1827#ifdef CONFIG_PROC_FS
1828static int ip_vs_stats_show(struct seq_file *seq, void *v)
1829{
1830
1831/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1832 seq_puts(seq,
1833 " Total Incoming Outgoing Incoming Outgoing\n");
1834 seq_printf(seq,
1835 " Conns Packets Packets Bytes Bytes\n");
1836
1837 spin_lock_bh(&ip_vs_stats.lock);
1838 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1839 ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1840 (unsigned long long) ip_vs_stats.inbytes,
1841 (unsigned long long) ip_vs_stats.outbytes);
1842
1843/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1844 seq_puts(seq,
1845 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1846 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1847 ip_vs_stats.cps,
1848 ip_vs_stats.inpps,
1849 ip_vs_stats.outpps,
1850 ip_vs_stats.inbps,
1851 ip_vs_stats.outbps);
1852 spin_unlock_bh(&ip_vs_stats.lock);
1853
1854 return 0;
1855}
1856
1857static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1858{
1859 return single_open(file, ip_vs_stats_show, NULL);
1860}
1861
1862static struct file_operations ip_vs_stats_fops = {
1863 .owner = THIS_MODULE,
1864 .open = ip_vs_stats_seq_open,
1865 .read = seq_read,
1866 .llseek = seq_lseek,
1867 .release = single_release,
1868};
1869
1870#endif
1871
1872/*
1873 * Set timeout values for tcp tcpfin udp in the timeout_table.
1874 */
1875static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1876{
1877 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1878 u->tcp_timeout,
1879 u->tcp_fin_timeout,
1880 u->udp_timeout);
1881
1882#ifdef CONFIG_IP_VS_PROTO_TCP
1883 if (u->tcp_timeout) {
1884 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1885 = u->tcp_timeout * HZ;
1886 }
1887
1888 if (u->tcp_fin_timeout) {
1889 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1890 = u->tcp_fin_timeout * HZ;
1891 }
1892#endif
1893
1894#ifdef CONFIG_IP_VS_PROTO_UDP
1895 if (u->udp_timeout) {
1896 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1897 = u->udp_timeout * HZ;
1898 }
1899#endif
1900 return 0;
1901}
1902
1903
1904#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
1905#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
1906#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
1907 sizeof(struct ip_vs_dest_user))
1908#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
1909#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
1910#define MAX_ARG_LEN SVCDEST_ARG_LEN
1911
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001912static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001913 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
1914 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
1915 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
1916 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
1917 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
1918 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
1919 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
1920 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
1921 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
1922 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
1923 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
1924};
1925
1926static int
1927do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1928{
1929 int ret;
1930 unsigned char arg[MAX_ARG_LEN];
1931 struct ip_vs_service_user *usvc;
1932 struct ip_vs_service *svc;
1933 struct ip_vs_dest_user *udest;
1934
1935 if (!capable(CAP_NET_ADMIN))
1936 return -EPERM;
1937
1938 if (len != set_arglen[SET_CMDID(cmd)]) {
1939 IP_VS_ERR("set_ctl: len %u != %u\n",
1940 len, set_arglen[SET_CMDID(cmd)]);
1941 return -EINVAL;
1942 }
1943
1944 if (copy_from_user(arg, user, len) != 0)
1945 return -EFAULT;
1946
1947 /* increase the module use count */
1948 ip_vs_use_count_inc();
1949
1950 if (down_interruptible(&__ip_vs_mutex)) {
1951 ret = -ERESTARTSYS;
1952 goto out_dec;
1953 }
1954
1955 if (cmd == IP_VS_SO_SET_FLUSH) {
1956 /* Flush the virtual service */
1957 ret = ip_vs_flush();
1958 goto out_unlock;
1959 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1960 /* Set timeout values for (tcp tcpfin udp) */
1961 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1962 goto out_unlock;
1963 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1964 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1965 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1966 goto out_unlock;
1967 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1968 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1969 ret = stop_sync_thread(dm->state);
1970 goto out_unlock;
1971 }
1972
1973 usvc = (struct ip_vs_service_user *)arg;
1974 udest = (struct ip_vs_dest_user *)(usvc + 1);
1975
1976 if (cmd == IP_VS_SO_SET_ZERO) {
1977 /* if no service address is set, zero counters in all */
1978 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1979 ret = ip_vs_zero_all();
1980 goto out_unlock;
1981 }
1982 }
1983
1984 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1985 if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1986 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1987 usvc->protocol, NIPQUAD(usvc->addr),
1988 ntohs(usvc->port), usvc->sched_name);
1989 ret = -EFAULT;
1990 goto out_unlock;
1991 }
1992
1993 /* Lookup the exact service by <protocol, addr, port> or fwmark */
1994 if (usvc->fwmark == 0)
1995 svc = __ip_vs_service_get(usvc->protocol,
1996 usvc->addr, usvc->port);
1997 else
1998 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1999
2000 if (cmd != IP_VS_SO_SET_ADD
2001 && (svc == NULL || svc->protocol != usvc->protocol)) {
2002 ret = -ESRCH;
2003 goto out_unlock;
2004 }
2005
2006 switch (cmd) {
2007 case IP_VS_SO_SET_ADD:
2008 if (svc != NULL)
2009 ret = -EEXIST;
2010 else
2011 ret = ip_vs_add_service(usvc, &svc);
2012 break;
2013 case IP_VS_SO_SET_EDIT:
2014 ret = ip_vs_edit_service(svc, usvc);
2015 break;
2016 case IP_VS_SO_SET_DEL:
2017 ret = ip_vs_del_service(svc);
2018 if (!ret)
2019 goto out_unlock;
2020 break;
2021 case IP_VS_SO_SET_ZERO:
2022 ret = ip_vs_zero_service(svc);
2023 break;
2024 case IP_VS_SO_SET_ADDDEST:
2025 ret = ip_vs_add_dest(svc, udest);
2026 break;
2027 case IP_VS_SO_SET_EDITDEST:
2028 ret = ip_vs_edit_dest(svc, udest);
2029 break;
2030 case IP_VS_SO_SET_DELDEST:
2031 ret = ip_vs_del_dest(svc, udest);
2032 break;
2033 default:
2034 ret = -EINVAL;
2035 }
2036
2037 if (svc)
2038 ip_vs_service_put(svc);
2039
2040 out_unlock:
2041 up(&__ip_vs_mutex);
2042 out_dec:
2043 /* decrease the module use count */
2044 ip_vs_use_count_dec();
2045
2046 return ret;
2047}
2048
2049
2050static void
2051ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2052{
2053 spin_lock_bh(&src->lock);
2054 memcpy(dst, src, (char*)&src->lock - (char*)src);
2055 spin_unlock_bh(&src->lock);
2056}
2057
2058static void
2059ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2060{
2061 dst->protocol = src->protocol;
2062 dst->addr = src->addr;
2063 dst->port = src->port;
2064 dst->fwmark = src->fwmark;
pageexec4da62fc2005-06-26 16:00:19 -07002065 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002066 dst->flags = src->flags;
2067 dst->timeout = src->timeout / HZ;
2068 dst->netmask = src->netmask;
2069 dst->num_dests = src->num_dests;
2070 ip_vs_copy_stats(&dst->stats, &src->stats);
2071}
2072
2073static inline int
2074__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2075 struct ip_vs_get_services __user *uptr)
2076{
2077 int idx, count=0;
2078 struct ip_vs_service *svc;
2079 struct ip_vs_service_entry entry;
2080 int ret = 0;
2081
2082 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2083 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2084 if (count >= get->num_services)
2085 goto out;
pageexec4da62fc2005-06-26 16:00:19 -07002086 memset(&entry, 0, sizeof(entry));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002087 ip_vs_copy_service(&entry, svc);
2088 if (copy_to_user(&uptr->entrytable[count],
2089 &entry, sizeof(entry))) {
2090 ret = -EFAULT;
2091 goto out;
2092 }
2093 count++;
2094 }
2095 }
2096
2097 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2098 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2099 if (count >= get->num_services)
2100 goto out;
pageexec4da62fc2005-06-26 16:00:19 -07002101 memset(&entry, 0, sizeof(entry));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002102 ip_vs_copy_service(&entry, svc);
2103 if (copy_to_user(&uptr->entrytable[count],
2104 &entry, sizeof(entry))) {
2105 ret = -EFAULT;
2106 goto out;
2107 }
2108 count++;
2109 }
2110 }
2111 out:
2112 return ret;
2113}
2114
2115static inline int
2116__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2117 struct ip_vs_get_dests __user *uptr)
2118{
2119 struct ip_vs_service *svc;
2120 int ret = 0;
2121
2122 if (get->fwmark)
2123 svc = __ip_vs_svc_fwm_get(get->fwmark);
2124 else
2125 svc = __ip_vs_service_get(get->protocol,
2126 get->addr, get->port);
2127 if (svc) {
2128 int count = 0;
2129 struct ip_vs_dest *dest;
2130 struct ip_vs_dest_entry entry;
2131
2132 list_for_each_entry(dest, &svc->destinations, n_list) {
2133 if (count >= get->num_dests)
2134 break;
2135
2136 entry.addr = dest->addr;
2137 entry.port = dest->port;
2138 entry.conn_flags = atomic_read(&dest->conn_flags);
2139 entry.weight = atomic_read(&dest->weight);
2140 entry.u_threshold = dest->u_threshold;
2141 entry.l_threshold = dest->l_threshold;
2142 entry.activeconns = atomic_read(&dest->activeconns);
2143 entry.inactconns = atomic_read(&dest->inactconns);
2144 entry.persistconns = atomic_read(&dest->persistconns);
2145 ip_vs_copy_stats(&entry.stats, &dest->stats);
2146 if (copy_to_user(&uptr->entrytable[count],
2147 &entry, sizeof(entry))) {
2148 ret = -EFAULT;
2149 break;
2150 }
2151 count++;
2152 }
2153 ip_vs_service_put(svc);
2154 } else
2155 ret = -ESRCH;
2156 return ret;
2157}
2158
2159static inline void
2160__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2161{
2162#ifdef CONFIG_IP_VS_PROTO_TCP
2163 u->tcp_timeout =
2164 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2165 u->tcp_fin_timeout =
2166 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2167#endif
2168#ifdef CONFIG_IP_VS_PROTO_UDP
2169 u->udp_timeout =
2170 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2171#endif
2172}
2173
2174
2175#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2176#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2177#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2178#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2179#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2180#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2181#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2182
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08002183static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002184 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2185 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2186 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2187 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2188 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2189 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2190 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2191};
2192
2193static int
2194do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2195{
2196 unsigned char arg[128];
2197 int ret = 0;
2198
2199 if (!capable(CAP_NET_ADMIN))
2200 return -EPERM;
2201
2202 if (*len < get_arglen[GET_CMDID(cmd)]) {
2203 IP_VS_ERR("get_ctl: len %u < %u\n",
2204 *len, get_arglen[GET_CMDID(cmd)]);
2205 return -EINVAL;
2206 }
2207
2208 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2209 return -EFAULT;
2210
2211 if (down_interruptible(&__ip_vs_mutex))
2212 return -ERESTARTSYS;
2213
2214 switch (cmd) {
2215 case IP_VS_SO_GET_VERSION:
2216 {
2217 char buf[64];
2218
2219 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2220 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2221 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2222 ret = -EFAULT;
2223 goto out;
2224 }
2225 *len = strlen(buf)+1;
2226 }
2227 break;
2228
2229 case IP_VS_SO_GET_INFO:
2230 {
2231 struct ip_vs_getinfo info;
2232 info.version = IP_VS_VERSION_CODE;
2233 info.size = IP_VS_CONN_TAB_SIZE;
2234 info.num_services = ip_vs_num_services;
2235 if (copy_to_user(user, &info, sizeof(info)) != 0)
2236 ret = -EFAULT;
2237 }
2238 break;
2239
2240 case IP_VS_SO_GET_SERVICES:
2241 {
2242 struct ip_vs_get_services *get;
2243 int size;
2244
2245 get = (struct ip_vs_get_services *)arg;
2246 size = sizeof(*get) +
2247 sizeof(struct ip_vs_service_entry) * get->num_services;
2248 if (*len != size) {
2249 IP_VS_ERR("length: %u != %u\n", *len, size);
2250 ret = -EINVAL;
2251 goto out;
2252 }
2253 ret = __ip_vs_get_service_entries(get, user);
2254 }
2255 break;
2256
2257 case IP_VS_SO_GET_SERVICE:
2258 {
2259 struct ip_vs_service_entry *entry;
2260 struct ip_vs_service *svc;
2261
2262 entry = (struct ip_vs_service_entry *)arg;
2263 if (entry->fwmark)
2264 svc = __ip_vs_svc_fwm_get(entry->fwmark);
2265 else
2266 svc = __ip_vs_service_get(entry->protocol,
2267 entry->addr, entry->port);
2268 if (svc) {
2269 ip_vs_copy_service(entry, svc);
2270 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2271 ret = -EFAULT;
2272 ip_vs_service_put(svc);
2273 } else
2274 ret = -ESRCH;
2275 }
2276 break;
2277
2278 case IP_VS_SO_GET_DESTS:
2279 {
2280 struct ip_vs_get_dests *get;
2281 int size;
2282
2283 get = (struct ip_vs_get_dests *)arg;
2284 size = sizeof(*get) +
2285 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2286 if (*len != size) {
2287 IP_VS_ERR("length: %u != %u\n", *len, size);
2288 ret = -EINVAL;
2289 goto out;
2290 }
2291 ret = __ip_vs_get_dest_entries(get, user);
2292 }
2293 break;
2294
2295 case IP_VS_SO_GET_TIMEOUT:
2296 {
2297 struct ip_vs_timeout_user t;
2298
2299 __ip_vs_get_timeouts(&t);
2300 if (copy_to_user(user, &t, sizeof(t)) != 0)
2301 ret = -EFAULT;
2302 }
2303 break;
2304
2305 case IP_VS_SO_GET_DAEMON:
2306 {
2307 struct ip_vs_daemon_user d[2];
2308
2309 memset(&d, 0, sizeof(d));
2310 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2311 d[0].state = IP_VS_STATE_MASTER;
pageexec4da62fc2005-06-26 16:00:19 -07002312 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002313 d[0].syncid = ip_vs_master_syncid;
2314 }
2315 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2316 d[1].state = IP_VS_STATE_BACKUP;
pageexec4da62fc2005-06-26 16:00:19 -07002317 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002318 d[1].syncid = ip_vs_backup_syncid;
2319 }
2320 if (copy_to_user(user, &d, sizeof(d)) != 0)
2321 ret = -EFAULT;
2322 }
2323 break;
2324
2325 default:
2326 ret = -EINVAL;
2327 }
2328
2329 out:
2330 up(&__ip_vs_mutex);
2331 return ret;
2332}
2333
2334
2335static struct nf_sockopt_ops ip_vs_sockopts = {
2336 .pf = PF_INET,
2337 .set_optmin = IP_VS_BASE_CTL,
2338 .set_optmax = IP_VS_SO_SET_MAX+1,
2339 .set = do_ip_vs_set_ctl,
2340 .get_optmin = IP_VS_BASE_CTL,
2341 .get_optmax = IP_VS_SO_GET_MAX+1,
2342 .get = do_ip_vs_get_ctl,
2343};
2344
2345
2346int ip_vs_control_init(void)
2347{
2348 int ret;
2349 int idx;
2350
2351 EnterFunction(2);
2352
2353 ret = nf_register_sockopt(&ip_vs_sockopts);
2354 if (ret) {
2355 IP_VS_ERR("cannot register sockopt.\n");
2356 return ret;
2357 }
2358
2359 proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2360 proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2361
2362 sysctl_header = register_sysctl_table(vs_root_table, 0);
2363
2364 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2365 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2366 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2367 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2368 }
2369 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
2370 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2371 }
2372
2373 memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2374 spin_lock_init(&ip_vs_stats.lock);
2375 ip_vs_new_estimator(&ip_vs_stats);
2376
2377 /* Hook the defense timer */
2378 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2379
2380 LeaveFunction(2);
2381 return 0;
2382}
2383
2384
2385void ip_vs_control_cleanup(void)
2386{
2387 EnterFunction(2);
2388 ip_vs_trash_cleanup();
2389 cancel_rearming_delayed_work(&defense_work);
2390 ip_vs_kill_estimator(&ip_vs_stats);
2391 unregister_sysctl_table(sysctl_header);
2392 proc_net_remove("ip_vs_stats");
2393 proc_net_remove("ip_vs");
2394 nf_unregister_sockopt(&ip_vs_sockopts);
2395 LeaveFunction(2);
2396}