blob: d7024b2ed769a274ec8320ef9650c777a1d7dcd4 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
3 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07004 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
Hans Schillstrom4a85b962011-01-03 14:44:47 +010012 * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
Linus Torvalds1da177e2005-04-16 15:20:36 -070013 *
Hans Schillstrom4a85b962011-01-03 14:44:47 +010014 * Network name space (netns) aware.
15 * Global data moved to netns i.e struct netns_ipvs
16 * tcp_timeouts table has copy per netns in a hash table per
17 * protocol ip_vs_proto_data and is handled by netns
Linus Torvalds1da177e2005-04-16 15:20:36 -070018 */
19
Hannes Eder9aada7a2009-07-30 14:29:44 -070020#define KMSG_COMPONENT "IPVS"
21#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
22
Linus Torvalds1da177e2005-04-16 15:20:36 -070023#include <linux/kernel.h>
24#include <linux/ip.h>
25#include <linux/tcp.h> /* for tcphdr */
26#include <net/ip.h>
27#include <net/tcp.h> /* for csum_tcpudp_magic */
Stephen Rothwell63f2c042008-09-12 23:23:50 -070028#include <net/ip6_checksum.h>
Herbert Xuaf1e1cf2007-10-14 00:39:33 -070029#include <linux/netfilter.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070030#include <linux/netfilter_ipv4.h>
31
32#include <net/ip_vs.h>
33
Linus Torvalds1da177e2005-04-16 15:20:36 -070034static int
Eric W. Biedermand8f44c32015-09-21 13:02:43 -050035tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
36 struct ip_vs_proto_data *pd,
Jesper Dangaard Brouerd4383f02012-09-26 14:07:17 +020037 int *verdict, struct ip_vs_conn **cpp,
38 struct ip_vs_iphdr *iph)
Linus Torvalds1da177e2005-04-16 15:20:36 -070039{
40 struct ip_vs_service *svc;
41 struct tcphdr _tcph, *th;
Alex Gartrell8f88ea62015-08-26 09:40:39 -070042 __be16 _ports[2], *ports = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -070043
Alex Gartrell8f88ea62015-08-26 09:40:39 -070044 /* In the event of icmp, we're only guaranteed to have the first 8
45 * bytes of the transport header, so we only check the rest of the
46 * TCP packet for non-ICMP packets
47 */
48 if (likely(!ip_vs_iph_icmp(iph))) {
49 th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
50 if (th) {
51 if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn))
52 return 1;
53 ports = &th->source;
54 }
55 } else {
56 ports = skb_header_pointer(
57 skb, iph->len, sizeof(_ports), &_ports);
58 }
59
60 if (!ports) {
61 *verdict = NF_DROP;
62 return 0;
63 }
64
Julian Anastasov190ecd22010-10-17 16:24:37 +030065 /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
Julian Anastasovceec4c32013-03-22 11:46:53 +020066 rcu_read_lock();
Alex Gartrell8f88ea62015-08-26 09:40:39 -070067
68 if (likely(!ip_vs_iph_inverse(iph)))
Eric W. Biederman0a4fd6c2015-09-21 13:01:49 -050069 svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
Alex Gartrell8f88ea62015-08-26 09:40:39 -070070 &iph->daddr, ports[1]);
71 else
Eric W. Biederman0a4fd6c2015-09-21 13:01:49 -050072 svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
Alex Gartrell8f88ea62015-08-26 09:40:39 -070073 &iph->saddr, ports[0]);
74
75 if (svc) {
Julian Anastasov190ecd22010-10-17 16:24:37 +030076 int ignored;
77
Alexander Frolkinc6c96c12013-06-13 08:56:15 +010078 if (ip_vs_todrop(ipvs)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -070079 /*
80 * It seems that we are very loaded.
81 * We have to drop this packet :(
82 */
Julian Anastasovceec4c32013-03-22 11:46:53 +020083 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -070084 *verdict = NF_DROP;
85 return 0;
86 }
87
88 /*
89 * Let the virtual server select a real server for the
90 * incoming connection, and create a connection entry.
91 */
Jesper Dangaard Brouerd4383f02012-09-26 14:07:17 +020092 *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
Hans Schillstroma5959d52010-11-19 14:25:10 +010093 if (!*cpp && ignored <= 0) {
94 if (!ignored)
Jesper Dangaard Brouerd4383f02012-09-26 14:07:17 +020095 *verdict = ip_vs_leave(svc, skb, pd, iph);
Julian Anastasovceec4c32013-03-22 11:46:53 +020096 else
Hans Schillstroma5959d52010-11-19 14:25:10 +010097 *verdict = NF_DROP;
Julian Anastasovceec4c32013-03-22 11:46:53 +020098 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -070099 return 0;
100 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101 }
Julian Anastasovceec4c32013-03-22 11:46:53 +0200102 rcu_read_unlock();
Hans Schillstroma5959d52010-11-19 14:25:10 +0100103 /* NF_ACCEPT */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700104 return 1;
105}
106
107
108static inline void
Julius Volz0bbdd422008-09-02 15:55:42 +0200109tcp_fast_csum_update(int af, struct tcphdr *tcph,
110 const union nf_inet_addr *oldip,
111 const union nf_inet_addr *newip,
Al Viro014d7302006-09-28 14:29:52 -0700112 __be16 oldport, __be16 newport)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113{
Julius Volz0bbdd422008-09-02 15:55:42 +0200114#ifdef CONFIG_IP_VS_IPV6
115 if (af == AF_INET6)
116 tcph->check =
117 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
118 ip_vs_check_diff2(oldport, newport,
119 ~csum_unfold(tcph->check))));
120 else
121#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122 tcph->check =
Julius Volz0bbdd422008-09-02 15:55:42 +0200123 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
Al Virof9214b22006-11-16 02:41:18 -0800124 ip_vs_check_diff2(oldport, newport,
125 ~csum_unfold(tcph->check))));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126}
127
128
Simon Horman503e81f2008-09-08 12:04:21 +1000129static inline void
130tcp_partial_csum_update(int af, struct tcphdr *tcph,
131 const union nf_inet_addr *oldip,
132 const union nf_inet_addr *newip,
133 __be16 oldlen, __be16 newlen)
134{
135#ifdef CONFIG_IP_VS_IPV6
136 if (af == AF_INET6)
137 tcph->check =
Julian Anastasov5bc90682010-10-17 16:14:31 +0300138 ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
Simon Horman503e81f2008-09-08 12:04:21 +1000139 ip_vs_check_diff2(oldlen, newlen,
Julian Anastasov5bc90682010-10-17 16:14:31 +0300140 csum_unfold(tcph->check))));
Simon Horman503e81f2008-09-08 12:04:21 +1000141 else
142#endif
143 tcph->check =
Julian Anastasov5bc90682010-10-17 16:14:31 +0300144 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
Simon Horman503e81f2008-09-08 12:04:21 +1000145 ip_vs_check_diff2(oldlen, newlen,
Julian Anastasov5bc90682010-10-17 16:14:31 +0300146 csum_unfold(tcph->check))));
Simon Horman503e81f2008-09-08 12:04:21 +1000147}
148
149
Linus Torvalds1da177e2005-04-16 15:20:36 -0700150static int
Jesper Dangaard Brouerd4383f02012-09-26 14:07:17 +0200151tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
152 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153{
154 struct tcphdr *tcph;
Jesper Dangaard Brouerd4383f02012-09-26 14:07:17 +0200155 unsigned int tcphoff = iph->len;
Simon Horman503e81f2008-09-08 12:04:21 +1000156 int oldlen;
Julian Anastasov8b27b102010-10-17 16:17:20 +0300157 int payload_csum = 0;
Julius Volz0bbdd422008-09-02 15:55:42 +0200158
159#ifdef CONFIG_IP_VS_IPV6
Jesper Dangaard Brouerd4383f02012-09-26 14:07:17 +0200160 if (cp->af == AF_INET6 && iph->fragoffs)
Jesper Dangaard Brouer63dca2c2012-09-26 14:06:41 +0200161 return 1;
Julius Volz0bbdd422008-09-02 15:55:42 +0200162#endif
Simon Horman503e81f2008-09-08 12:04:21 +1000163 oldlen = skb->len - tcphoff;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700164
165 /* csum_check requires unshared skb */
Herbert Xu3db05fe2007-10-15 00:53:15 -0700166 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167 return 0;
168
169 if (unlikely(cp->app != NULL)) {
Julian Anastasov8b27b102010-10-17 16:17:20 +0300170 int ret;
171
Linus Torvalds1da177e2005-04-16 15:20:36 -0700172 /* Some checks before mangling */
Julius Volz0bbdd422008-09-02 15:55:42 +0200173 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700174 return 0;
175
176 /* Call application helper if needed */
Julian Anastasov8b27b102010-10-17 16:17:20 +0300177 if (!(ret = ip_vs_app_pkt_out(cp, skb)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178 return 0;
Julian Anastasov8b27b102010-10-17 16:17:20 +0300179 /* ret=2: csum update is needed after payload mangling */
180 if (ret == 1)
181 oldlen = skb->len - tcphoff;
182 else
183 payload_csum = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184 }
185
Julius Volz0bbdd422008-09-02 15:55:42 +0200186 tcph = (void *)skb_network_header(skb) + tcphoff;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700187 tcph->source = cp->vport;
188
189 /* Adjust TCP checksums */
Simon Horman503e81f2008-09-08 12:04:21 +1000190 if (skb->ip_summed == CHECKSUM_PARTIAL) {
191 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
Harvey Harrisonca620592008-11-06 23:09:56 -0800192 htons(oldlen),
193 htons(skb->len - tcphoff));
Julian Anastasov8b27b102010-10-17 16:17:20 +0300194 } else if (!payload_csum) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195 /* Only port and addr are changed, do fast csum update */
Julius Volz0bbdd422008-09-02 15:55:42 +0200196 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197 cp->dport, cp->vport);
Herbert Xu3db05fe2007-10-15 00:53:15 -0700198 if (skb->ip_summed == CHECKSUM_COMPLETE)
Julian Anastasov8b27b102010-10-17 16:17:20 +0300199 skb->ip_summed = (cp->app && pp->csum_check) ?
200 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700201 } else {
202 /* full checksum calculation */
203 tcph->check = 0;
Herbert Xu3db05fe2007-10-15 00:53:15 -0700204 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
Julius Volz0bbdd422008-09-02 15:55:42 +0200205#ifdef CONFIG_IP_VS_IPV6
206 if (cp->af == AF_INET6)
207 tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
208 &cp->caddr.in6,
209 skb->len - tcphoff,
210 cp->protocol, skb->csum);
211 else
212#endif
213 tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
214 cp->caddr.ip,
215 skb->len - tcphoff,
216 cp->protocol,
217 skb->csum);
Julian Anastasov8b27b102010-10-17 16:17:20 +0300218 skb->ip_summed = CHECKSUM_UNNECESSARY;
Julius Volz0bbdd422008-09-02 15:55:42 +0200219
Linus Torvalds1da177e2005-04-16 15:20:36 -0700220 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
221 pp->name, tcph->check,
222 (char*)&(tcph->check) - (char*)tcph);
223 }
224 return 1;
225}
226
227
228static int
Jesper Dangaard Brouerd4383f02012-09-26 14:07:17 +0200229tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
230 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700231{
232 struct tcphdr *tcph;
Jesper Dangaard Brouerd4383f02012-09-26 14:07:17 +0200233 unsigned int tcphoff = iph->len;
Simon Horman503e81f2008-09-08 12:04:21 +1000234 int oldlen;
Julian Anastasov8b27b102010-10-17 16:17:20 +0300235 int payload_csum = 0;
Julius Volz0bbdd422008-09-02 15:55:42 +0200236
237#ifdef CONFIG_IP_VS_IPV6
Jesper Dangaard Brouerd4383f02012-09-26 14:07:17 +0200238 if (cp->af == AF_INET6 && iph->fragoffs)
Jesper Dangaard Brouer63dca2c2012-09-26 14:06:41 +0200239 return 1;
Julius Volz0bbdd422008-09-02 15:55:42 +0200240#endif
Simon Horman503e81f2008-09-08 12:04:21 +1000241 oldlen = skb->len - tcphoff;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700242
243 /* csum_check requires unshared skb */
Herbert Xu3db05fe2007-10-15 00:53:15 -0700244 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700245 return 0;
246
247 if (unlikely(cp->app != NULL)) {
Julian Anastasov8b27b102010-10-17 16:17:20 +0300248 int ret;
249
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250 /* Some checks before mangling */
Julius Volz0bbdd422008-09-02 15:55:42 +0200251 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700252 return 0;
253
254 /*
255 * Attempt ip_vs_app call.
256 * It will fix ip_vs_conn and iph ack_seq stuff
257 */
Julian Anastasov8b27b102010-10-17 16:17:20 +0300258 if (!(ret = ip_vs_app_pkt_in(cp, skb)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259 return 0;
Julian Anastasov8b27b102010-10-17 16:17:20 +0300260 /* ret=2: csum update is needed after payload mangling */
261 if (ret == 1)
262 oldlen = skb->len - tcphoff;
263 else
264 payload_csum = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700265 }
266
Julius Volz0bbdd422008-09-02 15:55:42 +0200267 tcph = (void *)skb_network_header(skb) + tcphoff;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268 tcph->dest = cp->dport;
269
270 /*
271 * Adjust TCP checksums
272 */
Simon Horman503e81f2008-09-08 12:04:21 +1000273 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Julian Anastasov5bc90682010-10-17 16:14:31 +0300274 tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
Harvey Harrisonca620592008-11-06 23:09:56 -0800275 htons(oldlen),
276 htons(skb->len - tcphoff));
Julian Anastasov8b27b102010-10-17 16:17:20 +0300277 } else if (!payload_csum) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278 /* Only port and addr are changed, do fast csum update */
Julius Volz0bbdd422008-09-02 15:55:42 +0200279 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700280 cp->vport, cp->dport);
Herbert Xu3db05fe2007-10-15 00:53:15 -0700281 if (skb->ip_summed == CHECKSUM_COMPLETE)
Julian Anastasov8b27b102010-10-17 16:17:20 +0300282 skb->ip_summed = (cp->app && pp->csum_check) ?
283 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 } else {
285 /* full checksum calculation */
286 tcph->check = 0;
Herbert Xu3db05fe2007-10-15 00:53:15 -0700287 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
Julius Volz0bbdd422008-09-02 15:55:42 +0200288#ifdef CONFIG_IP_VS_IPV6
289 if (cp->af == AF_INET6)
290 tcph->check = csum_ipv6_magic(&cp->caddr.in6,
291 &cp->daddr.in6,
292 skb->len - tcphoff,
293 cp->protocol, skb->csum);
294 else
295#endif
296 tcph->check = csum_tcpudp_magic(cp->caddr.ip,
297 cp->daddr.ip,
298 skb->len - tcphoff,
299 cp->protocol,
300 skb->csum);
Herbert Xu3db05fe2007-10-15 00:53:15 -0700301 skb->ip_summed = CHECKSUM_UNNECESSARY;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302 }
303 return 1;
304}
305
306
307static int
Julius Volz51ef3482008-09-02 15:55:40 +0200308tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700309{
Julius Volz51ef3482008-09-02 15:55:40 +0200310 unsigned int tcphoff;
311
312#ifdef CONFIG_IP_VS_IPV6
313 if (af == AF_INET6)
314 tcphoff = sizeof(struct ipv6hdr);
315 else
316#endif
317 tcphoff = ip_hdrlen(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318
319 switch (skb->ip_summed) {
320 case CHECKSUM_NONE:
321 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
Patrick McHardy84fa7932006-08-29 16:44:56 -0700322 case CHECKSUM_COMPLETE:
Julius Volz51ef3482008-09-02 15:55:40 +0200323#ifdef CONFIG_IP_VS_IPV6
324 if (af == AF_INET6) {
325 if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
326 &ipv6_hdr(skb)->daddr,
327 skb->len - tcphoff,
328 ipv6_hdr(skb)->nexthdr,
329 skb->csum)) {
Julian Anastasov0d796412010-10-17 16:46:17 +0300330 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
Julius Volz51ef3482008-09-02 15:55:40 +0200331 "Failed checksum for");
332 return 0;
333 }
334 } else
335#endif
336 if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
337 ip_hdr(skb)->daddr,
338 skb->len - tcphoff,
339 ip_hdr(skb)->protocol,
340 skb->csum)) {
Julian Anastasov0d796412010-10-17 16:46:17 +0300341 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
Julius Volz51ef3482008-09-02 15:55:40 +0200342 "Failed checksum for");
343 return 0;
344 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700345 break;
346 default:
Patrick McHardy84fa7932006-08-29 16:44:56 -0700347 /* No need to checksum. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348 break;
349 }
350
351 return 1;
352}
353
354
355#define TCP_DIR_INPUT 0
356#define TCP_DIR_OUTPUT 4
357#define TCP_DIR_INPUT_ONLY 8
358
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -0800359static const int tcp_state_off[IP_VS_DIR_LAST] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360 [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
361 [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
362 [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
363};
364
365/*
366 * Timeout table[state]
367 */
Hans Schillstrom4a85b962011-01-03 14:44:47 +0100368static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700369 [IP_VS_TCP_S_NONE] = 2*HZ,
370 [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
371 [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
372 [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
373 [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
374 [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
375 [IP_VS_TCP_S_CLOSE] = 10*HZ,
376 [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
377 [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
378 [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
379 [IP_VS_TCP_S_SYNACK] = 120*HZ,
380 [IP_VS_TCP_S_LAST] = 2*HZ,
381};
382
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700383static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700384 [IP_VS_TCP_S_NONE] = "NONE",
385 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
386 [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
387 [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
388 [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
389 [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
390 [IP_VS_TCP_S_CLOSE] = "CLOSE",
391 [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
392 [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
393 [IP_VS_TCP_S_LISTEN] = "LISTEN",
394 [IP_VS_TCP_S_SYNACK] = "SYNACK",
395 [IP_VS_TCP_S_LAST] = "BUG!",
396};
397
398#define sNO IP_VS_TCP_S_NONE
399#define sES IP_VS_TCP_S_ESTABLISHED
400#define sSS IP_VS_TCP_S_SYN_SENT
401#define sSR IP_VS_TCP_S_SYN_RECV
402#define sFW IP_VS_TCP_S_FIN_WAIT
403#define sTW IP_VS_TCP_S_TIME_WAIT
404#define sCL IP_VS_TCP_S_CLOSE
405#define sCW IP_VS_TCP_S_CLOSE_WAIT
406#define sLA IP_VS_TCP_S_LAST_ACK
407#define sLI IP_VS_TCP_S_LISTEN
408#define sSA IP_VS_TCP_S_SYNACK
409
410struct tcp_states_t {
411 int next_state[IP_VS_TCP_S_LAST];
412};
413
414static const char * tcp_state_name(int state)
415{
416 if (state >= IP_VS_TCP_S_LAST)
417 return "ERR!";
418 return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
419}
420
421static struct tcp_states_t tcp_states [] = {
422/* INPUT */
423/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
424/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
425/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
Alexander Frolkinc6c96c12013-06-13 08:56:15 +0100426/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
Linus Torvalds1da177e2005-04-16 15:20:36 -0700427/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
428
429/* OUTPUT */
430/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
431/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
432/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
433/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
434/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
435
436/* INPUT-ONLY */
437/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
438/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
439/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
Alexander Frolkinc6c96c12013-06-13 08:56:15 +0100440/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
Linus Torvalds1da177e2005-04-16 15:20:36 -0700441/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
442};
443
444static struct tcp_states_t tcp_states_dos [] = {
445/* INPUT */
446/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
447/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
448/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
Alexander Frolkinc6c96c12013-06-13 08:56:15 +0100449/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
451
452/* OUTPUT */
453/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
454/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
455/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
456/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
457/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
458
459/* INPUT-ONLY */
460/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
461/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
462/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
Alexander Frolkinc6c96c12013-06-13 08:56:15 +0100463/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
Linus Torvalds1da177e2005-04-16 15:20:36 -0700464/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
465};
466
Hans Schillstrom93304192011-01-03 14:44:51 +0100467static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700468{
469 int on = (flags & 1); /* secure_tcp */
470
471 /*
472 ** FIXME: change secure_tcp to independent sysctl var
473 ** or make it per-service or per-app because it is valid
474 ** for most if not for all of the applications. Something
475 ** like "capabilities" (flags) for each object.
476 */
Hans Schillstrom93304192011-01-03 14:44:51 +0100477 pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700478}
479
Linus Torvalds1da177e2005-04-16 15:20:36 -0700480static inline int tcp_state_idx(struct tcphdr *th)
481{
482 if (th->rst)
483 return 3;
484 if (th->syn)
485 return 0;
486 if (th->fin)
487 return 1;
488 if (th->ack)
489 return 2;
490 return -1;
491}
492
493static inline void
Hans Schillstrom93304192011-01-03 14:44:51 +0100494set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495 int direction, struct tcphdr *th)
496{
497 int state_idx;
498 int new_state = IP_VS_TCP_S_CLOSE;
499 int state_off = tcp_state_off[direction];
500
501 /*
502 * Update state offset to INPUT_ONLY if necessary
503 * or delete NO_OUTPUT flag if output packet detected
504 */
505 if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
506 if (state_off == TCP_DIR_OUTPUT)
507 cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
508 else
509 state_off = TCP_DIR_INPUT_ONLY;
510 }
511
512 if ((state_idx = tcp_state_idx(th)) < 0) {
513 IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
514 goto tcp_state_out;
515 }
516
Hans Schillstrom93304192011-01-03 14:44:51 +0100517 new_state =
518 pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700519
520 tcp_state_out:
521 if (new_state != cp->state) {
522 struct ip_vs_dest *dest = cp->dest;
523
Julius Volzcfc78c52008-09-02 15:55:53 +0200524 IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
525 "%s:%d state: %s->%s conn->refcnt:%d\n",
Hans Schillstrom93304192011-01-03 14:44:51 +0100526 pd->pp->name,
Julius Volzcfc78c52008-09-02 15:55:53 +0200527 ((state_off == TCP_DIR_OUTPUT) ?
528 "output " : "input "),
529 th->syn ? 'S' : '.',
530 th->fin ? 'F' : '.',
531 th->ack ? 'A' : '.',
532 th->rst ? 'R' : '.',
Julian Anastasovf18ae722014-09-09 16:40:38 -0700533 IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
Julius Volzcfc78c52008-09-02 15:55:53 +0200534 ntohs(cp->dport),
535 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
536 ntohs(cp->cport),
537 tcp_state_name(cp->state),
538 tcp_state_name(new_state),
539 atomic_read(&cp->refcnt));
540
Linus Torvalds1da177e2005-04-16 15:20:36 -0700541 if (dest) {
542 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
543 (new_state != IP_VS_TCP_S_ESTABLISHED)) {
544 atomic_dec(&dest->activeconns);
545 atomic_inc(&dest->inactconns);
546 cp->flags |= IP_VS_CONN_F_INACTIVE;
547 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
548 (new_state == IP_VS_TCP_S_ESTABLISHED)) {
549 atomic_inc(&dest->activeconns);
550 atomic_dec(&dest->inactconns);
551 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
552 }
553 }
554 }
555
Hans Schillstrom4a85b962011-01-03 14:44:47 +0100556 if (likely(pd))
557 cp->timeout = pd->timeout_table[cp->state = new_state];
558 else /* What to do ? */
559 cp->timeout = tcp_timeouts[cp->state = new_state];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700560}
561
Linus Torvalds1da177e2005-04-16 15:20:36 -0700562/*
563 * Handle state transitions
564 */
Simon Horman4a516f12011-09-16 14:11:49 +0900565static void
Linus Torvalds1da177e2005-04-16 15:20:36 -0700566tcp_state_transition(struct ip_vs_conn *cp, int direction,
567 const struct sk_buff *skb,
Hans Schillstrom93304192011-01-03 14:44:51 +0100568 struct ip_vs_proto_data *pd)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700569{
570 struct tcphdr _tcph, *th;
571
Julius Volz0bbdd422008-09-02 15:55:42 +0200572#ifdef CONFIG_IP_VS_IPV6
573 int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
574#else
575 int ihl = ip_hdrlen(skb);
576#endif
577
578 th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700579 if (th == NULL)
Simon Horman4a516f12011-09-16 14:11:49 +0900580 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700581
Julian Anastasovac692692013-03-22 11:46:54 +0200582 spin_lock_bh(&cp->lock);
Hans Schillstrom93304192011-01-03 14:44:51 +0100583 set_tcp_state(pd, cp, direction, th);
Julian Anastasovac692692013-03-22 11:46:54 +0200584 spin_unlock_bh(&cp->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700585}
586
Al Viro75e7ce62006-11-14 21:13:28 -0800587static inline __u16 tcp_app_hashkey(__be16 port)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700588{
Al Viro75e7ce62006-11-14 21:13:28 -0800589 return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
590 & TCP_APP_TAB_MASK;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700591}
592
593
Eric W. Biederman19648912015-09-21 13:02:29 -0500594static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700595{
596 struct ip_vs_app *i;
Al Viro75e7ce62006-11-14 21:13:28 -0800597 __u16 hash;
598 __be16 port = inc->port;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700599 int ret = 0;
Eric W. Biederman18d6ade2015-09-21 13:02:01 -0500600 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700601
602 hash = tcp_app_hashkey(port);
603
Hans Schillstrom4a85b962011-01-03 14:44:47 +0100604 list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700605 if (i->port == port) {
606 ret = -EEXIST;
607 goto out;
608 }
609 }
Julian Anastasov363c97d2013-03-21 11:58:07 +0200610 list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
Hans Schillstrom9bbac6a2011-01-03 14:44:52 +0100611 atomic_inc(&pd->appcnt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612
613 out:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614 return ret;
615}
616
617
618static void
Eric W. Biederman19648912015-09-21 13:02:29 -0500619tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700620{
Eric W. Biederman19648912015-09-21 13:02:29 -0500621 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
Hans Schillstrom4a85b962011-01-03 14:44:47 +0100622
Hans Schillstrom9bbac6a2011-01-03 14:44:52 +0100623 atomic_dec(&pd->appcnt);
Julian Anastasov363c97d2013-03-21 11:58:07 +0200624 list_del_rcu(&inc->p_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700625}
626
627
628static int
629tcp_app_conn_bind(struct ip_vs_conn *cp)
630{
Eric W. Biederman58dbc6f2015-09-21 13:01:41 -0500631 struct netns_ipvs *ipvs = cp->ipvs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700632 int hash;
633 struct ip_vs_app *inc;
634 int result = 0;
635
636 /* Default binding: bind app only for NAT */
637 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
638 return 0;
639
640 /* Lookup application incarnations and bind the right one */
641 hash = tcp_app_hashkey(cp->vport);
642
Julian Anastasov363c97d2013-03-21 11:58:07 +0200643 rcu_read_lock();
644 list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700645 if (inc->port == cp->vport) {
646 if (unlikely(!ip_vs_app_inc_get(inc)))
647 break;
Julian Anastasov363c97d2013-03-21 11:58:07 +0200648 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700649
Hannes Eder1e3e2382009-08-02 11:05:41 +0000650 IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
Julius Volzcfc78c52008-09-02 15:55:53 +0200651 "%s:%u to app %s on port %u\n",
652 __func__,
653 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
654 ntohs(cp->cport),
655 IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
656 ntohs(cp->vport),
657 inc->name, ntohs(inc->port));
658
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659 cp->app = inc;
660 if (inc->init_conn)
661 result = inc->init_conn(inc, cp);
662 goto out;
663 }
664 }
Julian Anastasov363c97d2013-03-21 11:58:07 +0200665 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666
667 out:
668 return result;
669}
670
671
672/*
673 * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
674 */
Eric W. Biederman69f39092015-09-21 13:02:57 -0500675void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700676{
Eric W. Biederman69f39092015-09-21 13:02:57 -0500677 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP);
Hans Schillstrom4a85b962011-01-03 14:44:47 +0100678
Julian Anastasovac692692013-03-22 11:46:54 +0200679 spin_lock_bh(&cp->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680 cp->state = IP_VS_TCP_S_LISTEN;
Hans Schillstrom4a85b962011-01-03 14:44:47 +0100681 cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
682 : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
Julian Anastasovac692692013-03-22 11:46:54 +0200683 spin_unlock_bh(&cp->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700684}
685
Hans Schillstrom4a85b962011-01-03 14:44:47 +0100686/* ---------------------------------------------
687 * timeouts is netns related now.
688 * ---------------------------------------------
689 */
Eric W. Biederman1281a9c22015-09-21 13:02:36 -0500690static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700691{
Hans Schillstrom4a85b962011-01-03 14:44:47 +0100692 ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
Hans Schillstrom4a85b962011-01-03 14:44:47 +0100693 pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
694 sizeof(tcp_timeouts));
Hans Schillstrom582b8e32012-04-26 09:45:35 +0200695 if (!pd->timeout_table)
696 return -ENOMEM;
Hans Schillstrom93304192011-01-03 14:44:51 +0100697 pd->tcp_state_table = tcp_states;
Hans Schillstrom582b8e32012-04-26 09:45:35 +0200698 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700699}
700
Eric W. Biederman1281a9c22015-09-21 13:02:36 -0500701static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700702{
Hans Schillstrom4a85b962011-01-03 14:44:47 +0100703 kfree(pd->timeout_table);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700704}
705
706
707struct ip_vs_protocol ip_vs_protocol_tcp = {
708 .name = "TCP",
709 .protocol = IPPROTO_TCP,
Julian Anastasov2ad17de2008-04-29 03:21:23 -0700710 .num_states = IP_VS_TCP_S_LAST,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700711 .dont_defrag = 0,
Hans Schillstrom4a85b962011-01-03 14:44:47 +0100712 .init = NULL,
713 .exit = NULL,
714 .init_netns = __ip_vs_tcp_init,
715 .exit_netns = __ip_vs_tcp_exit,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716 .register_app = tcp_register_app,
717 .unregister_app = tcp_unregister_app,
718 .conn_schedule = tcp_conn_schedule,
Simon Horman5c0d2372010-08-02 17:12:44 +0200719 .conn_in_get = ip_vs_conn_in_get_proto,
720 .conn_out_get = ip_vs_conn_out_get_proto,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700721 .snat_handler = tcp_snat_handler,
722 .dnat_handler = tcp_dnat_handler,
723 .csum_check = tcp_csum_check,
724 .state_name = tcp_state_name,
725 .state_transition = tcp_state_transition,
726 .app_conn_bind = tcp_app_conn_bind,
727 .debug_packet = ip_vs_tcpudp_debug_packet,
728 .timeout_change = tcp_timeout_change,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700729};