blob: 9892d4aca42e3d4e663490abfcd564a602fe4f95 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * ip_vs_xmit.c: various packet transmitters for IPVS
3 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07004 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 *
14 */
15
16#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070017#include <linux/tcp.h> /* for tcphdr */
Herbert Xuc439cb22008-01-11 19:14:00 -080018#include <net/ip.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070019#include <net/tcp.h> /* for csum_tcpudp_magic */
20#include <net/udp.h>
21#include <net/icmp.h> /* for icmp_send */
22#include <net/route.h> /* for ip_route_output */
23#include <linux/netfilter.h>
24#include <linux/netfilter_ipv4.h>
25
26#include <net/ip_vs.h>
27
28
29/*
30 * Destination cache to speed up outgoing route lookup
31 */
32static inline void
33__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
34{
35 struct dst_entry *old_dst;
36
37 old_dst = dest->dst_cache;
38 dest->dst_cache = dst;
39 dest->dst_rtos = rtos;
40 dst_release(old_dst);
41}
42
43static inline struct dst_entry *
44__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
45{
46 struct dst_entry *dst = dest->dst_cache;
47
48 if (!dst)
49 return NULL;
50 if ((dst->obsolete || rtos != dest->dst_rtos) &&
51 dst->ops->check(dst, cookie) == NULL) {
52 dest->dst_cache = NULL;
53 dst_release(dst);
54 return NULL;
55 }
56 dst_hold(dst);
57 return dst;
58}
59
Ilpo Järvinenad1b30b2008-01-05 23:12:40 -080060static struct rtable *
Linus Torvalds1da177e2005-04-16 15:20:36 -070061__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
62{
63 struct rtable *rt; /* Route to the other host */
64 struct ip_vs_dest *dest = cp->dest;
65
66 if (dest) {
67 spin_lock(&dest->dst_lock);
68 if (!(rt = (struct rtable *)
69 __ip_vs_dst_check(dest, rtos, 0))) {
70 struct flowi fl = {
71 .oif = 0,
72 .nl_u = {
73 .ip4_u = {
74 .daddr = dest->addr,
75 .saddr = 0,
76 .tos = rtos, } },
77 };
78
Denis V. Lunevf2063512008-01-22 22:07:34 -080079 if (ip_route_output_key(&init_net, &rt, &fl)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -070080 spin_unlock(&dest->dst_lock);
81 IP_VS_DBG_RL("ip_route_output error, "
82 "dest: %u.%u.%u.%u\n",
83 NIPQUAD(dest->addr));
84 return NULL;
85 }
86 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
87 IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
88 NIPQUAD(dest->addr),
89 atomic_read(&rt->u.dst.__refcnt), rtos);
90 }
91 spin_unlock(&dest->dst_lock);
92 } else {
93 struct flowi fl = {
94 .oif = 0,
95 .nl_u = {
96 .ip4_u = {
97 .daddr = cp->daddr,
98 .saddr = 0,
99 .tos = rtos, } },
100 };
101
Denis V. Lunevf2063512008-01-22 22:07:34 -0800102 if (ip_route_output_key(&init_net, &rt, &fl)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103 IP_VS_DBG_RL("ip_route_output error, dest: "
104 "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
105 return NULL;
106 }
107 }
108
109 return rt;
110}
111
112
113/*
114 * Release dest->dst_cache before a dest is removed
115 */
116void
117ip_vs_dst_reset(struct ip_vs_dest *dest)
118{
119 struct dst_entry *old_dst;
120
121 old_dst = dest->dst_cache;
122 dest->dst_cache = NULL;
123 dst_release(old_dst);
124}
125
126#define IP_VS_XMIT(skb, rt) \
127do { \
Harald Welte6869c4d2005-08-09 19:24:19 -0700128 (skb)->ipvs_property = 1; \
Herbert Xuccc79112007-07-30 16:20:12 -0700129 skb_forward_csum(skb); \
Patrick McHardy6e23ae22007-11-19 18:53:30 -0800130 NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, (skb), NULL, \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700131 (rt)->u.dst.dev, dst_output); \
132} while (0)
133
134
135/*
136 * NULL transmitter (do nothing except return NF_ACCEPT)
137 */
138int
139ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
140 struct ip_vs_protocol *pp)
141{
142 /* we do not touch skb and do not need pskb ptr */
143 return NF_ACCEPT;
144}
145
146
147/*
148 * Bypass transmitter
149 * Let packets bypass the destination when the destination is not
150 * available, it may be only used in transparent cache cluster.
151 */
152int
153ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
154 struct ip_vs_protocol *pp)
155{
156 struct rtable *rt; /* Route to the other host */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700157 struct iphdr *iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158 u8 tos = iph->tos;
159 int mtu;
160 struct flowi fl = {
161 .oif = 0,
162 .nl_u = {
163 .ip4_u = {
164 .daddr = iph->daddr,
165 .saddr = 0,
166 .tos = RT_TOS(tos), } },
167 };
168
169 EnterFunction(10);
170
Denis V. Lunevf2063512008-01-22 22:07:34 -0800171 if (ip_route_output_key(&init_net, &rt, &fl)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700172 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
173 "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
174 goto tx_error_icmp;
175 }
176
177 /* MTU checking */
178 mtu = dst_mtu(&rt->u.dst);
YOSHIFUJI Hideaki4412ec42007-03-07 14:19:10 +0900179 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180 ip_rt_put(rt);
181 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
182 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
183 goto tx_error;
184 }
185
186 /*
187 * Call ip_send_check because we are not sure it is called
188 * after ip_defrag. Is copy-on-write needed?
189 */
190 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
191 ip_rt_put(rt);
192 return NF_STOLEN;
193 }
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700194 ip_send_check(ip_hdr(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195
196 /* drop old route */
197 dst_release(skb->dst);
198 skb->dst = &rt->u.dst;
199
200 /* Another hack: avoid icmp_send in ip_fragment */
201 skb->local_df = 1;
202
203 IP_VS_XMIT(skb, rt);
204
205 LeaveFunction(10);
206 return NF_STOLEN;
207
208 tx_error_icmp:
209 dst_link_failure(skb);
210 tx_error:
211 kfree_skb(skb);
212 LeaveFunction(10);
213 return NF_STOLEN;
214}
215
216
217/*
218 * NAT transmitter (only for outside-to-inside nat forwarding)
219 * Not used for related ICMP
220 */
221int
222ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
223 struct ip_vs_protocol *pp)
224{
225 struct rtable *rt; /* Route to the other host */
226 int mtu;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700227 struct iphdr *iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700228
229 EnterFunction(10);
230
231 /* check if it is a connection of no-client-port */
232 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
Al Viro014d7302006-09-28 14:29:52 -0700233 __be16 _pt, *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700234 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
235 if (p == NULL)
236 goto tx_error;
237 ip_vs_conn_fill_cport(cp, *p);
238 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
239 }
240
241 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
242 goto tx_error_icmp;
243
244 /* MTU checking */
245 mtu = dst_mtu(&rt->u.dst);
YOSHIFUJI Hideaki4412ec42007-03-07 14:19:10 +0900246 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700247 ip_rt_put(rt);
248 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
249 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
250 goto tx_error;
251 }
252
253 /* copy-on-write the packet before mangling it */
Herbert Xuaf1e1cf2007-10-14 00:39:33 -0700254 if (!skb_make_writable(skb, sizeof(struct iphdr)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700255 goto tx_error_put;
256
257 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
258 goto tx_error_put;
259
260 /* drop old route */
261 dst_release(skb->dst);
262 skb->dst = &rt->u.dst;
263
264 /* mangle the packet */
Herbert Xu3db05fe2007-10-15 00:53:15 -0700265 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700266 goto tx_error;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700267 ip_hdr(skb)->daddr = cp->daddr;
268 ip_send_check(ip_hdr(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700269
270 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
271
272 /* FIXME: when application helper enlarges the packet and the length
273 is larger than the MTU of outgoing device, there will be still
274 MTU problem. */
275
276 /* Another hack: avoid icmp_send in ip_fragment */
277 skb->local_df = 1;
278
279 IP_VS_XMIT(skb, rt);
280
281 LeaveFunction(10);
282 return NF_STOLEN;
283
284 tx_error_icmp:
285 dst_link_failure(skb);
286 tx_error:
287 LeaveFunction(10);
288 kfree_skb(skb);
289 return NF_STOLEN;
290 tx_error_put:
291 ip_rt_put(rt);
292 goto tx_error;
293}
294
295
296/*
297 * IP Tunneling transmitter
298 *
299 * This function encapsulates the packet in a new IP packet, its
300 * destination will be set to cp->daddr. Most code of this function
301 * is taken from ipip.c.
302 *
303 * It is used in VS/TUN cluster. The load balancer selects a real
304 * server from a cluster based on a scheduling algorithm,
305 * encapsulates the request packet and forwards it to the selected
306 * server. For example, all real servers are configured with
307 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
308 * the encapsulated packet, it will decapsulate the packet, processe
309 * the request and return the response packets directly to the client
310 * without passing the load balancer. This can greatly increase the
311 * scalability of virtual server.
312 *
313 * Used for ANY protocol
314 */
315int
316ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
317 struct ip_vs_protocol *pp)
318{
319 struct rtable *rt; /* Route to the other host */
320 struct net_device *tdev; /* Device to other host */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700321 struct iphdr *old_iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322 u8 tos = old_iph->tos;
Alexey Dobriyan76ab6082006-01-06 13:24:29 -0800323 __be16 df = old_iph->frag_off;
Arnaldo Carvalho de Melo2e07fa92007-04-10 21:22:35 -0700324 sk_buff_data_t old_transport_header = skb->transport_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325 struct iphdr *iph; /* Our new IP header */
Chuck Leverc2636b42007-10-23 21:07:32 -0700326 unsigned int max_headroom; /* The extra header space needed */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327 int mtu;
328
329 EnterFunction(10);
330
YOSHIFUJI Hideaki4412ec42007-03-07 14:19:10 +0900331 if (skb->protocol != htons(ETH_P_IP)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
333 "ETH_P_IP: %d, skb protocol: %d\n",
YOSHIFUJI Hideaki4412ec42007-03-07 14:19:10 +0900334 htons(ETH_P_IP), skb->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335 goto tx_error;
336 }
337
338 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
339 goto tx_error_icmp;
340
341 tdev = rt->u.dst.dev;
342
343 mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
344 if (mtu < 68) {
345 ip_rt_put(rt);
346 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
347 goto tx_error;
348 }
349 if (skb->dst)
350 skb->dst->ops->update_pmtu(skb->dst, mtu);
351
YOSHIFUJI Hideaki4412ec42007-03-07 14:19:10 +0900352 df |= (old_iph->frag_off & htons(IP_DF));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700353
YOSHIFUJI Hideaki4412ec42007-03-07 14:19:10 +0900354 if ((old_iph->frag_off & htons(IP_DF))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355 && mtu < ntohs(old_iph->tot_len)) {
356 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
357 ip_rt_put(rt);
358 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
359 goto tx_error;
360 }
361
362 /*
363 * Okay, now see if we can stuff it in the buffer as-is.
364 */
365 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
366
367 if (skb_headroom(skb) < max_headroom
368 || skb_cloned(skb) || skb_shared(skb)) {
369 struct sk_buff *new_skb =
370 skb_realloc_headroom(skb, max_headroom);
371 if (!new_skb) {
372 ip_rt_put(rt);
373 kfree_skb(skb);
374 IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
375 return NF_STOLEN;
376 }
377 kfree_skb(skb);
378 skb = new_skb;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700379 old_iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380 }
381
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700382 skb->transport_header = old_transport_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383
384 /* fix old IP header checksum */
385 ip_send_check(old_iph);
386
Arnaldo Carvalho de Meloe2d1bca2007-04-10 20:46:21 -0700387 skb_push(skb, sizeof(struct iphdr));
388 skb_reset_network_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
390
391 /* drop old route */
392 dst_release(skb->dst);
393 skb->dst = &rt->u.dst;
394
395 /*
396 * Push down and install the IPIP header.
397 */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700398 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399 iph->version = 4;
400 iph->ihl = sizeof(struct iphdr)>>2;
401 iph->frag_off = df;
402 iph->protocol = IPPROTO_IPIP;
403 iph->tos = tos;
404 iph->daddr = rt->rt_dst;
405 iph->saddr = rt->rt_src;
406 iph->ttl = old_iph->ttl;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700407 ip_select_ident(iph, &rt->u.dst, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700408
409 /* Another hack: avoid icmp_send in ip_fragment */
410 skb->local_df = 1;
411
Herbert Xuc439cb22008-01-11 19:14:00 -0800412 ip_local_out(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700413
414 LeaveFunction(10);
415
416 return NF_STOLEN;
417
418 tx_error_icmp:
419 dst_link_failure(skb);
420 tx_error:
421 kfree_skb(skb);
422 LeaveFunction(10);
423 return NF_STOLEN;
424}
425
426
427/*
428 * Direct Routing transmitter
429 * Used for ANY protocol
430 */
431int
432ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
433 struct ip_vs_protocol *pp)
434{
435 struct rtable *rt; /* Route to the other host */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700436 struct iphdr *iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437 int mtu;
438
439 EnterFunction(10);
440
441 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
442 goto tx_error_icmp;
443
444 /* MTU checking */
445 mtu = dst_mtu(&rt->u.dst);
YOSHIFUJI Hideaki4412ec42007-03-07 14:19:10 +0900446 if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700447 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
448 ip_rt_put(rt);
449 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
450 goto tx_error;
451 }
452
453 /*
454 * Call ip_send_check because we are not sure it is called
455 * after ip_defrag. Is copy-on-write needed?
456 */
457 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
458 ip_rt_put(rt);
459 return NF_STOLEN;
460 }
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700461 ip_send_check(ip_hdr(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462
463 /* drop old route */
464 dst_release(skb->dst);
465 skb->dst = &rt->u.dst;
466
467 /* Another hack: avoid icmp_send in ip_fragment */
468 skb->local_df = 1;
469
470 IP_VS_XMIT(skb, rt);
471
472 LeaveFunction(10);
473 return NF_STOLEN;
474
475 tx_error_icmp:
476 dst_link_failure(skb);
477 tx_error:
478 kfree_skb(skb);
479 LeaveFunction(10);
480 return NF_STOLEN;
481}
482
483
484/*
485 * ICMP packet transmitter
486 * called by the ip_vs_in_icmp
487 */
488int
489ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
490 struct ip_vs_protocol *pp, int offset)
491{
492 struct rtable *rt; /* Route to the other host */
493 int mtu;
494 int rc;
495
496 EnterFunction(10);
497
498 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
499 forwarded directly here, because there is no need to
500 translate address/port back */
501 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
502 if (cp->packet_xmit)
503 rc = cp->packet_xmit(skb, cp, pp);
504 else
505 rc = NF_ACCEPT;
506 /* do not touch skb anymore */
507 atomic_inc(&cp->in_pkts);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508 goto out;
509 }
510
511 /*
512 * mangle and send the packet here (only for VS/NAT)
513 */
514
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700515 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700516 goto tx_error_icmp;
517
518 /* MTU checking */
519 mtu = dst_mtu(&rt->u.dst);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700520 if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521 ip_rt_put(rt);
522 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
523 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
524 goto tx_error;
525 }
526
527 /* copy-on-write the packet before mangling it */
Herbert Xuaf1e1cf2007-10-14 00:39:33 -0700528 if (!skb_make_writable(skb, offset))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700529 goto tx_error_put;
530
531 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
532 goto tx_error_put;
533
534 /* drop the old route when skb is not shared */
535 dst_release(skb->dst);
536 skb->dst = &rt->u.dst;
537
538 ip_vs_nat_icmp(skb, pp, cp, 0);
539
540 /* Another hack: avoid icmp_send in ip_fragment */
541 skb->local_df = 1;
542
543 IP_VS_XMIT(skb, rt);
544
545 rc = NF_STOLEN;
546 goto out;
547
548 tx_error_icmp:
549 dst_link_failure(skb);
550 tx_error:
551 dev_kfree_skb(skb);
552 rc = NF_STOLEN;
553 out:
554 LeaveFunction(10);
555 return rc;
556 tx_error_put:
557 ip_rt_put(rt);
558 goto tx_error;
559}