Julian Anastasov | f4bc17c | 2010-09-21 17:35:41 +0200 | [diff] [blame] | 1 | /* |
| 2 | * ip_vs_nfct.c: Netfilter connection tracking support for IPVS |
| 3 | * |
| 4 | * Portions Copyright (C) 2001-2002 |
| 5 | * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. |
| 6 | * |
| 7 | * Portions Copyright (C) 2003-2010 |
| 8 | * Julian Anastasov |
| 9 | * |
| 10 | * |
| 11 | * This code is free software; you can redistribute it and/or modify |
| 12 | * it under the terms of the GNU General Public License as published by |
| 13 | * the Free Software Foundation; either version 2 of the License, or |
| 14 | * (at your option) any later version. |
| 15 | * |
| 16 | * This program is distributed in the hope that it will be useful, |
| 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 19 | * GNU General Public License for more details. |
| 20 | * |
| 21 | * You should have received a copy of the GNU General Public License |
| 22 | * along with this program; if not, write to the Free Software |
| 23 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| 24 | * |
| 25 | * |
| 26 | * Authors: |
| 27 | * Ben North <ben@redfrontdoor.org> |
| 28 | * Julian Anastasov <ja@ssi.bg> Reorganize and sync with latest kernels |
| 29 | * Hannes Eder <heder@google.com> Extend NFCT support for FTP, ipvs match |
| 30 | * |
| 31 | * |
| 32 | * Current status: |
| 33 | * |
| 34 | * - provide conntrack confirmation for new and related connections, by |
| 35 | * this way we can see their proper conntrack state in all hooks |
| 36 | * - support for all forwarding methods, not only NAT |
| 37 | * - FTP support (NAT), ability to support other NAT apps with expectations |
| 38 | * - to correctly create expectations for related NAT connections the proper |
| 39 | * NF conntrack support must be already installed, eg. ip_vs_ftp requires |
| 40 | * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables |
| 41 | * NAT rules are needed) |
| 42 | * - alter reply for NAT when forwarding packet in original direction: |
| 43 | * conntrack from client in NEW or RELATED (Passive FTP DATA) state or |
| 44 | * when RELATED conntrack is created from real server (Active FTP DATA) |
| 45 | * - if iptables_nat is not loaded the Passive FTP will not work (the |
| 46 | * PASV response can not be NAT-ed) but Active FTP should work |
| 47 | * |
| 48 | */ |
| 49 | |
| 50 | #define KMSG_COMPONENT "IPVS" |
| 51 | #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt |
| 52 | |
| 53 | #include <linux/module.h> |
| 54 | #include <linux/types.h> |
| 55 | #include <linux/kernel.h> |
| 56 | #include <linux/errno.h> |
| 57 | #include <linux/compiler.h> |
| 58 | #include <linux/vmalloc.h> |
| 59 | #include <linux/skbuff.h> |
| 60 | #include <net/ip.h> |
| 61 | #include <linux/netfilter.h> |
| 62 | #include <linux/netfilter_ipv4.h> |
| 63 | #include <net/ip_vs.h> |
| 64 | #include <net/netfilter/nf_conntrack_core.h> |
| 65 | #include <net/netfilter/nf_conntrack_expect.h> |
| 66 | #include <net/netfilter/nf_conntrack_helper.h> |
| 67 | #include <net/netfilter/nf_conntrack_zones.h> |
| 68 | |
| 69 | |
| 70 | #define FMT_TUPLE "%pI4:%u->%pI4:%u/%u" |
| 71 | #define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \ |
| 72 | &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \ |
| 73 | (T)->dst.protonum |
| 74 | |
| 75 | #define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u" |
| 76 | #define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \ |
| 77 | &((C)->vaddr.ip), ntohs((C)->vport), \ |
| 78 | &((C)->daddr.ip), ntohs((C)->dport), \ |
| 79 | (C)->protocol, (C)->state |
| 80 | |
| 81 | void |
| 82 | ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) |
| 83 | { |
| 84 | enum ip_conntrack_info ctinfo; |
| 85 | struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); |
| 86 | struct nf_conntrack_tuple new_tuple; |
| 87 | |
| 88 | if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) || |
| 89 | nf_ct_is_dying(ct)) |
| 90 | return; |
| 91 | |
| 92 | /* Never alter conntrack for non-NAT conns */ |
| 93 | if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) |
| 94 | return; |
| 95 | |
| 96 | /* Alter reply only in original direction */ |
| 97 | if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) |
| 98 | return; |
| 99 | |
| 100 | /* |
| 101 | * The connection is not yet in the hashtable, so we update it. |
| 102 | * CIP->VIP will remain the same, so leave the tuple in |
| 103 | * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the |
| 104 | * real-server we will see RIP->DIP. |
| 105 | */ |
| 106 | new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; |
| 107 | /* |
| 108 | * This will also take care of UDP and other protocols. |
| 109 | */ |
| 110 | if (outin) { |
| 111 | new_tuple.src.u3 = cp->daddr; |
| 112 | if (new_tuple.dst.protonum != IPPROTO_ICMP && |
| 113 | new_tuple.dst.protonum != IPPROTO_ICMPV6) |
| 114 | new_tuple.src.u.tcp.port = cp->dport; |
| 115 | } else { |
| 116 | new_tuple.dst.u3 = cp->vaddr; |
| 117 | if (new_tuple.dst.protonum != IPPROTO_ICMP && |
| 118 | new_tuple.dst.protonum != IPPROTO_ICMPV6) |
| 119 | new_tuple.dst.u.tcp.port = cp->vport; |
| 120 | } |
| 121 | IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, " |
| 122 | "ctinfo=%d, old reply=" FMT_TUPLE |
| 123 | ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n", |
| 124 | __func__, ct, ct->status, ctinfo, |
| 125 | ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple), |
| 126 | ARG_TUPLE(&new_tuple), ARG_CONN(cp)); |
| 127 | nf_conntrack_alter_reply(ct, &new_tuple); |
| 128 | } |
| 129 | |
Simon Horman | 3c2de2a | 2011-09-16 14:02:19 +0900 | [diff] [blame] | 130 | int ip_vs_confirm_conntrack(struct sk_buff *skb) |
Julian Anastasov | f4bc17c | 2010-09-21 17:35:41 +0200 | [diff] [blame] | 131 | { |
| 132 | return nf_conntrack_confirm(skb); |
| 133 | } |
| 134 | |
| 135 | /* |
| 136 | * Called from init_conntrack() as expectfn handler. |
| 137 | */ |
| 138 | static void ip_vs_nfct_expect_callback(struct nf_conn *ct, |
| 139 | struct nf_conntrack_expect *exp) |
| 140 | { |
| 141 | struct nf_conntrack_tuple *orig, new_reply; |
| 142 | struct ip_vs_conn *cp; |
Simon Horman | f11017e | 2010-08-22 21:37:52 +0900 | [diff] [blame] | 143 | struct ip_vs_conn_param p; |
Hans Schillstrom | 6e67e58 | 2011-01-03 14:44:57 +0100 | [diff] [blame] | 144 | struct net *net = nf_ct_net(ct); |
Julian Anastasov | f4bc17c | 2010-09-21 17:35:41 +0200 | [diff] [blame] | 145 | |
| 146 | if (exp->tuple.src.l3num != PF_INET) |
| 147 | return; |
| 148 | |
| 149 | /* |
| 150 | * We assume that no NF locks are held before this callback. |
| 151 | * ip_vs_conn_out_get and ip_vs_conn_in_get should match their |
| 152 | * expectations even if they use wildcard values, now we provide the |
| 153 | * actual values from the newly created original conntrack direction. |
| 154 | * The conntrack is confirmed when packet reaches IPVS hooks. |
| 155 | */ |
| 156 | |
| 157 | /* RS->CLIENT */ |
| 158 | orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; |
Hans Schillstrom | 6e67e58 | 2011-01-03 14:44:57 +0100 | [diff] [blame] | 159 | ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum, |
Simon Horman | f11017e | 2010-08-22 21:37:52 +0900 | [diff] [blame] | 160 | &orig->src.u3, orig->src.u.tcp.port, |
| 161 | &orig->dst.u3, orig->dst.u.tcp.port, &p); |
| 162 | cp = ip_vs_conn_out_get(&p); |
Julian Anastasov | f4bc17c | 2010-09-21 17:35:41 +0200 | [diff] [blame] | 163 | if (cp) { |
| 164 | /* Change reply CLIENT->RS to CLIENT->VS */ |
| 165 | new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; |
| 166 | IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " |
| 167 | FMT_TUPLE ", found inout cp=" FMT_CONN "\n", |
| 168 | __func__, ct, ct->status, |
| 169 | ARG_TUPLE(orig), ARG_TUPLE(&new_reply), |
| 170 | ARG_CONN(cp)); |
| 171 | new_reply.dst.u3 = cp->vaddr; |
| 172 | new_reply.dst.u.tcp.port = cp->vport; |
| 173 | IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE |
| 174 | ", inout cp=" FMT_CONN "\n", |
| 175 | __func__, ct, |
| 176 | ARG_TUPLE(orig), ARG_TUPLE(&new_reply), |
| 177 | ARG_CONN(cp)); |
| 178 | goto alter; |
| 179 | } |
| 180 | |
| 181 | /* CLIENT->VS */ |
Simon Horman | f11017e | 2010-08-22 21:37:52 +0900 | [diff] [blame] | 182 | cp = ip_vs_conn_in_get(&p); |
Julian Anastasov | f4bc17c | 2010-09-21 17:35:41 +0200 | [diff] [blame] | 183 | if (cp) { |
| 184 | /* Change reply VS->CLIENT to RS->CLIENT */ |
| 185 | new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; |
| 186 | IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " |
| 187 | FMT_TUPLE ", found outin cp=" FMT_CONN "\n", |
| 188 | __func__, ct, ct->status, |
| 189 | ARG_TUPLE(orig), ARG_TUPLE(&new_reply), |
| 190 | ARG_CONN(cp)); |
| 191 | new_reply.src.u3 = cp->daddr; |
| 192 | new_reply.src.u.tcp.port = cp->dport; |
| 193 | IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " |
| 194 | FMT_TUPLE ", outin cp=" FMT_CONN "\n", |
| 195 | __func__, ct, |
| 196 | ARG_TUPLE(orig), ARG_TUPLE(&new_reply), |
| 197 | ARG_CONN(cp)); |
| 198 | goto alter; |
| 199 | } |
| 200 | |
| 201 | IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE |
| 202 | " - unknown expect\n", |
| 203 | __func__, ct, ct->status, ARG_TUPLE(orig)); |
| 204 | return; |
| 205 | |
| 206 | alter: |
| 207 | /* Never alter conntrack for non-NAT conns */ |
| 208 | if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) |
| 209 | nf_conntrack_alter_reply(ct, &new_reply); |
| 210 | ip_vs_conn_put(cp); |
| 211 | return; |
| 212 | } |
| 213 | |
| 214 | /* |
| 215 | * Create NF conntrack expectation with wildcard (optional) source port. |
| 216 | * Then the default callback function will alter the reply and will confirm |
| 217 | * the conntrack entry when the first packet comes. |
| 218 | * Use port 0 to expect connection from any port. |
| 219 | */ |
| 220 | void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, |
| 221 | struct ip_vs_conn *cp, u_int8_t proto, |
| 222 | const __be16 port, int from_rs) |
| 223 | { |
| 224 | struct nf_conntrack_expect *exp; |
| 225 | |
| 226 | if (ct == NULL || nf_ct_is_untracked(ct)) |
| 227 | return; |
| 228 | |
| 229 | exp = nf_ct_expect_alloc(ct); |
| 230 | if (!exp) |
| 231 | return; |
| 232 | |
| 233 | nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), |
| 234 | from_rs ? &cp->daddr : &cp->caddr, |
| 235 | from_rs ? &cp->caddr : &cp->vaddr, |
| 236 | proto, port ? &port : NULL, |
| 237 | from_rs ? &cp->cport : &cp->vport); |
| 238 | |
| 239 | exp->expectfn = ip_vs_nfct_expect_callback; |
| 240 | |
| 241 | IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", |
| 242 | __func__, ct, ARG_TUPLE(&exp->tuple)); |
| 243 | nf_ct_expect_related(exp); |
| 244 | nf_ct_expect_put(exp); |
| 245 | } |
| 246 | EXPORT_SYMBOL(ip_vs_nfct_expect_related); |
| 247 | |
| 248 | /* |
| 249 | * Our connection was terminated, try to drop the conntrack immediately |
| 250 | */ |
| 251 | void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) |
| 252 | { |
| 253 | struct nf_conntrack_tuple_hash *h; |
| 254 | struct nf_conn *ct; |
| 255 | struct nf_conntrack_tuple tuple; |
| 256 | |
| 257 | if (!cp->cport) |
| 258 | return; |
| 259 | |
| 260 | tuple = (struct nf_conntrack_tuple) { |
| 261 | .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } }; |
| 262 | tuple.src.u3 = cp->caddr; |
| 263 | tuple.src.u.all = cp->cport; |
| 264 | tuple.src.l3num = cp->af; |
| 265 | tuple.dst.u3 = cp->vaddr; |
| 266 | tuple.dst.u.all = cp->vport; |
| 267 | |
| 268 | IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE |
| 269 | " for conn " FMT_CONN "\n", |
| 270 | __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); |
| 271 | |
Hans Schillstrom | 6e67e58 | 2011-01-03 14:44:57 +0100 | [diff] [blame] | 272 | h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE, |
| 273 | &tuple); |
Julian Anastasov | f4bc17c | 2010-09-21 17:35:41 +0200 | [diff] [blame] | 274 | if (h) { |
| 275 | ct = nf_ct_tuplehash_to_ctrack(h); |
| 276 | /* Show what happens instead of calling nf_ct_kill() */ |
| 277 | if (del_timer(&ct->timeout)) { |
| 278 | IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple=" |
| 279 | FMT_TUPLE "\n", |
| 280 | __func__, ct, ARG_TUPLE(&tuple)); |
| 281 | if (ct->timeout.function) |
| 282 | ct->timeout.function(ct->timeout.data); |
| 283 | } else { |
| 284 | IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple=" |
| 285 | FMT_TUPLE "\n", |
| 286 | __func__, ct, ARG_TUPLE(&tuple)); |
| 287 | } |
| 288 | nf_ct_put(ct); |
| 289 | } else { |
| 290 | IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", |
| 291 | __func__, ARG_TUPLE(&tuple)); |
| 292 | } |
| 293 | } |
| 294 | |