Merge branch 'nf-ingress'
Pablo Neira Ayuso says:
====================
Netfilter ingress support (v4)
This is the v4 round of patches to add the Netfilter ingress hook, it basically
comes in two steps:
1) Add the CONFIG_NET_INGRESS switch to wrap the ingress static key around it.
The idea is to use the same global static key to avoid adding more code to
the hot path.
2) Add the Netfilter ingress hook after the tc ingress hook, under the global
ingress_needed static key. As I said, the netfilter ingress hook also has
its own static key, that is nested under the global static key. Please, see
patch 5/5 for performance numbers and more information.
I originally started this next round, as it was suggested, exploring the
independent static key for netfilter ingress just after tc ingress, but the
results that I gathered from that patch are not good for non-users:
Result: OK: 6425927(c6425843+d83) usec, 100000000 (60byte,0frags)
15561955pps 7469Mb/sec (7469738400bps) errors: 100000000
this roughly means 500Kpps less performance wrt. the base numbers, so that's
the reason why I discarded that approach and I focused on this.
The idea of this patchset is to open the window to nf_tables, which comes with
features that will work out-of-the-box (once the boiler plate code to support
the 'netdev' table family is in place), to avoid repeating myself [1], the most
relevant features are:
1) Multi-dimensional key dictionary lookups.
2) Arbitrary stateful flow tables.
3) Transactions and good support for dynamic updates.
But there are also interest aspects to consider from userspace, such as the
ability to support new layer 2 protocols without kernel updates, a well-defined
netlink interface, userspace libraries and utilities for third party
applications, among others.
I hope we can be happy with this approach.
Please, apply. Thanks.
[1] http://marc.info/?l=netfilter-devel&m=143033337020328&w=2
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d3ed01c..51f8d2f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1656,6 +1656,9 @@
struct tcf_proto __rcu *ingress_cl_list;
#endif
struct netdev_queue __rcu *ingress_queue;
+#ifdef CONFIG_NETFILTER_INGRESS
+ struct list_head nf_hooks_ingress;
+#endif
unsigned char broadcast[MAX_ADDR_LEN];
#ifdef CONFIG_RFS_ACCEL
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 63560d0..f5ff5d1 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -54,10 +54,12 @@
struct net_device *in;
struct net_device *out;
struct sock *sk;
+ struct list_head *hook_list;
int (*okfn)(struct sock *, struct sk_buff *);
};
static inline void nf_hook_state_init(struct nf_hook_state *p,
+ struct list_head *hook_list,
unsigned int hook,
int thresh, u_int8_t pf,
struct net_device *indev,
@@ -71,6 +73,7 @@
p->in = indev;
p->out = outdev;
p->sk = sk;
+ p->hook_list = hook_list;
p->okfn = okfn;
}
@@ -79,16 +82,17 @@
const struct nf_hook_state *state);
struct nf_hook_ops {
- struct list_head list;
+ struct list_head list;
/* User fills in from here down. */
- nf_hookfn *hook;
- struct module *owner;
- void *priv;
- u_int8_t pf;
- unsigned int hooknum;
+ nf_hookfn *hook;
+ struct net_device *dev;
+ struct module *owner;
+ void *priv;
+ u_int8_t pf;
+ unsigned int hooknum;
/* Hooks are ordered in ascending priority. */
- int priority;
+ int priority;
};
struct nf_sockopt_ops {
@@ -131,26 +135,33 @@
#ifdef HAVE_JUMP_LABEL
extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
-static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook)
+static inline bool nf_hook_list_active(struct list_head *nf_hook_list,
+ u_int8_t pf, unsigned int hook)
{
if (__builtin_constant_p(pf) &&
__builtin_constant_p(hook))
return static_key_false(&nf_hooks_needed[pf][hook]);
- return !list_empty(&nf_hooks[pf][hook]);
+ return !list_empty(nf_hook_list);
}
#else
-static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook)
+static inline bool nf_hook_list_active(struct list_head *nf_hook_list,
+ u_int8_t pf, unsigned int hook)
{
- return !list_empty(&nf_hooks[pf][hook]);
+ return !list_empty(nf_hook_list);
}
#endif
+static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook)
+{
+ return nf_hook_list_active(&nf_hooks[pf][hook], pf, hook);
+}
+
int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state);
/**
* nf_hook_thresh - call a netfilter hook
- *
+ *
* Returns 1 if the hook has allowed the packet to pass. The function
* okfn must be invoked by the caller in this case. Any other return
* value indicates the packet has been consumed by the hook.
@@ -166,8 +177,8 @@
if (nf_hooks_active(pf, hook)) {
struct nf_hook_state state;
- nf_hook_state_init(&state, hook, thresh, pf,
- indev, outdev, sk, okfn);
+ nf_hook_state_init(&state, &nf_hooks[pf][hook], hook, thresh,
+ pf, indev, outdev, sk, okfn);
return nf_hook_slow(skb, &state);
}
return 1;
diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
new file mode 100644
index 0000000..cb0727f
--- /dev/null
+++ b/include/linux/netfilter_ingress.h
@@ -0,0 +1,41 @@
+#ifndef _NETFILTER_INGRESS_H_
+#define _NETFILTER_INGRESS_H_
+
+#include <linux/netfilter.h>
+#include <linux/netdevice.h>
+
+#ifdef CONFIG_NETFILTER_INGRESS
+static inline int nf_hook_ingress_active(struct sk_buff *skb)
+{
+ return nf_hook_list_active(&skb->dev->nf_hooks_ingress,
+ NFPROTO_NETDEV, NF_NETDEV_INGRESS);
+}
+
+static inline int nf_hook_ingress(struct sk_buff *skb)
+{
+ struct nf_hook_state state;
+
+ nf_hook_state_init(&state, &skb->dev->nf_hooks_ingress,
+ NF_NETDEV_INGRESS, INT_MIN, NFPROTO_NETDEV, NULL,
+ skb->dev, NULL, NULL);
+ return nf_hook_slow(skb, &state);
+}
+
+static inline void nf_hook_ingress_init(struct net_device *dev)
+{
+ INIT_LIST_HEAD(&dev->nf_hooks_ingress);
+}
+#else /* CONFIG_NETFILTER_INGRESS */
+static inline int nf_hook_ingress_active(struct sk_buff *skb)
+{
+ return 0;
+}
+
+static inline int nf_hook_ingress(struct sk_buff *skb)
+{
+ return 0;
+}
+
+static inline void nf_hook_ingress_init(struct net_device *dev) {}
+#endif /* CONFIG_NETFILTER_INGRESS */
+#endif /* _NETFILTER_INGRESS_H_ */
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index bd29ab4..a2324fb 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -79,7 +79,7 @@
struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_INGRESS
void net_inc_ingress_queue(void);
void net_dec_ingress_queue(void);
#endif
diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h
index ef1b1f8..177027c 100644
--- a/include/uapi/linux/netfilter.h
+++ b/include/uapi/linux/netfilter.h
@@ -51,11 +51,17 @@
NF_INET_NUMHOOKS
};
+enum nf_dev_hooks {
+ NF_NETDEV_INGRESS,
+ NF_NETDEV_NUMHOOKS
+};
+
enum {
NFPROTO_UNSPEC = 0,
NFPROTO_INET = 1,
NFPROTO_IPV4 = 2,
NFPROTO_ARP = 3,
+ NFPROTO_NETDEV = 5,
NFPROTO_BRIDGE = 7,
NFPROTO_IPV6 = 10,
NFPROTO_DECNET = 12,
diff --git a/net/Kconfig b/net/Kconfig
index 44dd578..57a7c5a 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -45,6 +45,9 @@
Newly written code should NEVER need this option but do
compat-independent messages instead!
+config NET_INGRESS
+ bool
+
menu "Networking options"
source "net/packet/Kconfig"
diff --git a/net/core/dev.c b/net/core/dev.c
index af54906..29f0d6e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -135,6 +135,7 @@
#include <linux/if_macvlan.h>
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
+#include <linux/netfilter_ingress.h>
#include "net-sysfs.h"
@@ -1630,7 +1631,7 @@
}
EXPORT_SYMBOL(call_netdevice_notifiers);
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_INGRESS
static struct static_key ingress_needed __read_mostly;
void net_inc_ingress_queue(void)
@@ -3666,6 +3667,13 @@
return skb;
}
+#else
+static inline struct sk_buff *handle_ing(struct sk_buff *skb,
+ struct packet_type **pt_prev,
+ int *ret, struct net_device *orig_dev)
+{
+ return skb;
+}
#endif
/**
@@ -3739,6 +3747,28 @@
}
}
+#ifdef CONFIG_NETFILTER_INGRESS
+static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
+ int *ret, struct net_device *orig_dev)
+{
+ if (nf_hook_ingress_active(skb)) {
+ if (*pt_prev) {
+ *ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
+ }
+
+ return nf_hook_ingress(skb);
+ }
+ return 0;
+}
+#else
+static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
+ int *ret, struct net_device *orig_dev)
+{
+ return 0;
+}
+#endif
+
static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
struct packet_type *ptype, *pt_prev;
@@ -3798,13 +3828,17 @@
}
skip_taps:
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_INGRESS
if (static_key_false(&ingress_needed)) {
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto unlock;
- }
+ if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
+ goto unlock;
+ }
+#endif
+#ifdef CONFIG_NET_CLS_ACT
skb->tc_verd = 0;
ncls:
#endif
@@ -6967,6 +7001,9 @@
dev->group = INIT_NETDEV_GROUP;
if (!dev->ethtool_ops)
dev->ethtool_ops = &default_ethtool_ops;
+
+ nf_hook_ingress_init(dev);
+
return dev;
free_all:
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f70e34a..db1c674 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1,6 +1,13 @@
menu "Core Netfilter Configuration"
depends on NET && INET && NETFILTER
+config NETFILTER_INGRESS
+ bool "Netfilter ingress support"
+ select NET_INGRESS
+ help
+ This allows you to classify packets from ingress using the Netfilter
+ infrastructure.
+
config NETFILTER_NETLINK
tristate
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index e616301..653e32e 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -64,10 +64,27 @@
int nf_register_hook(struct nf_hook_ops *reg)
{
+ struct list_head *nf_hook_list;
struct nf_hook_ops *elem;
mutex_lock(&nf_hook_mutex);
- list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) {
+ switch (reg->pf) {
+ case NFPROTO_NETDEV:
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (reg->hooknum == NF_NETDEV_INGRESS) {
+ BUG_ON(reg->dev == NULL);
+ nf_hook_list = ®->dev->nf_hooks_ingress;
+ net_inc_ingress_queue();
+ break;
+ }
+#endif
+ /* Fall through. */
+ default:
+ nf_hook_list = &nf_hooks[reg->pf][reg->hooknum];
+ break;
+ }
+
+ list_for_each_entry(elem, nf_hook_list, list) {
if (reg->priority < elem->priority)
break;
}
@@ -85,6 +102,18 @@
mutex_lock(&nf_hook_mutex);
list_del_rcu(®->list);
mutex_unlock(&nf_hook_mutex);
+ switch (reg->pf) {
+ case NFPROTO_NETDEV:
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (reg->hooknum == NF_NETDEV_INGRESS) {
+ net_dec_ingress_queue();
+ break;
+ }
+ break;
+#endif
+ default:
+ break;
+ }
#ifdef HAVE_JUMP_LABEL
static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
#endif
@@ -166,11 +195,9 @@
/* We may already have this, but read-locks nest anyway */
rcu_read_lock();
- elem = list_entry_rcu(&nf_hooks[state->pf][state->hook],
- struct nf_hook_ops, list);
+ elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list);
next_hook:
- verdict = nf_iterate(&nf_hooks[state->pf][state->hook], skb, state,
- &elem);
+ verdict = nf_iterate(state->hook_list, skb, state, &elem);
if (verdict == NF_ACCEPT || verdict == NF_STOP) {
ret = 1;
} else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 5fd1c2f..daa3343 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -312,6 +312,7 @@
config NET_SCH_INGRESS
tristate "Ingress Qdisc"
depends on NET_CLS_ACT
+ select NET_INGRESS
---help---
Say Y here if you want to use classifiers for incoming packets.
If unsure, say Y.