blob: 974143d3e72780a8c4de5fb29786da0b88bda5f6 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070076#include <linux/bitops.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080077#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070078#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
stephen hemminger08e98972009-11-10 07:20:34 +000081#include <linux/hash.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090082#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/sched.h>
Arjan van de Ven4a3e2f72006-03-20 22:33:17 -080084#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
Ben Hutchings0187bdf2008-06-19 16:15:47 -070094#include <linux/ethtool.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <linux/notifier.h>
96#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020097#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <net/sock.h>
99#include <linux/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100#include <linux/stat.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101#include <net/dst.h>
102#include <net/pkt_sched.h>
103#include <net/checksum.h>
Arnd Bergmann44540962009-11-26 06:07:08 +0000104#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105#include <linux/highmem.h>
106#include <linux/init.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107#include <linux/module.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#include <linux/netpoll.h>
109#include <linux/rcupdate.h>
110#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700111#include <net/iw_handler.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112#include <asm/current.h>
Steve Grubb5bdb9882005-12-03 08:39:35 -0500113#include <linux/audit.h>
Chris Leechdb217332006-06-17 21:24:58 -0700114#include <linux/dmaengine.h>
Herbert Xuf6a78bf2006-06-22 02:57:17 -0700115#include <linux/err.h>
David S. Millerc7fa9d12006-08-15 16:34:13 -0700116#include <linux/ctype.h>
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700117#include <linux/if_arp.h>
Ben Hutchings6de329e2008-06-16 17:02:28 -0700118#include <linux/if_vlan.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700119#include <linux/ip.h>
Alexander Duyckad55dca2008-09-20 22:05:50 -0700120#include <net/ip.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700121#include <linux/ipv6.h>
122#include <linux/in.h>
David S. Millerb6b2fed2008-07-21 09:48:06 -0700123#include <linux/jhash.h>
124#include <linux/random.h>
David S. Miller9cbc1cb2009-06-15 03:02:23 -0700125#include <trace/events/napi.h>
Koki Sanagicf66ba52010-08-23 18:45:02 +0900126#include <trace/events/net.h>
Koki Sanagi07dc22e2010-08-23 18:46:12 +0900127#include <trace/events/skb.h>
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +0000128#include <linux/pci.h>
Stephen Rothwellcaeda9b2010-09-16 21:39:16 -0700129#include <linux/inetdevice.h>
Ben Hutchingsc4454772011-01-19 11:03:53 +0000130#include <linux/cpu_rmap.h>
Ingo Molnarc5905af2012-02-24 08:31:31 +0100131#include <linux/static_key.h>
Eliezer Tamiraf12fa62013-06-10 11:39:41 +0300132#include <linux/hashtable.h>
Eric Dumazet60877a32013-06-20 01:15:51 -0700133#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134
Pavel Emelyanov342709e2007-10-23 21:14:45 -0700135#include "net-sysfs.h"
136
Herbert Xud565b0a2008-12-15 23:38:52 -0800137/* Instead of increasing this, you should create a hash table. */
138#define MAX_GRO_SKBS 8
139
Herbert Xu5d38a072009-01-04 16:13:40 -0800140/* This should be increased if a protocol with a bigger head is added. */
141#define GRO_MAX_HEAD (MAX_HEADER + 128)
142
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143static DEFINE_SPINLOCK(ptype_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000144static DEFINE_SPINLOCK(offload_lock);
Cong Wang900ff8c2013-02-18 19:20:33 +0000145struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
146struct list_head ptype_all __read_mostly; /* Taps */
Vlad Yasevich62532da2012-11-15 08:49:10 +0000147static struct list_head offload_base __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149/*
Pavel Emelianov7562f872007-05-03 15:13:45 -0700150 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151 * semaphore.
152 *
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800153 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154 *
155 * Writers must hold the rtnl semaphore while they loop through the
Pavel Emelianov7562f872007-05-03 15:13:45 -0700156 * dev_base_head list, and hold dev_base_lock for writing when they do the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157 * actual updates. This allows pure readers to access the list even
158 * while a writer is preparing to update it.
159 *
160 * To put it another way, dev_base_lock is held for writing only to
161 * protect against pure readers; the rtnl semaphore provides the
162 * protection against other writers.
163 *
164 * See, for example usages, register_netdevice() and
165 * unregister_netdevice(), which must be called with the rtnl
166 * semaphore held.
167 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700168DEFINE_RWLOCK(dev_base_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169EXPORT_SYMBOL(dev_base_lock);
170
Eliezer Tamiraf12fa62013-06-10 11:39:41 +0300171/* protects napi_hash addition/deletion and napi_gen_id */
172static DEFINE_SPINLOCK(napi_hash_lock);
173
174static unsigned int napi_gen_id;
175static DEFINE_HASHTABLE(napi_hash, 8);
176
Thomas Gleixner18afa4b2013-07-23 16:13:17 +0200177static seqcount_t devnet_rename_seq;
Brian Haleyc91f6df2012-11-26 05:21:08 +0000178
Thomas Graf4e985ad2011-06-21 03:11:20 +0000179static inline void dev_base_seq_inc(struct net *net)
180{
181 while (++net->dev_base_seq == 0);
182}
183
Eric W. Biederman881d9662007-09-17 11:56:21 -0700184static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185{
Eric Dumazet95c96172012-04-15 05:58:06 +0000186 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
187
stephen hemminger08e98972009-11-10 07:20:34 +0000188 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189}
190
Eric W. Biederman881d9662007-09-17 11:56:21 -0700191static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700192{
Eric Dumazet7c28bd02009-10-24 06:13:17 -0700193 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194}
195
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000196static inline void rps_lock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000197{
198#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000199 spin_lock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000200#endif
201}
202
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000203static inline void rps_unlock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000204{
205#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000206 spin_unlock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000207#endif
208}
209
Eric W. Biedermance286d32007-09-12 13:53:49 +0200210/* Device list insertion */
dingtianhong53759be2013-04-17 22:17:50 +0000211static void list_netdevice(struct net_device *dev)
Eric W. Biedermance286d32007-09-12 13:53:49 +0200212{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900213 struct net *net = dev_net(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200214
215 ASSERT_RTNL();
216
217 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800218 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
Eric Dumazet72c95282009-10-30 07:11:27 +0000219 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000220 hlist_add_head_rcu(&dev->index_hlist,
221 dev_index_hash(net, dev->ifindex));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200222 write_unlock_bh(&dev_base_lock);
Thomas Graf4e985ad2011-06-21 03:11:20 +0000223
224 dev_base_seq_inc(net);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200225}
226
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000227/* Device list removal
228 * caller must respect a RCU grace period before freeing/reusing dev
229 */
Eric W. Biedermance286d32007-09-12 13:53:49 +0200230static void unlist_netdevice(struct net_device *dev)
231{
232 ASSERT_RTNL();
233
234 /* Unlink dev from the device chain */
235 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800236 list_del_rcu(&dev->dev_list);
Eric Dumazet72c95282009-10-30 07:11:27 +0000237 hlist_del_rcu(&dev->name_hlist);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000238 hlist_del_rcu(&dev->index_hlist);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200239 write_unlock_bh(&dev_base_lock);
Thomas Graf4e985ad2011-06-21 03:11:20 +0000240
241 dev_base_seq_inc(dev_net(dev));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200242}
243
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244/*
245 * Our notifier list
246 */
247
Alan Sternf07d5b92006-05-09 15:23:03 -0700248static RAW_NOTIFIER_HEAD(netdev_chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700249
250/*
251 * Device drivers call our routines to queue packets here. We empty the
252 * queue in the local softnet handler.
253 */
Stephen Hemmingerbea33482007-10-03 16:41:36 -0700254
Eric Dumazet9958da02010-04-17 04:17:02 +0000255DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700256EXPORT_PER_CPU_SYMBOL(softnet_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257
David S. Millercf508b12008-07-22 14:16:42 -0700258#ifdef CONFIG_LOCKDEP
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700259/*
David S. Millerc773e842008-07-08 23:13:53 -0700260 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700261 * according to dev->type
262 */
263static const unsigned short netdev_lock_type[] =
264 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
265 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
266 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
267 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
268 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
269 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
270 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
271 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
272 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
273 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
274 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
275 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
Paul Gortmaker211ed862012-05-10 17:14:35 -0400276 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
277 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
278 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700279
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700280static const char *const netdev_lock_name[] =
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700281 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
282 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
283 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
284 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
285 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
286 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
287 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
288 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
289 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
290 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
291 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
292 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
Paul Gortmaker211ed862012-05-10 17:14:35 -0400293 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
294 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
295 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700296
297static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
David S. Millercf508b12008-07-22 14:16:42 -0700298static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700299
300static inline unsigned short netdev_lock_pos(unsigned short dev_type)
301{
302 int i;
303
304 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
305 if (netdev_lock_type[i] == dev_type)
306 return i;
307 /* the last key is used by default */
308 return ARRAY_SIZE(netdev_lock_type) - 1;
309}
310
David S. Millercf508b12008-07-22 14:16:42 -0700311static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
312 unsigned short dev_type)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700313{
314 int i;
315
316 i = netdev_lock_pos(dev_type);
317 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
318 netdev_lock_name[i]);
319}
David S. Millercf508b12008-07-22 14:16:42 -0700320
321static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
322{
323 int i;
324
325 i = netdev_lock_pos(dev->type);
326 lockdep_set_class_and_name(&dev->addr_list_lock,
327 &netdev_addr_lock_key[i],
328 netdev_lock_name[i]);
329}
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700330#else
David S. Millercf508b12008-07-22 14:16:42 -0700331static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
332 unsigned short dev_type)
333{
334}
335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700336{
337}
338#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339
340/*******************************************************************************
341
342 Protocol management and registration routines
343
344*******************************************************************************/
345
346/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347 * Add a protocol ID to the list. Now that the input handler is
348 * smarter we can dispense with all the messy stuff that used to be
349 * here.
350 *
351 * BEWARE!!! Protocol handlers, mangling input packets,
352 * MUST BE last in hash buckets and checking protocol handlers
353 * MUST start from promiscuous ptype_all chain in net_bh.
354 * It is true now, do not change it.
355 * Explanation follows: if protocol handler, mangling packet, will
356 * be the first on list, it is not able to sense, that packet
357 * is cloned and should be copied-on-write, so that it will
358 * change it and subsequent readers will get broken packet.
359 * --ANK (980803)
360 */
361
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000362static inline struct list_head *ptype_head(const struct packet_type *pt)
363{
364 if (pt->type == htons(ETH_P_ALL))
365 return &ptype_all;
366 else
367 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
368}
369
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370/**
371 * dev_add_pack - add packet handler
372 * @pt: packet type declaration
373 *
374 * Add a protocol handler to the networking stack. The passed &packet_type
375 * is linked into kernel lists and may not be freed until it has been
376 * removed from the kernel lists.
377 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900378 * This call does not sleep therefore it can not
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 * guarantee all CPU's that are in middle of receiving packets
380 * will see the new packet type (until the next received packet).
381 */
382
383void dev_add_pack(struct packet_type *pt)
384{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000385 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000387 spin_lock(&ptype_lock);
388 list_add_rcu(&pt->list, head);
389 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700391EXPORT_SYMBOL(dev_add_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700392
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393/**
394 * __dev_remove_pack - remove packet handler
395 * @pt: packet type declaration
396 *
397 * Remove a protocol handler that was previously added to the kernel
398 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
399 * from the kernel lists and can be freed or reused once this function
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900400 * returns.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401 *
402 * The packet type might still be in use by receivers
403 * and must not be freed until after all the CPU's have gone
404 * through a quiescent state.
405 */
406void __dev_remove_pack(struct packet_type *pt)
407{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000408 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700409 struct packet_type *pt1;
410
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000411 spin_lock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700412
413 list_for_each_entry(pt1, head, list) {
414 if (pt == pt1) {
415 list_del_rcu(&pt->list);
416 goto out;
417 }
418 }
419
Joe Perches7b6cd1c2012-02-01 10:54:43 +0000420 pr_warn("dev_remove_pack: %p not found\n", pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700421out:
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000422 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700423}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700424EXPORT_SYMBOL(__dev_remove_pack);
425
Linus Torvalds1da177e2005-04-16 15:20:36 -0700426/**
427 * dev_remove_pack - remove packet handler
428 * @pt: packet type declaration
429 *
430 * Remove a protocol handler that was previously added to the kernel
431 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
432 * from the kernel lists and can be freed or reused once this function
433 * returns.
434 *
435 * This call sleeps to guarantee that no CPU is looking at the packet
436 * type after return.
437 */
438void dev_remove_pack(struct packet_type *pt)
439{
440 __dev_remove_pack(pt);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900441
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442 synchronize_net();
443}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700444EXPORT_SYMBOL(dev_remove_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445
Vlad Yasevich62532da2012-11-15 08:49:10 +0000446
447/**
448 * dev_add_offload - register offload handlers
449 * @po: protocol offload declaration
450 *
451 * Add protocol offload handlers to the networking stack. The passed
452 * &proto_offload is linked into kernel lists and may not be freed until
453 * it has been removed from the kernel lists.
454 *
455 * This call does not sleep therefore it can not
456 * guarantee all CPU's that are in middle of receiving packets
457 * will see the new offload handlers (until the next received packet).
458 */
459void dev_add_offload(struct packet_offload *po)
460{
461 struct list_head *head = &offload_base;
462
463 spin_lock(&offload_lock);
464 list_add_rcu(&po->list, head);
465 spin_unlock(&offload_lock);
466}
467EXPORT_SYMBOL(dev_add_offload);
468
469/**
470 * __dev_remove_offload - remove offload handler
471 * @po: packet offload declaration
472 *
473 * Remove a protocol offload handler that was previously added to the
474 * kernel offload handlers by dev_add_offload(). The passed &offload_type
475 * is removed from the kernel lists and can be freed or reused once this
476 * function returns.
477 *
478 * The packet type might still be in use by receivers
479 * and must not be freed until after all the CPU's have gone
480 * through a quiescent state.
481 */
482void __dev_remove_offload(struct packet_offload *po)
483{
484 struct list_head *head = &offload_base;
485 struct packet_offload *po1;
486
Eric Dumazetc53aa502012-11-16 08:08:23 +0000487 spin_lock(&offload_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000488
489 list_for_each_entry(po1, head, list) {
490 if (po == po1) {
491 list_del_rcu(&po->list);
492 goto out;
493 }
494 }
495
496 pr_warn("dev_remove_offload: %p not found\n", po);
497out:
Eric Dumazetc53aa502012-11-16 08:08:23 +0000498 spin_unlock(&offload_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000499}
500EXPORT_SYMBOL(__dev_remove_offload);
501
502/**
503 * dev_remove_offload - remove packet offload handler
504 * @po: packet offload declaration
505 *
506 * Remove a packet offload handler that was previously added to the kernel
507 * offload handlers by dev_add_offload(). The passed &offload_type is
508 * removed from the kernel lists and can be freed or reused once this
509 * function returns.
510 *
511 * This call sleeps to guarantee that no CPU is looking at the packet
512 * type after return.
513 */
514void dev_remove_offload(struct packet_offload *po)
515{
516 __dev_remove_offload(po);
517
518 synchronize_net();
519}
520EXPORT_SYMBOL(dev_remove_offload);
521
Linus Torvalds1da177e2005-04-16 15:20:36 -0700522/******************************************************************************
523
524 Device Boot-time Settings Routines
525
526*******************************************************************************/
527
528/* Boot time configuration table */
529static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
530
531/**
532 * netdev_boot_setup_add - add new setup entry
533 * @name: name of the device
534 * @map: configured settings for the device
535 *
536 * Adds new setup entry to the dev_boot_setup list. The function
537 * returns 0 on error and 1 on success. This is a generic routine to
538 * all netdevices.
539 */
540static int netdev_boot_setup_add(char *name, struct ifmap *map)
541{
542 struct netdev_boot_setup *s;
543 int i;
544
545 s = dev_boot_setup;
546 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
547 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
548 memset(s[i].name, 0, sizeof(s[i].name));
Wang Chen93b3cff2008-07-01 19:57:19 -0700549 strlcpy(s[i].name, name, IFNAMSIZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700550 memcpy(&s[i].map, map, sizeof(s[i].map));
551 break;
552 }
553 }
554
555 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
556}
557
558/**
559 * netdev_boot_setup_check - check boot time settings
560 * @dev: the netdevice
561 *
562 * Check boot time settings for the device.
563 * The found settings are set for the device to be used
564 * later in the device probing.
565 * Returns 0 if no settings found, 1 if they are.
566 */
567int netdev_boot_setup_check(struct net_device *dev)
568{
569 struct netdev_boot_setup *s = dev_boot_setup;
570 int i;
571
572 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
573 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
Wang Chen93b3cff2008-07-01 19:57:19 -0700574 !strcmp(dev->name, s[i].name)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700575 dev->irq = s[i].map.irq;
576 dev->base_addr = s[i].map.base_addr;
577 dev->mem_start = s[i].map.mem_start;
578 dev->mem_end = s[i].map.mem_end;
579 return 1;
580 }
581 }
582 return 0;
583}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700584EXPORT_SYMBOL(netdev_boot_setup_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700585
586
587/**
588 * netdev_boot_base - get address from boot time settings
589 * @prefix: prefix for network device
590 * @unit: id for network device
591 *
592 * Check boot time settings for the base address of device.
593 * The found settings are set for the device to be used
594 * later in the device probing.
595 * Returns 0 if no settings found.
596 */
597unsigned long netdev_boot_base(const char *prefix, int unit)
598{
599 const struct netdev_boot_setup *s = dev_boot_setup;
600 char name[IFNAMSIZ];
601 int i;
602
603 sprintf(name, "%s%d", prefix, unit);
604
605 /*
606 * If device already registered then return base of 1
607 * to indicate not to probe for this interface
608 */
Eric W. Biederman881d9662007-09-17 11:56:21 -0700609 if (__dev_get_by_name(&init_net, name))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700610 return 1;
611
612 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
613 if (!strcmp(name, s[i].name))
614 return s[i].map.base_addr;
615 return 0;
616}
617
618/*
619 * Saves at boot time configured settings for any netdevice.
620 */
621int __init netdev_boot_setup(char *str)
622{
623 int ints[5];
624 struct ifmap map;
625
626 str = get_options(str, ARRAY_SIZE(ints), ints);
627 if (!str || !*str)
628 return 0;
629
630 /* Save settings */
631 memset(&map, 0, sizeof(map));
632 if (ints[0] > 0)
633 map.irq = ints[1];
634 if (ints[0] > 1)
635 map.base_addr = ints[2];
636 if (ints[0] > 2)
637 map.mem_start = ints[3];
638 if (ints[0] > 3)
639 map.mem_end = ints[4];
640
641 /* Add new entry to the list */
642 return netdev_boot_setup_add(str, &map);
643}
644
645__setup("netdev=", netdev_boot_setup);
646
647/*******************************************************************************
648
649 Device Interface Subroutines
650
651*******************************************************************************/
652
653/**
654 * __dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700655 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700656 * @name: name to find
657 *
658 * Find an interface by name. Must be called under RTNL semaphore
659 * or @dev_base_lock. If the name is found a pointer to the device
660 * is returned. If the name is not found then %NULL is returned. The
661 * reference counters are not incremented so the caller must be
662 * careful with locks.
663 */
664
Eric W. Biederman881d9662007-09-17 11:56:21 -0700665struct net_device *__dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666{
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700667 struct net_device *dev;
668 struct hlist_head *head = dev_name_hash(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669
Sasha Levinb67bfe02013-02-27 17:06:00 -0800670 hlist_for_each_entry(dev, head, name_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700671 if (!strncmp(dev->name, name, IFNAMSIZ))
672 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700673
Linus Torvalds1da177e2005-04-16 15:20:36 -0700674 return NULL;
675}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700676EXPORT_SYMBOL(__dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700677
678/**
Eric Dumazet72c95282009-10-30 07:11:27 +0000679 * dev_get_by_name_rcu - find a device by its name
680 * @net: the applicable net namespace
681 * @name: name to find
682 *
683 * Find an interface by name.
684 * If the name is found a pointer to the device is returned.
685 * If the name is not found then %NULL is returned.
686 * The reference counters are not incremented so the caller must be
687 * careful with locks. The caller must hold RCU lock.
688 */
689
690struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
691{
Eric Dumazet72c95282009-10-30 07:11:27 +0000692 struct net_device *dev;
693 struct hlist_head *head = dev_name_hash(net, name);
694
Sasha Levinb67bfe02013-02-27 17:06:00 -0800695 hlist_for_each_entry_rcu(dev, head, name_hlist)
Eric Dumazet72c95282009-10-30 07:11:27 +0000696 if (!strncmp(dev->name, name, IFNAMSIZ))
697 return dev;
698
699 return NULL;
700}
701EXPORT_SYMBOL(dev_get_by_name_rcu);
702
703/**
Linus Torvalds1da177e2005-04-16 15:20:36 -0700704 * dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700705 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700706 * @name: name to find
707 *
708 * Find an interface by name. This can be called from any
709 * context and does its own locking. The returned handle has
710 * the usage count incremented and the caller must use dev_put() to
711 * release it when it is no longer needed. %NULL is returned if no
712 * matching device is found.
713 */
714
Eric W. Biederman881d9662007-09-17 11:56:21 -0700715struct net_device *dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716{
717 struct net_device *dev;
718
Eric Dumazet72c95282009-10-30 07:11:27 +0000719 rcu_read_lock();
720 dev = dev_get_by_name_rcu(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700721 if (dev)
722 dev_hold(dev);
Eric Dumazet72c95282009-10-30 07:11:27 +0000723 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724 return dev;
725}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700726EXPORT_SYMBOL(dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700727
728/**
729 * __dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700730 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731 * @ifindex: index of device
732 *
733 * Search for an interface by index. Returns %NULL if the device
734 * is not found or a pointer to the device. The device has not
735 * had its reference counter increased so the caller must be careful
736 * about locking. The caller must hold either the RTNL semaphore
737 * or @dev_base_lock.
738 */
739
Eric W. Biederman881d9662007-09-17 11:56:21 -0700740struct net_device *__dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700741{
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700742 struct net_device *dev;
743 struct hlist_head *head = dev_index_hash(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700744
Sasha Levinb67bfe02013-02-27 17:06:00 -0800745 hlist_for_each_entry(dev, head, index_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700746 if (dev->ifindex == ifindex)
747 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700748
Linus Torvalds1da177e2005-04-16 15:20:36 -0700749 return NULL;
750}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700751EXPORT_SYMBOL(__dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700752
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000753/**
754 * dev_get_by_index_rcu - find a device by its ifindex
755 * @net: the applicable net namespace
756 * @ifindex: index of device
757 *
758 * Search for an interface by index. Returns %NULL if the device
759 * is not found or a pointer to the device. The device has not
760 * had its reference counter increased so the caller must be careful
761 * about locking. The caller must hold RCU lock.
762 */
763
764struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
765{
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000766 struct net_device *dev;
767 struct hlist_head *head = dev_index_hash(net, ifindex);
768
Sasha Levinb67bfe02013-02-27 17:06:00 -0800769 hlist_for_each_entry_rcu(dev, head, index_hlist)
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000770 if (dev->ifindex == ifindex)
771 return dev;
772
773 return NULL;
774}
775EXPORT_SYMBOL(dev_get_by_index_rcu);
776
Linus Torvalds1da177e2005-04-16 15:20:36 -0700777
778/**
779 * dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700780 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700781 * @ifindex: index of device
782 *
783 * Search for an interface by index. Returns NULL if the device
784 * is not found or a pointer to the device. The device returned has
785 * had a reference added and the pointer is safe until the user calls
786 * dev_put to indicate they have finished with it.
787 */
788
Eric W. Biederman881d9662007-09-17 11:56:21 -0700789struct net_device *dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790{
791 struct net_device *dev;
792
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000793 rcu_read_lock();
794 dev = dev_get_by_index_rcu(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700795 if (dev)
796 dev_hold(dev);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000797 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700798 return dev;
799}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700800EXPORT_SYMBOL(dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700801
802/**
Nicolas Schichan5dbe7c12013-06-26 17:23:42 +0200803 * netdev_get_name - get a netdevice name, knowing its ifindex.
804 * @net: network namespace
805 * @name: a pointer to the buffer where the name will be stored.
806 * @ifindex: the ifindex of the interface to get the name from.
807 *
808 * The use of raw_seqcount_begin() and cond_resched() before
809 * retrying is required as we want to give the writers a chance
810 * to complete when CONFIG_PREEMPT is not set.
811 */
812int netdev_get_name(struct net *net, char *name, int ifindex)
813{
814 struct net_device *dev;
815 unsigned int seq;
816
817retry:
818 seq = raw_seqcount_begin(&devnet_rename_seq);
819 rcu_read_lock();
820 dev = dev_get_by_index_rcu(net, ifindex);
821 if (!dev) {
822 rcu_read_unlock();
823 return -ENODEV;
824 }
825
826 strcpy(name, dev->name);
827 rcu_read_unlock();
828 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
829 cond_resched();
830 goto retry;
831 }
832
833 return 0;
834}
835
836/**
Eric Dumazet941666c2010-12-05 01:23:53 +0000837 * dev_getbyhwaddr_rcu - find a device by its hardware address
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700838 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700839 * @type: media type of device
840 * @ha: hardware address
841 *
842 * Search for an interface by MAC address. Returns NULL if the device
Eric Dumazetc5066532011-01-24 13:16:16 -0800843 * is not found or a pointer to the device.
844 * The caller must hold RCU or RTNL.
Eric Dumazet941666c2010-12-05 01:23:53 +0000845 * The returned device has not had its ref count increased
Linus Torvalds1da177e2005-04-16 15:20:36 -0700846 * and the caller must therefore be careful about locking
847 *
Linus Torvalds1da177e2005-04-16 15:20:36 -0700848 */
849
Eric Dumazet941666c2010-12-05 01:23:53 +0000850struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
851 const char *ha)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700852{
853 struct net_device *dev;
854
Eric Dumazet941666c2010-12-05 01:23:53 +0000855 for_each_netdev_rcu(net, dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700856 if (dev->type == type &&
857 !memcmp(dev->dev_addr, ha, dev->addr_len))
Pavel Emelianov7562f872007-05-03 15:13:45 -0700858 return dev;
859
860 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700861}
Eric Dumazet941666c2010-12-05 01:23:53 +0000862EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
Jochen Friedrichcf309e32005-09-22 04:44:55 -0300863
Eric W. Biederman881d9662007-09-17 11:56:21 -0700864struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700865{
866 struct net_device *dev;
867
868 ASSERT_RTNL();
Eric W. Biederman881d9662007-09-17 11:56:21 -0700869 for_each_netdev(net, dev)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700870 if (dev->type == type)
Pavel Emelianov7562f872007-05-03 15:13:45 -0700871 return dev;
872
873 return NULL;
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700874}
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700875EXPORT_SYMBOL(__dev_getfirstbyhwtype);
876
Eric W. Biederman881d9662007-09-17 11:56:21 -0700877struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700878{
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000879 struct net_device *dev, *ret = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700880
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000881 rcu_read_lock();
882 for_each_netdev_rcu(net, dev)
883 if (dev->type == type) {
884 dev_hold(dev);
885 ret = dev;
886 break;
887 }
888 rcu_read_unlock();
889 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700890}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700891EXPORT_SYMBOL(dev_getfirstbyhwtype);
892
893/**
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000894 * dev_get_by_flags_rcu - find any device with given flags
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700895 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700896 * @if_flags: IFF_* values
897 * @mask: bitmask of bits in if_flags to check
898 *
899 * Search for any interface with the given flags. Returns NULL if a device
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000900 * is not found or a pointer to the device. Must be called inside
901 * rcu_read_lock(), and result refcount is unchanged.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700902 */
903
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000904struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700905 unsigned short mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700906{
Pavel Emelianov7562f872007-05-03 15:13:45 -0700907 struct net_device *dev, *ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700908
Pavel Emelianov7562f872007-05-03 15:13:45 -0700909 ret = NULL;
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800910 for_each_netdev_rcu(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700911 if (((dev->flags ^ if_flags) & mask) == 0) {
Pavel Emelianov7562f872007-05-03 15:13:45 -0700912 ret = dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700913 break;
914 }
915 }
Pavel Emelianov7562f872007-05-03 15:13:45 -0700916 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917}
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000918EXPORT_SYMBOL(dev_get_by_flags_rcu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700919
920/**
921 * dev_valid_name - check if name is okay for network device
922 * @name: name string
923 *
924 * Network device names need to be valid file names to
David S. Millerc7fa9d12006-08-15 16:34:13 -0700925 * to allow sysfs to work. We also disallow any kind of
926 * whitespace.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700927 */
David S. Miller95f050b2012-03-06 16:12:15 -0500928bool dev_valid_name(const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700929{
David S. Millerc7fa9d12006-08-15 16:34:13 -0700930 if (*name == '\0')
David S. Miller95f050b2012-03-06 16:12:15 -0500931 return false;
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -0700932 if (strlen(name) >= IFNAMSIZ)
David S. Miller95f050b2012-03-06 16:12:15 -0500933 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700934 if (!strcmp(name, ".") || !strcmp(name, ".."))
David S. Miller95f050b2012-03-06 16:12:15 -0500935 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700936
937 while (*name) {
938 if (*name == '/' || isspace(*name))
David S. Miller95f050b2012-03-06 16:12:15 -0500939 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700940 name++;
941 }
David S. Miller95f050b2012-03-06 16:12:15 -0500942 return true;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700943}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700944EXPORT_SYMBOL(dev_valid_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700945
946/**
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200947 * __dev_alloc_name - allocate a name for a device
948 * @net: network namespace to allocate the device name in
Linus Torvalds1da177e2005-04-16 15:20:36 -0700949 * @name: name format string
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200950 * @buf: scratch buffer and result name string
Linus Torvalds1da177e2005-04-16 15:20:36 -0700951 *
952 * Passed a format string - eg "lt%d" it will try and find a suitable
Stephen Hemminger3041a062006-05-26 13:25:24 -0700953 * id. It scans list of devices to build up a free map, then chooses
954 * the first empty slot. The caller must hold the dev_base or rtnl lock
955 * while allocating the name and adding the device in order to avoid
956 * duplicates.
957 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
958 * Returns the number of the unit assigned or a negative errno code.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700959 */
960
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200961static int __dev_alloc_name(struct net *net, const char *name, char *buf)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700962{
963 int i = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700964 const char *p;
965 const int max_netdevices = 8*PAGE_SIZE;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700966 unsigned long *inuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700967 struct net_device *d;
968
969 p = strnchr(name, IFNAMSIZ-1, '%');
970 if (p) {
971 /*
972 * Verify the string as this thing may have come from
973 * the user. There must be either one "%d" and no other "%"
974 * characters.
975 */
976 if (p[1] != 'd' || strchr(p + 2, '%'))
977 return -EINVAL;
978
979 /* Use one page as a bit array of possible slots */
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700980 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700981 if (!inuse)
982 return -ENOMEM;
983
Eric W. Biederman881d9662007-09-17 11:56:21 -0700984 for_each_netdev(net, d) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700985 if (!sscanf(d->name, name, &i))
986 continue;
987 if (i < 0 || i >= max_netdevices)
988 continue;
989
990 /* avoid cases where sscanf is not exact inverse of printf */
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200991 snprintf(buf, IFNAMSIZ, name, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700992 if (!strncmp(buf, d->name, IFNAMSIZ))
993 set_bit(i, inuse);
994 }
995
996 i = find_first_zero_bit(inuse, max_netdevices);
997 free_page((unsigned long) inuse);
998 }
999
Octavian Purdilad9031022009-11-18 02:36:59 +00001000 if (buf != name)
1001 snprintf(buf, IFNAMSIZ, name, i);
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001002 if (!__dev_get_by_name(net, buf))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001003 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001004
1005 /* It is possible to run out of possible slots
1006 * when the name is long and there isn't enough space left
1007 * for the digits, or if all bits are used.
1008 */
1009 return -ENFILE;
1010}
1011
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001012/**
1013 * dev_alloc_name - allocate a name for a device
1014 * @dev: device
1015 * @name: name format string
1016 *
1017 * Passed a format string - eg "lt%d" it will try and find a suitable
1018 * id. It scans list of devices to build up a free map, then chooses
1019 * the first empty slot. The caller must hold the dev_base or rtnl lock
1020 * while allocating the name and adding the device in order to avoid
1021 * duplicates.
1022 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1023 * Returns the number of the unit assigned or a negative errno code.
1024 */
1025
1026int dev_alloc_name(struct net_device *dev, const char *name)
1027{
1028 char buf[IFNAMSIZ];
1029 struct net *net;
1030 int ret;
1031
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001032 BUG_ON(!dev_net(dev));
1033 net = dev_net(dev);
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001034 ret = __dev_alloc_name(net, name, buf);
1035 if (ret >= 0)
1036 strlcpy(dev->name, buf, IFNAMSIZ);
1037 return ret;
1038}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001039EXPORT_SYMBOL(dev_alloc_name);
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001040
Gao feng828de4f2012-09-13 20:58:27 +00001041static int dev_alloc_name_ns(struct net *net,
1042 struct net_device *dev,
1043 const char *name)
Octavian Purdilad9031022009-11-18 02:36:59 +00001044{
Gao feng828de4f2012-09-13 20:58:27 +00001045 char buf[IFNAMSIZ];
1046 int ret;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001047
Gao feng828de4f2012-09-13 20:58:27 +00001048 ret = __dev_alloc_name(net, name, buf);
1049 if (ret >= 0)
1050 strlcpy(dev->name, buf, IFNAMSIZ);
1051 return ret;
1052}
1053
1054static int dev_get_valid_name(struct net *net,
1055 struct net_device *dev,
1056 const char *name)
1057{
1058 BUG_ON(!net);
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001059
Octavian Purdilad9031022009-11-18 02:36:59 +00001060 if (!dev_valid_name(name))
1061 return -EINVAL;
1062
Jiri Pirko1c5cae82011-04-30 01:21:32 +00001063 if (strchr(name, '%'))
Gao feng828de4f2012-09-13 20:58:27 +00001064 return dev_alloc_name_ns(net, dev, name);
Octavian Purdilad9031022009-11-18 02:36:59 +00001065 else if (__dev_get_by_name(net, name))
1066 return -EEXIST;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001067 else if (dev->name != name)
1068 strlcpy(dev->name, name, IFNAMSIZ);
Octavian Purdilad9031022009-11-18 02:36:59 +00001069
1070 return 0;
1071}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001072
1073/**
1074 * dev_change_name - change name of a device
1075 * @dev: device
1076 * @newname: name (or format string) must be at least IFNAMSIZ
1077 *
1078 * Change name of a device, can pass format strings "eth%d".
1079 * for wildcarding.
1080 */
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07001081int dev_change_name(struct net_device *dev, const char *newname)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001082{
Herbert Xufcc5a032007-07-30 17:03:38 -07001083 char oldname[IFNAMSIZ];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001084 int err = 0;
Herbert Xufcc5a032007-07-30 17:03:38 -07001085 int ret;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001086 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001087
1088 ASSERT_RTNL();
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001089 BUG_ON(!dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001090
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001091 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001092 if (dev->flags & IFF_UP)
1093 return -EBUSY;
1094
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001095 write_seqcount_begin(&devnet_rename_seq);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001096
1097 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001098 write_seqcount_end(&devnet_rename_seq);
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -07001099 return 0;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001100 }
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -07001101
Herbert Xufcc5a032007-07-30 17:03:38 -07001102 memcpy(oldname, dev->name, IFNAMSIZ);
1103
Gao feng828de4f2012-09-13 20:58:27 +00001104 err = dev_get_valid_name(net, dev, newname);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001105 if (err < 0) {
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001106 write_seqcount_end(&devnet_rename_seq);
Octavian Purdilad9031022009-11-18 02:36:59 +00001107 return err;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001108 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001109
Herbert Xufcc5a032007-07-30 17:03:38 -07001110rollback:
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001111 ret = device_rename(&dev->dev, dev->name);
1112 if (ret) {
1113 memcpy(dev->name, oldname, IFNAMSIZ);
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001114 write_seqcount_end(&devnet_rename_seq);
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001115 return ret;
Stephen Hemmingerdcc99772008-05-14 22:33:38 -07001116 }
Herbert Xu7f988ea2007-07-30 16:35:46 -07001117
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001118 write_seqcount_end(&devnet_rename_seq);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001119
Herbert Xu7f988ea2007-07-30 16:35:46 -07001120 write_lock_bh(&dev_base_lock);
Eric Dumazet372b2312011-05-17 13:56:59 -04001121 hlist_del_rcu(&dev->name_hlist);
Eric Dumazet72c95282009-10-30 07:11:27 +00001122 write_unlock_bh(&dev_base_lock);
1123
1124 synchronize_rcu();
1125
1126 write_lock_bh(&dev_base_lock);
1127 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Herbert Xu7f988ea2007-07-30 16:35:46 -07001128 write_unlock_bh(&dev_base_lock);
1129
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001130 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001131 ret = notifier_to_errno(ret);
1132
1133 if (ret) {
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001134 /* err >= 0 after dev_alloc_name() or stores the first errno */
1135 if (err >= 0) {
Herbert Xufcc5a032007-07-30 17:03:38 -07001136 err = ret;
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001137 write_seqcount_begin(&devnet_rename_seq);
Herbert Xufcc5a032007-07-30 17:03:38 -07001138 memcpy(dev->name, oldname, IFNAMSIZ);
1139 goto rollback;
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001140 } else {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001141 pr_err("%s: name change rollback failed: %d\n",
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001142 dev->name, ret);
Herbert Xufcc5a032007-07-30 17:03:38 -07001143 }
1144 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001145
1146 return err;
1147}
1148
1149/**
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001150 * dev_set_alias - change ifalias of a device
1151 * @dev: device
1152 * @alias: name up to IFALIASZ
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07001153 * @len: limit of bytes to copy from info
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001154 *
1155 * Set ifalias for a device,
1156 */
1157int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1158{
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001159 char *new_ifalias;
1160
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001161 ASSERT_RTNL();
1162
1163 if (len >= IFALIASZ)
1164 return -EINVAL;
1165
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001166 if (!len) {
Sachin Kamat388dfc22012-11-20 00:57:04 +00001167 kfree(dev->ifalias);
1168 dev->ifalias = NULL;
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001169 return 0;
1170 }
1171
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001172 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1173 if (!new_ifalias)
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001174 return -ENOMEM;
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001175 dev->ifalias = new_ifalias;
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001176
1177 strlcpy(dev->ifalias, alias, len+1);
1178 return len;
1179}
1180
1181
1182/**
Stephen Hemminger3041a062006-05-26 13:25:24 -07001183 * netdev_features_change - device changes features
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001184 * @dev: device to cause notification
1185 *
1186 * Called to indicate a device has changed features.
1187 */
1188void netdev_features_change(struct net_device *dev)
1189{
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001190 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001191}
1192EXPORT_SYMBOL(netdev_features_change);
1193
1194/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001195 * netdev_state_change - device changes state
1196 * @dev: device to cause notification
1197 *
1198 * Called to indicate a device has changed state. This function calls
1199 * the notifier chains for netdev_chain and sends a NEWLINK message
1200 * to the routing socket.
1201 */
1202void netdev_state_change(struct net_device *dev)
1203{
1204 if (dev->flags & IFF_UP) {
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001205 call_netdevice_notifiers(NETDEV_CHANGE, dev);
Alexei Starovoitov7f294052013-10-23 16:02:42 -07001206 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001207 }
1208}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001209EXPORT_SYMBOL(netdev_state_change);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001210
Amerigo Wangee89bab2012-08-09 22:14:56 +00001211/**
1212 * netdev_notify_peers - notify network peers about existence of @dev
1213 * @dev: network device
1214 *
1215 * Generate traffic such that interested network peers are aware of
1216 * @dev, such as by generating a gratuitous ARP. This may be used when
1217 * a device wants to inform the rest of the network about some sort of
1218 * reconfiguration such as a failover event or virtual machine
1219 * migration.
1220 */
1221void netdev_notify_peers(struct net_device *dev)
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001222{
Amerigo Wangee89bab2012-08-09 22:14:56 +00001223 rtnl_lock();
1224 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1225 rtnl_unlock();
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001226}
Amerigo Wangee89bab2012-08-09 22:14:56 +00001227EXPORT_SYMBOL(netdev_notify_peers);
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001228
Patrick McHardybd380812010-02-26 06:34:53 +00001229static int __dev_open(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001230{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001231 const struct net_device_ops *ops = dev->netdev_ops;
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001232 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001233
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001234 ASSERT_RTNL();
1235
Linus Torvalds1da177e2005-04-16 15:20:36 -07001236 if (!netif_device_present(dev))
1237 return -ENODEV;
1238
Neil Hormanca99ca12013-02-05 08:05:43 +00001239 /* Block netpoll from trying to do any rx path servicing.
1240 * If we don't do this there is a chance ndo_poll_controller
1241 * or ndo_poll may be running while we open the device
1242 */
dingtianhongda6e3782013-05-27 19:53:31 +00001243 netpoll_rx_disable(dev);
Neil Hormanca99ca12013-02-05 08:05:43 +00001244
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001245 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1246 ret = notifier_to_errno(ret);
1247 if (ret)
1248 return ret;
1249
Linus Torvalds1da177e2005-04-16 15:20:36 -07001250 set_bit(__LINK_STATE_START, &dev->state);
Jeff Garzikbada3392007-10-23 20:19:37 -07001251
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001252 if (ops->ndo_validate_addr)
1253 ret = ops->ndo_validate_addr(dev);
Jeff Garzikbada3392007-10-23 20:19:37 -07001254
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001255 if (!ret && ops->ndo_open)
1256 ret = ops->ndo_open(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001257
Neil Hormanca99ca12013-02-05 08:05:43 +00001258 netpoll_rx_enable(dev);
1259
Jeff Garzikbada3392007-10-23 20:19:37 -07001260 if (ret)
1261 clear_bit(__LINK_STATE_START, &dev->state);
1262 else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001263 dev->flags |= IFF_UP;
David S. Millerb4bd07c2009-02-06 22:06:43 -08001264 net_dmaengine_get();
Patrick McHardy4417da62007-06-27 01:28:10 -07001265 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001266 dev_activate(dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04001267 add_device_randomness(dev->dev_addr, dev->addr_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001268 }
Jeff Garzikbada3392007-10-23 20:19:37 -07001269
Linus Torvalds1da177e2005-04-16 15:20:36 -07001270 return ret;
1271}
Patrick McHardybd380812010-02-26 06:34:53 +00001272
1273/**
1274 * dev_open - prepare an interface for use.
1275 * @dev: device to open
1276 *
1277 * Takes a device from down to up state. The device's private open
1278 * function is invoked and then the multicast lists are loaded. Finally
1279 * the device is moved into the up state and a %NETDEV_UP message is
1280 * sent to the netdev notifier chain.
1281 *
1282 * Calling this function on an active interface is a nop. On a failure
1283 * a negative errno code is returned.
1284 */
1285int dev_open(struct net_device *dev)
1286{
1287 int ret;
1288
Patrick McHardybd380812010-02-26 06:34:53 +00001289 if (dev->flags & IFF_UP)
1290 return 0;
1291
Patrick McHardybd380812010-02-26 06:34:53 +00001292 ret = __dev_open(dev);
1293 if (ret < 0)
1294 return ret;
1295
Alexei Starovoitov7f294052013-10-23 16:02:42 -07001296 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
Patrick McHardybd380812010-02-26 06:34:53 +00001297 call_netdevice_notifiers(NETDEV_UP, dev);
1298
1299 return ret;
1300}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001301EXPORT_SYMBOL(dev_open);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001302
Octavian Purdila44345722010-12-13 12:44:07 +00001303static int __dev_close_many(struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001304{
Octavian Purdila44345722010-12-13 12:44:07 +00001305 struct net_device *dev;
Patrick McHardybd380812010-02-26 06:34:53 +00001306
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001307 ASSERT_RTNL();
David S. Miller9d5010d2007-09-12 14:33:25 +02001308 might_sleep();
1309
Eric W. Biederman5cde2822013-10-05 19:26:05 -07001310 list_for_each_entry(dev, head, close_list) {
Octavian Purdila44345722010-12-13 12:44:07 +00001311 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001312
Octavian Purdila44345722010-12-13 12:44:07 +00001313 clear_bit(__LINK_STATE_START, &dev->state);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001314
Octavian Purdila44345722010-12-13 12:44:07 +00001315 /* Synchronize to scheduled poll. We cannot touch poll list, it
1316 * can be even on different cpu. So just clear netif_running().
1317 *
1318 * dev->stop() will invoke napi_disable() on all of it's
1319 * napi_struct instances on this device.
1320 */
1321 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1322 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001323
Octavian Purdila44345722010-12-13 12:44:07 +00001324 dev_deactivate_many(head);
1325
Eric W. Biederman5cde2822013-10-05 19:26:05 -07001326 list_for_each_entry(dev, head, close_list) {
Octavian Purdila44345722010-12-13 12:44:07 +00001327 const struct net_device_ops *ops = dev->netdev_ops;
1328
1329 /*
1330 * Call the device specific close. This cannot fail.
1331 * Only if device is UP
1332 *
1333 * We allow it to be called even after a DETACH hot-plug
1334 * event.
1335 */
1336 if (ops->ndo_stop)
1337 ops->ndo_stop(dev);
1338
Octavian Purdila44345722010-12-13 12:44:07 +00001339 dev->flags &= ~IFF_UP;
Octavian Purdila44345722010-12-13 12:44:07 +00001340 net_dmaengine_put();
1341 }
1342
1343 return 0;
1344}
1345
1346static int __dev_close(struct net_device *dev)
1347{
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001348 int retval;
Octavian Purdila44345722010-12-13 12:44:07 +00001349 LIST_HEAD(single);
1350
Neil Hormanca99ca12013-02-05 08:05:43 +00001351 /* Temporarily disable netpoll until the interface is down */
dingtianhongda6e3782013-05-27 19:53:31 +00001352 netpoll_rx_disable(dev);
Neil Hormanca99ca12013-02-05 08:05:43 +00001353
Eric W. Biederman5cde2822013-10-05 19:26:05 -07001354 list_add(&dev->close_list, &single);
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001355 retval = __dev_close_many(&single);
1356 list_del(&single);
Neil Hormanca99ca12013-02-05 08:05:43 +00001357
1358 netpoll_rx_enable(dev);
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001359 return retval;
Octavian Purdila44345722010-12-13 12:44:07 +00001360}
1361
Eric Dumazet3fbd8752011-01-19 21:23:22 +00001362static int dev_close_many(struct list_head *head)
Octavian Purdila44345722010-12-13 12:44:07 +00001363{
1364 struct net_device *dev, *tmp;
Octavian Purdila44345722010-12-13 12:44:07 +00001365
Eric W. Biederman5cde2822013-10-05 19:26:05 -07001366 /* Remove the devices that don't need to be closed */
1367 list_for_each_entry_safe(dev, tmp, head, close_list)
Octavian Purdila44345722010-12-13 12:44:07 +00001368 if (!(dev->flags & IFF_UP))
Eric W. Biederman5cde2822013-10-05 19:26:05 -07001369 list_del_init(&dev->close_list);
Octavian Purdila44345722010-12-13 12:44:07 +00001370
1371 __dev_close_many(head);
Matti Linnanvuorid8b2a4d2008-02-12 23:10:11 -08001372
Eric W. Biederman5cde2822013-10-05 19:26:05 -07001373 list_for_each_entry_safe(dev, tmp, head, close_list) {
Alexei Starovoitov7f294052013-10-23 16:02:42 -07001374 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
Octavian Purdila44345722010-12-13 12:44:07 +00001375 call_netdevice_notifiers(NETDEV_DOWN, dev);
Eric W. Biederman5cde2822013-10-05 19:26:05 -07001376 list_del_init(&dev->close_list);
Octavian Purdila44345722010-12-13 12:44:07 +00001377 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001378
Linus Torvalds1da177e2005-04-16 15:20:36 -07001379 return 0;
1380}
Patrick McHardybd380812010-02-26 06:34:53 +00001381
1382/**
1383 * dev_close - shutdown an interface.
1384 * @dev: device to shutdown
1385 *
1386 * This function moves an active device into down state. A
1387 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1388 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1389 * chain.
1390 */
1391int dev_close(struct net_device *dev)
1392{
Eric Dumazete14a5992011-05-10 12:26:06 -07001393 if (dev->flags & IFF_UP) {
1394 LIST_HEAD(single);
Patrick McHardybd380812010-02-26 06:34:53 +00001395
Neil Hormanca99ca12013-02-05 08:05:43 +00001396 /* Block netpoll rx while the interface is going down */
dingtianhongda6e3782013-05-27 19:53:31 +00001397 netpoll_rx_disable(dev);
Neil Hormanca99ca12013-02-05 08:05:43 +00001398
Eric W. Biederman5cde2822013-10-05 19:26:05 -07001399 list_add(&dev->close_list, &single);
Eric Dumazete14a5992011-05-10 12:26:06 -07001400 dev_close_many(&single);
1401 list_del(&single);
Neil Hormanca99ca12013-02-05 08:05:43 +00001402
1403 netpoll_rx_enable(dev);
Eric Dumazete14a5992011-05-10 12:26:06 -07001404 }
dingtianhongda6e3782013-05-27 19:53:31 +00001405 return 0;
Patrick McHardybd380812010-02-26 06:34:53 +00001406}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001407EXPORT_SYMBOL(dev_close);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001408
1409
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001410/**
1411 * dev_disable_lro - disable Large Receive Offload on a device
1412 * @dev: device
1413 *
1414 * Disable Large Receive Offload (LRO) on a net device. Must be
1415 * called under RTNL. This is needed if received packets may be
1416 * forwarded to another interface.
1417 */
1418void dev_disable_lro(struct net_device *dev)
1419{
Neil Hormanf11970e2011-05-24 08:31:09 +00001420 /*
1421 * If we're trying to disable lro on a vlan device
1422 * use the underlying physical device instead
1423 */
1424 if (is_vlan_dev(dev))
1425 dev = vlan_dev_real_dev(dev);
1426
Michał Mirosławbc5787c62011-11-15 15:29:55 +00001427 dev->wanted_features &= ~NETIF_F_LRO;
1428 netdev_update_features(dev);
Michał Mirosław27660512011-03-18 16:56:34 +00001429
Michał Mirosław22d59692011-04-21 12:42:15 +00001430 if (unlikely(dev->features & NETIF_F_LRO))
1431 netdev_WARN(dev, "failed to disable LRO!\n");
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001432}
1433EXPORT_SYMBOL(dev_disable_lro);
1434
Jiri Pirko351638e2013-05-28 01:30:21 +00001435static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1436 struct net_device *dev)
1437{
1438 struct netdev_notifier_info info;
1439
1440 netdev_notifier_info_init(&info, dev);
1441 return nb->notifier_call(nb, val, &info);
1442}
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001443
Eric W. Biederman881d9662007-09-17 11:56:21 -07001444static int dev_boot_phase = 1;
1445
Linus Torvalds1da177e2005-04-16 15:20:36 -07001446/**
1447 * register_netdevice_notifier - register a network notifier block
1448 * @nb: notifier
1449 *
1450 * Register a notifier to be called when network device events occur.
1451 * The notifier passed is linked into the kernel structures and must
1452 * not be reused until it has been unregistered. A negative errno code
1453 * is returned on a failure.
1454 *
1455 * When registered all registration and up events are replayed
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001456 * to the new notifier to allow device to have a race free
Linus Torvalds1da177e2005-04-16 15:20:36 -07001457 * view of the network device list.
1458 */
1459
1460int register_netdevice_notifier(struct notifier_block *nb)
1461{
1462 struct net_device *dev;
Herbert Xufcc5a032007-07-30 17:03:38 -07001463 struct net_device *last;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001464 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001465 int err;
1466
1467 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001468 err = raw_notifier_chain_register(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001469 if (err)
1470 goto unlock;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001471 if (dev_boot_phase)
1472 goto unlock;
1473 for_each_net(net) {
1474 for_each_netdev(net, dev) {
Jiri Pirko351638e2013-05-28 01:30:21 +00001475 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
Eric W. Biederman881d9662007-09-17 11:56:21 -07001476 err = notifier_to_errno(err);
1477 if (err)
1478 goto rollback;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001479
Eric W. Biederman881d9662007-09-17 11:56:21 -07001480 if (!(dev->flags & IFF_UP))
1481 continue;
Herbert Xufcc5a032007-07-30 17:03:38 -07001482
Jiri Pirko351638e2013-05-28 01:30:21 +00001483 call_netdevice_notifier(nb, NETDEV_UP, dev);
Eric W. Biederman881d9662007-09-17 11:56:21 -07001484 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001485 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001486
1487unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001488 rtnl_unlock();
1489 return err;
Herbert Xufcc5a032007-07-30 17:03:38 -07001490
1491rollback:
1492 last = dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001493 for_each_net(net) {
1494 for_each_netdev(net, dev) {
1495 if (dev == last)
RongQing.Li8f891482011-11-30 23:43:07 -05001496 goto outroll;
Herbert Xufcc5a032007-07-30 17:03:38 -07001497
Eric W. Biederman881d9662007-09-17 11:56:21 -07001498 if (dev->flags & IFF_UP) {
Jiri Pirko351638e2013-05-28 01:30:21 +00001499 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1500 dev);
1501 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
Eric W. Biederman881d9662007-09-17 11:56:21 -07001502 }
Jiri Pirko351638e2013-05-28 01:30:21 +00001503 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001504 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001505 }
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001506
RongQing.Li8f891482011-11-30 23:43:07 -05001507outroll:
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001508 raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001509 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001510}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001511EXPORT_SYMBOL(register_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001512
1513/**
1514 * unregister_netdevice_notifier - unregister a network notifier block
1515 * @nb: notifier
1516 *
1517 * Unregister a notifier previously registered by
1518 * register_netdevice_notifier(). The notifier is unlinked into the
1519 * kernel structures and may then be reused. A negative errno code
1520 * is returned on a failure.
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001521 *
1522 * After unregistering unregister and down device events are synthesized
1523 * for all devices on the device list to the removed notifier to remove
1524 * the need for special case cleanup code.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001525 */
1526
1527int unregister_netdevice_notifier(struct notifier_block *nb)
1528{
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001529 struct net_device *dev;
1530 struct net *net;
Herbert Xu9f514952006-03-25 01:24:25 -08001531 int err;
1532
1533 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001534 err = raw_notifier_chain_unregister(&netdev_chain, nb);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001535 if (err)
1536 goto unlock;
1537
1538 for_each_net(net) {
1539 for_each_netdev(net, dev) {
1540 if (dev->flags & IFF_UP) {
Jiri Pirko351638e2013-05-28 01:30:21 +00001541 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1542 dev);
1543 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001544 }
Jiri Pirko351638e2013-05-28 01:30:21 +00001545 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001546 }
1547 }
1548unlock:
Herbert Xu9f514952006-03-25 01:24:25 -08001549 rtnl_unlock();
1550 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001551}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001552EXPORT_SYMBOL(unregister_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001553
1554/**
Jiri Pirko351638e2013-05-28 01:30:21 +00001555 * call_netdevice_notifiers_info - call all network notifier blocks
1556 * @val: value passed unmodified to notifier function
1557 * @dev: net_device pointer passed unmodified to notifier function
1558 * @info: notifier information data
1559 *
1560 * Call all network notifier blocks. Parameters and return value
1561 * are as for raw_notifier_call_chain().
1562 */
1563
1564int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
1565 struct netdev_notifier_info *info)
1566{
1567 ASSERT_RTNL();
1568 netdev_notifier_info_init(info, dev);
1569 return raw_notifier_call_chain(&netdev_chain, val, info);
1570}
1571EXPORT_SYMBOL(call_netdevice_notifiers_info);
1572
1573/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001574 * call_netdevice_notifiers - call all network notifier blocks
1575 * @val: value passed unmodified to notifier function
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001576 * @dev: net_device pointer passed unmodified to notifier function
Linus Torvalds1da177e2005-04-16 15:20:36 -07001577 *
1578 * Call all network notifier blocks. Parameters and return value
Alan Sternf07d5b92006-05-09 15:23:03 -07001579 * are as for raw_notifier_call_chain().
Linus Torvalds1da177e2005-04-16 15:20:36 -07001580 */
1581
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001582int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001583{
Jiri Pirko351638e2013-05-28 01:30:21 +00001584 struct netdev_notifier_info info;
1585
1586 return call_netdevice_notifiers_info(val, dev, &info);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001587}
stephen hemmingeredf947f2011-03-24 13:24:01 +00001588EXPORT_SYMBOL(call_netdevice_notifiers);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589
Ingo Molnarc5905af2012-02-24 08:31:31 +01001590static struct static_key netstamp_needed __read_mostly;
Eric Dumazetb90e5792011-11-28 11:16:50 +00001591#ifdef HAVE_JUMP_LABEL
Ingo Molnarc5905af2012-02-24 08:31:31 +01001592/* We are not allowed to call static_key_slow_dec() from irq context
Eric Dumazetb90e5792011-11-28 11:16:50 +00001593 * If net_disable_timestamp() is called from irq context, defer the
Ingo Molnarc5905af2012-02-24 08:31:31 +01001594 * static_key_slow_dec() calls.
Eric Dumazetb90e5792011-11-28 11:16:50 +00001595 */
1596static atomic_t netstamp_needed_deferred;
1597#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001598
1599void net_enable_timestamp(void)
1600{
Eric Dumazetb90e5792011-11-28 11:16:50 +00001601#ifdef HAVE_JUMP_LABEL
1602 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1603
1604 if (deferred) {
1605 while (--deferred)
Ingo Molnarc5905af2012-02-24 08:31:31 +01001606 static_key_slow_dec(&netstamp_needed);
Eric Dumazetb90e5792011-11-28 11:16:50 +00001607 return;
1608 }
1609#endif
Ingo Molnarc5905af2012-02-24 08:31:31 +01001610 static_key_slow_inc(&netstamp_needed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001611}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001612EXPORT_SYMBOL(net_enable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001613
1614void net_disable_timestamp(void)
1615{
Eric Dumazetb90e5792011-11-28 11:16:50 +00001616#ifdef HAVE_JUMP_LABEL
1617 if (in_interrupt()) {
1618 atomic_inc(&netstamp_needed_deferred);
1619 return;
1620 }
1621#endif
Ingo Molnarc5905af2012-02-24 08:31:31 +01001622 static_key_slow_dec(&netstamp_needed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001623}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001624EXPORT_SYMBOL(net_disable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001625
Eric Dumazet3b098e22010-05-15 23:57:10 -07001626static inline void net_timestamp_set(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001627{
Eric Dumazet588f0332011-11-15 04:12:55 +00001628 skb->tstamp.tv64 = 0;
Ingo Molnarc5905af2012-02-24 08:31:31 +01001629 if (static_key_false(&netstamp_needed))
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001630 __net_timestamp(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001631}
1632
Eric Dumazet588f0332011-11-15 04:12:55 +00001633#define net_timestamp_check(COND, SKB) \
Ingo Molnarc5905af2012-02-24 08:31:31 +01001634 if (static_key_false(&netstamp_needed)) { \
Eric Dumazet588f0332011-11-15 04:12:55 +00001635 if ((COND) && !(SKB)->tstamp.tv64) \
1636 __net_timestamp(SKB); \
1637 } \
Eric Dumazet3b098e22010-05-15 23:57:10 -07001638
Daniel Lezcano79b569f2011-03-30 02:42:17 -07001639static inline bool is_skb_forwardable(struct net_device *dev,
1640 struct sk_buff *skb)
1641{
1642 unsigned int len;
1643
1644 if (!(dev->flags & IFF_UP))
1645 return false;
1646
1647 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1648 if (skb->len <= len)
1649 return true;
1650
1651 /* if TSO is enabled, we don't care about the length as the packet
1652 * could be forwarded without being segmented before
1653 */
1654 if (skb_is_gso(skb))
1655 return true;
1656
1657 return false;
1658}
1659
Arnd Bergmann44540962009-11-26 06:07:08 +00001660/**
1661 * dev_forward_skb - loopback an skb to another netif
1662 *
1663 * @dev: destination network device
1664 * @skb: buffer to forward
1665 *
1666 * return values:
1667 * NET_RX_SUCCESS (no congestion)
Eric Dumazet6ec82562010-05-06 00:53:53 -07001668 * NET_RX_DROP (packet was dropped, but freed)
Arnd Bergmann44540962009-11-26 06:07:08 +00001669 *
1670 * dev_forward_skb can be used for injecting an skb from the
1671 * start_xmit function of one device into the receive queue
1672 * of another device.
1673 *
1674 * The receiving device may be in another namespace, so
1675 * we have to clear all information in the skb that could
1676 * impact namespace isolation.
1677 */
1678int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1679{
Michael S. Tsirkin48c83012011-08-31 08:03:29 +00001680 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1681 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1682 atomic_long_inc(&dev->rx_dropped);
1683 kfree_skb(skb);
1684 return NET_RX_DROP;
1685 }
1686 }
1687
Daniel Lezcano79b569f2011-03-30 02:42:17 -07001688 if (unlikely(!is_skb_forwardable(dev, skb))) {
Eric Dumazetcaf586e2010-09-30 21:06:55 +00001689 atomic_long_inc(&dev->rx_dropped);
Eric Dumazet6ec82562010-05-06 00:53:53 -07001690 kfree_skb(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001691 return NET_RX_DROP;
Eric Dumazet6ec82562010-05-06 00:53:53 -07001692 }
Isaku Yamahata06a23fe2013-07-02 20:30:10 +09001693
Nicolas Dichtel8b27f272013-09-02 15:34:56 +02001694 skb_scrub_packet(skb, true);
Alexei Starovoitov81b9eab2013-11-12 14:39:13 -08001695 skb->protocol = eth_type_trans(skb, dev);
Isaku Yamahata06a23fe2013-07-02 20:30:10 +09001696
Arnd Bergmann44540962009-11-26 06:07:08 +00001697 return netif_rx(skb);
1698}
1699EXPORT_SYMBOL_GPL(dev_forward_skb);
1700
Changli Gao71d9dec2010-12-15 19:57:25 +00001701static inline int deliver_skb(struct sk_buff *skb,
1702 struct packet_type *pt_prev,
1703 struct net_device *orig_dev)
1704{
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00001705 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1706 return -ENOMEM;
Changli Gao71d9dec2010-12-15 19:57:25 +00001707 atomic_inc(&skb->users);
1708 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1709}
1710
Eric Leblondc0de08d2012-08-16 22:02:58 +00001711static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1712{
Eric Leblonda3d744e2012-11-06 02:10:10 +00001713 if (!ptype->af_packet_priv || !skb->sk)
Eric Leblondc0de08d2012-08-16 22:02:58 +00001714 return false;
1715
1716 if (ptype->id_match)
1717 return ptype->id_match(ptype, skb->sk);
1718 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1719 return true;
1720
1721 return false;
1722}
1723
Linus Torvalds1da177e2005-04-16 15:20:36 -07001724/*
1725 * Support routine. Sends outgoing frames to any network
1726 * taps currently in use.
1727 */
1728
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001729static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001730{
1731 struct packet_type *ptype;
Changli Gao71d9dec2010-12-15 19:57:25 +00001732 struct sk_buff *skb2 = NULL;
1733 struct packet_type *pt_prev = NULL;
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001734
Linus Torvalds1da177e2005-04-16 15:20:36 -07001735 rcu_read_lock();
1736 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1737 /* Never send packets back to the socket
1738 * they originated from - MvS (miquels@drinkel.ow.org)
1739 */
1740 if ((ptype->dev == dev || !ptype->dev) &&
Eric Leblondc0de08d2012-08-16 22:02:58 +00001741 (!skb_loop_sk(ptype, skb))) {
Changli Gao71d9dec2010-12-15 19:57:25 +00001742 if (pt_prev) {
1743 deliver_skb(skb2, pt_prev, skb->dev);
1744 pt_prev = ptype;
1745 continue;
1746 }
1747
1748 skb2 = skb_clone(skb, GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001749 if (!skb2)
1750 break;
1751
Eric Dumazet70978182010-12-20 21:22:51 +00001752 net_timestamp_set(skb2);
1753
Linus Torvalds1da177e2005-04-16 15:20:36 -07001754 /* skb->nh should be correctly
1755 set by sender, so that the second statement is
1756 just protection against buggy protocols.
1757 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001758 skb_reset_mac_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001759
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001760 if (skb_network_header(skb2) < skb2->data ||
Simon Hormanced14f62013-05-28 20:34:25 +00001761 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
Joe Perchese87cc472012-05-13 21:56:26 +00001762 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1763 ntohs(skb2->protocol),
1764 dev->name);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07001765 skb_reset_network_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001766 }
1767
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001768 skb2->transport_header = skb2->network_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001769 skb2->pkt_type = PACKET_OUTGOING;
Changli Gao71d9dec2010-12-15 19:57:25 +00001770 pt_prev = ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001771 }
1772 }
Changli Gao71d9dec2010-12-15 19:57:25 +00001773 if (pt_prev)
1774 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001775 rcu_read_unlock();
1776}
1777
Ben Hutchings2c530402012-07-10 10:55:09 +00001778/**
1779 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
John Fastabend4f57c082011-01-17 08:06:04 +00001780 * @dev: Network device
1781 * @txq: number of queues available
1782 *
1783 * If real_num_tx_queues is changed the tc mappings may no longer be
1784 * valid. To resolve this verify the tc mapping remains valid and if
1785 * not NULL the mapping. With no priorities mapping to this
1786 * offset/count pair it will no longer be used. In the worst case TC0
1787 * is invalid nothing can be done so disable priority mappings. If is
1788 * expected that drivers will fix this mapping if they can before
1789 * calling netif_set_real_num_tx_queues.
1790 */
Eric Dumazetbb134d22011-01-20 19:18:08 +00001791static void netif_setup_tc(struct net_device *dev, unsigned int txq)
John Fastabend4f57c082011-01-17 08:06:04 +00001792{
1793 int i;
1794 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1795
1796 /* If TC0 is invalidated disable TC mapping */
1797 if (tc->offset + tc->count > txq) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001798 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
John Fastabend4f57c082011-01-17 08:06:04 +00001799 dev->num_tc = 0;
1800 return;
1801 }
1802
1803 /* Invalidated prio to tc mappings set to TC0 */
1804 for (i = 1; i < TC_BITMASK + 1; i++) {
1805 int q = netdev_get_prio_tc_map(dev, i);
1806
1807 tc = &dev->tc_to_txq[q];
1808 if (tc->offset + tc->count > txq) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001809 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1810 i, q);
John Fastabend4f57c082011-01-17 08:06:04 +00001811 netdev_set_prio_tc_map(dev, i, 0);
1812 }
1813 }
1814}
1815
Alexander Duyck537c00d2013-01-10 08:57:02 +00001816#ifdef CONFIG_XPS
1817static DEFINE_MUTEX(xps_map_mutex);
1818#define xmap_dereference(P) \
1819 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1820
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001821static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1822 int cpu, u16 index)
1823{
1824 struct xps_map *map = NULL;
1825 int pos;
1826
1827 if (dev_maps)
1828 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1829
1830 for (pos = 0; map && pos < map->len; pos++) {
1831 if (map->queues[pos] == index) {
1832 if (map->len > 1) {
1833 map->queues[pos] = map->queues[--map->len];
1834 } else {
1835 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1836 kfree_rcu(map, rcu);
1837 map = NULL;
1838 }
1839 break;
1840 }
1841 }
1842
1843 return map;
1844}
1845
Alexander Duyck024e9672013-01-10 08:57:46 +00001846static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
Alexander Duyck537c00d2013-01-10 08:57:02 +00001847{
1848 struct xps_dev_maps *dev_maps;
Alexander Duyck024e9672013-01-10 08:57:46 +00001849 int cpu, i;
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001850 bool active = false;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001851
1852 mutex_lock(&xps_map_mutex);
1853 dev_maps = xmap_dereference(dev->xps_maps);
1854
1855 if (!dev_maps)
1856 goto out_no_maps;
1857
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001858 for_each_possible_cpu(cpu) {
Alexander Duyck024e9672013-01-10 08:57:46 +00001859 for (i = index; i < dev->num_tx_queues; i++) {
1860 if (!remove_xps_queue(dev_maps, cpu, i))
1861 break;
1862 }
1863 if (i == dev->num_tx_queues)
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001864 active = true;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001865 }
1866
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001867 if (!active) {
Alexander Duyck537c00d2013-01-10 08:57:02 +00001868 RCU_INIT_POINTER(dev->xps_maps, NULL);
1869 kfree_rcu(dev_maps, rcu);
1870 }
1871
Alexander Duyck024e9672013-01-10 08:57:46 +00001872 for (i = index; i < dev->num_tx_queues; i++)
1873 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1874 NUMA_NO_NODE);
1875
Alexander Duyck537c00d2013-01-10 08:57:02 +00001876out_no_maps:
1877 mutex_unlock(&xps_map_mutex);
1878}
1879
Alexander Duyck01c5f862013-01-10 08:57:35 +00001880static struct xps_map *expand_xps_map(struct xps_map *map,
1881 int cpu, u16 index)
1882{
1883 struct xps_map *new_map;
1884 int alloc_len = XPS_MIN_MAP_ALLOC;
1885 int i, pos;
1886
1887 for (pos = 0; map && pos < map->len; pos++) {
1888 if (map->queues[pos] != index)
1889 continue;
1890 return map;
1891 }
1892
1893 /* Need to add queue to this CPU's existing map */
1894 if (map) {
1895 if (pos < map->alloc_len)
1896 return map;
1897
1898 alloc_len = map->alloc_len * 2;
1899 }
1900
1901 /* Need to allocate new map to store queue on this CPU's map */
1902 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1903 cpu_to_node(cpu));
1904 if (!new_map)
1905 return NULL;
1906
1907 for (i = 0; i < pos; i++)
1908 new_map->queues[i] = map->queues[i];
1909 new_map->alloc_len = alloc_len;
1910 new_map->len = pos;
1911
1912 return new_map;
1913}
1914
Michael S. Tsirkin35735402013-10-02 09:14:06 +03001915int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1916 u16 index)
Alexander Duyck537c00d2013-01-10 08:57:02 +00001917{
Alexander Duyck01c5f862013-01-10 08:57:35 +00001918 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001919 struct xps_map *map, *new_map;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001920 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
Alexander Duyck01c5f862013-01-10 08:57:35 +00001921 int cpu, numa_node_id = -2;
1922 bool active = false;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001923
1924 mutex_lock(&xps_map_mutex);
1925
1926 dev_maps = xmap_dereference(dev->xps_maps);
1927
Alexander Duyck01c5f862013-01-10 08:57:35 +00001928 /* allocate memory for queue storage */
1929 for_each_online_cpu(cpu) {
1930 if (!cpumask_test_cpu(cpu, mask))
1931 continue;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001932
Alexander Duyck01c5f862013-01-10 08:57:35 +00001933 if (!new_dev_maps)
1934 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
Alexander Duyck2bb60cb2013-02-22 06:38:44 +00001935 if (!new_dev_maps) {
1936 mutex_unlock(&xps_map_mutex);
Alexander Duyck01c5f862013-01-10 08:57:35 +00001937 return -ENOMEM;
Alexander Duyck2bb60cb2013-02-22 06:38:44 +00001938 }
Alexander Duyck01c5f862013-01-10 08:57:35 +00001939
1940 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1941 NULL;
1942
1943 map = expand_xps_map(map, cpu, index);
1944 if (!map)
1945 goto error;
1946
1947 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1948 }
1949
1950 if (!new_dev_maps)
1951 goto out_no_new_maps;
1952
1953 for_each_possible_cpu(cpu) {
1954 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1955 /* add queue to CPU maps */
1956 int pos = 0;
1957
1958 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1959 while ((pos < map->len) && (map->queues[pos] != index))
1960 pos++;
1961
1962 if (pos == map->len)
1963 map->queues[map->len++] = index;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001964#ifdef CONFIG_NUMA
Alexander Duyck537c00d2013-01-10 08:57:02 +00001965 if (numa_node_id == -2)
1966 numa_node_id = cpu_to_node(cpu);
1967 else if (numa_node_id != cpu_to_node(cpu))
1968 numa_node_id = -1;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001969#endif
Alexander Duyck01c5f862013-01-10 08:57:35 +00001970 } else if (dev_maps) {
1971 /* fill in the new device map from the old device map */
1972 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1973 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
Alexander Duyck537c00d2013-01-10 08:57:02 +00001974 }
Alexander Duyck01c5f862013-01-10 08:57:35 +00001975
Alexander Duyck537c00d2013-01-10 08:57:02 +00001976 }
1977
Alexander Duyck01c5f862013-01-10 08:57:35 +00001978 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1979
Alexander Duyck537c00d2013-01-10 08:57:02 +00001980 /* Cleanup old maps */
Alexander Duyck01c5f862013-01-10 08:57:35 +00001981 if (dev_maps) {
1982 for_each_possible_cpu(cpu) {
1983 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1984 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1985 if (map && map != new_map)
1986 kfree_rcu(map, rcu);
1987 }
Alexander Duyck537c00d2013-01-10 08:57:02 +00001988
Alexander Duyck537c00d2013-01-10 08:57:02 +00001989 kfree_rcu(dev_maps, rcu);
Alexander Duyck01c5f862013-01-10 08:57:35 +00001990 }
Alexander Duyck537c00d2013-01-10 08:57:02 +00001991
Alexander Duyck01c5f862013-01-10 08:57:35 +00001992 dev_maps = new_dev_maps;
1993 active = true;
1994
1995out_no_new_maps:
1996 /* update Tx queue numa node */
Alexander Duyck537c00d2013-01-10 08:57:02 +00001997 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1998 (numa_node_id >= 0) ? numa_node_id :
1999 NUMA_NO_NODE);
2000
Alexander Duyck01c5f862013-01-10 08:57:35 +00002001 if (!dev_maps)
2002 goto out_no_maps;
2003
2004 /* removes queue from unused CPUs */
2005 for_each_possible_cpu(cpu) {
2006 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2007 continue;
2008
2009 if (remove_xps_queue(dev_maps, cpu, index))
2010 active = true;
2011 }
2012
2013 /* free map if not active */
2014 if (!active) {
2015 RCU_INIT_POINTER(dev->xps_maps, NULL);
2016 kfree_rcu(dev_maps, rcu);
2017 }
2018
2019out_no_maps:
Alexander Duyck537c00d2013-01-10 08:57:02 +00002020 mutex_unlock(&xps_map_mutex);
2021
2022 return 0;
2023error:
Alexander Duyck01c5f862013-01-10 08:57:35 +00002024 /* remove any maps that we added */
2025 for_each_possible_cpu(cpu) {
2026 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2027 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2028 NULL;
2029 if (new_map && new_map != map)
2030 kfree(new_map);
2031 }
2032
Alexander Duyck537c00d2013-01-10 08:57:02 +00002033 mutex_unlock(&xps_map_mutex);
2034
Alexander Duyck537c00d2013-01-10 08:57:02 +00002035 kfree(new_dev_maps);
2036 return -ENOMEM;
2037}
2038EXPORT_SYMBOL(netif_set_xps_queue);
2039
2040#endif
John Fastabendf0796d52010-07-01 13:21:57 +00002041/*
2042 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2043 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2044 */
Tom Herberte6484932010-10-18 18:04:39 +00002045int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
John Fastabendf0796d52010-07-01 13:21:57 +00002046{
Tom Herbert1d24eb42010-11-21 13:17:27 +00002047 int rc;
2048
Tom Herberte6484932010-10-18 18:04:39 +00002049 if (txq < 1 || txq > dev->num_tx_queues)
2050 return -EINVAL;
John Fastabendf0796d52010-07-01 13:21:57 +00002051
Ben Hutchings5c565802011-02-15 19:39:21 +00002052 if (dev->reg_state == NETREG_REGISTERED ||
2053 dev->reg_state == NETREG_UNREGISTERING) {
Tom Herberte6484932010-10-18 18:04:39 +00002054 ASSERT_RTNL();
2055
Tom Herbert1d24eb42010-11-21 13:17:27 +00002056 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2057 txq);
Tom Herbertbf264142010-11-26 08:36:09 +00002058 if (rc)
2059 return rc;
2060
John Fastabend4f57c082011-01-17 08:06:04 +00002061 if (dev->num_tc)
2062 netif_setup_tc(dev, txq);
2063
Alexander Duyck024e9672013-01-10 08:57:46 +00002064 if (txq < dev->real_num_tx_queues) {
Tom Herberte6484932010-10-18 18:04:39 +00002065 qdisc_reset_all_tx_gt(dev, txq);
Alexander Duyck024e9672013-01-10 08:57:46 +00002066#ifdef CONFIG_XPS
2067 netif_reset_xps_queues_gt(dev, txq);
2068#endif
2069 }
John Fastabendf0796d52010-07-01 13:21:57 +00002070 }
Tom Herberte6484932010-10-18 18:04:39 +00002071
2072 dev->real_num_tx_queues = txq;
2073 return 0;
John Fastabendf0796d52010-07-01 13:21:57 +00002074}
2075EXPORT_SYMBOL(netif_set_real_num_tx_queues);
Denis Vlasenko56079432006-03-29 15:57:29 -08002076
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002077#ifdef CONFIG_RPS
2078/**
2079 * netif_set_real_num_rx_queues - set actual number of RX queues used
2080 * @dev: Network device
2081 * @rxq: Actual number of RX queues
2082 *
2083 * This must be called either with the rtnl_lock held or before
2084 * registration of the net device. Returns 0 on success, or a
Ben Hutchings4e7f7952010-10-08 10:33:39 -07002085 * negative error code. If called before registration, it always
2086 * succeeds.
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002087 */
2088int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2089{
2090 int rc;
2091
Tom Herbertbd25fa72010-10-18 18:00:16 +00002092 if (rxq < 1 || rxq > dev->num_rx_queues)
2093 return -EINVAL;
2094
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002095 if (dev->reg_state == NETREG_REGISTERED) {
2096 ASSERT_RTNL();
2097
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002098 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2099 rxq);
2100 if (rc)
2101 return rc;
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002102 }
2103
2104 dev->real_num_rx_queues = rxq;
2105 return 0;
2106}
2107EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2108#endif
2109
Ben Hutchings2c530402012-07-10 10:55:09 +00002110/**
2111 * netif_get_num_default_rss_queues - default number of RSS queues
Yuval Mintz16917b82012-07-01 03:18:50 +00002112 *
2113 * This routine should set an upper limit on the number of RSS queues
2114 * used by default by multiqueue devices.
2115 */
Ben Hutchingsa55b1382012-07-10 10:54:38 +00002116int netif_get_num_default_rss_queues(void)
Yuval Mintz16917b82012-07-01 03:18:50 +00002117{
2118 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2119}
2120EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2121
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002122static inline void __netif_reschedule(struct Qdisc *q)
2123{
2124 struct softnet_data *sd;
2125 unsigned long flags;
2126
2127 local_irq_save(flags);
2128 sd = &__get_cpu_var(softnet_data);
Changli Gaoa9cbd582010-04-26 23:06:24 +00002129 q->next_sched = NULL;
2130 *sd->output_queue_tailp = q;
2131 sd->output_queue_tailp = &q->next_sched;
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002132 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2133 local_irq_restore(flags);
2134}
2135
David S. Miller37437bb2008-07-16 02:15:04 -07002136void __netif_schedule(struct Qdisc *q)
Denis Vlasenko56079432006-03-29 15:57:29 -08002137{
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002138 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2139 __netif_reschedule(q);
Denis Vlasenko56079432006-03-29 15:57:29 -08002140}
2141EXPORT_SYMBOL(__netif_schedule);
2142
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002143void dev_kfree_skb_irq(struct sk_buff *skb)
Denis Vlasenko56079432006-03-29 15:57:29 -08002144{
David S. Miller3578b0c2010-08-03 00:24:04 -07002145 if (atomic_dec_and_test(&skb->users)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002146 struct softnet_data *sd;
2147 unsigned long flags;
Denis Vlasenko56079432006-03-29 15:57:29 -08002148
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002149 local_irq_save(flags);
2150 sd = &__get_cpu_var(softnet_data);
2151 skb->next = sd->completion_queue;
2152 sd->completion_queue = skb;
2153 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2154 local_irq_restore(flags);
2155 }
Denis Vlasenko56079432006-03-29 15:57:29 -08002156}
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002157EXPORT_SYMBOL(dev_kfree_skb_irq);
Denis Vlasenko56079432006-03-29 15:57:29 -08002158
2159void dev_kfree_skb_any(struct sk_buff *skb)
2160{
2161 if (in_irq() || irqs_disabled())
2162 dev_kfree_skb_irq(skb);
2163 else
2164 dev_kfree_skb(skb);
2165}
2166EXPORT_SYMBOL(dev_kfree_skb_any);
2167
2168
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002169/**
2170 * netif_device_detach - mark device as removed
2171 * @dev: network device
2172 *
2173 * Mark device as removed from system and therefore no longer available.
2174 */
Denis Vlasenko56079432006-03-29 15:57:29 -08002175void netif_device_detach(struct net_device *dev)
2176{
2177 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2178 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00002179 netif_tx_stop_all_queues(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08002180 }
2181}
2182EXPORT_SYMBOL(netif_device_detach);
2183
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002184/**
2185 * netif_device_attach - mark device as attached
2186 * @dev: network device
2187 *
2188 * Mark device as attached from system and restart if needed.
2189 */
Denis Vlasenko56079432006-03-29 15:57:29 -08002190void netif_device_attach(struct net_device *dev)
2191{
2192 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2193 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00002194 netif_tx_wake_all_queues(dev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002195 __netdev_watchdog_up(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08002196 }
2197}
2198EXPORT_SYMBOL(netif_device_attach);
2199
Ben Hutchings36c92472012-01-17 07:57:56 +00002200static void skb_warn_bad_offload(const struct sk_buff *skb)
2201{
Michał Mirosław65e9d2f2012-01-17 10:00:40 +00002202 static const netdev_features_t null_features = 0;
Ben Hutchings36c92472012-01-17 07:57:56 +00002203 struct net_device *dev = skb->dev;
2204 const char *driver = "";
2205
Ben Greearc846ad92013-04-19 10:45:52 +00002206 if (!net_ratelimit())
2207 return;
2208
Ben Hutchings36c92472012-01-17 07:57:56 +00002209 if (dev && dev->dev.parent)
2210 driver = dev_driver_string(dev->dev.parent);
2211
2212 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2213 "gso_type=%d ip_summed=%d\n",
Michał Mirosław65e9d2f2012-01-17 10:00:40 +00002214 driver, dev ? &dev->features : &null_features,
2215 skb->sk ? &skb->sk->sk_route_caps : &null_features,
Ben Hutchings36c92472012-01-17 07:57:56 +00002216 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2217 skb_shinfo(skb)->gso_type, skb->ip_summed);
2218}
2219
Linus Torvalds1da177e2005-04-16 15:20:36 -07002220/*
2221 * Invalidate hardware checksum when packet is to be mangled, and
2222 * complete checksum manually on outgoing path.
2223 */
Patrick McHardy84fa7932006-08-29 16:44:56 -07002224int skb_checksum_help(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002225{
Al Virod3bc23e2006-11-14 21:24:49 -08002226 __wsum csum;
Herbert Xu663ead32007-04-09 11:59:07 -07002227 int ret = 0, offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002228
Patrick McHardy84fa7932006-08-29 16:44:56 -07002229 if (skb->ip_summed == CHECKSUM_COMPLETE)
Herbert Xua430a432006-07-08 13:34:56 -07002230 goto out_set_summed;
2231
2232 if (unlikely(skb_shinfo(skb)->gso_size)) {
Ben Hutchings36c92472012-01-17 07:57:56 +00002233 skb_warn_bad_offload(skb);
2234 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002235 }
2236
Eric Dumazetcef401d2013-01-25 20:34:37 +00002237 /* Before computing a checksum, we should make sure no frag could
2238 * be modified by an external entity : checksum could be wrong.
2239 */
2240 if (skb_has_shared_frag(skb)) {
2241 ret = __skb_linearize(skb);
2242 if (ret)
2243 goto out;
2244 }
2245
Michał Mirosław55508d62010-12-14 15:24:08 +00002246 offset = skb_checksum_start_offset(skb);
Herbert Xua0308472007-10-15 01:47:15 -07002247 BUG_ON(offset >= skb_headlen(skb));
2248 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2249
2250 offset += skb->csum_offset;
2251 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2252
2253 if (skb_cloned(skb) &&
2254 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002255 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2256 if (ret)
2257 goto out;
2258 }
2259
Herbert Xua0308472007-10-15 01:47:15 -07002260 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
Herbert Xua430a432006-07-08 13:34:56 -07002261out_set_summed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002262 skb->ip_summed = CHECKSUM_NONE;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002263out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002264 return ret;
2265}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002266EXPORT_SYMBOL(skb_checksum_help);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002267
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002268__be16 skb_network_protocol(struct sk_buff *skb)
2269{
2270 __be16 type = skb->protocol;
David S. Miller61816592013-03-20 12:46:26 -04002271 int vlan_depth = ETH_HLEN;
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002272
Pravin B Shelar19acc322013-05-07 20:41:07 +00002273 /* Tunnel gso handlers can set protocol to ethernet. */
2274 if (type == htons(ETH_P_TEB)) {
2275 struct ethhdr *eth;
2276
2277 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2278 return 0;
2279
2280 eth = (struct ethhdr *)skb_mac_header(skb);
2281 type = eth->h_proto;
2282 }
2283
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002284 while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002285 struct vlan_hdr *vh;
2286
2287 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2288 return 0;
2289
2290 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2291 type = vh->h_vlan_encapsulated_proto;
2292 vlan_depth += VLAN_HLEN;
2293 }
2294
2295 return type;
2296}
2297
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002298/**
2299 * skb_mac_gso_segment - mac layer segmentation handler.
2300 * @skb: buffer to segment
2301 * @features: features for the output path (see dev->features)
2302 */
2303struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2304 netdev_features_t features)
2305{
2306 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2307 struct packet_offload *ptype;
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002308 __be16 type = skb_network_protocol(skb);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002309
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002310 if (unlikely(!type))
2311 return ERR_PTR(-EINVAL);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002312
2313 __skb_pull(skb, skb->mac_len);
2314
2315 rcu_read_lock();
2316 list_for_each_entry_rcu(ptype, &offload_base, list) {
2317 if (ptype->type == type && ptype->callbacks.gso_segment) {
2318 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2319 int err;
2320
2321 err = ptype->callbacks.gso_send_check(skb);
2322 segs = ERR_PTR(err);
2323 if (err || skb_gso_ok(skb, features))
2324 break;
2325 __skb_push(skb, (skb->data -
2326 skb_network_header(skb)));
2327 }
2328 segs = ptype->callbacks.gso_segment(skb, features);
2329 break;
2330 }
2331 }
2332 rcu_read_unlock();
2333
2334 __skb_push(skb, skb->data - skb_mac_header(skb));
2335
2336 return segs;
2337}
2338EXPORT_SYMBOL(skb_mac_gso_segment);
2339
2340
Cong Wang12b00042013-02-05 16:36:38 +00002341/* openvswitch calls this on rx path, so we need a different check.
2342 */
2343static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2344{
2345 if (tx_path)
2346 return skb->ip_summed != CHECKSUM_PARTIAL;
2347 else
2348 return skb->ip_summed == CHECKSUM_NONE;
2349}
2350
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002351/**
Cong Wang12b00042013-02-05 16:36:38 +00002352 * __skb_gso_segment - Perform segmentation on skb.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002353 * @skb: buffer to segment
Herbert Xu576a30e2006-06-27 13:22:38 -07002354 * @features: features for the output path (see dev->features)
Cong Wang12b00042013-02-05 16:36:38 +00002355 * @tx_path: whether it is called in TX path
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002356 *
2357 * This function segments the given skb and returns a list of segments.
Herbert Xu576a30e2006-06-27 13:22:38 -07002358 *
2359 * It may return NULL if the skb requires no segmentation. This is
2360 * only possible when GSO is used for verifying header integrity.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002361 */
Cong Wang12b00042013-02-05 16:36:38 +00002362struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2363 netdev_features_t features, bool tx_path)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002364{
Cong Wang12b00042013-02-05 16:36:38 +00002365 if (unlikely(skb_needs_check(skb, tx_path))) {
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002366 int err;
2367
Ben Hutchings36c92472012-01-17 07:57:56 +00002368 skb_warn_bad_offload(skb);
Herbert Xu67fd1a72009-01-19 16:26:44 -08002369
Herbert Xua430a432006-07-08 13:34:56 -07002370 if (skb_header_cloned(skb) &&
2371 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2372 return ERR_PTR(err);
2373 }
2374
Pravin B Shelar68c33162013-02-14 14:02:41 +00002375 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
Eric Dumazet3347c962013-10-19 11:42:56 -07002376 SKB_GSO_CB(skb)->encap_level = 0;
2377
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002378 skb_reset_mac_header(skb);
2379 skb_reset_mac_len(skb);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002380
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002381 return skb_mac_gso_segment(skb, features);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002382}
Cong Wang12b00042013-02-05 16:36:38 +00002383EXPORT_SYMBOL(__skb_gso_segment);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002384
Herbert Xufb286bb2005-11-10 13:01:24 -08002385/* Take action when hardware reception checksum errors are detected. */
2386#ifdef CONFIG_BUG
2387void netdev_rx_csum_fault(struct net_device *dev)
2388{
2389 if (net_ratelimit()) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00002390 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
Herbert Xufb286bb2005-11-10 13:01:24 -08002391 dump_stack();
2392 }
2393}
2394EXPORT_SYMBOL(netdev_rx_csum_fault);
2395#endif
2396
Linus Torvalds1da177e2005-04-16 15:20:36 -07002397/* Actually, we should eliminate this check as soon as we know, that:
2398 * 1. IOMMU is present and allows to map all the memory.
2399 * 2. No high memory really exists on this machine.
2400 */
2401
Eric Dumazet9092c652010-04-02 13:34:49 -07002402static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002403{
Herbert Xu3d3a8532006-06-27 13:33:10 -07002404#ifdef CONFIG_HIGHMEM
Linus Torvalds1da177e2005-04-16 15:20:36 -07002405 int i;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002406 if (!(dev->features & NETIF_F_HIGHDMA)) {
Ian Campbellea2ab692011-08-22 23:44:58 +00002407 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2408 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2409 if (PageHighMem(skb_frag_page(frag)))
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002410 return 1;
Ian Campbellea2ab692011-08-22 23:44:58 +00002411 }
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002412 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002413
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002414 if (PCI_DMA_BUS_IS_PHYS) {
2415 struct device *pdev = dev->dev.parent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002416
Eric Dumazet9092c652010-04-02 13:34:49 -07002417 if (!pdev)
2418 return 0;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002419 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
Ian Campbellea2ab692011-08-22 23:44:58 +00002420 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2421 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002422 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2423 return 1;
2424 }
2425 }
Herbert Xu3d3a8532006-06-27 13:33:10 -07002426#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002427 return 0;
2428}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002429
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002430struct dev_gso_cb {
2431 void (*destructor)(struct sk_buff *skb);
2432};
2433
2434#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2435
2436static void dev_gso_skb_destructor(struct sk_buff *skb)
2437{
2438 struct dev_gso_cb *cb;
2439
2440 do {
2441 struct sk_buff *nskb = skb->next;
2442
2443 skb->next = nskb->next;
2444 nskb->next = NULL;
2445 kfree_skb(nskb);
2446 } while (skb->next);
2447
2448 cb = DEV_GSO_CB(skb);
2449 if (cb->destructor)
2450 cb->destructor(skb);
2451}
2452
2453/**
2454 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2455 * @skb: buffer to segment
Jesse Gross91ecb632011-01-09 06:23:33 +00002456 * @features: device features as applicable to this skb
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002457 *
2458 * This function segments the given skb and stores the list of segments
2459 * in skb->next.
2460 */
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002461static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002462{
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002463 struct sk_buff *segs;
2464
Herbert Xu576a30e2006-06-27 13:22:38 -07002465 segs = skb_gso_segment(skb, features);
2466
2467 /* Verifying header integrity only. */
2468 if (!segs)
2469 return 0;
2470
Hirofumi Nakagawa801678c2008-04-29 01:03:09 -07002471 if (IS_ERR(segs))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002472 return PTR_ERR(segs);
2473
2474 skb->next = segs;
2475 DEV_GSO_CB(skb)->destructor = skb->destructor;
2476 skb->destructor = dev_gso_skb_destructor;
2477
2478 return 0;
2479}
2480
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002481static netdev_features_t harmonize_features(struct sk_buff *skb,
Alexander Duyckcdbaa0b2013-07-10 17:05:06 -07002482 netdev_features_t features)
Jesse Grossf01a5232011-01-09 06:23:31 +00002483{
Ed Cashinc0d680e2012-09-19 15:49:00 +00002484 if (skb->ip_summed != CHECKSUM_NONE &&
Alexander Duyckcdbaa0b2013-07-10 17:05:06 -07002485 !can_checksum_protocol(features, skb_network_protocol(skb))) {
Jesse Grossf01a5232011-01-09 06:23:31 +00002486 features &= ~NETIF_F_ALL_CSUM;
Jesse Grossf01a5232011-01-09 06:23:31 +00002487 } else if (illegal_highdma(skb->dev, skb)) {
2488 features &= ~NETIF_F_SG;
2489 }
2490
2491 return features;
2492}
2493
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002494netdev_features_t netif_skb_features(struct sk_buff *skb)
Jesse Gross58e998c2010-10-29 12:14:55 +00002495{
2496 __be16 protocol = skb->protocol;
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002497 netdev_features_t features = skb->dev->features;
Jesse Gross58e998c2010-10-29 12:14:55 +00002498
Ben Hutchings30b678d2012-07-30 15:57:00 +00002499 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2500 features &= ~NETIF_F_GSO_MASK;
2501
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002502 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
Jesse Gross58e998c2010-10-29 12:14:55 +00002503 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2504 protocol = veh->h_vlan_encapsulated_proto;
Jesse Grossf01a5232011-01-09 06:23:31 +00002505 } else if (!vlan_tx_tag_present(skb)) {
Alexander Duyckcdbaa0b2013-07-10 17:05:06 -07002506 return harmonize_features(skb, features);
Jesse Grossf01a5232011-01-09 06:23:31 +00002507 }
Jesse Gross58e998c2010-10-29 12:14:55 +00002508
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002509 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2510 NETIF_F_HW_VLAN_STAG_TX);
Jesse Grossf01a5232011-01-09 06:23:31 +00002511
Alexander Duyckcdbaa0b2013-07-10 17:05:06 -07002512 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
Jesse Grossf01a5232011-01-09 06:23:31 +00002513 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002514 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2515 NETIF_F_HW_VLAN_STAG_TX;
Alexander Duyckcdbaa0b2013-07-10 17:05:06 -07002516
2517 return harmonize_features(skb, features);
Jesse Gross58e998c2010-10-29 12:14:55 +00002518}
Jesse Grossf01a5232011-01-09 06:23:31 +00002519EXPORT_SYMBOL(netif_skb_features);
Jesse Gross58e998c2010-10-29 12:14:55 +00002520
John Fastabend6afff0c2010-06-16 14:18:12 +00002521/*
2522 * Returns true if either:
2523 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
Rami Rosend1a53df2012-08-27 23:39:24 +00002524 * 2. skb is fragmented and the device does not support SG.
John Fastabend6afff0c2010-06-16 14:18:12 +00002525 */
2526static inline int skb_needs_linearize(struct sk_buff *skb,
Patrick McHardy6708c9e2013-05-01 22:36:49 +00002527 netdev_features_t features)
John Fastabend6afff0c2010-06-16 14:18:12 +00002528{
Jesse Gross02932ce2011-01-09 06:23:34 +00002529 return skb_is_nonlinear(skb) &&
2530 ((skb_has_frag_list(skb) &&
2531 !(features & NETIF_F_FRAGLIST)) ||
Jesse Grosse1e78db2010-10-29 12:14:53 +00002532 (skb_shinfo(skb)->nr_frags &&
Jesse Gross02932ce2011-01-09 06:23:34 +00002533 !(features & NETIF_F_SG)));
John Fastabend6afff0c2010-06-16 14:18:12 +00002534}
2535
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002536int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
John Fastabenda6cc0cf2013-11-06 09:54:46 -08002537 struct netdev_queue *txq, void *accel_priv)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002538{
Stephen Hemminger00829822008-11-20 20:14:53 -08002539 const struct net_device_ops *ops = dev->netdev_ops;
Patrick McHardy572a9d72009-11-10 06:14:14 +00002540 int rc = NETDEV_TX_OK;
Koki Sanagiec764bf2011-05-30 21:48:34 +00002541 unsigned int skb_len;
Stephen Hemminger00829822008-11-20 20:14:53 -08002542
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002543 if (likely(!skb->next)) {
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002544 netdev_features_t features;
Jesse Grossfc741212011-01-09 06:23:32 +00002545
Eric Dumazet93f154b2009-05-18 22:19:19 -07002546 /*
Lucas De Marchi25985ed2011-03-30 22:57:33 -03002547 * If device doesn't need skb->dst, release it right now while
Eric Dumazet93f154b2009-05-18 22:19:19 -07002548 * its hot in this cpu cache
2549 */
Eric Dumazetadf30902009-06-02 05:19:30 +00002550 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2551 skb_dst_drop(skb);
2552
Jesse Grossfc741212011-01-09 06:23:32 +00002553 features = netif_skb_features(skb);
2554
Jesse Gross7b9c6092010-10-20 13:56:04 +00002555 if (vlan_tx_tag_present(skb) &&
Patrick McHardy86a9bad2013-04-19 02:04:30 +00002556 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2557 skb = __vlan_put_tag(skb, skb->vlan_proto,
2558 vlan_tx_tag_get(skb));
Jesse Gross7b9c6092010-10-20 13:56:04 +00002559 if (unlikely(!skb))
2560 goto out;
2561
2562 skb->vlan_tci = 0;
2563 }
2564
Alexander Duyckfc70fb62012-12-07 14:14:15 +00002565 /* If encapsulation offload request, verify we are testing
2566 * hardware encapsulation features instead of standard
2567 * features for the netdev
2568 */
2569 if (skb->encapsulation)
2570 features &= dev->hw_enc_features;
2571
Jesse Grossfc741212011-01-09 06:23:32 +00002572 if (netif_needs_gso(skb, features)) {
Jesse Gross91ecb632011-01-09 06:23:33 +00002573 if (unlikely(dev_gso_segment(skb, features)))
David S. Miller9ccb8972010-04-22 01:02:07 -07002574 goto out_kfree_skb;
2575 if (skb->next)
2576 goto gso;
John Fastabend6afff0c2010-06-16 14:18:12 +00002577 } else {
Jesse Gross02932ce2011-01-09 06:23:34 +00002578 if (skb_needs_linearize(skb, features) &&
John Fastabend6afff0c2010-06-16 14:18:12 +00002579 __skb_linearize(skb))
2580 goto out_kfree_skb;
2581
2582 /* If packet is not checksummed and device does not
2583 * support checksumming for this protocol, complete
2584 * checksumming here.
2585 */
2586 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Alexander Duyckfc70fb62012-12-07 14:14:15 +00002587 if (skb->encapsulation)
2588 skb_set_inner_transport_header(skb,
2589 skb_checksum_start_offset(skb));
2590 else
2591 skb_set_transport_header(skb,
2592 skb_checksum_start_offset(skb));
Jesse Gross03634662011-01-09 06:23:35 +00002593 if (!(features & NETIF_F_ALL_CSUM) &&
John Fastabend6afff0c2010-06-16 14:18:12 +00002594 skb_checksum_help(skb))
2595 goto out_kfree_skb;
2596 }
David S. Miller9ccb8972010-04-22 01:02:07 -07002597 }
2598
Eric Dumazetb40863c2012-09-18 20:44:49 +00002599 if (!list_empty(&ptype_all))
2600 dev_queue_xmit_nit(skb, dev);
2601
Koki Sanagiec764bf2011-05-30 21:48:34 +00002602 skb_len = skb->len;
John Fastabenda6cc0cf2013-11-06 09:54:46 -08002603 if (accel_priv)
2604 rc = ops->ndo_dfwd_start_xmit(skb, dev, accel_priv);
2605 else
2606 rc = ops->ndo_start_xmit(skb, dev);
2607
Koki Sanagiec764bf2011-05-30 21:48:34 +00002608 trace_net_dev_xmit(skb, rc, dev, skb_len);
John Fastabenda6cc0cf2013-11-06 09:54:46 -08002609 if (rc == NETDEV_TX_OK && txq)
Eric Dumazet08baf562009-05-25 22:58:01 -07002610 txq_trans_update(txq);
Patrick Ohlyac45f602009-02-12 05:03:37 +00002611 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002612 }
2613
Herbert Xu576a30e2006-06-27 13:22:38 -07002614gso:
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002615 do {
2616 struct sk_buff *nskb = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002617
2618 skb->next = nskb->next;
2619 nskb->next = NULL;
Krishna Kumar068a2de2009-12-09 20:59:58 +00002620
Eric Dumazetb40863c2012-09-18 20:44:49 +00002621 if (!list_empty(&ptype_all))
2622 dev_queue_xmit_nit(nskb, dev);
2623
Koki Sanagiec764bf2011-05-30 21:48:34 +00002624 skb_len = nskb->len;
John Fastabenda6cc0cf2013-11-06 09:54:46 -08002625 if (accel_priv)
2626 rc = ops->ndo_dfwd_start_xmit(nskb, dev, accel_priv);
2627 else
2628 rc = ops->ndo_start_xmit(nskb, dev);
Koki Sanagiec764bf2011-05-30 21:48:34 +00002629 trace_net_dev_xmit(nskb, rc, dev, skb_len);
Patrick McHardyec634fe2009-07-05 19:23:38 -07002630 if (unlikely(rc != NETDEV_TX_OK)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002631 if (rc & ~NETDEV_TX_MASK)
2632 goto out_kfree_gso_skb;
Michael Chanf54d9e82006-06-25 23:57:04 -07002633 nskb->next = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002634 skb->next = nskb;
2635 return rc;
2636 }
Eric Dumazet08baf562009-05-25 22:58:01 -07002637 txq_trans_update(txq);
Tom Herbert734664982011-11-28 16:32:44 +00002638 if (unlikely(netif_xmit_stopped(txq) && skb->next))
Michael Chanf54d9e82006-06-25 23:57:04 -07002639 return NETDEV_TX_BUSY;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002640 } while (skb->next);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002641
Patrick McHardy572a9d72009-11-10 06:14:14 +00002642out_kfree_gso_skb:
Sridhar Samudrala0c772152013-04-29 13:02:42 +00002643 if (likely(skb->next == NULL)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002644 skb->destructor = DEV_GSO_CB(skb)->destructor;
Sridhar Samudrala0c772152013-04-29 13:02:42 +00002645 consume_skb(skb);
2646 return rc;
2647 }
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002648out_kfree_skb:
2649 kfree_skb(skb);
Jesse Gross7b9c6092010-10-20 13:56:04 +00002650out:
Patrick McHardy572a9d72009-11-10 06:14:14 +00002651 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002652}
John Fastabenda6cc0cf2013-11-06 09:54:46 -08002653EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002654
Eric Dumazet1def9232013-01-10 12:36:42 +00002655static void qdisc_pkt_len_init(struct sk_buff *skb)
2656{
2657 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2658
2659 qdisc_skb_cb(skb)->pkt_len = skb->len;
2660
2661 /* To get more precise estimation of bytes sent on wire,
2662 * we add to pkt_len the headers size of all segments
2663 */
2664 if (shinfo->gso_size) {
Eric Dumazet757b8b12013-01-15 21:14:21 -08002665 unsigned int hdr_len;
Jason Wang15e5a032013-03-25 20:19:59 +00002666 u16 gso_segs = shinfo->gso_segs;
Eric Dumazet1def9232013-01-10 12:36:42 +00002667
Eric Dumazet757b8b12013-01-15 21:14:21 -08002668 /* mac layer + network layer */
2669 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2670
2671 /* + transport layer */
Eric Dumazet1def9232013-01-10 12:36:42 +00002672 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2673 hdr_len += tcp_hdrlen(skb);
2674 else
2675 hdr_len += sizeof(struct udphdr);
Jason Wang15e5a032013-03-25 20:19:59 +00002676
2677 if (shinfo->gso_type & SKB_GSO_DODGY)
2678 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2679 shinfo->gso_size);
2680
2681 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
Eric Dumazet1def9232013-01-10 12:36:42 +00002682 }
2683}
2684
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002685static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2686 struct net_device *dev,
2687 struct netdev_queue *txq)
2688{
2689 spinlock_t *root_lock = qdisc_lock(q);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002690 bool contended;
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002691 int rc;
2692
Eric Dumazet1def9232013-01-10 12:36:42 +00002693 qdisc_pkt_len_init(skb);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002694 qdisc_calculate_pkt_len(skb, q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002695 /*
2696 * Heuristic to force contended enqueues to serialize on a
2697 * separate lock before trying to get qdisc main lock.
2698 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2699 * and dequeue packets faster.
2700 */
Eric Dumazeta2da5702011-01-20 03:48:19 +00002701 contended = qdisc_is_running(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002702 if (unlikely(contended))
2703 spin_lock(&q->busylock);
2704
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002705 spin_lock(root_lock);
2706 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2707 kfree_skb(skb);
2708 rc = NET_XMIT_DROP;
2709 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
Eric Dumazetbc135b22010-06-02 03:23:51 -07002710 qdisc_run_begin(q)) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002711 /*
2712 * This is a work-conserving queue; there are no old skbs
2713 * waiting to be sent out; and the qdisc is not running -
2714 * xmit the skb directly.
2715 */
Eric Dumazet7fee2262010-05-11 23:19:48 +00002716 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2717 skb_dst_force(skb);
Eric Dumazetbfe0d022011-01-09 08:30:54 +00002718
Eric Dumazetbfe0d022011-01-09 08:30:54 +00002719 qdisc_bstats_update(q, skb);
2720
Eric Dumazet79640a42010-06-02 05:09:29 -07002721 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2722 if (unlikely(contended)) {
2723 spin_unlock(&q->busylock);
2724 contended = false;
2725 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002726 __qdisc_run(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002727 } else
Eric Dumazetbc135b22010-06-02 03:23:51 -07002728 qdisc_run_end(q);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002729
2730 rc = NET_XMIT_SUCCESS;
2731 } else {
Eric Dumazet7fee2262010-05-11 23:19:48 +00002732 skb_dst_force(skb);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002733 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
Eric Dumazet79640a42010-06-02 05:09:29 -07002734 if (qdisc_run_begin(q)) {
2735 if (unlikely(contended)) {
2736 spin_unlock(&q->busylock);
2737 contended = false;
2738 }
2739 __qdisc_run(q);
2740 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002741 }
2742 spin_unlock(root_lock);
Eric Dumazet79640a42010-06-02 05:09:29 -07002743 if (unlikely(contended))
2744 spin_unlock(&q->busylock);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002745 return rc;
2746}
2747
Neil Horman5bc14212011-11-22 05:10:51 +00002748#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2749static void skb_update_prio(struct sk_buff *skb)
2750{
Igor Maravic6977a792011-11-25 07:44:54 +00002751 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
Neil Horman5bc14212011-11-22 05:10:51 +00002752
Eric Dumazet91c68ce2012-07-08 21:45:10 +00002753 if (!skb->priority && skb->sk && map) {
2754 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2755
2756 if (prioidx < map->priomap_len)
2757 skb->priority = map->priomap[prioidx];
2758 }
Neil Horman5bc14212011-11-22 05:10:51 +00002759}
2760#else
2761#define skb_update_prio(skb)
2762#endif
2763
Eric Dumazet745e20f2010-09-29 13:23:09 -07002764static DEFINE_PER_CPU(int, xmit_recursion);
David S. Miller11a766c2010-10-25 12:51:55 -07002765#define RECURSION_LIMIT 10
Eric Dumazet745e20f2010-09-29 13:23:09 -07002766
Dave Jonesd29f7492008-07-22 14:09:06 -07002767/**
Michel Machado95603e22012-06-12 10:16:35 +00002768 * dev_loopback_xmit - loop back @skb
2769 * @skb: buffer to transmit
2770 */
2771int dev_loopback_xmit(struct sk_buff *skb)
2772{
2773 skb_reset_mac_header(skb);
2774 __skb_pull(skb, skb_network_offset(skb));
2775 skb->pkt_type = PACKET_LOOPBACK;
2776 skb->ip_summed = CHECKSUM_UNNECESSARY;
2777 WARN_ON(!skb_dst(skb));
2778 skb_dst_force(skb);
2779 netif_rx_ni(skb);
2780 return 0;
2781}
2782EXPORT_SYMBOL(dev_loopback_xmit);
2783
2784/**
Dave Jonesd29f7492008-07-22 14:09:06 -07002785 * dev_queue_xmit - transmit a buffer
2786 * @skb: buffer to transmit
2787 *
2788 * Queue a buffer for transmission to a network device. The caller must
2789 * have set the device and priority and built the buffer before calling
2790 * this function. The function can be called from an interrupt.
2791 *
2792 * A negative errno code is returned on a failure. A success does not
2793 * guarantee the frame will be transmitted as it may be dropped due
2794 * to congestion or traffic shaping.
2795 *
2796 * -----------------------------------------------------------------------------------
2797 * I notice this method can also return errors from the queue disciplines,
2798 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2799 * be positive.
2800 *
2801 * Regardless of the return value, the skb is consumed, so it is currently
2802 * difficult to retry a send to this method. (You can bump the ref count
2803 * before sending to hold a reference for retry if you are careful.)
2804 *
2805 * When calling this method, interrupts MUST be enabled. This is because
2806 * the BH enable code must have IRQs enabled so that it will not deadlock.
2807 * --BLG
2808 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002809int dev_queue_xmit(struct sk_buff *skb)
2810{
2811 struct net_device *dev = skb->dev;
David S. Millerdc2b4842008-07-08 17:18:23 -07002812 struct netdev_queue *txq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002813 struct Qdisc *q;
2814 int rc = -ENOMEM;
2815
Eric Dumazet6d1ccff2013-02-05 20:22:20 +00002816 skb_reset_mac_header(skb);
2817
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002818 /* Disable soft irqs for various locks below. Also
2819 * stops preemption for RCU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002820 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002821 rcu_read_lock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002822
Neil Horman5bc14212011-11-22 05:10:51 +00002823 skb_update_prio(skb);
2824
Amerigo Wang8c4c49d2012-09-17 20:16:31 +00002825 txq = netdev_pick_tx(dev, skb);
Paul E. McKenneya898def2010-02-22 17:04:49 -08002826 q = rcu_dereference_bh(txq->qdisc);
David S. Miller37437bb2008-07-16 02:15:04 -07002827
Linus Torvalds1da177e2005-04-16 15:20:36 -07002828#ifdef CONFIG_NET_CLS_ACT
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002829 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002830#endif
Koki Sanagicf66ba52010-08-23 18:45:02 +09002831 trace_net_dev_queue(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002832 if (q->enqueue) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002833 rc = __dev_xmit_skb(skb, q, dev, txq);
David S. Miller37437bb2008-07-16 02:15:04 -07002834 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002835 }
2836
2837 /* The device has no queue. Common case for software devices:
2838 loopback, all the sorts of tunnels...
2839
Herbert Xu932ff272006-06-09 12:20:56 -07002840 Really, it is unlikely that netif_tx_lock protection is necessary
2841 here. (f.e. loopback and IP tunnels are clean ignoring statistics
Linus Torvalds1da177e2005-04-16 15:20:36 -07002842 counters.)
2843 However, it is possible, that they rely on protection
2844 made by us here.
2845
2846 Check this and shot the lock. It is not prone from deadlocks.
2847 Either shot noqueue qdisc, it is even simpler 8)
2848 */
2849 if (dev->flags & IFF_UP) {
2850 int cpu = smp_processor_id(); /* ok because BHs are off */
2851
David S. Millerc773e842008-07-08 23:13:53 -07002852 if (txq->xmit_lock_owner != cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002853
Eric Dumazet745e20f2010-09-29 13:23:09 -07002854 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2855 goto recursion_alert;
2856
David S. Millerc773e842008-07-08 23:13:53 -07002857 HARD_TX_LOCK(dev, txq, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002858
Tom Herbert734664982011-11-28 16:32:44 +00002859 if (!netif_xmit_stopped(txq)) {
Eric Dumazet745e20f2010-09-29 13:23:09 -07002860 __this_cpu_inc(xmit_recursion);
John Fastabenda6cc0cf2013-11-06 09:54:46 -08002861 rc = dev_hard_start_xmit(skb, dev, txq, NULL);
Eric Dumazet745e20f2010-09-29 13:23:09 -07002862 __this_cpu_dec(xmit_recursion);
Patrick McHardy572a9d72009-11-10 06:14:14 +00002863 if (dev_xmit_complete(rc)) {
David S. Millerc773e842008-07-08 23:13:53 -07002864 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002865 goto out;
2866 }
2867 }
David S. Millerc773e842008-07-08 23:13:53 -07002868 HARD_TX_UNLOCK(dev, txq);
Joe Perchese87cc472012-05-13 21:56:26 +00002869 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2870 dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002871 } else {
2872 /* Recursion is detected! It is possible,
Eric Dumazet745e20f2010-09-29 13:23:09 -07002873 * unfortunately
2874 */
2875recursion_alert:
Joe Perchese87cc472012-05-13 21:56:26 +00002876 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2877 dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002878 }
2879 }
2880
2881 rc = -ENETDOWN;
Herbert Xud4828d82006-06-22 02:28:18 -07002882 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002883
Linus Torvalds1da177e2005-04-16 15:20:36 -07002884 kfree_skb(skb);
2885 return rc;
2886out:
Herbert Xud4828d82006-06-22 02:28:18 -07002887 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002888 return rc;
2889}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002890EXPORT_SYMBOL(dev_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002891
2892
2893/*=======================================================================
2894 Receiver routines
2895 =======================================================================*/
2896
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002897int netdev_max_backlog __read_mostly = 1000;
Eric Dumazetc9e6bc62012-09-27 19:29:05 +00002898EXPORT_SYMBOL(netdev_max_backlog);
2899
Eric Dumazet3b098e22010-05-15 23:57:10 -07002900int netdev_tstamp_prequeue __read_mostly = 1;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002901int netdev_budget __read_mostly = 300;
2902int weight_p __read_mostly = 64; /* old backlog weight */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002903
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002904/* Called with irq disabled */
2905static inline void ____napi_schedule(struct softnet_data *sd,
2906 struct napi_struct *napi)
2907{
2908 list_add_tail(&napi->poll_list, &sd->poll_list);
2909 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2910}
2911
Eric Dumazetdf334542010-03-24 19:13:54 +00002912#ifdef CONFIG_RPS
Tom Herbertfec5e652010-04-16 16:01:27 -07002913
2914/* One global table that all flow-based protocols share. */
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002915struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
Tom Herbertfec5e652010-04-16 16:01:27 -07002916EXPORT_SYMBOL(rps_sock_flow_table);
2917
Ingo Molnarc5905af2012-02-24 08:31:31 +01002918struct static_key rps_needed __read_mostly;
Eric Dumazetadc93002011-11-17 03:13:26 +00002919
Ben Hutchingsc4454772011-01-19 11:03:53 +00002920static struct rps_dev_flow *
2921set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2922 struct rps_dev_flow *rflow, u16 next_cpu)
2923{
Ben Hutchings09994d12011-10-03 04:42:46 +00002924 if (next_cpu != RPS_NO_CPU) {
Ben Hutchingsc4454772011-01-19 11:03:53 +00002925#ifdef CONFIG_RFS_ACCEL
2926 struct netdev_rx_queue *rxqueue;
2927 struct rps_dev_flow_table *flow_table;
2928 struct rps_dev_flow *old_rflow;
2929 u32 flow_id;
2930 u16 rxq_index;
2931 int rc;
2932
2933 /* Should we steer this flow to a different hardware queue? */
Ben Hutchings69a19ee2011-02-15 20:32:04 +00002934 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2935 !(dev->features & NETIF_F_NTUPLE))
Ben Hutchingsc4454772011-01-19 11:03:53 +00002936 goto out;
2937 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2938 if (rxq_index == skb_get_rx_queue(skb))
2939 goto out;
2940
2941 rxqueue = dev->_rx + rxq_index;
2942 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2943 if (!flow_table)
2944 goto out;
2945 flow_id = skb->rxhash & flow_table->mask;
2946 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2947 rxq_index, flow_id);
2948 if (rc < 0)
2949 goto out;
2950 old_rflow = rflow;
2951 rflow = &flow_table->flows[flow_id];
Ben Hutchingsc4454772011-01-19 11:03:53 +00002952 rflow->filter = rc;
2953 if (old_rflow->filter == rflow->filter)
2954 old_rflow->filter = RPS_NO_FILTER;
2955 out:
2956#endif
2957 rflow->last_qtail =
Ben Hutchings09994d12011-10-03 04:42:46 +00002958 per_cpu(softnet_data, next_cpu).input_queue_head;
Ben Hutchingsc4454772011-01-19 11:03:53 +00002959 }
2960
Ben Hutchings09994d12011-10-03 04:42:46 +00002961 rflow->cpu = next_cpu;
Ben Hutchingsc4454772011-01-19 11:03:53 +00002962 return rflow;
2963}
2964
Tom Herbert0a9627f2010-03-16 08:03:29 +00002965/*
2966 * get_rps_cpu is called from netif_receive_skb and returns the target
2967 * CPU from the RPS map of the receiving queue for a given skb.
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002968 * rcu_read_lock must be held on entry.
Tom Herbert0a9627f2010-03-16 08:03:29 +00002969 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002970static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2971 struct rps_dev_flow **rflowp)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002972{
Tom Herbert0a9627f2010-03-16 08:03:29 +00002973 struct netdev_rx_queue *rxqueue;
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002974 struct rps_map *map;
Tom Herbertfec5e652010-04-16 16:01:27 -07002975 struct rps_dev_flow_table *flow_table;
2976 struct rps_sock_flow_table *sock_flow_table;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002977 int cpu = -1;
Tom Herbertfec5e652010-04-16 16:01:27 -07002978 u16 tcpu;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002979
Tom Herbert0a9627f2010-03-16 08:03:29 +00002980 if (skb_rx_queue_recorded(skb)) {
2981 u16 index = skb_get_rx_queue(skb);
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002982 if (unlikely(index >= dev->real_num_rx_queues)) {
2983 WARN_ONCE(dev->real_num_rx_queues > 1,
2984 "%s received packet on queue %u, but number "
2985 "of RX queues is %u\n",
2986 dev->name, index, dev->real_num_rx_queues);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002987 goto done;
2988 }
2989 rxqueue = dev->_rx + index;
2990 } else
2991 rxqueue = dev->_rx;
2992
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002993 map = rcu_dereference(rxqueue->rps_map);
2994 if (map) {
Tom Herbert85875232011-01-31 16:23:42 -08002995 if (map->len == 1 &&
Eric Dumazet33d480c2011-08-11 19:30:52 +00002996 !rcu_access_pointer(rxqueue->rps_flow_table)) {
Changli Gao6febfca2010-09-03 23:12:37 +00002997 tcpu = map->cpus[0];
2998 if (cpu_online(tcpu))
2999 cpu = tcpu;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003000 goto done;
Eric Dumazetb249dcb2010-04-19 21:56:38 +00003001 }
Eric Dumazet33d480c2011-08-11 19:30:52 +00003002 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00003003 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003004 }
3005
Changli Gao2d47b452010-08-17 19:00:56 +00003006 skb_reset_network_header(skb);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00003007 if (!skb_get_rxhash(skb))
Tom Herbert0a9627f2010-03-16 08:03:29 +00003008 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003009
Tom Herbertfec5e652010-04-16 16:01:27 -07003010 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3011 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3012 if (flow_table && sock_flow_table) {
3013 u16 next_cpu;
3014 struct rps_dev_flow *rflow;
3015
3016 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
3017 tcpu = rflow->cpu;
3018
3019 next_cpu = sock_flow_table->ents[skb->rxhash &
3020 sock_flow_table->mask];
3021
3022 /*
3023 * If the desired CPU (where last recvmsg was done) is
3024 * different from current CPU (one in the rx-queue flow
3025 * table entry), switch if one of the following holds:
3026 * - Current CPU is unset (equal to RPS_NO_CPU).
3027 * - Current CPU is offline.
3028 * - The current CPU's queue tail has advanced beyond the
3029 * last packet that was enqueued using this table entry.
3030 * This guarantees that all previous packets for the flow
3031 * have been dequeued, thus preserving in order delivery.
3032 */
3033 if (unlikely(tcpu != next_cpu) &&
3034 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3035 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
Tom Herbertbaefa312012-11-16 09:04:15 +00003036 rflow->last_qtail)) >= 0)) {
3037 tcpu = next_cpu;
Ben Hutchingsc4454772011-01-19 11:03:53 +00003038 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
Tom Herbertbaefa312012-11-16 09:04:15 +00003039 }
Ben Hutchingsc4454772011-01-19 11:03:53 +00003040
Tom Herbertfec5e652010-04-16 16:01:27 -07003041 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3042 *rflowp = rflow;
3043 cpu = tcpu;
3044 goto done;
3045 }
3046 }
3047
Tom Herbert0a9627f2010-03-16 08:03:29 +00003048 if (map) {
Tom Herbertfec5e652010-04-16 16:01:27 -07003049 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
Tom Herbert0a9627f2010-03-16 08:03:29 +00003050
3051 if (cpu_online(tcpu)) {
3052 cpu = tcpu;
3053 goto done;
3054 }
3055 }
3056
3057done:
Tom Herbert0a9627f2010-03-16 08:03:29 +00003058 return cpu;
3059}
3060
Ben Hutchingsc4454772011-01-19 11:03:53 +00003061#ifdef CONFIG_RFS_ACCEL
3062
3063/**
3064 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3065 * @dev: Device on which the filter was set
3066 * @rxq_index: RX queue index
3067 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3068 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3069 *
3070 * Drivers that implement ndo_rx_flow_steer() should periodically call
3071 * this function for each installed filter and remove the filters for
3072 * which it returns %true.
3073 */
3074bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3075 u32 flow_id, u16 filter_id)
3076{
3077 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3078 struct rps_dev_flow_table *flow_table;
3079 struct rps_dev_flow *rflow;
3080 bool expire = true;
3081 int cpu;
3082
3083 rcu_read_lock();
3084 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3085 if (flow_table && flow_id <= flow_table->mask) {
3086 rflow = &flow_table->flows[flow_id];
3087 cpu = ACCESS_ONCE(rflow->cpu);
3088 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3089 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3090 rflow->last_qtail) <
3091 (int)(10 * flow_table->mask)))
3092 expire = false;
3093 }
3094 rcu_read_unlock();
3095 return expire;
3096}
3097EXPORT_SYMBOL(rps_may_expire_flow);
3098
3099#endif /* CONFIG_RFS_ACCEL */
3100
Tom Herbert0a9627f2010-03-16 08:03:29 +00003101/* Called from hardirq (IPI) context */
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003102static void rps_trigger_softirq(void *data)
Tom Herbert0a9627f2010-03-16 08:03:29 +00003103{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003104 struct softnet_data *sd = data;
3105
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003106 ____napi_schedule(sd, &sd->backlog);
Changli Gaodee42872010-05-02 05:42:16 +00003107 sd->received_rps++;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003108}
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003109
Tom Herbertfec5e652010-04-16 16:01:27 -07003110#endif /* CONFIG_RPS */
Tom Herbert0a9627f2010-03-16 08:03:29 +00003111
3112/*
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003113 * Check if this softnet_data structure is another cpu one
3114 * If yes, queue it to our IPI list and return 1
3115 * If no, return 0
3116 */
3117static int rps_ipi_queued(struct softnet_data *sd)
3118{
3119#ifdef CONFIG_RPS
3120 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3121
3122 if (sd != mysd) {
3123 sd->rps_ipi_next = mysd->rps_ipi_list;
3124 mysd->rps_ipi_list = sd;
3125
3126 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3127 return 1;
3128 }
3129#endif /* CONFIG_RPS */
3130 return 0;
3131}
3132
Willem de Bruijn99bbc702013-05-20 04:02:32 +00003133#ifdef CONFIG_NET_FLOW_LIMIT
3134int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3135#endif
3136
3137static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3138{
3139#ifdef CONFIG_NET_FLOW_LIMIT
3140 struct sd_flow_limit *fl;
3141 struct softnet_data *sd;
3142 unsigned int old_flow, new_flow;
3143
3144 if (qlen < (netdev_max_backlog >> 1))
3145 return false;
3146
3147 sd = &__get_cpu_var(softnet_data);
3148
3149 rcu_read_lock();
3150 fl = rcu_dereference(sd->flow_limit);
3151 if (fl) {
3152 new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
3153 old_flow = fl->history[fl->history_head];
3154 fl->history[fl->history_head] = new_flow;
3155
3156 fl->history_head++;
3157 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3158
3159 if (likely(fl->buckets[old_flow]))
3160 fl->buckets[old_flow]--;
3161
3162 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3163 fl->count++;
3164 rcu_read_unlock();
3165 return true;
3166 }
3167 }
3168 rcu_read_unlock();
3169#endif
3170 return false;
3171}
3172
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003173/*
Tom Herbert0a9627f2010-03-16 08:03:29 +00003174 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3175 * queue (may be a remote CPU queue).
3176 */
Tom Herbertfec5e652010-04-16 16:01:27 -07003177static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3178 unsigned int *qtail)
Tom Herbert0a9627f2010-03-16 08:03:29 +00003179{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003180 struct softnet_data *sd;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003181 unsigned long flags;
Willem de Bruijn99bbc702013-05-20 04:02:32 +00003182 unsigned int qlen;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003183
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003184 sd = &per_cpu(softnet_data, cpu);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003185
3186 local_irq_save(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003187
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003188 rps_lock(sd);
Willem de Bruijn99bbc702013-05-20 04:02:32 +00003189 qlen = skb_queue_len(&sd->input_pkt_queue);
3190 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
Changli Gao6e7676c2010-04-27 15:07:33 -07003191 if (skb_queue_len(&sd->input_pkt_queue)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00003192enqueue:
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003193 __skb_queue_tail(&sd->input_pkt_queue, skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003194 input_queue_tail_incr_save(sd, qtail);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003195 rps_unlock(sd);
Changli Gao152102c2010-03-30 20:16:22 +00003196 local_irq_restore(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003197 return NET_RX_SUCCESS;
3198 }
3199
Eric Dumazetebda37c22010-05-06 23:51:21 +00003200 /* Schedule NAPI for backlog device
3201 * We can use non atomic operation since we own the queue lock
3202 */
3203 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003204 if (!rps_ipi_queued(sd))
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003205 ____napi_schedule(sd, &sd->backlog);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003206 }
3207 goto enqueue;
3208 }
3209
Changli Gaodee42872010-05-02 05:42:16 +00003210 sd->dropped++;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003211 rps_unlock(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003212
Tom Herbert0a9627f2010-03-16 08:03:29 +00003213 local_irq_restore(flags);
3214
Eric Dumazetcaf586e2010-09-30 21:06:55 +00003215 atomic_long_inc(&skb->dev->rx_dropped);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003216 kfree_skb(skb);
3217 return NET_RX_DROP;
3218}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003219
Linus Torvalds1da177e2005-04-16 15:20:36 -07003220/**
3221 * netif_rx - post buffer to the network code
3222 * @skb: buffer to post
3223 *
3224 * This function receives a packet from a device driver and queues it for
3225 * the upper (protocol) levels to process. It always succeeds. The buffer
3226 * may be dropped during processing for congestion control or by the
3227 * protocol layers.
3228 *
3229 * return values:
3230 * NET_RX_SUCCESS (no congestion)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003231 * NET_RX_DROP (packet was dropped)
3232 *
3233 */
3234
3235int netif_rx(struct sk_buff *skb)
3236{
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003237 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003238
3239 /* if netpoll wants it, pretend we never saw it */
3240 if (netpoll_rx(skb))
3241 return NET_RX_DROP;
3242
Eric Dumazet588f0332011-11-15 04:12:55 +00003243 net_timestamp_check(netdev_tstamp_prequeue, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003244
Koki Sanagicf66ba52010-08-23 18:45:02 +09003245 trace_netif_rx(skb);
Eric Dumazetdf334542010-03-24 19:13:54 +00003246#ifdef CONFIG_RPS
Ingo Molnarc5905af2012-02-24 08:31:31 +01003247 if (static_key_false(&rps_needed)) {
Tom Herbertfec5e652010-04-16 16:01:27 -07003248 struct rps_dev_flow voidflow, *rflow = &voidflow;
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003249 int cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003250
Changli Gaocece1942010-08-07 20:35:43 -07003251 preempt_disable();
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003252 rcu_read_lock();
Tom Herbertfec5e652010-04-16 16:01:27 -07003253
3254 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003255 if (cpu < 0)
3256 cpu = smp_processor_id();
Tom Herbertfec5e652010-04-16 16:01:27 -07003257
3258 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3259
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003260 rcu_read_unlock();
Changli Gaocece1942010-08-07 20:35:43 -07003261 preempt_enable();
Eric Dumazetadc93002011-11-17 03:13:26 +00003262 } else
3263#endif
Tom Herbertfec5e652010-04-16 16:01:27 -07003264 {
3265 unsigned int qtail;
3266 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3267 put_cpu();
3268 }
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003269 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003270}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003271EXPORT_SYMBOL(netif_rx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003272
3273int netif_rx_ni(struct sk_buff *skb)
3274{
3275 int err;
3276
3277 preempt_disable();
3278 err = netif_rx(skb);
3279 if (local_softirq_pending())
3280 do_softirq();
3281 preempt_enable();
3282
3283 return err;
3284}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003285EXPORT_SYMBOL(netif_rx_ni);
3286
Linus Torvalds1da177e2005-04-16 15:20:36 -07003287static void net_tx_action(struct softirq_action *h)
3288{
3289 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3290
3291 if (sd->completion_queue) {
3292 struct sk_buff *clist;
3293
3294 local_irq_disable();
3295 clist = sd->completion_queue;
3296 sd->completion_queue = NULL;
3297 local_irq_enable();
3298
3299 while (clist) {
3300 struct sk_buff *skb = clist;
3301 clist = clist->next;
3302
Ilpo Järvinen547b7922008-07-25 21:43:18 -07003303 WARN_ON(atomic_read(&skb->users));
Koki Sanagi07dc22e2010-08-23 18:46:12 +09003304 trace_kfree_skb(skb, net_tx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003305 __kfree_skb(skb);
3306 }
3307 }
3308
3309 if (sd->output_queue) {
David S. Miller37437bb2008-07-16 02:15:04 -07003310 struct Qdisc *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003311
3312 local_irq_disable();
3313 head = sd->output_queue;
3314 sd->output_queue = NULL;
Changli Gaoa9cbd582010-04-26 23:06:24 +00003315 sd->output_queue_tailp = &sd->output_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003316 local_irq_enable();
3317
3318 while (head) {
David S. Miller37437bb2008-07-16 02:15:04 -07003319 struct Qdisc *q = head;
3320 spinlock_t *root_lock;
3321
Linus Torvalds1da177e2005-04-16 15:20:36 -07003322 head = head->next_sched;
3323
David S. Miller5fb66222008-08-02 20:02:43 -07003324 root_lock = qdisc_lock(q);
David S. Miller37437bb2008-07-16 02:15:04 -07003325 if (spin_trylock(root_lock)) {
Jarek Poplawskidef82a12008-08-17 21:54:43 -07003326 smp_mb__before_clear_bit();
3327 clear_bit(__QDISC_STATE_SCHED,
3328 &q->state);
David S. Miller37437bb2008-07-16 02:15:04 -07003329 qdisc_run(q);
3330 spin_unlock(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003331 } else {
David S. Miller195648b2008-08-19 04:00:36 -07003332 if (!test_bit(__QDISC_STATE_DEACTIVATED,
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07003333 &q->state)) {
David S. Miller195648b2008-08-19 04:00:36 -07003334 __netif_reschedule(q);
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07003335 } else {
3336 smp_mb__before_clear_bit();
3337 clear_bit(__QDISC_STATE_SCHED,
3338 &q->state);
3339 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003340 }
3341 }
3342 }
3343}
3344
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003345#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3346 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
Michał Mirosławda678292009-06-05 05:35:28 +00003347/* This hook is defined here for ATM LANE */
3348int (*br_fdb_test_addr_hook)(struct net_device *dev,
3349 unsigned char *addr) __read_mostly;
Stephen Hemminger4fb019a2009-09-11 11:50:08 -07003350EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
Michał Mirosławda678292009-06-05 05:35:28 +00003351#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003352
Linus Torvalds1da177e2005-04-16 15:20:36 -07003353#ifdef CONFIG_NET_CLS_ACT
3354/* TODO: Maybe we should just force sch_ingress to be compiled in
3355 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3356 * a compare and 2 stores extra right now if we dont have it on
3357 * but have CONFIG_NET_CLS_ACT
Lucas De Marchi25985ed2011-03-30 22:57:33 -03003358 * NOTE: This doesn't stop any functionality; if you dont have
3359 * the ingress scheduler, you just can't add policies on ingress.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003360 *
3361 */
Eric Dumazet24824a02010-10-02 06:11:55 +00003362static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003363{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003364 struct net_device *dev = skb->dev;
Herbert Xuf697c3e2007-10-14 00:38:47 -07003365 u32 ttl = G_TC_RTTL(skb->tc_verd);
David S. Miller555353c2008-07-08 17:33:13 -07003366 int result = TC_ACT_OK;
3367 struct Qdisc *q;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003368
Stephen Hemmingerde384832010-08-01 00:33:23 -07003369 if (unlikely(MAX_RED_LOOP < ttl++)) {
Joe Perchese87cc472012-05-13 21:56:26 +00003370 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3371 skb->skb_iif, dev->ifindex);
Herbert Xuf697c3e2007-10-14 00:38:47 -07003372 return TC_ACT_SHOT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003373 }
3374
Herbert Xuf697c3e2007-10-14 00:38:47 -07003375 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3376 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3377
David S. Miller83874002008-07-17 00:53:03 -07003378 q = rxq->qdisc;
David S. Miller8d50b532008-07-30 02:37:46 -07003379 if (q != &noop_qdisc) {
David S. Miller83874002008-07-17 00:53:03 -07003380 spin_lock(qdisc_lock(q));
David S. Millera9312ae2008-08-17 21:51:03 -07003381 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3382 result = qdisc_enqueue_root(skb, q);
David S. Miller83874002008-07-17 00:53:03 -07003383 spin_unlock(qdisc_lock(q));
3384 }
Herbert Xuf697c3e2007-10-14 00:38:47 -07003385
Linus Torvalds1da177e2005-04-16 15:20:36 -07003386 return result;
3387}
Herbert Xuf697c3e2007-10-14 00:38:47 -07003388
3389static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3390 struct packet_type **pt_prev,
3391 int *ret, struct net_device *orig_dev)
3392{
Eric Dumazet24824a02010-10-02 06:11:55 +00003393 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3394
3395 if (!rxq || rxq->qdisc == &noop_qdisc)
Herbert Xuf697c3e2007-10-14 00:38:47 -07003396 goto out;
3397
3398 if (*pt_prev) {
3399 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3400 *pt_prev = NULL;
Herbert Xuf697c3e2007-10-14 00:38:47 -07003401 }
3402
Eric Dumazet24824a02010-10-02 06:11:55 +00003403 switch (ing_filter(skb, rxq)) {
Herbert Xuf697c3e2007-10-14 00:38:47 -07003404 case TC_ACT_SHOT:
3405 case TC_ACT_STOLEN:
3406 kfree_skb(skb);
3407 return NULL;
3408 }
3409
3410out:
3411 skb->tc_verd = 0;
3412 return skb;
3413}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003414#endif
3415
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003416/**
3417 * netdev_rx_handler_register - register receive handler
3418 * @dev: device to register a handler for
3419 * @rx_handler: receive handler to register
Jiri Pirko93e2c322010-06-10 03:34:59 +00003420 * @rx_handler_data: data pointer that is used by rx handler
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003421 *
3422 * Register a receive hander for a device. This handler will then be
3423 * called from __netif_receive_skb. A negative errno code is returned
3424 * on a failure.
3425 *
3426 * The caller must hold the rtnl_mutex.
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003427 *
3428 * For a general description of rx_handler, see enum rx_handler_result.
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003429 */
3430int netdev_rx_handler_register(struct net_device *dev,
Jiri Pirko93e2c322010-06-10 03:34:59 +00003431 rx_handler_func_t *rx_handler,
3432 void *rx_handler_data)
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003433{
3434 ASSERT_RTNL();
3435
3436 if (dev->rx_handler)
3437 return -EBUSY;
3438
Eric Dumazet00cfec32013-03-29 03:01:22 +00003439 /* Note: rx_handler_data must be set before rx_handler */
Jiri Pirko93e2c322010-06-10 03:34:59 +00003440 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003441 rcu_assign_pointer(dev->rx_handler, rx_handler);
3442
3443 return 0;
3444}
3445EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3446
3447/**
3448 * netdev_rx_handler_unregister - unregister receive handler
3449 * @dev: device to unregister a handler from
3450 *
Kusanagi Kouichi166ec362013-03-18 02:59:52 +00003451 * Unregister a receive handler from a device.
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003452 *
3453 * The caller must hold the rtnl_mutex.
3454 */
3455void netdev_rx_handler_unregister(struct net_device *dev)
3456{
3457
3458 ASSERT_RTNL();
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00003459 RCU_INIT_POINTER(dev->rx_handler, NULL);
Eric Dumazet00cfec32013-03-29 03:01:22 +00003460 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3461 * section has a guarantee to see a non NULL rx_handler_data
3462 * as well.
3463 */
3464 synchronize_net();
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00003465 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003466}
3467EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3468
Mel Gormanb4b9e352012-07-31 16:44:26 -07003469/*
3470 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3471 * the special handling of PFMEMALLOC skbs.
3472 */
3473static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3474{
3475 switch (skb->protocol) {
3476 case __constant_htons(ETH_P_ARP):
3477 case __constant_htons(ETH_P_IP):
3478 case __constant_htons(ETH_P_IPV6):
3479 case __constant_htons(ETH_P_8021Q):
Patrick McHardy8ad227f2013-04-19 02:04:31 +00003480 case __constant_htons(ETH_P_8021AD):
Mel Gormanb4b9e352012-07-31 16:44:26 -07003481 return true;
3482 default:
3483 return false;
3484 }
3485}
3486
David S. Miller9754e292013-02-14 15:57:38 -05003487static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003488{
3489 struct packet_type *ptype, *pt_prev;
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003490 rx_handler_func_t *rx_handler;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003491 struct net_device *orig_dev;
David S. Miller63d8ea72011-02-28 10:48:59 -08003492 struct net_device *null_or_dev;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003493 bool deliver_exact = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003494 int ret = NET_RX_DROP;
Al Viro252e3342006-11-14 20:48:11 -08003495 __be16 type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003496
Eric Dumazet588f0332011-11-15 04:12:55 +00003497 net_timestamp_check(!netdev_tstamp_prequeue, skb);
Eric Dumazet81bbb3d2009-09-30 16:42:42 -07003498
Koki Sanagicf66ba52010-08-23 18:45:02 +09003499 trace_netif_receive_skb(skb);
Patrick McHardy9b22ea52008-11-04 14:49:57 -08003500
Linus Torvalds1da177e2005-04-16 15:20:36 -07003501 /* if we've gotten here through NAPI, check netpoll */
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003502 if (netpoll_receive_skb(skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003503 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003504
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07003505 orig_dev = skb->dev;
Jiri Pirko1765a572011-02-12 06:48:36 +00003506
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003507 skb_reset_network_header(skb);
Eric Dumazetfda55ec2013-01-07 09:28:21 +00003508 if (!skb_transport_header_was_set(skb))
3509 skb_reset_transport_header(skb);
Jiri Pirko0b5c9db2011-06-10 06:56:58 +00003510 skb_reset_mac_len(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003511
3512 pt_prev = NULL;
3513
3514 rcu_read_lock();
3515
David S. Miller63d8ea72011-02-28 10:48:59 -08003516another_round:
David S. Millerb6858172012-07-23 16:27:54 -07003517 skb->skb_iif = skb->dev->ifindex;
David S. Miller63d8ea72011-02-28 10:48:59 -08003518
3519 __this_cpu_inc(softnet_data.processed);
3520
Patrick McHardy8ad227f2013-04-19 02:04:31 +00003521 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3522 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
Jiri Pirkobcc6d472011-04-07 19:48:33 +00003523 skb = vlan_untag(skb);
3524 if (unlikely(!skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003525 goto unlock;
Jiri Pirkobcc6d472011-04-07 19:48:33 +00003526 }
3527
Linus Torvalds1da177e2005-04-16 15:20:36 -07003528#ifdef CONFIG_NET_CLS_ACT
3529 if (skb->tc_verd & TC_NCLS) {
3530 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3531 goto ncls;
3532 }
3533#endif
3534
David S. Miller9754e292013-02-14 15:57:38 -05003535 if (pfmemalloc)
Mel Gormanb4b9e352012-07-31 16:44:26 -07003536 goto skip_taps;
3537
Linus Torvalds1da177e2005-04-16 15:20:36 -07003538 list_for_each_entry_rcu(ptype, &ptype_all, list) {
David S. Miller63d8ea72011-02-28 10:48:59 -08003539 if (!ptype->dev || ptype->dev == skb->dev) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003540 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003541 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003542 pt_prev = ptype;
3543 }
3544 }
3545
Mel Gormanb4b9e352012-07-31 16:44:26 -07003546skip_taps:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003547#ifdef CONFIG_NET_CLS_ACT
Herbert Xuf697c3e2007-10-14 00:38:47 -07003548 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3549 if (!skb)
Mel Gormanb4b9e352012-07-31 16:44:26 -07003550 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003551ncls:
3552#endif
3553
David S. Miller9754e292013-02-14 15:57:38 -05003554 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003555 goto drop;
3556
John Fastabend24257172011-10-10 09:16:41 +00003557 if (vlan_tx_tag_present(skb)) {
3558 if (pt_prev) {
3559 ret = deliver_skb(skb, pt_prev, orig_dev);
3560 pt_prev = NULL;
3561 }
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003562 if (vlan_do_receive(&skb))
John Fastabend24257172011-10-10 09:16:41 +00003563 goto another_round;
3564 else if (unlikely(!skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003565 goto unlock;
John Fastabend24257172011-10-10 09:16:41 +00003566 }
3567
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003568 rx_handler = rcu_dereference(skb->dev->rx_handler);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003569 if (rx_handler) {
3570 if (pt_prev) {
3571 ret = deliver_skb(skb, pt_prev, orig_dev);
3572 pt_prev = NULL;
3573 }
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003574 switch (rx_handler(&skb)) {
3575 case RX_HANDLER_CONSUMED:
Cristian Bercaru3bc1b1a2013-03-08 07:03:38 +00003576 ret = NET_RX_SUCCESS;
Mel Gormanb4b9e352012-07-31 16:44:26 -07003577 goto unlock;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003578 case RX_HANDLER_ANOTHER:
David S. Miller63d8ea72011-02-28 10:48:59 -08003579 goto another_round;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003580 case RX_HANDLER_EXACT:
3581 deliver_exact = true;
3582 case RX_HANDLER_PASS:
3583 break;
3584 default:
3585 BUG();
3586 }
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003587 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003588
Eric Dumazetd4b812d2013-07-18 07:19:26 -07003589 if (unlikely(vlan_tx_tag_present(skb))) {
3590 if (vlan_tx_tag_get_id(skb))
3591 skb->pkt_type = PACKET_OTHERHOST;
3592 /* Note: we might in the future use prio bits
3593 * and set skb->priority like in vlan_do_receive()
3594 * For the time being, just ignore Priority Code Point
3595 */
3596 skb->vlan_tci = 0;
3597 }
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003598
David S. Miller63d8ea72011-02-28 10:48:59 -08003599 /* deliver only exact match when indicated */
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003600 null_or_dev = deliver_exact ? skb->dev : NULL;
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00003601
Linus Torvalds1da177e2005-04-16 15:20:36 -07003602 type = skb->protocol;
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08003603 list_for_each_entry_rcu(ptype,
3604 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
David S. Miller63d8ea72011-02-28 10:48:59 -08003605 if (ptype->type == type &&
Jiri Pirkoe3f48d32011-02-28 20:26:31 +00003606 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3607 ptype->dev == orig_dev)) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003608 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003609 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003610 pt_prev = ptype;
3611 }
3612 }
3613
3614 if (pt_prev) {
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00003615 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
Michael S. Tsirkin0e698bf2012-09-15 22:44:16 +00003616 goto drop;
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00003617 else
3618 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003619 } else {
Mel Gormanb4b9e352012-07-31 16:44:26 -07003620drop:
Eric Dumazetcaf586e2010-09-30 21:06:55 +00003621 atomic_long_inc(&skb->dev->rx_dropped);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003622 kfree_skb(skb);
3623 /* Jamal, now you will not able to escape explaining
3624 * me how you were going to use this. :-)
3625 */
3626 ret = NET_RX_DROP;
3627 }
3628
Mel Gormanb4b9e352012-07-31 16:44:26 -07003629unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003630 rcu_read_unlock();
Mel Gormanb4b9e352012-07-31 16:44:26 -07003631out:
David S. Miller9754e292013-02-14 15:57:38 -05003632 return ret;
3633}
3634
3635static int __netif_receive_skb(struct sk_buff *skb)
3636{
3637 int ret;
3638
3639 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3640 unsigned long pflags = current->flags;
3641
3642 /*
3643 * PFMEMALLOC skbs are special, they should
3644 * - be delivered to SOCK_MEMALLOC sockets only
3645 * - stay away from userspace
3646 * - have bounded memory usage
3647 *
3648 * Use PF_MEMALLOC as this saves us from propagating the allocation
3649 * context down to all allocation sites.
3650 */
3651 current->flags |= PF_MEMALLOC;
3652 ret = __netif_receive_skb_core(skb, true);
3653 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3654 } else
3655 ret = __netif_receive_skb_core(skb, false);
3656
Linus Torvalds1da177e2005-04-16 15:20:36 -07003657 return ret;
3658}
Tom Herbert0a9627f2010-03-16 08:03:29 +00003659
3660/**
3661 * netif_receive_skb - process receive buffer from network
3662 * @skb: buffer to process
3663 *
3664 * netif_receive_skb() is the main receive data processing function.
3665 * It always succeeds. The buffer may be dropped during processing
3666 * for congestion control or by the protocol layers.
3667 *
3668 * This function may only be called from softirq context and interrupts
3669 * should be enabled.
3670 *
3671 * Return values (usually ignored):
3672 * NET_RX_SUCCESS: no congestion
3673 * NET_RX_DROP: packet was dropped
3674 */
3675int netif_receive_skb(struct sk_buff *skb)
3676{
Eric Dumazet588f0332011-11-15 04:12:55 +00003677 net_timestamp_check(netdev_tstamp_prequeue, skb);
Eric Dumazet3b098e22010-05-15 23:57:10 -07003678
Richard Cochranc1f19b52010-07-17 08:49:36 +00003679 if (skb_defer_rx_timestamp(skb))
3680 return NET_RX_SUCCESS;
3681
Eric Dumazetdf334542010-03-24 19:13:54 +00003682#ifdef CONFIG_RPS
Ingo Molnarc5905af2012-02-24 08:31:31 +01003683 if (static_key_false(&rps_needed)) {
Eric Dumazet3b098e22010-05-15 23:57:10 -07003684 struct rps_dev_flow voidflow, *rflow = &voidflow;
3685 int cpu, ret;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003686
Eric Dumazet3b098e22010-05-15 23:57:10 -07003687 rcu_read_lock();
Tom Herbert0a9627f2010-03-16 08:03:29 +00003688
Eric Dumazet3b098e22010-05-15 23:57:10 -07003689 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Tom Herbertfec5e652010-04-16 16:01:27 -07003690
Eric Dumazet3b098e22010-05-15 23:57:10 -07003691 if (cpu >= 0) {
3692 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3693 rcu_read_unlock();
Eric Dumazetadc93002011-11-17 03:13:26 +00003694 return ret;
Eric Dumazet3b098e22010-05-15 23:57:10 -07003695 }
Eric Dumazetadc93002011-11-17 03:13:26 +00003696 rcu_read_unlock();
Tom Herbertfec5e652010-04-16 16:01:27 -07003697 }
Tom Herbert1e94d722010-03-18 17:45:44 -07003698#endif
Eric Dumazetadc93002011-11-17 03:13:26 +00003699 return __netif_receive_skb(skb);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003700}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003701EXPORT_SYMBOL(netif_receive_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003702
Eric Dumazet88751272010-04-19 05:07:33 +00003703/* Network device is going away, flush any packets still pending
3704 * Called with irqs disabled.
3705 */
Changli Gao152102c2010-03-30 20:16:22 +00003706static void flush_backlog(void *arg)
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003707{
Changli Gao152102c2010-03-30 20:16:22 +00003708 struct net_device *dev = arg;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003709 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003710 struct sk_buff *skb, *tmp;
3711
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003712 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003713 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003714 if (skb->dev == dev) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003715 __skb_unlink(skb, &sd->input_pkt_queue);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003716 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003717 input_queue_head_incr(sd);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003718 }
Changli Gao6e7676c2010-04-27 15:07:33 -07003719 }
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003720 rps_unlock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003721
3722 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3723 if (skb->dev == dev) {
3724 __skb_unlink(skb, &sd->process_queue);
3725 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003726 input_queue_head_incr(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003727 }
3728 }
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003729}
3730
Herbert Xud565b0a2008-12-15 23:38:52 -08003731static int napi_gro_complete(struct sk_buff *skb)
3732{
Vlad Yasevich22061d82012-11-15 08:49:11 +00003733 struct packet_offload *ptype;
Herbert Xud565b0a2008-12-15 23:38:52 -08003734 __be16 type = skb->protocol;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003735 struct list_head *head = &offload_base;
Herbert Xud565b0a2008-12-15 23:38:52 -08003736 int err = -ENOENT;
3737
Eric Dumazetc3c7c252012-12-06 13:54:59 +00003738 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3739
Herbert Xufc59f9a2009-04-14 15:11:06 -07003740 if (NAPI_GRO_CB(skb)->count == 1) {
3741 skb_shinfo(skb)->gso_size = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003742 goto out;
Herbert Xufc59f9a2009-04-14 15:11:06 -07003743 }
Herbert Xud565b0a2008-12-15 23:38:52 -08003744
3745 rcu_read_lock();
3746 list_for_each_entry_rcu(ptype, head, list) {
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003747 if (ptype->type != type || !ptype->callbacks.gro_complete)
Herbert Xud565b0a2008-12-15 23:38:52 -08003748 continue;
3749
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003750 err = ptype->callbacks.gro_complete(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003751 break;
3752 }
3753 rcu_read_unlock();
3754
3755 if (err) {
3756 WARN_ON(&ptype->list == head);
3757 kfree_skb(skb);
3758 return NET_RX_SUCCESS;
3759 }
3760
3761out:
Herbert Xud565b0a2008-12-15 23:38:52 -08003762 return netif_receive_skb(skb);
3763}
3764
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003765/* napi->gro_list contains packets ordered by age.
3766 * youngest packets at the head of it.
3767 * Complete skbs in reverse order to reduce latencies.
3768 */
3769void napi_gro_flush(struct napi_struct *napi, bool flush_old)
Herbert Xud565b0a2008-12-15 23:38:52 -08003770{
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003771 struct sk_buff *skb, *prev = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08003772
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003773 /* scan list and build reverse chain */
3774 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3775 skb->prev = prev;
3776 prev = skb;
Herbert Xud565b0a2008-12-15 23:38:52 -08003777 }
3778
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003779 for (skb = prev; skb; skb = prev) {
3780 skb->next = NULL;
3781
3782 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3783 return;
3784
3785 prev = skb->prev;
3786 napi_gro_complete(skb);
3787 napi->gro_count--;
3788 }
3789
Herbert Xud565b0a2008-12-15 23:38:52 -08003790 napi->gro_list = NULL;
3791}
Eric Dumazet86cac582010-08-31 18:25:32 +00003792EXPORT_SYMBOL(napi_gro_flush);
Herbert Xud565b0a2008-12-15 23:38:52 -08003793
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003794static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3795{
3796 struct sk_buff *p;
3797 unsigned int maclen = skb->dev->hard_header_len;
3798
3799 for (p = napi->gro_list; p; p = p->next) {
3800 unsigned long diffs;
3801
3802 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3803 diffs |= p->vlan_tci ^ skb->vlan_tci;
3804 if (maclen == ETH_HLEN)
3805 diffs |= compare_ether_header(skb_mac_header(p),
3806 skb_gro_mac_header(skb));
3807 else if (!diffs)
3808 diffs = memcmp(skb_mac_header(p),
3809 skb_gro_mac_header(skb),
3810 maclen);
3811 NAPI_GRO_CB(p)->same_flow = !diffs;
3812 NAPI_GRO_CB(p)->flush = 0;
3813 }
3814}
3815
Rami Rosenbb728822012-11-28 21:55:25 +00003816static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xud565b0a2008-12-15 23:38:52 -08003817{
3818 struct sk_buff **pp = NULL;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003819 struct packet_offload *ptype;
Herbert Xud565b0a2008-12-15 23:38:52 -08003820 __be16 type = skb->protocol;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003821 struct list_head *head = &offload_base;
Herbert Xu0da2afd52008-12-26 14:57:42 -08003822 int same_flow;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003823 enum gro_result ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003824
Jarek Poplawskice9e76c2010-08-05 01:19:11 +00003825 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
Herbert Xud565b0a2008-12-15 23:38:52 -08003826 goto normal;
3827
David S. Miller21dc3302010-08-23 00:13:46 -07003828 if (skb_is_gso(skb) || skb_has_frag_list(skb))
Herbert Xuf17f5c92009-01-14 14:36:12 -08003829 goto normal;
3830
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003831 gro_list_prepare(napi, skb);
3832
Herbert Xud565b0a2008-12-15 23:38:52 -08003833 rcu_read_lock();
3834 list_for_each_entry_rcu(ptype, head, list) {
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003835 if (ptype->type != type || !ptype->callbacks.gro_receive)
Herbert Xud565b0a2008-12-15 23:38:52 -08003836 continue;
3837
Herbert Xu86911732009-01-29 14:19:50 +00003838 skb_set_network_header(skb, skb_gro_offset(skb));
Eric Dumazetefd94502013-02-14 17:31:48 +00003839 skb_reset_mac_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003840 NAPI_GRO_CB(skb)->same_flow = 0;
3841 NAPI_GRO_CB(skb)->flush = 0;
Herbert Xu5d38a072009-01-04 16:13:40 -08003842 NAPI_GRO_CB(skb)->free = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003843
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003844 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003845 break;
3846 }
3847 rcu_read_unlock();
3848
3849 if (&ptype->list == head)
3850 goto normal;
3851
Herbert Xu0da2afd52008-12-26 14:57:42 -08003852 same_flow = NAPI_GRO_CB(skb)->same_flow;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003853 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
Herbert Xu0da2afd52008-12-26 14:57:42 -08003854
Herbert Xud565b0a2008-12-15 23:38:52 -08003855 if (pp) {
3856 struct sk_buff *nskb = *pp;
3857
3858 *pp = nskb->next;
3859 nskb->next = NULL;
3860 napi_gro_complete(nskb);
Herbert Xu4ae55442009-02-08 18:00:36 +00003861 napi->gro_count--;
Herbert Xud565b0a2008-12-15 23:38:52 -08003862 }
3863
Herbert Xu0da2afd52008-12-26 14:57:42 -08003864 if (same_flow)
Herbert Xud565b0a2008-12-15 23:38:52 -08003865 goto ok;
3866
Herbert Xu4ae55442009-02-08 18:00:36 +00003867 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
Herbert Xud565b0a2008-12-15 23:38:52 -08003868 goto normal;
Herbert Xud565b0a2008-12-15 23:38:52 -08003869
Herbert Xu4ae55442009-02-08 18:00:36 +00003870 napi->gro_count++;
Herbert Xud565b0a2008-12-15 23:38:52 -08003871 NAPI_GRO_CB(skb)->count = 1;
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003872 NAPI_GRO_CB(skb)->age = jiffies;
Herbert Xu86911732009-01-29 14:19:50 +00003873 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003874 skb->next = napi->gro_list;
3875 napi->gro_list = skb;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003876 ret = GRO_HELD;
Herbert Xud565b0a2008-12-15 23:38:52 -08003877
Herbert Xuad0f9902009-02-01 01:24:55 -08003878pull:
Herbert Xucb189782009-05-26 18:50:31 +00003879 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3880 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3881
3882 BUG_ON(skb->end - skb->tail < grow);
3883
3884 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3885
3886 skb->tail += grow;
3887 skb->data_len -= grow;
3888
3889 skb_shinfo(skb)->frags[0].page_offset += grow;
Eric Dumazet9e903e02011-10-18 21:00:24 +00003890 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
Herbert Xucb189782009-05-26 18:50:31 +00003891
Eric Dumazet9e903e02011-10-18 21:00:24 +00003892 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
Ian Campbellea2ab692011-08-22 23:44:58 +00003893 skb_frag_unref(skb, 0);
Herbert Xucb189782009-05-26 18:50:31 +00003894 memmove(skb_shinfo(skb)->frags,
3895 skb_shinfo(skb)->frags + 1,
Jarek Poplawskie5093ae2010-08-11 02:02:10 +00003896 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
Herbert Xucb189782009-05-26 18:50:31 +00003897 }
Herbert Xuad0f9902009-02-01 01:24:55 -08003898 }
3899
Herbert Xud565b0a2008-12-15 23:38:52 -08003900ok:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003901 return ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003902
3903normal:
Herbert Xuad0f9902009-02-01 01:24:55 -08003904 ret = GRO_NORMAL;
3905 goto pull;
Herbert Xu5d38a072009-01-04 16:13:40 -08003906}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003907
Herbert Xu96e93ea2009-01-06 10:49:34 -08003908
Rami Rosenbb728822012-11-28 21:55:25 +00003909static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
Herbert Xu5d38a072009-01-04 16:13:40 -08003910{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003911 switch (ret) {
3912 case GRO_NORMAL:
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003913 if (netif_receive_skb(skb))
3914 ret = GRO_DROP;
3915 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003916
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003917 case GRO_DROP:
Herbert Xu5d38a072009-01-04 16:13:40 -08003918 kfree_skb(skb);
3919 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003920
Eric Dumazetdaa86542012-04-19 07:07:40 +00003921 case GRO_MERGED_FREE:
Eric Dumazetd7e88832012-04-30 08:10:34 +00003922 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3923 kmem_cache_free(skbuff_head_cache, skb);
3924 else
3925 __kfree_skb(skb);
Eric Dumazetdaa86542012-04-19 07:07:40 +00003926 break;
3927
Ben Hutchings5b252f02009-10-29 07:17:09 +00003928 case GRO_HELD:
3929 case GRO_MERGED:
3930 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003931 }
3932
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003933 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003934}
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003935
Eric Dumazetca07e432012-10-06 22:28:06 +00003936static void skb_gro_reset_offset(struct sk_buff *skb)
Herbert Xu78a478d2009-05-26 18:50:21 +00003937{
Eric Dumazetca07e432012-10-06 22:28:06 +00003938 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3939 const skb_frag_t *frag0 = &pinfo->frags[0];
3940
Herbert Xu78a478d2009-05-26 18:50:21 +00003941 NAPI_GRO_CB(skb)->data_offset = 0;
3942 NAPI_GRO_CB(skb)->frag0 = NULL;
Herbert Xu74895942009-05-26 18:50:27 +00003943 NAPI_GRO_CB(skb)->frag0_len = 0;
Herbert Xu78a478d2009-05-26 18:50:21 +00003944
Simon Hormanced14f62013-05-28 20:34:25 +00003945 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
Eric Dumazetca07e432012-10-06 22:28:06 +00003946 pinfo->nr_frags &&
3947 !PageHighMem(skb_frag_page(frag0))) {
3948 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3949 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
Herbert Xu74895942009-05-26 18:50:27 +00003950 }
Herbert Xu78a478d2009-05-26 18:50:21 +00003951}
Herbert Xu78a478d2009-05-26 18:50:21 +00003952
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003953gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003954{
Herbert Xu86911732009-01-29 14:19:50 +00003955 skb_gro_reset_offset(skb);
3956
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003957 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003958}
3959EXPORT_SYMBOL(napi_gro_receive);
3960
stephen hemmingerd0c2b0d2010-10-19 07:12:10 +00003961static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003962{
Herbert Xu96e93ea2009-01-06 10:49:34 -08003963 __skb_pull(skb, skb_headlen(skb));
Eric Dumazet2a2a4592012-03-21 06:58:03 +00003964 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3965 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
Jesse Gross3701e512010-10-20 13:56:06 +00003966 skb->vlan_tci = 0;
Herbert Xu66c46d72011-01-29 20:44:54 -08003967 skb->dev = napi->dev;
Andy Gospodarek6d152e22011-02-02 14:53:25 -08003968 skb->skb_iif = 0;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003969
3970 napi->skb = skb;
3971}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003972
Herbert Xu76620aa2009-04-16 02:02:07 -07003973struct sk_buff *napi_get_frags(struct napi_struct *napi)
Herbert Xu5d38a072009-01-04 16:13:40 -08003974{
Herbert Xu5d38a072009-01-04 16:13:40 -08003975 struct sk_buff *skb = napi->skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003976
3977 if (!skb) {
Eric Dumazet89d71a62009-10-13 05:34:20 +00003978 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3979 if (skb)
3980 napi->skb = skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003981 }
Herbert Xu96e93ea2009-01-06 10:49:34 -08003982 return skb;
3983}
Herbert Xu76620aa2009-04-16 02:02:07 -07003984EXPORT_SYMBOL(napi_get_frags);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003985
Rami Rosenbb728822012-11-28 21:55:25 +00003986static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003987 gro_result_t ret)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003988{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003989 switch (ret) {
3990 case GRO_NORMAL:
Herbert Xu86911732009-01-29 14:19:50 +00003991 case GRO_HELD:
Ajit Khapardee76b69c2010-02-16 20:25:43 +00003992 skb->protocol = eth_type_trans(skb, skb->dev);
Herbert Xu86911732009-01-29 14:19:50 +00003993
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003994 if (ret == GRO_HELD)
3995 skb_gro_pull(skb, -ETH_HLEN);
3996 else if (netif_receive_skb(skb))
3997 ret = GRO_DROP;
Herbert Xu86911732009-01-29 14:19:50 +00003998 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003999
4000 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004001 case GRO_MERGED_FREE:
4002 napi_reuse_skb(napi, skb);
4003 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00004004
4005 case GRO_MERGED:
4006 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004007 }
4008
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07004009 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004010}
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004011
Eric Dumazet4adb9c42012-05-18 20:49:06 +00004012static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
Herbert Xu96e93ea2009-01-06 10:49:34 -08004013{
Herbert Xu76620aa2009-04-16 02:02:07 -07004014 struct sk_buff *skb = napi->skb;
4015 struct ethhdr *eth;
Herbert Xua5b1cf22009-05-26 18:50:28 +00004016 unsigned int hlen;
4017 unsigned int off;
Herbert Xu76620aa2009-04-16 02:02:07 -07004018
4019 napi->skb = NULL;
4020
4021 skb_reset_mac_header(skb);
4022 skb_gro_reset_offset(skb);
4023
Herbert Xua5b1cf22009-05-26 18:50:28 +00004024 off = skb_gro_offset(skb);
4025 hlen = off + sizeof(*eth);
4026 eth = skb_gro_header_fast(skb, off);
4027 if (skb_gro_header_hard(skb, hlen)) {
4028 eth = skb_gro_header_slow(skb, hlen, off);
4029 if (unlikely(!eth)) {
4030 napi_reuse_skb(napi, skb);
4031 skb = NULL;
4032 goto out;
4033 }
Herbert Xu76620aa2009-04-16 02:02:07 -07004034 }
4035
4036 skb_gro_pull(skb, sizeof(*eth));
4037
4038 /*
4039 * This works because the only protocols we care about don't require
4040 * special handling. We'll fix it up properly at the end.
4041 */
4042 skb->protocol = eth->h_proto;
4043
4044out:
4045 return skb;
4046}
Herbert Xu76620aa2009-04-16 02:02:07 -07004047
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07004048gro_result_t napi_gro_frags(struct napi_struct *napi)
Herbert Xu76620aa2009-04-16 02:02:07 -07004049{
4050 struct sk_buff *skb = napi_frags_skb(napi);
Herbert Xu96e93ea2009-01-06 10:49:34 -08004051
4052 if (!skb)
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07004053 return GRO_DROP;
Herbert Xu96e93ea2009-01-06 10:49:34 -08004054
Eric Dumazet89c5fa32012-12-10 13:28:16 +00004055 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
Herbert Xu5d38a072009-01-04 16:13:40 -08004056}
4057EXPORT_SYMBOL(napi_gro_frags);
4058
Eric Dumazete326bed2010-04-22 00:22:45 -07004059/*
4060 * net_rps_action sends any pending IPI's for rps.
4061 * Note: called with local irq disabled, but exits with local irq enabled.
4062 */
4063static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4064{
4065#ifdef CONFIG_RPS
4066 struct softnet_data *remsd = sd->rps_ipi_list;
4067
4068 if (remsd) {
4069 sd->rps_ipi_list = NULL;
4070
4071 local_irq_enable();
4072
4073 /* Send pending IPI's to kick RPS processing on remote cpus. */
4074 while (remsd) {
4075 struct softnet_data *next = remsd->rps_ipi_next;
4076
4077 if (cpu_online(remsd->cpu))
4078 __smp_call_function_single(remsd->cpu,
4079 &remsd->csd, 0);
4080 remsd = next;
4081 }
4082 } else
4083#endif
4084 local_irq_enable();
4085}
4086
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004087static int process_backlog(struct napi_struct *napi, int quota)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004088{
4089 int work = 0;
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004090 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004091
Eric Dumazete326bed2010-04-22 00:22:45 -07004092#ifdef CONFIG_RPS
4093 /* Check if we have pending ipi, its better to send them now,
4094 * not waiting net_rx_action() end.
4095 */
4096 if (sd->rps_ipi_list) {
4097 local_irq_disable();
4098 net_rps_action_and_irq_enable(sd);
4099 }
4100#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004101 napi->weight = weight_p;
Changli Gao6e7676c2010-04-27 15:07:33 -07004102 local_irq_disable();
4103 while (work < quota) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004104 struct sk_buff *skb;
Changli Gao6e7676c2010-04-27 15:07:33 -07004105 unsigned int qlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004106
Changli Gao6e7676c2010-04-27 15:07:33 -07004107 while ((skb = __skb_dequeue(&sd->process_queue))) {
Eric Dumazete4008272010-04-05 15:42:39 -07004108 local_irq_enable();
Changli Gao6e7676c2010-04-27 15:07:33 -07004109 __netif_receive_skb(skb);
Changli Gao6e7676c2010-04-27 15:07:33 -07004110 local_irq_disable();
Tom Herbert76cc8b12010-05-20 18:37:59 +00004111 input_queue_head_incr(sd);
4112 if (++work >= quota) {
4113 local_irq_enable();
4114 return work;
4115 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004116 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004117
Changli Gao6e7676c2010-04-27 15:07:33 -07004118 rps_lock(sd);
4119 qlen = skb_queue_len(&sd->input_pkt_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00004120 if (qlen)
Changli Gao6e7676c2010-04-27 15:07:33 -07004121 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4122 &sd->process_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00004123
Changli Gao6e7676c2010-04-27 15:07:33 -07004124 if (qlen < quota - work) {
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004125 /*
4126 * Inline a custom version of __napi_complete().
4127 * only current cpu owns and manipulates this napi,
4128 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4129 * we can use a plain write instead of clear_bit(),
4130 * and we dont need an smp_mb() memory barrier.
4131 */
4132 list_del(&napi->poll_list);
4133 napi->state = 0;
4134
Changli Gao6e7676c2010-04-27 15:07:33 -07004135 quota = work + qlen;
4136 }
4137 rps_unlock(sd);
4138 }
4139 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004140
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004141 return work;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004142}
4143
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004144/**
4145 * __napi_schedule - schedule for receive
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004146 * @n: entry to schedule
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004147 *
4148 * The entry's receive function will be scheduled to run
4149 */
Harvey Harrisonb5606c22008-02-13 15:03:16 -08004150void __napi_schedule(struct napi_struct *n)
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004151{
4152 unsigned long flags;
4153
4154 local_irq_save(flags);
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004155 ____napi_schedule(&__get_cpu_var(softnet_data), n);
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004156 local_irq_restore(flags);
4157}
4158EXPORT_SYMBOL(__napi_schedule);
4159
Herbert Xud565b0a2008-12-15 23:38:52 -08004160void __napi_complete(struct napi_struct *n)
4161{
4162 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4163 BUG_ON(n->gro_list);
4164
4165 list_del(&n->poll_list);
4166 smp_mb__before_clear_bit();
4167 clear_bit(NAPI_STATE_SCHED, &n->state);
4168}
4169EXPORT_SYMBOL(__napi_complete);
4170
4171void napi_complete(struct napi_struct *n)
4172{
4173 unsigned long flags;
4174
4175 /*
4176 * don't let napi dequeue from the cpu poll list
4177 * just in case its running on a different cpu
4178 */
4179 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4180 return;
4181
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004182 napi_gro_flush(n, false);
Herbert Xud565b0a2008-12-15 23:38:52 -08004183 local_irq_save(flags);
4184 __napi_complete(n);
4185 local_irq_restore(flags);
4186}
4187EXPORT_SYMBOL(napi_complete);
4188
Eliezer Tamiraf12fa62013-06-10 11:39:41 +03004189/* must be called under rcu_read_lock(), as we dont take a reference */
4190struct napi_struct *napi_by_id(unsigned int napi_id)
4191{
4192 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4193 struct napi_struct *napi;
4194
4195 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4196 if (napi->napi_id == napi_id)
4197 return napi;
4198
4199 return NULL;
4200}
4201EXPORT_SYMBOL_GPL(napi_by_id);
4202
4203void napi_hash_add(struct napi_struct *napi)
4204{
4205 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4206
4207 spin_lock(&napi_hash_lock);
4208
4209 /* 0 is not a valid id, we also skip an id that is taken
4210 * we expect both events to be extremely rare
4211 */
4212 napi->napi_id = 0;
4213 while (!napi->napi_id) {
4214 napi->napi_id = ++napi_gen_id;
4215 if (napi_by_id(napi->napi_id))
4216 napi->napi_id = 0;
4217 }
4218
4219 hlist_add_head_rcu(&napi->napi_hash_node,
4220 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4221
4222 spin_unlock(&napi_hash_lock);
4223 }
4224}
4225EXPORT_SYMBOL_GPL(napi_hash_add);
4226
4227/* Warning : caller is responsible to make sure rcu grace period
4228 * is respected before freeing memory containing @napi
4229 */
4230void napi_hash_del(struct napi_struct *napi)
4231{
4232 spin_lock(&napi_hash_lock);
4233
4234 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4235 hlist_del_rcu(&napi->napi_hash_node);
4236
4237 spin_unlock(&napi_hash_lock);
4238}
4239EXPORT_SYMBOL_GPL(napi_hash_del);
4240
Herbert Xud565b0a2008-12-15 23:38:52 -08004241void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4242 int (*poll)(struct napi_struct *, int), int weight)
4243{
4244 INIT_LIST_HEAD(&napi->poll_list);
Herbert Xu4ae55442009-02-08 18:00:36 +00004245 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08004246 napi->gro_list = NULL;
Herbert Xu5d38a072009-01-04 16:13:40 -08004247 napi->skb = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08004248 napi->poll = poll;
Eric Dumazet82dc3c62013-03-05 15:57:22 +00004249 if (weight > NAPI_POLL_WEIGHT)
4250 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4251 weight, dev->name);
Herbert Xud565b0a2008-12-15 23:38:52 -08004252 napi->weight = weight;
4253 list_add(&napi->dev_list, &dev->napi_list);
Herbert Xud565b0a2008-12-15 23:38:52 -08004254 napi->dev = dev;
Herbert Xu5d38a072009-01-04 16:13:40 -08004255#ifdef CONFIG_NETPOLL
Herbert Xud565b0a2008-12-15 23:38:52 -08004256 spin_lock_init(&napi->poll_lock);
4257 napi->poll_owner = -1;
4258#endif
4259 set_bit(NAPI_STATE_SCHED, &napi->state);
4260}
4261EXPORT_SYMBOL(netif_napi_add);
4262
4263void netif_napi_del(struct napi_struct *napi)
4264{
4265 struct sk_buff *skb, *next;
4266
Peter P Waskiewicz Jrd7b06632008-12-26 01:35:35 -08004267 list_del_init(&napi->dev_list);
Herbert Xu76620aa2009-04-16 02:02:07 -07004268 napi_free_frags(napi);
Herbert Xud565b0a2008-12-15 23:38:52 -08004269
4270 for (skb = napi->gro_list; skb; skb = next) {
4271 next = skb->next;
4272 skb->next = NULL;
4273 kfree_skb(skb);
4274 }
4275
4276 napi->gro_list = NULL;
Herbert Xu4ae55442009-02-08 18:00:36 +00004277 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08004278}
4279EXPORT_SYMBOL(netif_napi_del);
4280
Linus Torvalds1da177e2005-04-16 15:20:36 -07004281static void net_rx_action(struct softirq_action *h)
4282{
Eric Dumazete326bed2010-04-22 00:22:45 -07004283 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger24f8b232008-11-03 17:14:38 -08004284 unsigned long time_limit = jiffies + 2;
Stephen Hemminger51b0bde2005-06-23 20:14:40 -07004285 int budget = netdev_budget;
Matt Mackall53fb95d2005-08-11 19:27:43 -07004286 void *have;
4287
Linus Torvalds1da177e2005-04-16 15:20:36 -07004288 local_irq_disable();
4289
Eric Dumazete326bed2010-04-22 00:22:45 -07004290 while (!list_empty(&sd->poll_list)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004291 struct napi_struct *n;
4292 int work, weight;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004293
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004294 /* If softirq window is exhuasted then punt.
Stephen Hemminger24f8b232008-11-03 17:14:38 -08004295 * Allow this to run for 2 jiffies since which will allow
4296 * an average latency of 1.5/HZ.
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004297 */
Eric Dumazetd1f41b62013-03-05 07:15:13 +00004298 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004299 goto softnet_break;
4300
4301 local_irq_enable();
4302
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004303 /* Even though interrupts have been re-enabled, this
4304 * access is safe because interrupts can only add new
4305 * entries to the tail of this list, and only ->poll()
4306 * calls can remove this head entry from the list.
4307 */
Eric Dumazete326bed2010-04-22 00:22:45 -07004308 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004309
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004310 have = netpoll_poll_lock(n);
4311
4312 weight = n->weight;
4313
David S. Miller0a7606c2007-10-29 21:28:47 -07004314 /* This NAPI_STATE_SCHED test is for avoiding a race
4315 * with netpoll's poll_napi(). Only the entity which
4316 * obtains the lock and sees NAPI_STATE_SCHED set will
4317 * actually make the ->poll() call. Therefore we avoid
Lucas De Marchi25985ed2011-03-30 22:57:33 -03004318 * accidentally calling ->poll() when NAPI is not scheduled.
David S. Miller0a7606c2007-10-29 21:28:47 -07004319 */
4320 work = 0;
Neil Horman4ea7e382009-05-21 07:36:08 +00004321 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
David S. Miller0a7606c2007-10-29 21:28:47 -07004322 work = n->poll(n, weight);
Neil Horman4ea7e382009-05-21 07:36:08 +00004323 trace_napi_poll(n);
4324 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004325
4326 WARN_ON_ONCE(work > weight);
4327
4328 budget -= work;
4329
4330 local_irq_disable();
4331
4332 /* Drivers must not modify the NAPI state if they
4333 * consume the entire weight. In such cases this code
4334 * still "owns" the NAPI instance and therefore can
4335 * move the instance around on the list at-will.
4336 */
David S. Millerfed17f32008-01-07 21:00:40 -08004337 if (unlikely(work == weight)) {
Herbert Xuff780cd2009-06-26 19:27:04 -07004338 if (unlikely(napi_disable_pending(n))) {
4339 local_irq_enable();
4340 napi_complete(n);
4341 local_irq_disable();
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004342 } else {
4343 if (n->gro_list) {
4344 /* flush too old packets
4345 * If HZ < 1000, flush all packets.
4346 */
4347 local_irq_enable();
4348 napi_gro_flush(n, HZ >= 1000);
4349 local_irq_disable();
4350 }
Eric Dumazete326bed2010-04-22 00:22:45 -07004351 list_move_tail(&n->poll_list, &sd->poll_list);
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004352 }
David S. Millerfed17f32008-01-07 21:00:40 -08004353 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004354
4355 netpoll_poll_unlock(have);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004356 }
4357out:
Eric Dumazete326bed2010-04-22 00:22:45 -07004358 net_rps_action_and_irq_enable(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00004359
Chris Leechdb217332006-06-17 21:24:58 -07004360#ifdef CONFIG_NET_DMA
4361 /*
4362 * There may not be any more sk_buffs coming right now, so push
4363 * any pending DMA copies to hardware
4364 */
Dan Williams2ba05622009-01-06 11:38:14 -07004365 dma_issue_pending_all();
Chris Leechdb217332006-06-17 21:24:58 -07004366#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004367
Linus Torvalds1da177e2005-04-16 15:20:36 -07004368 return;
4369
4370softnet_break:
Changli Gaodee42872010-05-02 05:42:16 +00004371 sd->time_squeeze++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004372 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4373 goto out;
4374}
4375
Veaceslav Falicoaa9d8562013-08-28 23:25:04 +02004376struct netdev_adjacent {
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004377 struct net_device *dev;
Veaceslav Falico5d261912013-08-28 23:25:05 +02004378
4379 /* upper master flag, there can only be one master device per list */
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004380 bool master;
Veaceslav Falico5d261912013-08-28 23:25:05 +02004381
Veaceslav Falico5d261912013-08-28 23:25:05 +02004382 /* counter for the number of times this device was added to us */
4383 u16 ref_nr;
4384
Veaceslav Falico402dae92013-09-25 09:20:09 +02004385 /* private field for the users */
4386 void *private;
4387
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004388 struct list_head list;
4389 struct rcu_head rcu;
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004390};
4391
Veaceslav Falico5249dec2013-09-25 09:20:08 +02004392static struct netdev_adjacent *__netdev_find_adj_rcu(struct net_device *dev,
4393 struct net_device *adj_dev,
4394 struct list_head *adj_list)
4395{
4396 struct netdev_adjacent *adj;
4397
4398 list_for_each_entry_rcu(adj, adj_list, list) {
4399 if (adj->dev == adj_dev)
4400 return adj;
4401 }
4402 return NULL;
4403}
4404
Veaceslav Falico5d261912013-08-28 23:25:05 +02004405static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4406 struct net_device *adj_dev,
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004407 struct list_head *adj_list)
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004408{
Veaceslav Falico5d261912013-08-28 23:25:05 +02004409 struct netdev_adjacent *adj;
Veaceslav Falico5d261912013-08-28 23:25:05 +02004410
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004411 list_for_each_entry(adj, adj_list, list) {
Veaceslav Falico5d261912013-08-28 23:25:05 +02004412 if (adj->dev == adj_dev)
4413 return adj;
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004414 }
4415 return NULL;
4416}
4417
4418/**
4419 * netdev_has_upper_dev - Check if device is linked to an upper device
4420 * @dev: device
4421 * @upper_dev: upper device to check
4422 *
4423 * Find out if a device is linked to specified upper device and return true
4424 * in case it is. Note that this checks only immediate upper device,
4425 * not through a complete stack of devices. The caller must hold the RTNL lock.
4426 */
4427bool netdev_has_upper_dev(struct net_device *dev,
4428 struct net_device *upper_dev)
4429{
4430 ASSERT_RTNL();
4431
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004432 return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004433}
4434EXPORT_SYMBOL(netdev_has_upper_dev);
4435
4436/**
4437 * netdev_has_any_upper_dev - Check if device is linked to some device
4438 * @dev: device
4439 *
4440 * Find out if a device is linked to an upper device and return true in case
4441 * it is. The caller must hold the RTNL lock.
4442 */
4443bool netdev_has_any_upper_dev(struct net_device *dev)
4444{
4445 ASSERT_RTNL();
4446
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004447 return !list_empty(&dev->all_adj_list.upper);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004448}
4449EXPORT_SYMBOL(netdev_has_any_upper_dev);
4450
4451/**
4452 * netdev_master_upper_dev_get - Get master upper device
4453 * @dev: device
4454 *
4455 * Find a master upper device and return pointer to it or NULL in case
4456 * it's not there. The caller must hold the RTNL lock.
4457 */
4458struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4459{
Veaceslav Falicoaa9d8562013-08-28 23:25:04 +02004460 struct netdev_adjacent *upper;
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004461
4462 ASSERT_RTNL();
4463
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004464 if (list_empty(&dev->adj_list.upper))
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004465 return NULL;
4466
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004467 upper = list_first_entry(&dev->adj_list.upper,
Veaceslav Falicoaa9d8562013-08-28 23:25:04 +02004468 struct netdev_adjacent, list);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004469 if (likely(upper->master))
4470 return upper->dev;
4471 return NULL;
4472}
4473EXPORT_SYMBOL(netdev_master_upper_dev_get);
4474
Veaceslav Falicob6ccba42013-09-25 09:20:23 +02004475void *netdev_adjacent_get_private(struct list_head *adj_list)
4476{
4477 struct netdev_adjacent *adj;
4478
4479 adj = list_entry(adj_list, struct netdev_adjacent, list);
4480
4481 return adj->private;
4482}
4483EXPORT_SYMBOL(netdev_adjacent_get_private);
4484
Veaceslav Falico31088a12013-09-25 09:20:12 +02004485/**
4486 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
Veaceslav Falico48311f42013-08-28 23:25:07 +02004487 * @dev: device
4488 * @iter: list_head ** of the current position
4489 *
4490 * Gets the next device from the dev's upper list, starting from iter
4491 * position. The caller must hold RCU read lock.
4492 */
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004493struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4494 struct list_head **iter)
Veaceslav Falico48311f42013-08-28 23:25:07 +02004495{
4496 struct netdev_adjacent *upper;
4497
4498 WARN_ON_ONCE(!rcu_read_lock_held());
4499
4500 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4501
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004502 if (&upper->list == &dev->all_adj_list.upper)
Veaceslav Falico48311f42013-08-28 23:25:07 +02004503 return NULL;
4504
4505 *iter = &upper->list;
4506
4507 return upper->dev;
4508}
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004509EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
Veaceslav Falico48311f42013-08-28 23:25:07 +02004510
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004511/**
Veaceslav Falico31088a12013-09-25 09:20:12 +02004512 * netdev_lower_get_next_private - Get the next ->private from the
4513 * lower neighbour list
4514 * @dev: device
4515 * @iter: list_head ** of the current position
4516 *
4517 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4518 * list, starting from iter position. The caller must hold either hold the
4519 * RTNL lock or its own locking that guarantees that the neighbour lower
4520 * list will remain unchainged.
4521 */
4522void *netdev_lower_get_next_private(struct net_device *dev,
4523 struct list_head **iter)
4524{
4525 struct netdev_adjacent *lower;
4526
4527 lower = list_entry(*iter, struct netdev_adjacent, list);
4528
4529 if (&lower->list == &dev->adj_list.lower)
4530 return NULL;
4531
4532 if (iter)
4533 *iter = lower->list.next;
4534
4535 return lower->private;
4536}
4537EXPORT_SYMBOL(netdev_lower_get_next_private);
4538
4539/**
4540 * netdev_lower_get_next_private_rcu - Get the next ->private from the
4541 * lower neighbour list, RCU
4542 * variant
4543 * @dev: device
4544 * @iter: list_head ** of the current position
4545 *
4546 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4547 * list, starting from iter position. The caller must hold RCU read lock.
4548 */
4549void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4550 struct list_head **iter)
4551{
4552 struct netdev_adjacent *lower;
4553
4554 WARN_ON_ONCE(!rcu_read_lock_held());
4555
4556 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4557
4558 if (&lower->list == &dev->adj_list.lower)
4559 return NULL;
4560
4561 if (iter)
4562 *iter = &lower->list;
4563
4564 return lower->private;
4565}
4566EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4567
4568/**
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004569 * netdev_master_upper_dev_get_rcu - Get master upper device
4570 * @dev: device
4571 *
4572 * Find a master upper device and return pointer to it or NULL in case
4573 * it's not there. The caller must hold the RCU read lock.
4574 */
4575struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4576{
Veaceslav Falicoaa9d8562013-08-28 23:25:04 +02004577 struct netdev_adjacent *upper;
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004578
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004579 upper = list_first_or_null_rcu(&dev->adj_list.upper,
Veaceslav Falicoaa9d8562013-08-28 23:25:04 +02004580 struct netdev_adjacent, list);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004581 if (upper && likely(upper->master))
4582 return upper->dev;
4583 return NULL;
4584}
4585EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4586
Veaceslav Falico5d261912013-08-28 23:25:05 +02004587static int __netdev_adjacent_dev_insert(struct net_device *dev,
4588 struct net_device *adj_dev,
Veaceslav Falico7863c052013-09-25 09:20:06 +02004589 struct list_head *dev_list,
Veaceslav Falico402dae92013-09-25 09:20:09 +02004590 void *private, bool master)
Veaceslav Falico5d261912013-08-28 23:25:05 +02004591{
4592 struct netdev_adjacent *adj;
Veaceslav Falico5831d66e2013-09-25 09:20:32 +02004593 char linkname[IFNAMSIZ+7];
Veaceslav Falico842d67a2013-09-25 09:20:31 +02004594 int ret;
Veaceslav Falico5d261912013-08-28 23:25:05 +02004595
Veaceslav Falico7863c052013-09-25 09:20:06 +02004596 adj = __netdev_find_adj(dev, adj_dev, dev_list);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004597
4598 if (adj) {
Veaceslav Falico5d261912013-08-28 23:25:05 +02004599 adj->ref_nr++;
4600 return 0;
4601 }
4602
4603 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4604 if (!adj)
4605 return -ENOMEM;
4606
4607 adj->dev = adj_dev;
4608 adj->master = master;
Veaceslav Falico5d261912013-08-28 23:25:05 +02004609 adj->ref_nr = 1;
Veaceslav Falico402dae92013-09-25 09:20:09 +02004610 adj->private = private;
Veaceslav Falico5d261912013-08-28 23:25:05 +02004611 dev_hold(adj_dev);
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004612
4613 pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4614 adj_dev->name, dev->name, adj_dev->name);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004615
Veaceslav Falico5831d66e2013-09-25 09:20:32 +02004616 if (dev_list == &dev->adj_list.lower) {
4617 sprintf(linkname, "lower_%s", adj_dev->name);
4618 ret = sysfs_create_link(&(dev->dev.kobj),
4619 &(adj_dev->dev.kobj), linkname);
4620 if (ret)
4621 goto free_adj;
4622 } else if (dev_list == &dev->adj_list.upper) {
4623 sprintf(linkname, "upper_%s", adj_dev->name);
4624 ret = sysfs_create_link(&(dev->dev.kobj),
4625 &(adj_dev->dev.kobj), linkname);
4626 if (ret)
4627 goto free_adj;
4628 }
4629
Veaceslav Falico7863c052013-09-25 09:20:06 +02004630 /* Ensure that master link is always the first item in list. */
Veaceslav Falico842d67a2013-09-25 09:20:31 +02004631 if (master) {
4632 ret = sysfs_create_link(&(dev->dev.kobj),
4633 &(adj_dev->dev.kobj), "master");
4634 if (ret)
Veaceslav Falico5831d66e2013-09-25 09:20:32 +02004635 goto remove_symlinks;
Veaceslav Falico842d67a2013-09-25 09:20:31 +02004636
Veaceslav Falico7863c052013-09-25 09:20:06 +02004637 list_add_rcu(&adj->list, dev_list);
Veaceslav Falico842d67a2013-09-25 09:20:31 +02004638 } else {
Veaceslav Falico7863c052013-09-25 09:20:06 +02004639 list_add_tail_rcu(&adj->list, dev_list);
Veaceslav Falico842d67a2013-09-25 09:20:31 +02004640 }
Veaceslav Falico5d261912013-08-28 23:25:05 +02004641
4642 return 0;
Veaceslav Falico842d67a2013-09-25 09:20:31 +02004643
Veaceslav Falico5831d66e2013-09-25 09:20:32 +02004644remove_symlinks:
4645 if (dev_list == &dev->adj_list.lower) {
4646 sprintf(linkname, "lower_%s", adj_dev->name);
4647 sysfs_remove_link(&(dev->dev.kobj), linkname);
4648 } else if (dev_list == &dev->adj_list.upper) {
4649 sprintf(linkname, "upper_%s", adj_dev->name);
4650 sysfs_remove_link(&(dev->dev.kobj), linkname);
4651 }
4652
Veaceslav Falico842d67a2013-09-25 09:20:31 +02004653free_adj:
4654 kfree(adj);
Nikolay Aleksandrov974daef2013-10-23 15:28:56 +02004655 dev_put(adj_dev);
Veaceslav Falico842d67a2013-09-25 09:20:31 +02004656
4657 return ret;
Veaceslav Falico5d261912013-08-28 23:25:05 +02004658}
4659
Veaceslav Falico5d261912013-08-28 23:25:05 +02004660void __netdev_adjacent_dev_remove(struct net_device *dev,
Veaceslav Falico7863c052013-09-25 09:20:06 +02004661 struct net_device *adj_dev,
4662 struct list_head *dev_list)
Veaceslav Falico5d261912013-08-28 23:25:05 +02004663{
4664 struct netdev_adjacent *adj;
Veaceslav Falico5831d66e2013-09-25 09:20:32 +02004665 char linkname[IFNAMSIZ+7];
Veaceslav Falico5d261912013-08-28 23:25:05 +02004666
Veaceslav Falico7863c052013-09-25 09:20:06 +02004667 adj = __netdev_find_adj(dev, adj_dev, dev_list);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004668
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004669 if (!adj) {
4670 pr_err("tried to remove device %s from %s\n",
4671 dev->name, adj_dev->name);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004672 BUG();
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004673 }
Veaceslav Falico5d261912013-08-28 23:25:05 +02004674
4675 if (adj->ref_nr > 1) {
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004676 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4677 adj->ref_nr-1);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004678 adj->ref_nr--;
4679 return;
4680 }
4681
Veaceslav Falico842d67a2013-09-25 09:20:31 +02004682 if (adj->master)
4683 sysfs_remove_link(&(dev->dev.kobj), "master");
4684
Veaceslav Falico5831d66e2013-09-25 09:20:32 +02004685 if (dev_list == &dev->adj_list.lower) {
4686 sprintf(linkname, "lower_%s", adj_dev->name);
4687 sysfs_remove_link(&(dev->dev.kobj), linkname);
4688 } else if (dev_list == &dev->adj_list.upper) {
4689 sprintf(linkname, "upper_%s", adj_dev->name);
4690 sysfs_remove_link(&(dev->dev.kobj), linkname);
4691 }
4692
Veaceslav Falico5d261912013-08-28 23:25:05 +02004693 list_del_rcu(&adj->list);
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004694 pr_debug("dev_put for %s, because link removed from %s to %s\n",
4695 adj_dev->name, dev->name, adj_dev->name);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004696 dev_put(adj_dev);
4697 kfree_rcu(adj, rcu);
4698}
4699
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004700int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4701 struct net_device *upper_dev,
4702 struct list_head *up_list,
4703 struct list_head *down_list,
Veaceslav Falico402dae92013-09-25 09:20:09 +02004704 void *private, bool master)
Veaceslav Falico5d261912013-08-28 23:25:05 +02004705{
4706 int ret;
4707
Veaceslav Falico402dae92013-09-25 09:20:09 +02004708 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4709 master);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004710 if (ret)
4711 return ret;
4712
Veaceslav Falico402dae92013-09-25 09:20:09 +02004713 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4714 false);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004715 if (ret) {
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004716 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004717 return ret;
4718 }
4719
4720 return 0;
4721}
4722
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004723int __netdev_adjacent_dev_link(struct net_device *dev,
4724 struct net_device *upper_dev)
Veaceslav Falico5d261912013-08-28 23:25:05 +02004725{
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004726 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4727 &dev->all_adj_list.upper,
4728 &upper_dev->all_adj_list.lower,
Veaceslav Falico402dae92013-09-25 09:20:09 +02004729 NULL, false);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004730}
4731
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004732void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4733 struct net_device *upper_dev,
4734 struct list_head *up_list,
4735 struct list_head *down_list)
Veaceslav Falico5d261912013-08-28 23:25:05 +02004736{
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004737 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4738 __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004739}
4740
4741void __netdev_adjacent_dev_unlink(struct net_device *dev,
4742 struct net_device *upper_dev)
4743{
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004744 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4745 &dev->all_adj_list.upper,
4746 &upper_dev->all_adj_list.lower);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004747}
4748
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004749int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
4750 struct net_device *upper_dev,
Veaceslav Falico402dae92013-09-25 09:20:09 +02004751 void *private, bool master)
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004752{
4753 int ret = __netdev_adjacent_dev_link(dev, upper_dev);
4754
4755 if (ret)
4756 return ret;
4757
4758 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
4759 &dev->adj_list.upper,
4760 &upper_dev->adj_list.lower,
Veaceslav Falico402dae92013-09-25 09:20:09 +02004761 private, master);
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004762 if (ret) {
4763 __netdev_adjacent_dev_unlink(dev, upper_dev);
4764 return ret;
4765 }
4766
4767 return 0;
4768}
4769
4770void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
4771 struct net_device *upper_dev)
4772{
4773 __netdev_adjacent_dev_unlink(dev, upper_dev);
4774 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4775 &dev->adj_list.upper,
4776 &upper_dev->adj_list.lower);
4777}
Veaceslav Falico5d261912013-08-28 23:25:05 +02004778
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004779static int __netdev_upper_dev_link(struct net_device *dev,
Veaceslav Falico402dae92013-09-25 09:20:09 +02004780 struct net_device *upper_dev, bool master,
4781 void *private)
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004782{
Veaceslav Falico5d261912013-08-28 23:25:05 +02004783 struct netdev_adjacent *i, *j, *to_i, *to_j;
4784 int ret = 0;
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004785
4786 ASSERT_RTNL();
4787
4788 if (dev == upper_dev)
4789 return -EBUSY;
4790
4791 /* To prevent loops, check if dev is not upper device to upper_dev. */
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004792 if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004793 return -EBUSY;
4794
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004795 if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004796 return -EEXIST;
4797
4798 if (master && netdev_master_upper_dev_get(dev))
4799 return -EBUSY;
4800
Veaceslav Falico402dae92013-09-25 09:20:09 +02004801 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
4802 master);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004803 if (ret)
4804 return ret;
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004805
Veaceslav Falico5d261912013-08-28 23:25:05 +02004806 /* Now that we linked these devs, make all the upper_dev's
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004807 * all_adj_list.upper visible to every dev's all_adj_list.lower an
Veaceslav Falico5d261912013-08-28 23:25:05 +02004808 * versa, and don't forget the devices itself. All of these
4809 * links are non-neighbours.
4810 */
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004811 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4812 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4813 pr_debug("Interlinking %s with %s, non-neighbour\n",
4814 i->dev->name, j->dev->name);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004815 ret = __netdev_adjacent_dev_link(i->dev, j->dev);
4816 if (ret)
4817 goto rollback_mesh;
4818 }
4819 }
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004820
Veaceslav Falico5d261912013-08-28 23:25:05 +02004821 /* add dev to every upper_dev's upper device */
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004822 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4823 pr_debug("linking %s's upper device %s with %s\n",
4824 upper_dev->name, i->dev->name, dev->name);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004825 ret = __netdev_adjacent_dev_link(dev, i->dev);
4826 if (ret)
4827 goto rollback_upper_mesh;
4828 }
4829
4830 /* add upper_dev to every dev's lower device */
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004831 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4832 pr_debug("linking %s's lower device %s with %s\n", dev->name,
4833 i->dev->name, upper_dev->name);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004834 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
4835 if (ret)
4836 goto rollback_lower_mesh;
4837 }
4838
Jiri Pirko42e52bf2013-05-25 04:12:10 +00004839 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004840 return 0;
Veaceslav Falico5d261912013-08-28 23:25:05 +02004841
4842rollback_lower_mesh:
4843 to_i = i;
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004844 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
Veaceslav Falico5d261912013-08-28 23:25:05 +02004845 if (i == to_i)
4846 break;
4847 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
4848 }
4849
4850 i = NULL;
4851
4852rollback_upper_mesh:
4853 to_i = i;
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004854 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
Veaceslav Falico5d261912013-08-28 23:25:05 +02004855 if (i == to_i)
4856 break;
4857 __netdev_adjacent_dev_unlink(dev, i->dev);
4858 }
4859
4860 i = j = NULL;
4861
4862rollback_mesh:
4863 to_i = i;
4864 to_j = j;
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004865 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4866 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
Veaceslav Falico5d261912013-08-28 23:25:05 +02004867 if (i == to_i && j == to_j)
4868 break;
4869 __netdev_adjacent_dev_unlink(i->dev, j->dev);
4870 }
4871 if (i == to_i)
4872 break;
4873 }
4874
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004875 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004876
4877 return ret;
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004878}
4879
4880/**
4881 * netdev_upper_dev_link - Add a link to the upper device
4882 * @dev: device
4883 * @upper_dev: new upper device
4884 *
4885 * Adds a link to device which is upper to this one. The caller must hold
4886 * the RTNL lock. On a failure a negative errno code is returned.
4887 * On success the reference counts are adjusted and the function
4888 * returns zero.
4889 */
4890int netdev_upper_dev_link(struct net_device *dev,
4891 struct net_device *upper_dev)
4892{
Veaceslav Falico402dae92013-09-25 09:20:09 +02004893 return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004894}
4895EXPORT_SYMBOL(netdev_upper_dev_link);
4896
4897/**
4898 * netdev_master_upper_dev_link - Add a master link to the upper device
4899 * @dev: device
4900 * @upper_dev: new upper device
4901 *
4902 * Adds a link to device which is upper to this one. In this case, only
4903 * one master upper device can be linked, although other non-master devices
4904 * might be linked as well. The caller must hold the RTNL lock.
4905 * On a failure a negative errno code is returned. On success the reference
4906 * counts are adjusted and the function returns zero.
4907 */
4908int netdev_master_upper_dev_link(struct net_device *dev,
4909 struct net_device *upper_dev)
4910{
Veaceslav Falico402dae92013-09-25 09:20:09 +02004911 return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004912}
4913EXPORT_SYMBOL(netdev_master_upper_dev_link);
4914
Veaceslav Falico402dae92013-09-25 09:20:09 +02004915int netdev_master_upper_dev_link_private(struct net_device *dev,
4916 struct net_device *upper_dev,
4917 void *private)
4918{
4919 return __netdev_upper_dev_link(dev, upper_dev, true, private);
4920}
4921EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
4922
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004923/**
4924 * netdev_upper_dev_unlink - Removes a link to upper device
4925 * @dev: device
4926 * @upper_dev: new upper device
4927 *
4928 * Removes a link to device which is upper to this one. The caller must hold
4929 * the RTNL lock.
4930 */
4931void netdev_upper_dev_unlink(struct net_device *dev,
4932 struct net_device *upper_dev)
4933{
Veaceslav Falico5d261912013-08-28 23:25:05 +02004934 struct netdev_adjacent *i, *j;
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004935 ASSERT_RTNL();
4936
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004937 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
Veaceslav Falico5d261912013-08-28 23:25:05 +02004938
4939 /* Here is the tricky part. We must remove all dev's lower
4940 * devices from all upper_dev's upper devices and vice
4941 * versa, to maintain the graph relationship.
4942 */
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004943 list_for_each_entry(i, &dev->all_adj_list.lower, list)
4944 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
Veaceslav Falico5d261912013-08-28 23:25:05 +02004945 __netdev_adjacent_dev_unlink(i->dev, j->dev);
4946
4947 /* remove also the devices itself from lower/upper device
4948 * list
4949 */
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004950 list_for_each_entry(i, &dev->all_adj_list.lower, list)
Veaceslav Falico5d261912013-08-28 23:25:05 +02004951 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
4952
Veaceslav Falico2f268f12013-09-25 09:20:07 +02004953 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
Veaceslav Falico5d261912013-08-28 23:25:05 +02004954 __netdev_adjacent_dev_unlink(dev, i->dev);
4955
Jiri Pirko42e52bf2013-05-25 04:12:10 +00004956 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004957}
4958EXPORT_SYMBOL(netdev_upper_dev_unlink);
4959
Veaceslav Falico402dae92013-09-25 09:20:09 +02004960void *netdev_lower_dev_get_private_rcu(struct net_device *dev,
4961 struct net_device *lower_dev)
4962{
4963 struct netdev_adjacent *lower;
4964
4965 if (!lower_dev)
4966 return NULL;
4967 lower = __netdev_find_adj_rcu(dev, lower_dev, &dev->adj_list.lower);
4968 if (!lower)
4969 return NULL;
4970
4971 return lower->private;
4972}
4973EXPORT_SYMBOL(netdev_lower_dev_get_private_rcu);
4974
4975void *netdev_lower_dev_get_private(struct net_device *dev,
4976 struct net_device *lower_dev)
4977{
4978 struct netdev_adjacent *lower;
4979
4980 if (!lower_dev)
4981 return NULL;
4982 lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
4983 if (!lower)
4984 return NULL;
4985
4986 return lower->private;
4987}
4988EXPORT_SYMBOL(netdev_lower_dev_get_private);
4989
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004990static void dev_change_rx_flags(struct net_device *dev, int flags)
4991{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004992 const struct net_device_ops *ops = dev->netdev_ops;
4993
4994 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4995 ops->ndo_change_rx_flags(dev, flags);
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004996}
4997
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02004998static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
Patrick McHardy4417da62007-06-27 01:28:10 -07004999{
Eric Dumazetb536db92011-11-30 21:42:26 +00005000 unsigned int old_flags = dev->flags;
Eric W. Biedermand04a48b2012-05-23 17:01:57 -06005001 kuid_t uid;
5002 kgid_t gid;
Patrick McHardy4417da62007-06-27 01:28:10 -07005003
Patrick McHardy24023452007-07-14 18:51:31 -07005004 ASSERT_RTNL();
5005
Wang Chendad9b332008-06-18 01:48:28 -07005006 dev->flags |= IFF_PROMISC;
5007 dev->promiscuity += inc;
5008 if (dev->promiscuity == 0) {
5009 /*
5010 * Avoid overflow.
5011 * If inc causes overflow, untouch promisc and return error.
5012 */
5013 if (inc < 0)
5014 dev->flags &= ~IFF_PROMISC;
5015 else {
5016 dev->promiscuity -= inc;
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005017 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5018 dev->name);
Wang Chendad9b332008-06-18 01:48:28 -07005019 return -EOVERFLOW;
5020 }
5021 }
Patrick McHardy4417da62007-06-27 01:28:10 -07005022 if (dev->flags != old_flags) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005023 pr_info("device %s %s promiscuous mode\n",
5024 dev->name,
5025 dev->flags & IFF_PROMISC ? "entered" : "left");
David Howells8192b0c2008-11-14 10:39:10 +11005026 if (audit_enabled) {
5027 current_uid_gid(&uid, &gid);
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05005028 audit_log(current->audit_context, GFP_ATOMIC,
5029 AUDIT_ANOM_PROMISCUOUS,
5030 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5031 dev->name, (dev->flags & IFF_PROMISC),
5032 (old_flags & IFF_PROMISC),
Eric W. Biedermane1760bd2012-09-10 22:39:43 -07005033 from_kuid(&init_user_ns, audit_get_loginuid(current)),
Eric W. Biedermand04a48b2012-05-23 17:01:57 -06005034 from_kuid(&init_user_ns, uid),
5035 from_kgid(&init_user_ns, gid),
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05005036 audit_get_sessionid(current));
David Howells8192b0c2008-11-14 10:39:10 +11005037 }
Patrick McHardy24023452007-07-14 18:51:31 -07005038
Patrick McHardyb6c40d62008-10-07 15:26:48 -07005039 dev_change_rx_flags(dev, IFF_PROMISC);
Patrick McHardy4417da62007-06-27 01:28:10 -07005040 }
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005041 if (notify)
5042 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
Wang Chendad9b332008-06-18 01:48:28 -07005043 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07005044}
5045
Linus Torvalds1da177e2005-04-16 15:20:36 -07005046/**
5047 * dev_set_promiscuity - update promiscuity count on a device
5048 * @dev: device
5049 * @inc: modifier
5050 *
Stephen Hemminger3041a062006-05-26 13:25:24 -07005051 * Add or remove promiscuity from a device. While the count in the device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005052 * remains above zero the interface remains promiscuous. Once it hits zero
5053 * the device reverts back to normal filtering operation. A negative inc
5054 * value is used to drop promiscuity on the device.
Wang Chendad9b332008-06-18 01:48:28 -07005055 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005056 */
Wang Chendad9b332008-06-18 01:48:28 -07005057int dev_set_promiscuity(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005058{
Eric Dumazetb536db92011-11-30 21:42:26 +00005059 unsigned int old_flags = dev->flags;
Wang Chendad9b332008-06-18 01:48:28 -07005060 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005061
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005062 err = __dev_set_promiscuity(dev, inc, true);
Patrick McHardy4b5a6982008-07-06 15:49:08 -07005063 if (err < 0)
Wang Chendad9b332008-06-18 01:48:28 -07005064 return err;
Patrick McHardy4417da62007-06-27 01:28:10 -07005065 if (dev->flags != old_flags)
5066 dev_set_rx_mode(dev);
Wang Chendad9b332008-06-18 01:48:28 -07005067 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005068}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005069EXPORT_SYMBOL(dev_set_promiscuity);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005070
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005071static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005072{
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005073 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005074
Patrick McHardy24023452007-07-14 18:51:31 -07005075 ASSERT_RTNL();
5076
Linus Torvalds1da177e2005-04-16 15:20:36 -07005077 dev->flags |= IFF_ALLMULTI;
Wang Chendad9b332008-06-18 01:48:28 -07005078 dev->allmulti += inc;
5079 if (dev->allmulti == 0) {
5080 /*
5081 * Avoid overflow.
5082 * If inc causes overflow, untouch allmulti and return error.
5083 */
5084 if (inc < 0)
5085 dev->flags &= ~IFF_ALLMULTI;
5086 else {
5087 dev->allmulti -= inc;
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005088 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5089 dev->name);
Wang Chendad9b332008-06-18 01:48:28 -07005090 return -EOVERFLOW;
5091 }
5092 }
Patrick McHardy24023452007-07-14 18:51:31 -07005093 if (dev->flags ^ old_flags) {
Patrick McHardyb6c40d62008-10-07 15:26:48 -07005094 dev_change_rx_flags(dev, IFF_ALLMULTI);
Patrick McHardy4417da62007-06-27 01:28:10 -07005095 dev_set_rx_mode(dev);
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005096 if (notify)
5097 __dev_notify_flags(dev, old_flags,
5098 dev->gflags ^ old_gflags);
Patrick McHardy24023452007-07-14 18:51:31 -07005099 }
Wang Chendad9b332008-06-18 01:48:28 -07005100 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07005101}
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005102
5103/**
5104 * dev_set_allmulti - update allmulti count on a device
5105 * @dev: device
5106 * @inc: modifier
5107 *
5108 * Add or remove reception of all multicast frames to a device. While the
5109 * count in the device remains above zero the interface remains listening
5110 * to all interfaces. Once it hits zero the device reverts back to normal
5111 * filtering operation. A negative @inc value is used to drop the counter
5112 * when releasing a resource needing all multicasts.
5113 * Return 0 if successful or a negative errno code on error.
5114 */
5115
5116int dev_set_allmulti(struct net_device *dev, int inc)
5117{
5118 return __dev_set_allmulti(dev, inc, true);
5119}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005120EXPORT_SYMBOL(dev_set_allmulti);
Patrick McHardy4417da62007-06-27 01:28:10 -07005121
5122/*
5123 * Upload unicast and multicast address lists to device and
5124 * configure RX filtering. When the device doesn't support unicast
Joe Perches53ccaae2007-12-20 14:02:06 -08005125 * filtering it is put in promiscuous mode while unicast addresses
Patrick McHardy4417da62007-06-27 01:28:10 -07005126 * are present.
5127 */
5128void __dev_set_rx_mode(struct net_device *dev)
5129{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005130 const struct net_device_ops *ops = dev->netdev_ops;
5131
Patrick McHardy4417da62007-06-27 01:28:10 -07005132 /* dev_open will call this function so the list will stay sane. */
5133 if (!(dev->flags&IFF_UP))
5134 return;
5135
5136 if (!netif_device_present(dev))
YOSHIFUJI Hideaki40b77c92007-07-19 10:43:23 +09005137 return;
Patrick McHardy4417da62007-06-27 01:28:10 -07005138
Jiri Pirko01789342011-08-16 06:29:00 +00005139 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
Patrick McHardy4417da62007-06-27 01:28:10 -07005140 /* Unicast addresses changes may only happen under the rtnl,
5141 * therefore calling __dev_set_promiscuity here is safe.
5142 */
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08005143 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005144 __dev_set_promiscuity(dev, 1, false);
Joe Perches2d348d12011-07-25 16:17:35 -07005145 dev->uc_promisc = true;
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08005146 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005147 __dev_set_promiscuity(dev, -1, false);
Joe Perches2d348d12011-07-25 16:17:35 -07005148 dev->uc_promisc = false;
Patrick McHardy4417da62007-06-27 01:28:10 -07005149 }
Patrick McHardy4417da62007-06-27 01:28:10 -07005150 }
Jiri Pirko01789342011-08-16 06:29:00 +00005151
5152 if (ops->ndo_set_rx_mode)
5153 ops->ndo_set_rx_mode(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07005154}
5155
5156void dev_set_rx_mode(struct net_device *dev)
5157{
David S. Millerb9e40852008-07-15 00:15:08 -07005158 netif_addr_lock_bh(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07005159 __dev_set_rx_mode(dev);
David S. Millerb9e40852008-07-15 00:15:08 -07005160 netif_addr_unlock_bh(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005161}
5162
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005163/**
5164 * dev_get_flags - get flags reported to userspace
5165 * @dev: device
5166 *
5167 * Get the combination of flag bits exported through APIs to userspace.
5168 */
Eric Dumazet95c96172012-04-15 05:58:06 +00005169unsigned int dev_get_flags(const struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005170{
Eric Dumazet95c96172012-04-15 05:58:06 +00005171 unsigned int flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005172
5173 flags = (dev->flags & ~(IFF_PROMISC |
5174 IFF_ALLMULTI |
Stefan Rompfb00055a2006-03-20 17:09:11 -08005175 IFF_RUNNING |
5176 IFF_LOWER_UP |
5177 IFF_DORMANT)) |
Linus Torvalds1da177e2005-04-16 15:20:36 -07005178 (dev->gflags & (IFF_PROMISC |
5179 IFF_ALLMULTI));
5180
Stefan Rompfb00055a2006-03-20 17:09:11 -08005181 if (netif_running(dev)) {
5182 if (netif_oper_up(dev))
5183 flags |= IFF_RUNNING;
5184 if (netif_carrier_ok(dev))
5185 flags |= IFF_LOWER_UP;
5186 if (netif_dormant(dev))
5187 flags |= IFF_DORMANT;
5188 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005189
5190 return flags;
5191}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005192EXPORT_SYMBOL(dev_get_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005193
Patrick McHardybd380812010-02-26 06:34:53 +00005194int __dev_change_flags(struct net_device *dev, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005195{
Eric Dumazetb536db92011-11-30 21:42:26 +00005196 unsigned int old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00005197 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005198
Patrick McHardy24023452007-07-14 18:51:31 -07005199 ASSERT_RTNL();
5200
Linus Torvalds1da177e2005-04-16 15:20:36 -07005201 /*
5202 * Set the flags on our device.
5203 */
5204
5205 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5206 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5207 IFF_AUTOMEDIA)) |
5208 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5209 IFF_ALLMULTI));
5210
5211 /*
5212 * Load in the correct multicast list now the flags have changed.
5213 */
5214
Patrick McHardyb6c40d62008-10-07 15:26:48 -07005215 if ((old_flags ^ flags) & IFF_MULTICAST)
5216 dev_change_rx_flags(dev, IFF_MULTICAST);
Patrick McHardy24023452007-07-14 18:51:31 -07005217
Patrick McHardy4417da62007-06-27 01:28:10 -07005218 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005219
5220 /*
5221 * Have we downed the interface. We handle IFF_UP ourselves
5222 * according to user attempts to set it, rather than blindly
5223 * setting it.
5224 */
5225
5226 ret = 0;
5227 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
Patrick McHardybd380812010-02-26 06:34:53 +00005228 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005229
5230 if (!ret)
Patrick McHardy4417da62007-06-27 01:28:10 -07005231 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005232 }
5233
Linus Torvalds1da177e2005-04-16 15:20:36 -07005234 if ((flags ^ dev->gflags) & IFF_PROMISC) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005235 int inc = (flags & IFF_PROMISC) ? 1 : -1;
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005236 unsigned int old_flags = dev->flags;
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005237
Linus Torvalds1da177e2005-04-16 15:20:36 -07005238 dev->gflags ^= IFF_PROMISC;
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005239
5240 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5241 if (dev->flags != old_flags)
5242 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005243 }
5244
5245 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5246 is important. Some (broken) drivers set IFF_PROMISC, when
5247 IFF_ALLMULTI is requested not asking us and not reporting.
5248 */
5249 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005250 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5251
Linus Torvalds1da177e2005-04-16 15:20:36 -07005252 dev->gflags ^= IFF_ALLMULTI;
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005253 __dev_set_allmulti(dev, inc, false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005254 }
5255
Patrick McHardybd380812010-02-26 06:34:53 +00005256 return ret;
5257}
5258
Nicolas Dichtela528c212013-09-25 12:02:44 +02005259void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5260 unsigned int gchanges)
Patrick McHardybd380812010-02-26 06:34:53 +00005261{
5262 unsigned int changes = dev->flags ^ old_flags;
5263
Nicolas Dichtela528c212013-09-25 12:02:44 +02005264 if (gchanges)
Alexei Starovoitov7f294052013-10-23 16:02:42 -07005265 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
Nicolas Dichtela528c212013-09-25 12:02:44 +02005266
Patrick McHardybd380812010-02-26 06:34:53 +00005267 if (changes & IFF_UP) {
5268 if (dev->flags & IFF_UP)
5269 call_netdevice_notifiers(NETDEV_UP, dev);
5270 else
5271 call_netdevice_notifiers(NETDEV_DOWN, dev);
5272 }
5273
5274 if (dev->flags & IFF_UP &&
Jiri Pirkobe9efd32013-05-28 01:30:22 +00005275 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5276 struct netdev_notifier_change_info change_info;
5277
5278 change_info.flags_changed = changes;
5279 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5280 &change_info.info);
5281 }
Patrick McHardybd380812010-02-26 06:34:53 +00005282}
5283
5284/**
5285 * dev_change_flags - change device settings
5286 * @dev: device
5287 * @flags: device state flags
5288 *
5289 * Change settings on device based state flags. The flags are
5290 * in the userspace exported format.
5291 */
Eric Dumazetb536db92011-11-30 21:42:26 +00005292int dev_change_flags(struct net_device *dev, unsigned int flags)
Patrick McHardybd380812010-02-26 06:34:53 +00005293{
Eric Dumazetb536db92011-11-30 21:42:26 +00005294 int ret;
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005295 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
Patrick McHardybd380812010-02-26 06:34:53 +00005296
5297 ret = __dev_change_flags(dev, flags);
5298 if (ret < 0)
5299 return ret;
5300
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005301 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
Nicolas Dichtela528c212013-09-25 12:02:44 +02005302 __dev_notify_flags(dev, old_flags, changes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005303 return ret;
5304}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005305EXPORT_SYMBOL(dev_change_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005306
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005307/**
5308 * dev_set_mtu - Change maximum transfer unit
5309 * @dev: device
5310 * @new_mtu: new transfer unit
5311 *
5312 * Change the maximum transfer size of the network device.
5313 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005314int dev_set_mtu(struct net_device *dev, int new_mtu)
5315{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005316 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005317 int err;
5318
5319 if (new_mtu == dev->mtu)
5320 return 0;
5321
5322 /* MTU must be positive. */
5323 if (new_mtu < 0)
5324 return -EINVAL;
5325
5326 if (!netif_device_present(dev))
5327 return -ENODEV;
5328
5329 err = 0;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005330 if (ops->ndo_change_mtu)
5331 err = ops->ndo_change_mtu(dev, new_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005332 else
5333 dev->mtu = new_mtu;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005334
Jiri Pirkoe3d8fab2012-12-03 01:16:32 +00005335 if (!err)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005336 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005337 return err;
5338}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005339EXPORT_SYMBOL(dev_set_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005340
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005341/**
Vlad Dogarucbda10f2011-01-13 23:38:30 +00005342 * dev_set_group - Change group this device belongs to
5343 * @dev: device
5344 * @new_group: group this device should belong to
5345 */
5346void dev_set_group(struct net_device *dev, int new_group)
5347{
5348 dev->group = new_group;
5349}
5350EXPORT_SYMBOL(dev_set_group);
5351
5352/**
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005353 * dev_set_mac_address - Change Media Access Control Address
5354 * @dev: device
5355 * @sa: new address
5356 *
5357 * Change the hardware (MAC) address of the device
5358 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005359int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5360{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005361 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005362 int err;
5363
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005364 if (!ops->ndo_set_mac_address)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005365 return -EOPNOTSUPP;
5366 if (sa->sa_family != dev->type)
5367 return -EINVAL;
5368 if (!netif_device_present(dev))
5369 return -ENODEV;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005370 err = ops->ndo_set_mac_address(dev, sa);
Jiri Pirkof6521512013-01-01 03:30:14 +00005371 if (err)
5372 return err;
Jiri Pirkofbdeca22013-01-01 03:30:16 +00005373 dev->addr_assign_type = NET_ADDR_SET;
Jiri Pirkof6521512013-01-01 03:30:14 +00005374 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04005375 add_device_randomness(dev->dev_addr, dev->addr_len);
Jiri Pirkof6521512013-01-01 03:30:14 +00005376 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005377}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005378EXPORT_SYMBOL(dev_set_mac_address);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005379
Jiri Pirko4bf84c32012-12-27 23:49:37 +00005380/**
5381 * dev_change_carrier - Change device carrier
5382 * @dev: device
Randy Dunlap691b3b72013-03-04 12:32:43 +00005383 * @new_carrier: new value
Jiri Pirko4bf84c32012-12-27 23:49:37 +00005384 *
5385 * Change device carrier
5386 */
5387int dev_change_carrier(struct net_device *dev, bool new_carrier)
5388{
5389 const struct net_device_ops *ops = dev->netdev_ops;
5390
5391 if (!ops->ndo_change_carrier)
5392 return -EOPNOTSUPP;
5393 if (!netif_device_present(dev))
5394 return -ENODEV;
5395 return ops->ndo_change_carrier(dev, new_carrier);
5396}
5397EXPORT_SYMBOL(dev_change_carrier);
5398
Linus Torvalds1da177e2005-04-16 15:20:36 -07005399/**
Jiri Pirko66b52b02013-07-29 18:16:49 +02005400 * dev_get_phys_port_id - Get device physical port ID
5401 * @dev: device
5402 * @ppid: port ID
5403 *
5404 * Get device physical port ID
5405 */
5406int dev_get_phys_port_id(struct net_device *dev,
5407 struct netdev_phys_port_id *ppid)
5408{
5409 const struct net_device_ops *ops = dev->netdev_ops;
5410
5411 if (!ops->ndo_get_phys_port_id)
5412 return -EOPNOTSUPP;
5413 return ops->ndo_get_phys_port_id(dev, ppid);
5414}
5415EXPORT_SYMBOL(dev_get_phys_port_id);
5416
5417/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005418 * dev_new_index - allocate an ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07005419 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07005420 *
5421 * Returns a suitable unique value for a new device interface
5422 * number. The caller must hold the rtnl semaphore or the
5423 * dev_base_lock to be sure it remains unique.
5424 */
Eric W. Biederman881d9662007-09-17 11:56:21 -07005425static int dev_new_index(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005426{
Pavel Emelyanovaa79e662012-08-08 21:53:19 +00005427 int ifindex = net->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005428 for (;;) {
5429 if (++ifindex <= 0)
5430 ifindex = 1;
Eric W. Biederman881d9662007-09-17 11:56:21 -07005431 if (!__dev_get_by_index(net, ifindex))
Pavel Emelyanovaa79e662012-08-08 21:53:19 +00005432 return net->ifindex = ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005433 }
5434}
5435
Linus Torvalds1da177e2005-04-16 15:20:36 -07005436/* Delayed registration/unregisteration */
Denis Cheng3b5b34f2007-12-07 00:49:17 -08005437static LIST_HEAD(net_todo_list);
Eric W. Biederman50624c92013-09-23 21:19:49 -07005438static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005439
Stephen Hemminger6f05f622007-03-08 20:46:03 -08005440static void net_set_todo(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005441{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005442 list_add_tail(&dev->todo_list, &net_todo_list);
Eric W. Biederman50624c92013-09-23 21:19:49 -07005443 dev_net(dev)->dev_unreg_count++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005444}
5445
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005446static void rollback_registered_many(struct list_head *head)
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005447{
Krishna Kumare93737b2009-12-08 22:26:02 +00005448 struct net_device *dev, *tmp;
Eric W. Biederman5cde2822013-10-05 19:26:05 -07005449 LIST_HEAD(close_head);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005450
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005451 BUG_ON(dev_boot_phase);
5452 ASSERT_RTNL();
5453
Krishna Kumare93737b2009-12-08 22:26:02 +00005454 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005455 /* Some devices call without registering
Krishna Kumare93737b2009-12-08 22:26:02 +00005456 * for initialization unwind. Remove those
5457 * devices and proceed with the remaining.
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005458 */
5459 if (dev->reg_state == NETREG_UNINITIALIZED) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005460 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5461 dev->name, dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005462
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005463 WARN_ON(1);
Krishna Kumare93737b2009-12-08 22:26:02 +00005464 list_del(&dev->unreg_list);
5465 continue;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005466 }
Eric Dumazet449f4542011-05-19 12:24:16 +00005467 dev->dismantle = true;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005468 BUG_ON(dev->reg_state != NETREG_REGISTERED);
Octavian Purdila44345722010-12-13 12:44:07 +00005469 }
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005470
Octavian Purdila44345722010-12-13 12:44:07 +00005471 /* If device is running, close it first. */
Eric W. Biederman5cde2822013-10-05 19:26:05 -07005472 list_for_each_entry(dev, head, unreg_list)
5473 list_add_tail(&dev->close_list, &close_head);
5474 dev_close_many(&close_head);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005475
Octavian Purdila44345722010-12-13 12:44:07 +00005476 list_for_each_entry(dev, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005477 /* And unlink it from device chain. */
5478 unlist_netdevice(dev);
5479
5480 dev->reg_state = NETREG_UNREGISTERING;
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005481 }
5482
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005483 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005484
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005485 list_for_each_entry(dev, head, unreg_list) {
5486 /* Shutdown queueing discipline. */
5487 dev_shutdown(dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005488
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005489
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005490 /* Notify protocols, that we are about to destroy
5491 this device. They should clean all the things.
5492 */
5493 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5494
Patrick McHardya2835762010-02-26 06:34:51 +00005495 if (!dev->rtnl_link_ops ||
5496 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
Alexei Starovoitov7f294052013-10-23 16:02:42 -07005497 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
Patrick McHardya2835762010-02-26 06:34:51 +00005498
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005499 /*
5500 * Flush the unicast and multicast chains
5501 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005502 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00005503 dev_mc_flush(dev);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005504
5505 if (dev->netdev_ops->ndo_uninit)
5506 dev->netdev_ops->ndo_uninit(dev);
5507
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005508 /* Notifier chain MUST detach us all upper devices. */
5509 WARN_ON(netdev_has_any_upper_dev(dev));
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005510
5511 /* Remove entries from kobject tree */
5512 netdev_unregister_kobject(dev);
Alexander Duyck024e9672013-01-10 08:57:46 +00005513#ifdef CONFIG_XPS
5514 /* Remove XPS queueing entries */
5515 netif_reset_xps_queues_gt(dev, 0);
5516#endif
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005517 }
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005518
Eric W. Biederman850a5452011-10-13 22:25:23 +00005519 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005520
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005521 list_for_each_entry(dev, head, unreg_list)
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005522 dev_put(dev);
5523}
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005524
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005525static void rollback_registered(struct net_device *dev)
5526{
5527 LIST_HEAD(single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005528
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005529 list_add(&dev->unreg_list, &single);
5530 rollback_registered_many(&single);
Eric Dumazetceaaec92011-02-17 22:59:19 +00005531 list_del(&single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005532}
5533
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005534static netdev_features_t netdev_fix_features(struct net_device *dev,
5535 netdev_features_t features)
Herbert Xub63365a2008-10-23 01:11:29 -07005536{
Michał Mirosław57422dc2011-01-22 12:14:12 +00005537 /* Fix illegal checksum combinations */
5538 if ((features & NETIF_F_HW_CSUM) &&
5539 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005540 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
Michał Mirosław57422dc2011-01-22 12:14:12 +00005541 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5542 }
5543
Herbert Xub63365a2008-10-23 01:11:29 -07005544 /* TSO requires that SG is present as well. */
Ben Hutchingsea2d3682011-04-12 14:38:37 +00005545 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005546 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
Ben Hutchingsea2d3682011-04-12 14:38:37 +00005547 features &= ~NETIF_F_ALL_TSO;
Herbert Xub63365a2008-10-23 01:11:29 -07005548 }
5549
Pravin B Shelarec5f0612013-03-07 09:28:01 +00005550 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5551 !(features & NETIF_F_IP_CSUM)) {
5552 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5553 features &= ~NETIF_F_TSO;
5554 features &= ~NETIF_F_TSO_ECN;
5555 }
5556
5557 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5558 !(features & NETIF_F_IPV6_CSUM)) {
5559 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5560 features &= ~NETIF_F_TSO6;
5561 }
5562
Ben Hutchings31d8b9e2011-04-12 14:47:15 +00005563 /* TSO ECN requires that TSO is present as well. */
5564 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5565 features &= ~NETIF_F_TSO_ECN;
5566
Michał Mirosław212b5732011-02-15 16:59:16 +00005567 /* Software GSO depends on SG. */
5568 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005569 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
Michał Mirosław212b5732011-02-15 16:59:16 +00005570 features &= ~NETIF_F_GSO;
5571 }
5572
Michał Mirosławacd11302011-01-24 15:45:15 -08005573 /* UFO needs SG and checksumming */
Herbert Xub63365a2008-10-23 01:11:29 -07005574 if (features & NETIF_F_UFO) {
Michał Mirosław79032642010-11-30 06:38:00 +00005575 /* maybe split UFO into V4 and V6? */
5576 if (!((features & NETIF_F_GEN_CSUM) ||
5577 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5578 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005579 netdev_dbg(dev,
Michał Mirosławacd11302011-01-24 15:45:15 -08005580 "Dropping NETIF_F_UFO since no checksum offload features.\n");
Herbert Xub63365a2008-10-23 01:11:29 -07005581 features &= ~NETIF_F_UFO;
5582 }
5583
5584 if (!(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005585 netdev_dbg(dev,
Michał Mirosławacd11302011-01-24 15:45:15 -08005586 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
Herbert Xub63365a2008-10-23 01:11:29 -07005587 features &= ~NETIF_F_UFO;
5588 }
5589 }
5590
5591 return features;
5592}
Herbert Xub63365a2008-10-23 01:11:29 -07005593
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005594int __netdev_update_features(struct net_device *dev)
Michał Mirosław5455c692011-02-15 16:59:17 +00005595{
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005596 netdev_features_t features;
Michał Mirosław5455c692011-02-15 16:59:17 +00005597 int err = 0;
5598
Michał Mirosław87267482011-04-12 09:56:38 +00005599 ASSERT_RTNL();
5600
Michał Mirosław5455c692011-02-15 16:59:17 +00005601 features = netdev_get_wanted_features(dev);
5602
5603 if (dev->netdev_ops->ndo_fix_features)
5604 features = dev->netdev_ops->ndo_fix_features(dev, features);
5605
5606 /* driver might be less strict about feature dependencies */
5607 features = netdev_fix_features(dev, features);
5608
5609 if (dev->features == features)
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005610 return 0;
Michał Mirosław5455c692011-02-15 16:59:17 +00005611
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005612 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5613 &dev->features, &features);
Michał Mirosław5455c692011-02-15 16:59:17 +00005614
5615 if (dev->netdev_ops->ndo_set_features)
5616 err = dev->netdev_ops->ndo_set_features(dev, features);
5617
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005618 if (unlikely(err < 0)) {
Michał Mirosław5455c692011-02-15 16:59:17 +00005619 netdev_err(dev,
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005620 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5621 err, &features, &dev->features);
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005622 return -1;
5623 }
5624
5625 if (!err)
5626 dev->features = features;
5627
5628 return 1;
5629}
5630
Michał Mirosławafe12cc2011-05-07 03:22:17 +00005631/**
5632 * netdev_update_features - recalculate device features
5633 * @dev: the device to check
5634 *
5635 * Recalculate dev->features set and send notifications if it
5636 * has changed. Should be called after driver or hardware dependent
5637 * conditions might have changed that influence the features.
5638 */
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005639void netdev_update_features(struct net_device *dev)
5640{
5641 if (__netdev_update_features(dev))
5642 netdev_features_change(dev);
Michał Mirosław5455c692011-02-15 16:59:17 +00005643}
5644EXPORT_SYMBOL(netdev_update_features);
5645
Linus Torvalds1da177e2005-04-16 15:20:36 -07005646/**
Michał Mirosławafe12cc2011-05-07 03:22:17 +00005647 * netdev_change_features - recalculate device features
5648 * @dev: the device to check
5649 *
5650 * Recalculate dev->features set and send notifications even
5651 * if they have not changed. Should be called instead of
5652 * netdev_update_features() if also dev->vlan_features might
5653 * have changed to allow the changes to be propagated to stacked
5654 * VLAN devices.
5655 */
5656void netdev_change_features(struct net_device *dev)
5657{
5658 __netdev_update_features(dev);
5659 netdev_features_change(dev);
5660}
5661EXPORT_SYMBOL(netdev_change_features);
5662
5663/**
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08005664 * netif_stacked_transfer_operstate - transfer operstate
5665 * @rootdev: the root or lower level device to transfer state from
5666 * @dev: the device to transfer operstate to
5667 *
5668 * Transfer operational state from root to device. This is normally
5669 * called when a stacking relationship exists between the root
5670 * device and the device(a leaf device).
5671 */
5672void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5673 struct net_device *dev)
5674{
5675 if (rootdev->operstate == IF_OPER_DORMANT)
5676 netif_dormant_on(dev);
5677 else
5678 netif_dormant_off(dev);
5679
5680 if (netif_carrier_ok(rootdev)) {
5681 if (!netif_carrier_ok(dev))
5682 netif_carrier_on(dev);
5683 } else {
5684 if (netif_carrier_ok(dev))
5685 netif_carrier_off(dev);
5686 }
5687}
5688EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5689
Tom Herbertbf264142010-11-26 08:36:09 +00005690#ifdef CONFIG_RPS
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005691static int netif_alloc_rx_queues(struct net_device *dev)
5692{
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005693 unsigned int i, count = dev->num_rx_queues;
Tom Herbertbd25fa72010-10-18 18:00:16 +00005694 struct netdev_rx_queue *rx;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005695
Tom Herbertbd25fa72010-10-18 18:00:16 +00005696 BUG_ON(count < 1);
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005697
Tom Herbertbd25fa72010-10-18 18:00:16 +00005698 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
Joe Perches62b59422013-02-04 16:48:16 +00005699 if (!rx)
Tom Herbertbd25fa72010-10-18 18:00:16 +00005700 return -ENOMEM;
Joe Perches62b59422013-02-04 16:48:16 +00005701
Tom Herbertbd25fa72010-10-18 18:00:16 +00005702 dev->_rx = rx;
5703
Tom Herbertbd25fa72010-10-18 18:00:16 +00005704 for (i = 0; i < count; i++)
Tom Herbertfe822242010-11-09 10:47:38 +00005705 rx[i].dev = dev;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005706 return 0;
5707}
Tom Herbertbf264142010-11-26 08:36:09 +00005708#endif
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005709
Changli Gaoaa942102010-12-04 02:31:41 +00005710static void netdev_init_one_queue(struct net_device *dev,
5711 struct netdev_queue *queue, void *_unused)
5712{
5713 /* Initialize queue lock */
5714 spin_lock_init(&queue->_xmit_lock);
5715 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5716 queue->xmit_lock_owner = -1;
Changli Gaob236da62010-12-14 03:09:15 +00005717 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
Changli Gaoaa942102010-12-04 02:31:41 +00005718 queue->dev = dev;
Tom Herbert114cf582011-11-28 16:33:09 +00005719#ifdef CONFIG_BQL
5720 dql_init(&queue->dql, HZ);
5721#endif
Changli Gaoaa942102010-12-04 02:31:41 +00005722}
5723
Eric Dumazet60877a32013-06-20 01:15:51 -07005724static void netif_free_tx_queues(struct net_device *dev)
5725{
5726 if (is_vmalloc_addr(dev->_tx))
5727 vfree(dev->_tx);
5728 else
5729 kfree(dev->_tx);
5730}
5731
Tom Herberte6484932010-10-18 18:04:39 +00005732static int netif_alloc_netdev_queues(struct net_device *dev)
5733{
5734 unsigned int count = dev->num_tx_queues;
5735 struct netdev_queue *tx;
Eric Dumazet60877a32013-06-20 01:15:51 -07005736 size_t sz = count * sizeof(*tx);
Tom Herberte6484932010-10-18 18:04:39 +00005737
Eric Dumazet60877a32013-06-20 01:15:51 -07005738 BUG_ON(count < 1 || count > 0xffff);
Tom Herberte6484932010-10-18 18:04:39 +00005739
Eric Dumazet60877a32013-06-20 01:15:51 -07005740 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5741 if (!tx) {
5742 tx = vzalloc(sz);
5743 if (!tx)
5744 return -ENOMEM;
5745 }
Tom Herberte6484932010-10-18 18:04:39 +00005746 dev->_tx = tx;
Tom Herbert1d24eb42010-11-21 13:17:27 +00005747
Tom Herberte6484932010-10-18 18:04:39 +00005748 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5749 spin_lock_init(&dev->tx_global_lock);
Changli Gaoaa942102010-12-04 02:31:41 +00005750
5751 return 0;
Tom Herberte6484932010-10-18 18:04:39 +00005752}
5753
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08005754/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005755 * register_netdevice - register a network device
5756 * @dev: device to register
5757 *
5758 * Take a completed network device structure and add it to the kernel
5759 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5760 * chain. 0 is returned on success. A negative errno code is returned
5761 * on a failure to set up the device, or if the name is a duplicate.
5762 *
5763 * Callers must hold the rtnl semaphore. You may want
5764 * register_netdev() instead of this.
5765 *
5766 * BUGS:
5767 * The locking appears insufficient to guarantee two parallel registers
5768 * will not get the same name.
5769 */
5770
5771int register_netdevice(struct net_device *dev)
5772{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005773 int ret;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005774 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005775
5776 BUG_ON(dev_boot_phase);
5777 ASSERT_RTNL();
5778
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005779 might_sleep();
5780
Linus Torvalds1da177e2005-04-16 15:20:36 -07005781 /* When net_device's are persistent, this will be fatal. */
5782 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005783 BUG_ON(!net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005784
David S. Millerf1f28aa2008-07-15 00:08:33 -07005785 spin_lock_init(&dev->addr_list_lock);
David S. Millercf508b12008-07-22 14:16:42 -07005786 netdev_set_addr_lockdep_class(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005787
Linus Torvalds1da177e2005-04-16 15:20:36 -07005788 dev->iflink = -1;
5789
Gao feng828de4f2012-09-13 20:58:27 +00005790 ret = dev_get_valid_name(net, dev, dev->name);
Peter Pan(潘卫平)0696c3a2011-05-12 15:46:56 +00005791 if (ret < 0)
5792 goto out;
5793
Linus Torvalds1da177e2005-04-16 15:20:36 -07005794 /* Init, if this function is available */
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005795 if (dev->netdev_ops->ndo_init) {
5796 ret = dev->netdev_ops->ndo_init(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005797 if (ret) {
5798 if (ret > 0)
5799 ret = -EIO;
Adrian Bunk90833aa2006-11-13 16:02:22 -08005800 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005801 }
5802 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005803
Patrick McHardyf6469682013-04-19 02:04:27 +00005804 if (((dev->hw_features | dev->features) &
5805 NETIF_F_HW_VLAN_CTAG_FILTER) &&
Michał Mirosławd2ed2732013-01-29 15:14:16 +00005806 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5807 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5808 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5809 ret = -EINVAL;
5810 goto err_uninit;
5811 }
5812
Pavel Emelyanov9c7dafb2012-08-08 21:52:46 +00005813 ret = -EBUSY;
5814 if (!dev->ifindex)
5815 dev->ifindex = dev_new_index(net);
5816 else if (__dev_get_by_index(net, dev->ifindex))
5817 goto err_uninit;
5818
Linus Torvalds1da177e2005-04-16 15:20:36 -07005819 if (dev->iflink == -1)
5820 dev->iflink = dev->ifindex;
5821
Michał Mirosław5455c692011-02-15 16:59:17 +00005822 /* Transfer changeable features to wanted_features and enable
5823 * software offloads (GSO and GRO).
5824 */
5825 dev->hw_features |= NETIF_F_SOFT_FEATURES;
Michał Mirosław14d12322011-02-22 16:52:28 +00005826 dev->features |= NETIF_F_SOFT_FEATURES;
5827 dev->wanted_features = dev->features & dev->hw_features;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005828
Tom Herbertc6e1a0d2011-04-04 22:30:30 -07005829 /* Turn on no cache copy if HW is doing checksum */
Michał Mirosław34324dc2011-11-15 15:29:55 +00005830 if (!(dev->flags & IFF_LOOPBACK)) {
5831 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5832 if (dev->features & NETIF_F_ALL_CSUM) {
5833 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5834 dev->features |= NETIF_F_NOCACHE_COPY;
5835 }
Tom Herbertc6e1a0d2011-04-04 22:30:30 -07005836 }
5837
Michał Mirosław1180e7d2011-07-14 14:41:11 -07005838 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
Brandon Philips16c3ea72010-09-15 09:24:24 +00005839 */
Michał Mirosław1180e7d2011-07-14 14:41:11 -07005840 dev->vlan_features |= NETIF_F_HIGHDMA;
Brandon Philips16c3ea72010-09-15 09:24:24 +00005841
Pravin B Shelaree579672013-03-07 09:28:08 +00005842 /* Make NETIF_F_SG inheritable to tunnel devices.
5843 */
5844 dev->hw_enc_features |= NETIF_F_SG;
5845
Simon Horman0d89d202013-05-23 21:02:52 +00005846 /* Make NETIF_F_SG inheritable to MPLS.
5847 */
5848 dev->mpls_features |= NETIF_F_SG;
5849
Johannes Berg7ffbe3f2009-10-02 05:15:27 +00005850 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5851 ret = notifier_to_errno(ret);
5852 if (ret)
5853 goto err_uninit;
5854
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005855 ret = netdev_register_kobject(dev);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005856 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005857 goto err_uninit;
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005858 dev->reg_state = NETREG_REGISTERED;
5859
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005860 __netdev_update_features(dev);
Michał Mirosław8e9b59b2011-02-22 16:52:28 +00005861
Linus Torvalds1da177e2005-04-16 15:20:36 -07005862 /*
5863 * Default initial state at registry is that the
5864 * device is present.
5865 */
5866
5867 set_bit(__LINK_STATE_PRESENT, &dev->state);
5868
Ben Hutchings8f4cccb2012-08-20 22:16:51 +01005869 linkwatch_init_dev(dev);
5870
Linus Torvalds1da177e2005-04-16 15:20:36 -07005871 dev_init_scheduler(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005872 dev_hold(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005873 list_netdevice(dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04005874 add_device_randomness(dev->dev_addr, dev->addr_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005875
Jiri Pirko948b3372013-01-08 01:38:25 +00005876 /* If the device has permanent device address, driver should
5877 * set dev_addr and also addr_assign_type should be set to
5878 * NET_ADDR_PERM (default value).
5879 */
5880 if (dev->addr_assign_type == NET_ADDR_PERM)
5881 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5882
Linus Torvalds1da177e2005-04-16 15:20:36 -07005883 /* Notify protocols, that a new device appeared. */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005884 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07005885 ret = notifier_to_errno(ret);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005886 if (ret) {
5887 rollback_registered(dev);
5888 dev->reg_state = NETREG_UNREGISTERED;
5889 }
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005890 /*
5891 * Prevent userspace races by waiting until the network
5892 * device is fully setup before sending notifications.
5893 */
Patrick McHardya2835762010-02-26 06:34:51 +00005894 if (!dev->rtnl_link_ops ||
5895 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
Alexei Starovoitov7f294052013-10-23 16:02:42 -07005896 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005897
5898out:
5899 return ret;
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005900
5901err_uninit:
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005902 if (dev->netdev_ops->ndo_uninit)
5903 dev->netdev_ops->ndo_uninit(dev);
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005904 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005905}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005906EXPORT_SYMBOL(register_netdevice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005907
5908/**
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005909 * init_dummy_netdev - init a dummy network device for NAPI
5910 * @dev: device to init
5911 *
5912 * This takes a network device structure and initialize the minimum
5913 * amount of fields so it can be used to schedule NAPI polls without
5914 * registering a full blown interface. This is to be used by drivers
5915 * that need to tie several hardware interfaces to a single NAPI
5916 * poll scheduler due to HW limitations.
5917 */
5918int init_dummy_netdev(struct net_device *dev)
5919{
5920 /* Clear everything. Note we don't initialize spinlocks
5921 * are they aren't supposed to be taken by any of the
5922 * NAPI code and this dummy netdev is supposed to be
5923 * only ever used for NAPI polls
5924 */
5925 memset(dev, 0, sizeof(struct net_device));
5926
5927 /* make sure we BUG if trying to hit standard
5928 * register/unregister code path
5929 */
5930 dev->reg_state = NETREG_DUMMY;
5931
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005932 /* NAPI wants this */
5933 INIT_LIST_HEAD(&dev->napi_list);
5934
5935 /* a dummy interface is started by default */
5936 set_bit(__LINK_STATE_PRESENT, &dev->state);
5937 set_bit(__LINK_STATE_START, &dev->state);
5938
Eric Dumazet29b44332010-10-11 10:22:12 +00005939 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5940 * because users of this 'device' dont need to change
5941 * its refcount.
5942 */
5943
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005944 return 0;
5945}
5946EXPORT_SYMBOL_GPL(init_dummy_netdev);
5947
5948
5949/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005950 * register_netdev - register a network device
5951 * @dev: device to register
5952 *
5953 * Take a completed network device structure and add it to the kernel
5954 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5955 * chain. 0 is returned on success. A negative errno code is returned
5956 * on a failure to set up the device, or if the name is a duplicate.
5957 *
Borislav Petkov38b4da32007-04-20 22:14:10 -07005958 * This is a wrapper around register_netdevice that takes the rtnl semaphore
Linus Torvalds1da177e2005-04-16 15:20:36 -07005959 * and expands the device name if you passed a format string to
5960 * alloc_netdev.
5961 */
5962int register_netdev(struct net_device *dev)
5963{
5964 int err;
5965
5966 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005967 err = register_netdevice(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005968 rtnl_unlock();
5969 return err;
5970}
5971EXPORT_SYMBOL(register_netdev);
5972
Eric Dumazet29b44332010-10-11 10:22:12 +00005973int netdev_refcnt_read(const struct net_device *dev)
5974{
5975 int i, refcnt = 0;
5976
5977 for_each_possible_cpu(i)
5978 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5979 return refcnt;
5980}
5981EXPORT_SYMBOL(netdev_refcnt_read);
5982
Ben Hutchings2c530402012-07-10 10:55:09 +00005983/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005984 * netdev_wait_allrefs - wait until all references are gone.
Randy Dunlap3de7a372012-08-18 14:36:44 +00005985 * @dev: target net_device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005986 *
5987 * This is called when unregistering network devices.
5988 *
5989 * Any protocol or device that holds a reference should register
5990 * for netdevice notification, and cleanup and put back the
5991 * reference if they receive an UNREGISTER event.
5992 * We can get stuck here if buggy protocols don't correctly
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005993 * call dev_put.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005994 */
5995static void netdev_wait_allrefs(struct net_device *dev)
5996{
5997 unsigned long rebroadcast_time, warning_time;
Eric Dumazet29b44332010-10-11 10:22:12 +00005998 int refcnt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005999
Eric Dumazete014deb2009-11-17 05:59:21 +00006000 linkwatch_forget_dev(dev);
6001
Linus Torvalds1da177e2005-04-16 15:20:36 -07006002 rebroadcast_time = warning_time = jiffies;
Eric Dumazet29b44332010-10-11 10:22:12 +00006003 refcnt = netdev_refcnt_read(dev);
6004
6005 while (refcnt != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006006 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08006007 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006008
6009 /* Rebroadcast unregister notification */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07006010 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006011
Eric Dumazet748e2d92012-08-22 21:50:59 +00006012 __rtnl_unlock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00006013 rcu_barrier();
Eric Dumazet748e2d92012-08-22 21:50:59 +00006014 rtnl_lock();
6015
Eric Dumazet0115e8e2012-08-22 17:19:46 +00006016 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006017 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6018 &dev->state)) {
6019 /* We must not have linkwatch events
6020 * pending on unregister. If this
6021 * happens, we simply run the queue
6022 * unscheduled, resulting in a noop
6023 * for this device.
6024 */
6025 linkwatch_run_queue();
6026 }
6027
Stephen Hemminger6756ae42006-03-20 22:23:58 -08006028 __rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006029
6030 rebroadcast_time = jiffies;
6031 }
6032
6033 msleep(250);
6034
Eric Dumazet29b44332010-10-11 10:22:12 +00006035 refcnt = netdev_refcnt_read(dev);
6036
Linus Torvalds1da177e2005-04-16 15:20:36 -07006037 if (time_after(jiffies, warning_time + 10 * HZ)) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00006038 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6039 dev->name, refcnt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006040 warning_time = jiffies;
6041 }
6042 }
6043}
6044
6045/* The sequence is:
6046 *
6047 * rtnl_lock();
6048 * ...
6049 * register_netdevice(x1);
6050 * register_netdevice(x2);
6051 * ...
6052 * unregister_netdevice(y1);
6053 * unregister_netdevice(y2);
6054 * ...
6055 * rtnl_unlock();
6056 * free_netdev(y1);
6057 * free_netdev(y2);
6058 *
Herbert Xu58ec3b42008-10-07 15:50:03 -07006059 * We are invoked by rtnl_unlock().
Linus Torvalds1da177e2005-04-16 15:20:36 -07006060 * This allows us to deal with problems:
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07006061 * 1) We can delete sysfs objects which invoke hotplug
Linus Torvalds1da177e2005-04-16 15:20:36 -07006062 * without deadlocking with linkwatch via keventd.
6063 * 2) Since we run with the RTNL semaphore not held, we can sleep
6064 * safely in order to wait for the netdev refcnt to drop to zero.
Herbert Xu58ec3b42008-10-07 15:50:03 -07006065 *
6066 * We must not return until all unregister events added during
6067 * the interval the lock was held have been completed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006068 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07006069void netdev_run_todo(void)
6070{
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07006071 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006072
Linus Torvalds1da177e2005-04-16 15:20:36 -07006073 /* Snapshot list, allow later requests */
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07006074 list_replace_init(&net_todo_list, &list);
Herbert Xu58ec3b42008-10-07 15:50:03 -07006075
6076 __rtnl_unlock();
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07006077
Eric Dumazet0115e8e2012-08-22 17:19:46 +00006078
6079 /* Wait for rcu callbacks to finish before next phase */
Eric W. Biederman850a5452011-10-13 22:25:23 +00006080 if (!list_empty(&list))
6081 rcu_barrier();
6082
Linus Torvalds1da177e2005-04-16 15:20:36 -07006083 while (!list_empty(&list)) {
6084 struct net_device *dev
stephen hemmingere5e26d72010-02-24 14:01:38 +00006085 = list_first_entry(&list, struct net_device, todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006086 list_del(&dev->todo_list);
6087
Eric Dumazet748e2d92012-08-22 21:50:59 +00006088 rtnl_lock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00006089 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Eric Dumazet748e2d92012-08-22 21:50:59 +00006090 __rtnl_unlock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00006091
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07006092 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00006093 pr_err("network todo '%s' but state %d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07006094 dev->name, dev->reg_state);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07006095 dump_stack();
6096 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006097 }
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07006098
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07006099 dev->reg_state = NETREG_UNREGISTERED;
6100
Changli Gao152102c2010-03-30 20:16:22 +00006101 on_each_cpu(flush_backlog, dev, 1);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07006102
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07006103 netdev_wait_allrefs(dev);
6104
6105 /* paranoia */
Eric Dumazet29b44332010-10-11 10:22:12 +00006106 BUG_ON(netdev_refcnt_read(dev));
Eric Dumazet33d480c2011-08-11 19:30:52 +00006107 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6108 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
Ilpo Järvinen547b7922008-07-25 21:43:18 -07006109 WARN_ON(dev->dn_ptr);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07006110
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07006111 if (dev->destructor)
6112 dev->destructor(dev);
Stephen Hemminger9093bbb2007-05-19 15:39:25 -07006113
Eric W. Biederman50624c92013-09-23 21:19:49 -07006114 /* Report a network device has been unregistered */
6115 rtnl_lock();
6116 dev_net(dev)->dev_unreg_count--;
6117 __rtnl_unlock();
6118 wake_up(&netdev_unregistering_wq);
6119
Stephen Hemminger9093bbb2007-05-19 15:39:25 -07006120 /* Free network device */
6121 kobject_put(&dev->dev.kobj);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006122 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006123}
6124
Ben Hutchings3cfde792010-07-09 09:11:52 +00006125/* Convert net_device_stats to rtnl_link_stats64. They have the same
6126 * fields in the same order, with only the type differing.
6127 */
Eric Dumazet77a1abf2012-03-05 04:50:09 +00006128void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6129 const struct net_device_stats *netdev_stats)
Ben Hutchings3cfde792010-07-09 09:11:52 +00006130{
6131#if BITS_PER_LONG == 64
Eric Dumazet77a1abf2012-03-05 04:50:09 +00006132 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6133 memcpy(stats64, netdev_stats, sizeof(*stats64));
Ben Hutchings3cfde792010-07-09 09:11:52 +00006134#else
6135 size_t i, n = sizeof(*stats64) / sizeof(u64);
6136 const unsigned long *src = (const unsigned long *)netdev_stats;
6137 u64 *dst = (u64 *)stats64;
6138
6139 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6140 sizeof(*stats64) / sizeof(u64));
6141 for (i = 0; i < n; i++)
6142 dst[i] = src[i];
6143#endif
6144}
Eric Dumazet77a1abf2012-03-05 04:50:09 +00006145EXPORT_SYMBOL(netdev_stats_to_stats64);
Ben Hutchings3cfde792010-07-09 09:11:52 +00006146
Eric Dumazetd83345a2009-11-16 03:36:51 +00006147/**
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08006148 * dev_get_stats - get network device statistics
6149 * @dev: device to get statistics from
Eric Dumazet28172732010-07-07 14:58:56 -07006150 * @storage: place to store stats
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08006151 *
Ben Hutchingsd7753512010-07-09 09:12:41 +00006152 * Get network statistics from device. Return @storage.
6153 * The device driver may provide its own method by setting
6154 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6155 * otherwise the internal statistics structure is used.
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08006156 */
Ben Hutchingsd7753512010-07-09 09:12:41 +00006157struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6158 struct rtnl_link_stats64 *storage)
Eric Dumazet7004bf22009-05-18 00:34:33 +00006159{
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08006160 const struct net_device_ops *ops = dev->netdev_ops;
6161
Eric Dumazet28172732010-07-07 14:58:56 -07006162 if (ops->ndo_get_stats64) {
6163 memset(storage, 0, sizeof(*storage));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00006164 ops->ndo_get_stats64(dev, storage);
6165 } else if (ops->ndo_get_stats) {
Ben Hutchings3cfde792010-07-09 09:11:52 +00006166 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00006167 } else {
6168 netdev_stats_to_stats64(storage, &dev->stats);
Eric Dumazet28172732010-07-07 14:58:56 -07006169 }
Eric Dumazetcaf586e2010-09-30 21:06:55 +00006170 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
Eric Dumazet28172732010-07-07 14:58:56 -07006171 return storage;
Rusty Russellc45d2862007-03-28 14:29:08 -07006172}
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08006173EXPORT_SYMBOL(dev_get_stats);
Rusty Russellc45d2862007-03-28 14:29:08 -07006174
Eric Dumazet24824a02010-10-02 06:11:55 +00006175struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
David S. Millerdc2b4842008-07-08 17:18:23 -07006176{
Eric Dumazet24824a02010-10-02 06:11:55 +00006177 struct netdev_queue *queue = dev_ingress_queue(dev);
David S. Millerdc2b4842008-07-08 17:18:23 -07006178
Eric Dumazet24824a02010-10-02 06:11:55 +00006179#ifdef CONFIG_NET_CLS_ACT
6180 if (queue)
6181 return queue;
6182 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6183 if (!queue)
6184 return NULL;
6185 netdev_init_one_queue(dev, queue, NULL);
Eric Dumazet24824a02010-10-02 06:11:55 +00006186 queue->qdisc = &noop_qdisc;
6187 queue->qdisc_sleeping = &noop_qdisc;
6188 rcu_assign_pointer(dev->ingress_queue, queue);
6189#endif
6190 return queue;
David S. Millerbb949fb2008-07-08 16:55:56 -07006191}
6192
Eric Dumazet2c60db02012-09-16 09:17:26 +00006193static const struct ethtool_ops default_ethtool_ops;
6194
Stanislaw Gruszkad07d7502013-01-10 23:19:10 +00006195void netdev_set_default_ethtool_ops(struct net_device *dev,
6196 const struct ethtool_ops *ops)
6197{
6198 if (dev->ethtool_ops == &default_ethtool_ops)
6199 dev->ethtool_ops = ops;
6200}
6201EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6202
Eric Dumazet74d332c2013-10-30 13:10:44 -07006203void netdev_freemem(struct net_device *dev)
6204{
6205 char *addr = (char *)dev - dev->padded;
6206
6207 if (is_vmalloc_addr(addr))
6208 vfree(addr);
6209 else
6210 kfree(addr);
6211}
6212
Linus Torvalds1da177e2005-04-16 15:20:36 -07006213/**
Tom Herbert36909ea2011-01-09 19:36:31 +00006214 * alloc_netdev_mqs - allocate network device
Linus Torvalds1da177e2005-04-16 15:20:36 -07006215 * @sizeof_priv: size of private data to allocate space for
6216 * @name: device name format string
6217 * @setup: callback to initialize device
Tom Herbert36909ea2011-01-09 19:36:31 +00006218 * @txqs: the number of TX subqueues to allocate
6219 * @rxqs: the number of RX subqueues to allocate
Linus Torvalds1da177e2005-04-16 15:20:36 -07006220 *
6221 * Allocates a struct net_device with private data area for driver use
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07006222 * and performs basic initialization. Also allocates subquue structs
Tom Herbert36909ea2011-01-09 19:36:31 +00006223 * for each queue on the device.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006224 */
Tom Herbert36909ea2011-01-09 19:36:31 +00006225struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6226 void (*setup)(struct net_device *),
6227 unsigned int txqs, unsigned int rxqs)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006228{
Linus Torvalds1da177e2005-04-16 15:20:36 -07006229 struct net_device *dev;
Stephen Hemminger79439862008-07-21 13:28:44 -07006230 size_t alloc_size;
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00006231 struct net_device *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006232
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07006233 BUG_ON(strlen(name) >= sizeof(dev->name));
6234
Tom Herbert36909ea2011-01-09 19:36:31 +00006235 if (txqs < 1) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00006236 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
Tom Herbert55513fb2010-10-18 17:55:58 +00006237 return NULL;
6238 }
6239
Tom Herbert36909ea2011-01-09 19:36:31 +00006240#ifdef CONFIG_RPS
6241 if (rxqs < 1) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00006242 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
Tom Herbert36909ea2011-01-09 19:36:31 +00006243 return NULL;
6244 }
6245#endif
6246
David S. Millerfd2ea0a2008-07-17 01:56:23 -07006247 alloc_size = sizeof(struct net_device);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07006248 if (sizeof_priv) {
6249 /* ensure 32-byte alignment of private area */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00006250 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07006251 alloc_size += sizeof_priv;
6252 }
6253 /* ensure 32-byte alignment of whole construct */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00006254 alloc_size += NETDEV_ALIGN - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006255
Eric Dumazet74d332c2013-10-30 13:10:44 -07006256 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6257 if (!p)
6258 p = vzalloc(alloc_size);
Joe Perches62b59422013-02-04 16:48:16 +00006259 if (!p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006260 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006261
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00006262 dev = PTR_ALIGN(p, NETDEV_ALIGN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006263 dev->padded = (char *)dev - (char *)p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00006264
Eric Dumazet29b44332010-10-11 10:22:12 +00006265 dev->pcpu_refcnt = alloc_percpu(int);
6266 if (!dev->pcpu_refcnt)
Eric Dumazet74d332c2013-10-30 13:10:44 -07006267 goto free_dev;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00006268
Linus Torvalds1da177e2005-04-16 15:20:36 -07006269 if (dev_addr_init(dev))
Eric Dumazet29b44332010-10-11 10:22:12 +00006270 goto free_pcpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006271
Jiri Pirko22bedad32010-04-01 21:22:57 +00006272 dev_mc_init(dev);
Jiri Pirkoa748ee22010-04-01 21:22:09 +00006273 dev_uc_init(dev);
Jiri Pirkoccffad252009-05-22 23:22:17 +00006274
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09006275 dev_net_set(dev, &init_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006276
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07006277 dev->gso_max_size = GSO_MAX_SIZE;
Ben Hutchings30b678d2012-07-30 15:57:00 +00006278 dev->gso_max_segs = GSO_MAX_SEGS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006279
Herbert Xud565b0a2008-12-15 23:38:52 -08006280 INIT_LIST_HEAD(&dev->napi_list);
Eric W. Biederman9fdce092009-10-30 14:51:13 +00006281 INIT_LIST_HEAD(&dev->unreg_list);
Eric W. Biederman5cde2822013-10-05 19:26:05 -07006282 INIT_LIST_HEAD(&dev->close_list);
Eric Dumazete014deb2009-11-17 05:59:21 +00006283 INIT_LIST_HEAD(&dev->link_watch_list);
Veaceslav Falico2f268f12013-09-25 09:20:07 +02006284 INIT_LIST_HEAD(&dev->adj_list.upper);
6285 INIT_LIST_HEAD(&dev->adj_list.lower);
6286 INIT_LIST_HEAD(&dev->all_adj_list.upper);
6287 INIT_LIST_HEAD(&dev->all_adj_list.lower);
Eric Dumazet93f154b2009-05-18 22:19:19 -07006288 dev->priv_flags = IFF_XMIT_DST_RELEASE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006289 setup(dev);
David S. Miller8d3bdbd2011-02-08 15:02:50 -08006290
6291 dev->num_tx_queues = txqs;
6292 dev->real_num_tx_queues = txqs;
6293 if (netif_alloc_netdev_queues(dev))
6294 goto free_all;
6295
6296#ifdef CONFIG_RPS
6297 dev->num_rx_queues = rxqs;
6298 dev->real_num_rx_queues = rxqs;
6299 if (netif_alloc_rx_queues(dev))
6300 goto free_all;
6301#endif
6302
Linus Torvalds1da177e2005-04-16 15:20:36 -07006303 strcpy(dev->name, name);
Vlad Dogarucbda10f2011-01-13 23:38:30 +00006304 dev->group = INIT_NETDEV_GROUP;
Eric Dumazet2c60db02012-09-16 09:17:26 +00006305 if (!dev->ethtool_ops)
6306 dev->ethtool_ops = &default_ethtool_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006307 return dev;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00006308
David S. Miller8d3bdbd2011-02-08 15:02:50 -08006309free_all:
6310 free_netdev(dev);
6311 return NULL;
6312
Eric Dumazet29b44332010-10-11 10:22:12 +00006313free_pcpu:
6314 free_percpu(dev->pcpu_refcnt);
Eric Dumazet60877a32013-06-20 01:15:51 -07006315 netif_free_tx_queues(dev);
Tom Herbertfe822242010-11-09 10:47:38 +00006316#ifdef CONFIG_RPS
6317 kfree(dev->_rx);
6318#endif
6319
Eric Dumazet74d332c2013-10-30 13:10:44 -07006320free_dev:
6321 netdev_freemem(dev);
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00006322 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006323}
Tom Herbert36909ea2011-01-09 19:36:31 +00006324EXPORT_SYMBOL(alloc_netdev_mqs);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006325
6326/**
6327 * free_netdev - free network device
6328 * @dev: device
6329 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09006330 * This function does the last stage of destroying an allocated device
6331 * interface. The reference to the device object is released.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006332 * If this is the last reference then it will be freed.
6333 */
6334void free_netdev(struct net_device *dev)
6335{
Herbert Xud565b0a2008-12-15 23:38:52 -08006336 struct napi_struct *p, *n;
6337
Denis V. Lunevf3005d72008-04-16 02:02:18 -07006338 release_net(dev_net(dev));
6339
Eric Dumazet60877a32013-06-20 01:15:51 -07006340 netif_free_tx_queues(dev);
Tom Herbertfe822242010-11-09 10:47:38 +00006341#ifdef CONFIG_RPS
6342 kfree(dev->_rx);
6343#endif
David S. Millere8a04642008-07-17 00:34:19 -07006344
Eric Dumazet33d480c2011-08-11 19:30:52 +00006345 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
Eric Dumazet24824a02010-10-02 06:11:55 +00006346
Jiri Pirkof001fde2009-05-05 02:48:28 +00006347 /* Flush device addresses */
6348 dev_addr_flush(dev);
6349
Herbert Xud565b0a2008-12-15 23:38:52 -08006350 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6351 netif_napi_del(p);
6352
Eric Dumazet29b44332010-10-11 10:22:12 +00006353 free_percpu(dev->pcpu_refcnt);
6354 dev->pcpu_refcnt = NULL;
6355
Stephen Hemminger3041a062006-05-26 13:25:24 -07006356 /* Compatibility with error handling in drivers */
Linus Torvalds1da177e2005-04-16 15:20:36 -07006357 if (dev->reg_state == NETREG_UNINITIALIZED) {
Eric Dumazet74d332c2013-10-30 13:10:44 -07006358 netdev_freemem(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006359 return;
6360 }
6361
6362 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6363 dev->reg_state = NETREG_RELEASED;
6364
Greg Kroah-Hartman43cb76d2002-04-09 12:14:34 -07006365 /* will free via device release */
6366 put_device(&dev->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006367}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07006368EXPORT_SYMBOL(free_netdev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09006369
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006370/**
6371 * synchronize_net - Synchronize with packet receive processing
6372 *
6373 * Wait for packets currently being received to be done.
6374 * Does not block later packets from starting.
6375 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09006376void synchronize_net(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006377{
6378 might_sleep();
Eric Dumazetbe3fc412011-05-23 23:07:32 +00006379 if (rtnl_is_locked())
6380 synchronize_rcu_expedited();
6381 else
6382 synchronize_rcu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006383}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07006384EXPORT_SYMBOL(synchronize_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006385
6386/**
Eric Dumazet44a08732009-10-27 07:03:04 +00006387 * unregister_netdevice_queue - remove device from the kernel
Linus Torvalds1da177e2005-04-16 15:20:36 -07006388 * @dev: device
Eric Dumazet44a08732009-10-27 07:03:04 +00006389 * @head: list
Jaswinder Singh Rajput6ebfbc02009-11-22 20:43:13 -08006390 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07006391 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08006392 * from the kernel tables.
Eric Dumazet44a08732009-10-27 07:03:04 +00006393 * If head not NULL, device is queued to be unregistered later.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006394 *
6395 * Callers must hold the rtnl semaphore. You may want
6396 * unregister_netdev() instead of this.
6397 */
6398
Eric Dumazet44a08732009-10-27 07:03:04 +00006399void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006400{
Herbert Xua6620712007-12-12 19:21:56 -08006401 ASSERT_RTNL();
6402
Eric Dumazet44a08732009-10-27 07:03:04 +00006403 if (head) {
Eric W. Biederman9fdce092009-10-30 14:51:13 +00006404 list_move_tail(&dev->unreg_list, head);
Eric Dumazet44a08732009-10-27 07:03:04 +00006405 } else {
6406 rollback_registered(dev);
6407 /* Finish processing unregister after unlock */
6408 net_set_todo(dev);
6409 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006410}
Eric Dumazet44a08732009-10-27 07:03:04 +00006411EXPORT_SYMBOL(unregister_netdevice_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006412
6413/**
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006414 * unregister_netdevice_many - unregister many devices
6415 * @head: list of devices
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006416 */
6417void unregister_netdevice_many(struct list_head *head)
6418{
6419 struct net_device *dev;
6420
6421 if (!list_empty(head)) {
6422 rollback_registered_many(head);
6423 list_for_each_entry(dev, head, unreg_list)
6424 net_set_todo(dev);
6425 }
6426}
Eric Dumazet63c80992009-10-27 07:06:49 +00006427EXPORT_SYMBOL(unregister_netdevice_many);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006428
6429/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07006430 * unregister_netdev - remove device from the kernel
6431 * @dev: device
6432 *
6433 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08006434 * from the kernel tables.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006435 *
6436 * This is just a wrapper for unregister_netdevice that takes
6437 * the rtnl semaphore. In general you want to use this and not
6438 * unregister_netdevice.
6439 */
6440void unregister_netdev(struct net_device *dev)
6441{
6442 rtnl_lock();
6443 unregister_netdevice(dev);
6444 rtnl_unlock();
6445}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006446EXPORT_SYMBOL(unregister_netdev);
6447
Eric W. Biedermance286d32007-09-12 13:53:49 +02006448/**
6449 * dev_change_net_namespace - move device to different nethost namespace
6450 * @dev: device
6451 * @net: network namespace
6452 * @pat: If not NULL name pattern to try if the current device name
6453 * is already taken in the destination network namespace.
6454 *
6455 * This function shuts down a device interface and moves it
6456 * to a new network namespace. On success 0 is returned, on
6457 * a failure a netagive errno code is returned.
6458 *
6459 * Callers must hold the rtnl semaphore.
6460 */
6461
6462int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6463{
Eric W. Biedermance286d32007-09-12 13:53:49 +02006464 int err;
6465
6466 ASSERT_RTNL();
6467
6468 /* Don't allow namespace local devices to be moved. */
6469 err = -EINVAL;
6470 if (dev->features & NETIF_F_NETNS_LOCAL)
6471 goto out;
6472
6473 /* Ensure the device has been registrered */
Eric W. Biedermance286d32007-09-12 13:53:49 +02006474 if (dev->reg_state != NETREG_REGISTERED)
6475 goto out;
6476
6477 /* Get out if there is nothing todo */
6478 err = 0;
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09006479 if (net_eq(dev_net(dev), net))
Eric W. Biedermance286d32007-09-12 13:53:49 +02006480 goto out;
6481
6482 /* Pick the destination device name, and ensure
6483 * we can use it in the destination network namespace.
6484 */
6485 err = -EEXIST;
Octavian Purdilad9031022009-11-18 02:36:59 +00006486 if (__dev_get_by_name(net, dev->name)) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006487 /* We get here if we can't use the current device name */
6488 if (!pat)
6489 goto out;
Gao feng828de4f2012-09-13 20:58:27 +00006490 if (dev_get_valid_name(net, dev, pat) < 0)
Eric W. Biedermance286d32007-09-12 13:53:49 +02006491 goto out;
6492 }
6493
6494 /*
6495 * And now a mini version of register_netdevice unregister_netdevice.
6496 */
6497
6498 /* If device is running close it first. */
Pavel Emelyanov9b772652007-10-10 02:49:09 -07006499 dev_close(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006500
6501 /* And unlink it from device chain */
6502 err = -ENODEV;
6503 unlist_netdevice(dev);
6504
6505 synchronize_net();
6506
6507 /* Shutdown queueing discipline. */
6508 dev_shutdown(dev);
6509
6510 /* Notify protocols, that we are about to destroy
6511 this device. They should clean all the things.
David Lamparter3b27e102010-09-17 03:22:19 +00006512
6513 Note that dev->reg_state stays at NETREG_REGISTERED.
6514 This is wanted because this way 8021q and macvlan know
6515 the device is just moving and can keep their slaves up.
Eric W. Biedermance286d32007-09-12 13:53:49 +02006516 */
6517 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Gao feng6549dd42012-08-23 15:36:55 +00006518 rcu_barrier();
6519 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Alexei Starovoitov7f294052013-10-23 16:02:42 -07006520 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006521
6522 /*
6523 * Flush the unicast and multicast chains
6524 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00006525 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00006526 dev_mc_flush(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006527
Serge Hallyn4e66ae22012-12-03 16:17:12 +00006528 /* Send a netdev-removed uevent to the old namespace */
6529 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6530
Eric W. Biedermance286d32007-09-12 13:53:49 +02006531 /* Actually switch the network namespace */
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09006532 dev_net_set(dev, net);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006533
Eric W. Biedermance286d32007-09-12 13:53:49 +02006534 /* If there is an ifindex conflict assign a new one */
6535 if (__dev_get_by_index(net, dev->ifindex)) {
6536 int iflink = (dev->iflink == dev->ifindex);
6537 dev->ifindex = dev_new_index(net);
6538 if (iflink)
6539 dev->iflink = dev->ifindex;
6540 }
6541
Serge Hallyn4e66ae22012-12-03 16:17:12 +00006542 /* Send a netdev-add uevent to the new namespace */
6543 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6544
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006545 /* Fixup kobjects */
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07006546 err = device_rename(&dev->dev, dev->name);
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006547 WARN_ON(err);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006548
6549 /* Add the device back in the hashes */
6550 list_netdevice(dev);
6551
6552 /* Notify protocols, that a new device appeared. */
6553 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6554
Eric W. Biedermand90a9092009-12-12 22:11:15 +00006555 /*
6556 * Prevent userspace races by waiting until the network
6557 * device is fully setup before sending notifications.
6558 */
Alexei Starovoitov7f294052013-10-23 16:02:42 -07006559 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
Eric W. Biedermand90a9092009-12-12 22:11:15 +00006560
Eric W. Biedermance286d32007-09-12 13:53:49 +02006561 synchronize_net();
6562 err = 0;
6563out:
6564 return err;
6565}
Johannes Berg463d0182009-07-14 00:33:35 +02006566EXPORT_SYMBOL_GPL(dev_change_net_namespace);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006567
Linus Torvalds1da177e2005-04-16 15:20:36 -07006568static int dev_cpu_callback(struct notifier_block *nfb,
6569 unsigned long action,
6570 void *ocpu)
6571{
6572 struct sk_buff **list_skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006573 struct sk_buff *skb;
6574 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6575 struct softnet_data *sd, *oldsd;
6576
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006577 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006578 return NOTIFY_OK;
6579
6580 local_irq_disable();
6581 cpu = smp_processor_id();
6582 sd = &per_cpu(softnet_data, cpu);
6583 oldsd = &per_cpu(softnet_data, oldcpu);
6584
6585 /* Find end of our completion_queue. */
6586 list_skb = &sd->completion_queue;
6587 while (*list_skb)
6588 list_skb = &(*list_skb)->next;
6589 /* Append completion queue from offline CPU. */
6590 *list_skb = oldsd->completion_queue;
6591 oldsd->completion_queue = NULL;
6592
Linus Torvalds1da177e2005-04-16 15:20:36 -07006593 /* Append output queue from offline CPU. */
Changli Gaoa9cbd582010-04-26 23:06:24 +00006594 if (oldsd->output_queue) {
6595 *sd->output_queue_tailp = oldsd->output_queue;
6596 sd->output_queue_tailp = oldsd->output_queue_tailp;
6597 oldsd->output_queue = NULL;
6598 oldsd->output_queue_tailp = &oldsd->output_queue;
6599 }
Heiko Carstens264524d2011-06-06 20:50:03 +00006600 /* Append NAPI poll list from offline CPU. */
6601 if (!list_empty(&oldsd->poll_list)) {
6602 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6603 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6604 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006605
6606 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6607 local_irq_enable();
6608
6609 /* Process offline CPU's input_pkt_queue */
Tom Herbert76cc8b12010-05-20 18:37:59 +00006610 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6611 netif_rx(skb);
6612 input_queue_head_incr(oldsd);
6613 }
Tom Herbertfec5e652010-04-16 16:01:27 -07006614 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006615 netif_rx(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00006616 input_queue_head_incr(oldsd);
Tom Herbertfec5e652010-04-16 16:01:27 -07006617 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006618
6619 return NOTIFY_OK;
6620}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006621
6622
Herbert Xu7f353bf2007-08-10 15:47:58 -07006623/**
Herbert Xub63365a2008-10-23 01:11:29 -07006624 * netdev_increment_features - increment feature set by one
6625 * @all: current feature set
6626 * @one: new feature set
6627 * @mask: mask feature set
Herbert Xu7f353bf2007-08-10 15:47:58 -07006628 *
6629 * Computes a new feature set after adding a device with feature set
Herbert Xub63365a2008-10-23 01:11:29 -07006630 * @one to the master device with current feature set @all. Will not
6631 * enable anything that is off in @mask. Returns the new feature set.
Herbert Xu7f353bf2007-08-10 15:47:58 -07006632 */
Michał Mirosławc8f44af2011-11-15 15:29:55 +00006633netdev_features_t netdev_increment_features(netdev_features_t all,
6634 netdev_features_t one, netdev_features_t mask)
Herbert Xu7f353bf2007-08-10 15:47:58 -07006635{
Michał Mirosław1742f182011-04-22 06:31:16 +00006636 if (mask & NETIF_F_GEN_CSUM)
6637 mask |= NETIF_F_ALL_CSUM;
6638 mask |= NETIF_F_VLAN_CHALLENGED;
6639
6640 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6641 all &= one | ~NETIF_F_ALL_FOR_ALL;
6642
Michał Mirosław1742f182011-04-22 06:31:16 +00006643 /* If one device supports hw checksumming, set for all. */
6644 if (all & NETIF_F_GEN_CSUM)
6645 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
Herbert Xu7f353bf2007-08-10 15:47:58 -07006646
6647 return all;
6648}
Herbert Xub63365a2008-10-23 01:11:29 -07006649EXPORT_SYMBOL(netdev_increment_features);
Herbert Xu7f353bf2007-08-10 15:47:58 -07006650
Baruch Siach430f03c2013-06-02 20:43:55 +00006651static struct hlist_head * __net_init netdev_create_hash(void)
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006652{
6653 int i;
6654 struct hlist_head *hash;
6655
6656 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6657 if (hash != NULL)
6658 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6659 INIT_HLIST_HEAD(&hash[i]);
6660
6661 return hash;
6662}
6663
Eric W. Biederman881d9662007-09-17 11:56:21 -07006664/* Initialize per network namespace state */
Pavel Emelyanov46650792007-10-08 20:38:39 -07006665static int __net_init netdev_init(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07006666{
Rustad, Mark D734b6542012-07-18 09:06:07 +00006667 if (net != &init_net)
6668 INIT_LIST_HEAD(&net->dev_base_head);
Eric W. Biederman881d9662007-09-17 11:56:21 -07006669
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006670 net->dev_name_head = netdev_create_hash();
6671 if (net->dev_name_head == NULL)
6672 goto err_name;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006673
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006674 net->dev_index_head = netdev_create_hash();
6675 if (net->dev_index_head == NULL)
6676 goto err_idx;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006677
6678 return 0;
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006679
6680err_idx:
6681 kfree(net->dev_name_head);
6682err_name:
6683 return -ENOMEM;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006684}
6685
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006686/**
6687 * netdev_drivername - network driver for the device
6688 * @dev: network device
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006689 *
6690 * Determine network driver for device.
6691 */
David S. Miller3019de12011-06-06 16:41:33 -07006692const char *netdev_drivername(const struct net_device *dev)
Arjan van de Ven6579e572008-07-21 13:31:48 -07006693{
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07006694 const struct device_driver *driver;
6695 const struct device *parent;
David S. Miller3019de12011-06-06 16:41:33 -07006696 const char *empty = "";
Arjan van de Ven6579e572008-07-21 13:31:48 -07006697
6698 parent = dev->dev.parent;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006699 if (!parent)
David S. Miller3019de12011-06-06 16:41:33 -07006700 return empty;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006701
6702 driver = parent->driver;
6703 if (driver && driver->name)
David S. Miller3019de12011-06-06 16:41:33 -07006704 return driver->name;
6705 return empty;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006706}
6707
Joe Perchesb004ff42012-09-12 20:12:19 -07006708static int __netdev_printk(const char *level, const struct net_device *dev,
Joe Perches256df2f2010-06-27 01:02:35 +00006709 struct va_format *vaf)
6710{
6711 int r;
6712
Joe Perchesb004ff42012-09-12 20:12:19 -07006713 if (dev && dev->dev.parent) {
Joe Perches666f3552012-09-12 20:14:11 -07006714 r = dev_printk_emit(level[1] - '0',
6715 dev->dev.parent,
6716 "%s %s %s: %pV",
6717 dev_driver_string(dev->dev.parent),
6718 dev_name(dev->dev.parent),
6719 netdev_name(dev), vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006720 } else if (dev) {
Joe Perches256df2f2010-06-27 01:02:35 +00006721 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006722 } else {
Joe Perches256df2f2010-06-27 01:02:35 +00006723 r = printk("%s(NULL net_device): %pV", level, vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006724 }
Joe Perches256df2f2010-06-27 01:02:35 +00006725
6726 return r;
6727}
6728
6729int netdev_printk(const char *level, const struct net_device *dev,
6730 const char *format, ...)
6731{
6732 struct va_format vaf;
6733 va_list args;
6734 int r;
6735
6736 va_start(args, format);
6737
6738 vaf.fmt = format;
6739 vaf.va = &args;
6740
6741 r = __netdev_printk(level, dev, &vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006742
Joe Perches256df2f2010-06-27 01:02:35 +00006743 va_end(args);
6744
6745 return r;
6746}
6747EXPORT_SYMBOL(netdev_printk);
6748
6749#define define_netdev_printk_level(func, level) \
6750int func(const struct net_device *dev, const char *fmt, ...) \
6751{ \
6752 int r; \
6753 struct va_format vaf; \
6754 va_list args; \
6755 \
6756 va_start(args, fmt); \
6757 \
6758 vaf.fmt = fmt; \
6759 vaf.va = &args; \
6760 \
6761 r = __netdev_printk(level, dev, &vaf); \
Joe Perchesb004ff42012-09-12 20:12:19 -07006762 \
Joe Perches256df2f2010-06-27 01:02:35 +00006763 va_end(args); \
6764 \
6765 return r; \
6766} \
6767EXPORT_SYMBOL(func);
6768
6769define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6770define_netdev_printk_level(netdev_alert, KERN_ALERT);
6771define_netdev_printk_level(netdev_crit, KERN_CRIT);
6772define_netdev_printk_level(netdev_err, KERN_ERR);
6773define_netdev_printk_level(netdev_warn, KERN_WARNING);
6774define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6775define_netdev_printk_level(netdev_info, KERN_INFO);
6776
Pavel Emelyanov46650792007-10-08 20:38:39 -07006777static void __net_exit netdev_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07006778{
6779 kfree(net->dev_name_head);
6780 kfree(net->dev_index_head);
6781}
6782
Denis V. Lunev022cbae2007-11-13 03:23:50 -08006783static struct pernet_operations __net_initdata netdev_net_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07006784 .init = netdev_init,
6785 .exit = netdev_exit,
6786};
6787
Pavel Emelyanov46650792007-10-08 20:38:39 -07006788static void __net_exit default_device_exit(struct net *net)
Eric W. Biedermance286d32007-09-12 13:53:49 +02006789{
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006790 struct net_device *dev, *aux;
Eric W. Biedermance286d32007-09-12 13:53:49 +02006791 /*
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006792 * Push all migratable network devices back to the
Eric W. Biedermance286d32007-09-12 13:53:49 +02006793 * initial network namespace
6794 */
6795 rtnl_lock();
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006796 for_each_netdev_safe(net, dev, aux) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006797 int err;
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006798 char fb_name[IFNAMSIZ];
Eric W. Biedermance286d32007-09-12 13:53:49 +02006799
6800 /* Ignore unmoveable devices (i.e. loopback) */
6801 if (dev->features & NETIF_F_NETNS_LOCAL)
6802 continue;
6803
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006804 /* Leave virtual devices for the generic cleanup */
6805 if (dev->rtnl_link_ops)
6806 continue;
Eric W. Biedermand0c082c2008-11-05 15:59:38 -08006807
Lucas De Marchi25985ed2011-03-30 22:57:33 -03006808 /* Push remaining network devices to init_net */
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006809 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6810 err = dev_change_net_namespace(dev, &init_net, fb_name);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006811 if (err) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00006812 pr_emerg("%s: failed to move %s to init_net: %d\n",
6813 __func__, dev->name, err);
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006814 BUG();
Eric W. Biedermance286d32007-09-12 13:53:49 +02006815 }
6816 }
6817 rtnl_unlock();
6818}
6819
Eric W. Biederman50624c92013-09-23 21:19:49 -07006820static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
6821{
6822 /* Return with the rtnl_lock held when there are no network
6823 * devices unregistering in any network namespace in net_list.
6824 */
6825 struct net *net;
6826 bool unregistering;
6827 DEFINE_WAIT(wait);
6828
6829 for (;;) {
6830 prepare_to_wait(&netdev_unregistering_wq, &wait,
6831 TASK_UNINTERRUPTIBLE);
6832 unregistering = false;
6833 rtnl_lock();
6834 list_for_each_entry(net, net_list, exit_list) {
6835 if (net->dev_unreg_count > 0) {
6836 unregistering = true;
6837 break;
6838 }
6839 }
6840 if (!unregistering)
6841 break;
6842 __rtnl_unlock();
6843 schedule();
6844 }
6845 finish_wait(&netdev_unregistering_wq, &wait);
6846}
6847
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006848static void __net_exit default_device_exit_batch(struct list_head *net_list)
6849{
6850 /* At exit all network devices most be removed from a network
Uwe Kleine-Königb5950762010-11-01 15:38:34 -04006851 * namespace. Do this in the reverse order of registration.
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006852 * Do this across as many network namespaces as possible to
6853 * improve batching efficiency.
6854 */
6855 struct net_device *dev;
6856 struct net *net;
6857 LIST_HEAD(dev_kill_list);
6858
Eric W. Biederman50624c92013-09-23 21:19:49 -07006859 /* To prevent network device cleanup code from dereferencing
6860 * loopback devices or network devices that have been freed
6861 * wait here for all pending unregistrations to complete,
6862 * before unregistring the loopback device and allowing the
6863 * network namespace be freed.
6864 *
6865 * The netdev todo list containing all network devices
6866 * unregistrations that happen in default_device_exit_batch
6867 * will run in the rtnl_unlock() at the end of
6868 * default_device_exit_batch.
6869 */
6870 rtnl_lock_unregistering(net_list);
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006871 list_for_each_entry(net, net_list, exit_list) {
6872 for_each_netdev_reverse(net, dev) {
6873 if (dev->rtnl_link_ops)
6874 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6875 else
6876 unregister_netdevice_queue(dev, &dev_kill_list);
6877 }
6878 }
6879 unregister_netdevice_many(&dev_kill_list);
Eric Dumazetceaaec92011-02-17 22:59:19 +00006880 list_del(&dev_kill_list);
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006881 rtnl_unlock();
6882}
6883
Denis V. Lunev022cbae2007-11-13 03:23:50 -08006884static struct pernet_operations __net_initdata default_device_ops = {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006885 .exit = default_device_exit,
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006886 .exit_batch = default_device_exit_batch,
Eric W. Biedermance286d32007-09-12 13:53:49 +02006887};
6888
Linus Torvalds1da177e2005-04-16 15:20:36 -07006889/*
6890 * Initialize the DEV module. At boot time this walks the device list and
6891 * unhooks any devices that fail to initialise (normally hardware not
6892 * present) and leaves us with a valid list of present and active devices.
6893 *
6894 */
6895
6896/*
6897 * This is called single threaded during boot, so no need
6898 * to take the rtnl semaphore.
6899 */
6900static int __init net_dev_init(void)
6901{
6902 int i, rc = -ENOMEM;
6903
6904 BUG_ON(!dev_boot_phase);
6905
Linus Torvalds1da177e2005-04-16 15:20:36 -07006906 if (dev_proc_init())
6907 goto out;
6908
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006909 if (netdev_kobject_init())
Linus Torvalds1da177e2005-04-16 15:20:36 -07006910 goto out;
6911
6912 INIT_LIST_HEAD(&ptype_all);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08006913 for (i = 0; i < PTYPE_HASH_SIZE; i++)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006914 INIT_LIST_HEAD(&ptype_base[i]);
6915
Vlad Yasevich62532da2012-11-15 08:49:10 +00006916 INIT_LIST_HEAD(&offload_base);
6917
Eric W. Biederman881d9662007-09-17 11:56:21 -07006918 if (register_pernet_subsys(&netdev_net_ops))
6919 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006920
6921 /*
6922 * Initialise the packet receive queues.
6923 */
6924
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07006925 for_each_possible_cpu(i) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006926 struct softnet_data *sd = &per_cpu(softnet_data, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006927
Changli Gaodee42872010-05-02 05:42:16 +00006928 memset(sd, 0, sizeof(*sd));
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006929 skb_queue_head_init(&sd->input_pkt_queue);
Changli Gao6e7676c2010-04-27 15:07:33 -07006930 skb_queue_head_init(&sd->process_queue);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006931 sd->completion_queue = NULL;
6932 INIT_LIST_HEAD(&sd->poll_list);
Changli Gaoa9cbd582010-04-26 23:06:24 +00006933 sd->output_queue = NULL;
6934 sd->output_queue_tailp = &sd->output_queue;
Eric Dumazetdf334542010-03-24 19:13:54 +00006935#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006936 sd->csd.func = rps_trigger_softirq;
6937 sd->csd.info = sd;
6938 sd->csd.flags = 0;
6939 sd->cpu = i;
Tom Herbert1e94d722010-03-18 17:45:44 -07006940#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00006941
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006942 sd->backlog.poll = process_backlog;
6943 sd->backlog.weight = weight_p;
6944 sd->backlog.gro_list = NULL;
6945 sd->backlog.gro_count = 0;
Willem de Bruijn99bbc702013-05-20 04:02:32 +00006946
6947#ifdef CONFIG_NET_FLOW_LIMIT
6948 sd->flow_limit = NULL;
6949#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006950 }
6951
Linus Torvalds1da177e2005-04-16 15:20:36 -07006952 dev_boot_phase = 0;
6953
Eric W. Biederman505d4f72008-11-07 22:54:20 -08006954 /* The loopback device is special if any other network devices
6955 * is present in a network namespace the loopback device must
6956 * be present. Since we now dynamically allocate and free the
6957 * loopback device ensure this invariant is maintained by
6958 * keeping the loopback device as the first device on the
6959 * list of network devices. Ensuring the loopback devices
6960 * is the first device that appears and the last network device
6961 * that disappears.
6962 */
6963 if (register_pernet_device(&loopback_net_ops))
6964 goto out;
6965
6966 if (register_pernet_device(&default_device_ops))
6967 goto out;
6968
Carlos R. Mafra962cf362008-05-15 11:15:37 -03006969 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6970 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006971
6972 hotcpu_notifier(dev_cpu_callback, 0);
6973 dst_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006974 rc = 0;
6975out:
6976 return rc;
6977}
6978
6979subsys_initcall(net_dev_init);