blob: fc1e289397f5895f3d2191ee78c9b450cc33e5cf [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070076#include <linux/bitops.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080077#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070078#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
stephen hemminger08e98972009-11-10 07:20:34 +000081#include <linux/hash.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090082#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/sched.h>
Arjan van de Ven4a3e2f72006-03-20 22:33:17 -080084#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
Ben Hutchings0187bdf2008-06-19 16:15:47 -070094#include <linux/ethtool.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <linux/notifier.h>
96#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020097#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <net/sock.h>
99#include <linux/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100#include <linux/stat.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101#include <net/dst.h>
102#include <net/pkt_sched.h>
103#include <net/checksum.h>
Arnd Bergmann44540962009-11-26 06:07:08 +0000104#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105#include <linux/highmem.h>
106#include <linux/init.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107#include <linux/module.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#include <linux/netpoll.h>
109#include <linux/rcupdate.h>
110#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700111#include <net/iw_handler.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112#include <asm/current.h>
Steve Grubb5bdb9882005-12-03 08:39:35 -0500113#include <linux/audit.h>
Chris Leechdb217332006-06-17 21:24:58 -0700114#include <linux/dmaengine.h>
Herbert Xuf6a78bf2006-06-22 02:57:17 -0700115#include <linux/err.h>
David S. Millerc7fa9d12006-08-15 16:34:13 -0700116#include <linux/ctype.h>
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700117#include <linux/if_arp.h>
Ben Hutchings6de329e2008-06-16 17:02:28 -0700118#include <linux/if_vlan.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700119#include <linux/ip.h>
Alexander Duyckad55dca2008-09-20 22:05:50 -0700120#include <net/ip.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700121#include <linux/ipv6.h>
122#include <linux/in.h>
David S. Millerb6b2fed2008-07-21 09:48:06 -0700123#include <linux/jhash.h>
124#include <linux/random.h>
David S. Miller9cbc1cb2009-06-15 03:02:23 -0700125#include <trace/events/napi.h>
Koki Sanagicf66ba52010-08-23 18:45:02 +0900126#include <trace/events/net.h>
Koki Sanagi07dc22e2010-08-23 18:46:12 +0900127#include <trace/events/skb.h>
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +0000128#include <linux/pci.h>
Stephen Rothwellcaeda9b2010-09-16 21:39:16 -0700129#include <linux/inetdevice.h>
Ben Hutchingsc4454772011-01-19 11:03:53 +0000130#include <linux/cpu_rmap.h>
Ingo Molnarc5905af2012-02-24 08:31:31 +0100131#include <linux/static_key.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132
Pavel Emelyanov342709e2007-10-23 21:14:45 -0700133#include "net-sysfs.h"
134
Herbert Xud565b0a2008-12-15 23:38:52 -0800135/* Instead of increasing this, you should create a hash table. */
136#define MAX_GRO_SKBS 8
137
Herbert Xu5d38a072009-01-04 16:13:40 -0800138/* This should be increased if a protocol with a bigger head is added. */
139#define GRO_MAX_HEAD (MAX_HEADER + 128)
140
Linus Torvalds1da177e2005-04-16 15:20:36 -0700141static DEFINE_SPINLOCK(ptype_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000142static DEFINE_SPINLOCK(offload_lock);
Cong Wang900ff8c2013-02-18 19:20:33 +0000143struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
144struct list_head ptype_all __read_mostly; /* Taps */
Vlad Yasevich62532da2012-11-15 08:49:10 +0000145static struct list_head offload_base __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700146
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147/*
Pavel Emelianov7562f872007-05-03 15:13:45 -0700148 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149 * semaphore.
150 *
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800151 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -0700152 *
153 * Writers must hold the rtnl semaphore while they loop through the
Pavel Emelianov7562f872007-05-03 15:13:45 -0700154 * dev_base_head list, and hold dev_base_lock for writing when they do the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 * actual updates. This allows pure readers to access the list even
156 * while a writer is preparing to update it.
157 *
158 * To put it another way, dev_base_lock is held for writing only to
159 * protect against pure readers; the rtnl semaphore provides the
160 * protection against other writers.
161 *
162 * See, for example usages, register_netdevice() and
163 * unregister_netdevice(), which must be called with the rtnl
164 * semaphore held.
165 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700166DEFINE_RWLOCK(dev_base_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167EXPORT_SYMBOL(dev_base_lock);
168
Eric Dumazet30e6c9f2012-12-20 17:25:08 +0000169seqcount_t devnet_rename_seq;
Brian Haleyc91f6df2012-11-26 05:21:08 +0000170
Thomas Graf4e985ad2011-06-21 03:11:20 +0000171static inline void dev_base_seq_inc(struct net *net)
172{
173 while (++net->dev_base_seq == 0);
174}
175
Eric W. Biederman881d9662007-09-17 11:56:21 -0700176static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700177{
Eric Dumazet95c96172012-04-15 05:58:06 +0000178 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
179
stephen hemminger08e98972009-11-10 07:20:34 +0000180 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181}
182
Eric W. Biederman881d9662007-09-17 11:56:21 -0700183static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184{
Eric Dumazet7c28bd02009-10-24 06:13:17 -0700185 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186}
187
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000188static inline void rps_lock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000189{
190#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000191 spin_lock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000192#endif
193}
194
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000195static inline void rps_unlock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000196{
197#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000198 spin_unlock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000199#endif
200}
201
Eric W. Biedermance286d32007-09-12 13:53:49 +0200202/* Device list insertion */
dingtianhong53759be2013-04-17 22:17:50 +0000203static void list_netdevice(struct net_device *dev)
Eric W. Biedermance286d32007-09-12 13:53:49 +0200204{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900205 struct net *net = dev_net(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200206
207 ASSERT_RTNL();
208
209 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800210 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
Eric Dumazet72c95282009-10-30 07:11:27 +0000211 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000212 hlist_add_head_rcu(&dev->index_hlist,
213 dev_index_hash(net, dev->ifindex));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200214 write_unlock_bh(&dev_base_lock);
Thomas Graf4e985ad2011-06-21 03:11:20 +0000215
216 dev_base_seq_inc(net);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200217}
218
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000219/* Device list removal
220 * caller must respect a RCU grace period before freeing/reusing dev
221 */
Eric W. Biedermance286d32007-09-12 13:53:49 +0200222static void unlist_netdevice(struct net_device *dev)
223{
224 ASSERT_RTNL();
225
226 /* Unlink dev from the device chain */
227 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800228 list_del_rcu(&dev->dev_list);
Eric Dumazet72c95282009-10-30 07:11:27 +0000229 hlist_del_rcu(&dev->name_hlist);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000230 hlist_del_rcu(&dev->index_hlist);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200231 write_unlock_bh(&dev_base_lock);
Thomas Graf4e985ad2011-06-21 03:11:20 +0000232
233 dev_base_seq_inc(dev_net(dev));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200234}
235
Linus Torvalds1da177e2005-04-16 15:20:36 -0700236/*
237 * Our notifier list
238 */
239
Alan Sternf07d5b92006-05-09 15:23:03 -0700240static RAW_NOTIFIER_HEAD(netdev_chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241
242/*
243 * Device drivers call our routines to queue packets here. We empty the
244 * queue in the local softnet handler.
245 */
Stephen Hemmingerbea33482007-10-03 16:41:36 -0700246
Eric Dumazet9958da02010-04-17 04:17:02 +0000247DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700248EXPORT_PER_CPU_SYMBOL(softnet_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700249
David S. Millercf508b12008-07-22 14:16:42 -0700250#ifdef CONFIG_LOCKDEP
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700251/*
David S. Millerc773e842008-07-08 23:13:53 -0700252 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700253 * according to dev->type
254 */
255static const unsigned short netdev_lock_type[] =
256 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
257 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
258 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
259 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
260 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
261 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
262 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
263 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
264 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
265 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
266 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
267 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
Paul Gortmaker211ed862012-05-10 17:14:35 -0400268 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
269 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
270 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700271
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700272static const char *const netdev_lock_name[] =
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700273 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
274 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
275 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
276 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
277 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
278 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
279 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
280 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
281 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
282 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
283 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
284 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
Paul Gortmaker211ed862012-05-10 17:14:35 -0400285 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
286 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
287 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700288
289static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
David S. Millercf508b12008-07-22 14:16:42 -0700290static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700291
292static inline unsigned short netdev_lock_pos(unsigned short dev_type)
293{
294 int i;
295
296 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
297 if (netdev_lock_type[i] == dev_type)
298 return i;
299 /* the last key is used by default */
300 return ARRAY_SIZE(netdev_lock_type) - 1;
301}
302
David S. Millercf508b12008-07-22 14:16:42 -0700303static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
304 unsigned short dev_type)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700305{
306 int i;
307
308 i = netdev_lock_pos(dev_type);
309 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
310 netdev_lock_name[i]);
311}
David S. Millercf508b12008-07-22 14:16:42 -0700312
313static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
314{
315 int i;
316
317 i = netdev_lock_pos(dev->type);
318 lockdep_set_class_and_name(&dev->addr_list_lock,
319 &netdev_addr_lock_key[i],
320 netdev_lock_name[i]);
321}
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700322#else
David S. Millercf508b12008-07-22 14:16:42 -0700323static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
324 unsigned short dev_type)
325{
326}
327static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700328{
329}
330#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700331
332/*******************************************************************************
333
334 Protocol management and registration routines
335
336*******************************************************************************/
337
338/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339 * Add a protocol ID to the list. Now that the input handler is
340 * smarter we can dispense with all the messy stuff that used to be
341 * here.
342 *
343 * BEWARE!!! Protocol handlers, mangling input packets,
344 * MUST BE last in hash buckets and checking protocol handlers
345 * MUST start from promiscuous ptype_all chain in net_bh.
346 * It is true now, do not change it.
347 * Explanation follows: if protocol handler, mangling packet, will
348 * be the first on list, it is not able to sense, that packet
349 * is cloned and should be copied-on-write, so that it will
350 * change it and subsequent readers will get broken packet.
351 * --ANK (980803)
352 */
353
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000354static inline struct list_head *ptype_head(const struct packet_type *pt)
355{
356 if (pt->type == htons(ETH_P_ALL))
357 return &ptype_all;
358 else
359 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
360}
361
Linus Torvalds1da177e2005-04-16 15:20:36 -0700362/**
363 * dev_add_pack - add packet handler
364 * @pt: packet type declaration
365 *
366 * Add a protocol handler to the networking stack. The passed &packet_type
367 * is linked into kernel lists and may not be freed until it has been
368 * removed from the kernel lists.
369 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900370 * This call does not sleep therefore it can not
Linus Torvalds1da177e2005-04-16 15:20:36 -0700371 * guarantee all CPU's that are in middle of receiving packets
372 * will see the new packet type (until the next received packet).
373 */
374
375void dev_add_pack(struct packet_type *pt)
376{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000377 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700378
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000379 spin_lock(&ptype_lock);
380 list_add_rcu(&pt->list, head);
381 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700382}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700383EXPORT_SYMBOL(dev_add_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700384
Linus Torvalds1da177e2005-04-16 15:20:36 -0700385/**
386 * __dev_remove_pack - remove packet handler
387 * @pt: packet type declaration
388 *
389 * Remove a protocol handler that was previously added to the kernel
390 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
391 * from the kernel lists and can be freed or reused once this function
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900392 * returns.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393 *
394 * The packet type might still be in use by receivers
395 * and must not be freed until after all the CPU's have gone
396 * through a quiescent state.
397 */
398void __dev_remove_pack(struct packet_type *pt)
399{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000400 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401 struct packet_type *pt1;
402
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000403 spin_lock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404
405 list_for_each_entry(pt1, head, list) {
406 if (pt == pt1) {
407 list_del_rcu(&pt->list);
408 goto out;
409 }
410 }
411
Joe Perches7b6cd1c2012-02-01 10:54:43 +0000412 pr_warn("dev_remove_pack: %p not found\n", pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700413out:
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000414 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700415}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700416EXPORT_SYMBOL(__dev_remove_pack);
417
Linus Torvalds1da177e2005-04-16 15:20:36 -0700418/**
419 * dev_remove_pack - remove packet handler
420 * @pt: packet type declaration
421 *
422 * Remove a protocol handler that was previously added to the kernel
423 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
424 * from the kernel lists and can be freed or reused once this function
425 * returns.
426 *
427 * This call sleeps to guarantee that no CPU is looking at the packet
428 * type after return.
429 */
430void dev_remove_pack(struct packet_type *pt)
431{
432 __dev_remove_pack(pt);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900433
Linus Torvalds1da177e2005-04-16 15:20:36 -0700434 synchronize_net();
435}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700436EXPORT_SYMBOL(dev_remove_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437
Vlad Yasevich62532da2012-11-15 08:49:10 +0000438
439/**
440 * dev_add_offload - register offload handlers
441 * @po: protocol offload declaration
442 *
443 * Add protocol offload handlers to the networking stack. The passed
444 * &proto_offload is linked into kernel lists and may not be freed until
445 * it has been removed from the kernel lists.
446 *
447 * This call does not sleep therefore it can not
448 * guarantee all CPU's that are in middle of receiving packets
449 * will see the new offload handlers (until the next received packet).
450 */
451void dev_add_offload(struct packet_offload *po)
452{
453 struct list_head *head = &offload_base;
454
455 spin_lock(&offload_lock);
456 list_add_rcu(&po->list, head);
457 spin_unlock(&offload_lock);
458}
459EXPORT_SYMBOL(dev_add_offload);
460
461/**
462 * __dev_remove_offload - remove offload handler
463 * @po: packet offload declaration
464 *
465 * Remove a protocol offload handler that was previously added to the
466 * kernel offload handlers by dev_add_offload(). The passed &offload_type
467 * is removed from the kernel lists and can be freed or reused once this
468 * function returns.
469 *
470 * The packet type might still be in use by receivers
471 * and must not be freed until after all the CPU's have gone
472 * through a quiescent state.
473 */
474void __dev_remove_offload(struct packet_offload *po)
475{
476 struct list_head *head = &offload_base;
477 struct packet_offload *po1;
478
Eric Dumazetc53aa502012-11-16 08:08:23 +0000479 spin_lock(&offload_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000480
481 list_for_each_entry(po1, head, list) {
482 if (po == po1) {
483 list_del_rcu(&po->list);
484 goto out;
485 }
486 }
487
488 pr_warn("dev_remove_offload: %p not found\n", po);
489out:
Eric Dumazetc53aa502012-11-16 08:08:23 +0000490 spin_unlock(&offload_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000491}
492EXPORT_SYMBOL(__dev_remove_offload);
493
494/**
495 * dev_remove_offload - remove packet offload handler
496 * @po: packet offload declaration
497 *
498 * Remove a packet offload handler that was previously added to the kernel
499 * offload handlers by dev_add_offload(). The passed &offload_type is
500 * removed from the kernel lists and can be freed or reused once this
501 * function returns.
502 *
503 * This call sleeps to guarantee that no CPU is looking at the packet
504 * type after return.
505 */
506void dev_remove_offload(struct packet_offload *po)
507{
508 __dev_remove_offload(po);
509
510 synchronize_net();
511}
512EXPORT_SYMBOL(dev_remove_offload);
513
Linus Torvalds1da177e2005-04-16 15:20:36 -0700514/******************************************************************************
515
516 Device Boot-time Settings Routines
517
518*******************************************************************************/
519
520/* Boot time configuration table */
521static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
522
523/**
524 * netdev_boot_setup_add - add new setup entry
525 * @name: name of the device
526 * @map: configured settings for the device
527 *
528 * Adds new setup entry to the dev_boot_setup list. The function
529 * returns 0 on error and 1 on success. This is a generic routine to
530 * all netdevices.
531 */
532static int netdev_boot_setup_add(char *name, struct ifmap *map)
533{
534 struct netdev_boot_setup *s;
535 int i;
536
537 s = dev_boot_setup;
538 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
539 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
540 memset(s[i].name, 0, sizeof(s[i].name));
Wang Chen93b3cff2008-07-01 19:57:19 -0700541 strlcpy(s[i].name, name, IFNAMSIZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700542 memcpy(&s[i].map, map, sizeof(s[i].map));
543 break;
544 }
545 }
546
547 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
548}
549
550/**
551 * netdev_boot_setup_check - check boot time settings
552 * @dev: the netdevice
553 *
554 * Check boot time settings for the device.
555 * The found settings are set for the device to be used
556 * later in the device probing.
557 * Returns 0 if no settings found, 1 if they are.
558 */
559int netdev_boot_setup_check(struct net_device *dev)
560{
561 struct netdev_boot_setup *s = dev_boot_setup;
562 int i;
563
564 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
565 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
Wang Chen93b3cff2008-07-01 19:57:19 -0700566 !strcmp(dev->name, s[i].name)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700567 dev->irq = s[i].map.irq;
568 dev->base_addr = s[i].map.base_addr;
569 dev->mem_start = s[i].map.mem_start;
570 dev->mem_end = s[i].map.mem_end;
571 return 1;
572 }
573 }
574 return 0;
575}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700576EXPORT_SYMBOL(netdev_boot_setup_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700577
578
579/**
580 * netdev_boot_base - get address from boot time settings
581 * @prefix: prefix for network device
582 * @unit: id for network device
583 *
584 * Check boot time settings for the base address of device.
585 * The found settings are set for the device to be used
586 * later in the device probing.
587 * Returns 0 if no settings found.
588 */
589unsigned long netdev_boot_base(const char *prefix, int unit)
590{
591 const struct netdev_boot_setup *s = dev_boot_setup;
592 char name[IFNAMSIZ];
593 int i;
594
595 sprintf(name, "%s%d", prefix, unit);
596
597 /*
598 * If device already registered then return base of 1
599 * to indicate not to probe for this interface
600 */
Eric W. Biederman881d9662007-09-17 11:56:21 -0700601 if (__dev_get_by_name(&init_net, name))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700602 return 1;
603
604 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
605 if (!strcmp(name, s[i].name))
606 return s[i].map.base_addr;
607 return 0;
608}
609
610/*
611 * Saves at boot time configured settings for any netdevice.
612 */
613int __init netdev_boot_setup(char *str)
614{
615 int ints[5];
616 struct ifmap map;
617
618 str = get_options(str, ARRAY_SIZE(ints), ints);
619 if (!str || !*str)
620 return 0;
621
622 /* Save settings */
623 memset(&map, 0, sizeof(map));
624 if (ints[0] > 0)
625 map.irq = ints[1];
626 if (ints[0] > 1)
627 map.base_addr = ints[2];
628 if (ints[0] > 2)
629 map.mem_start = ints[3];
630 if (ints[0] > 3)
631 map.mem_end = ints[4];
632
633 /* Add new entry to the list */
634 return netdev_boot_setup_add(str, &map);
635}
636
637__setup("netdev=", netdev_boot_setup);
638
639/*******************************************************************************
640
641 Device Interface Subroutines
642
643*******************************************************************************/
644
645/**
646 * __dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700647 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648 * @name: name to find
649 *
650 * Find an interface by name. Must be called under RTNL semaphore
651 * or @dev_base_lock. If the name is found a pointer to the device
652 * is returned. If the name is not found then %NULL is returned. The
653 * reference counters are not incremented so the caller must be
654 * careful with locks.
655 */
656
Eric W. Biederman881d9662007-09-17 11:56:21 -0700657struct net_device *__dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700658{
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700659 struct net_device *dev;
660 struct hlist_head *head = dev_name_hash(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700661
Sasha Levinb67bfe02013-02-27 17:06:00 -0800662 hlist_for_each_entry(dev, head, name_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663 if (!strncmp(dev->name, name, IFNAMSIZ))
664 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700665
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666 return NULL;
667}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700668EXPORT_SYMBOL(__dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669
670/**
Eric Dumazet72c95282009-10-30 07:11:27 +0000671 * dev_get_by_name_rcu - find a device by its name
672 * @net: the applicable net namespace
673 * @name: name to find
674 *
675 * Find an interface by name.
676 * If the name is found a pointer to the device is returned.
677 * If the name is not found then %NULL is returned.
678 * The reference counters are not incremented so the caller must be
679 * careful with locks. The caller must hold RCU lock.
680 */
681
682struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
683{
Eric Dumazet72c95282009-10-30 07:11:27 +0000684 struct net_device *dev;
685 struct hlist_head *head = dev_name_hash(net, name);
686
Sasha Levinb67bfe02013-02-27 17:06:00 -0800687 hlist_for_each_entry_rcu(dev, head, name_hlist)
Eric Dumazet72c95282009-10-30 07:11:27 +0000688 if (!strncmp(dev->name, name, IFNAMSIZ))
689 return dev;
690
691 return NULL;
692}
693EXPORT_SYMBOL(dev_get_by_name_rcu);
694
695/**
Linus Torvalds1da177e2005-04-16 15:20:36 -0700696 * dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700697 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700698 * @name: name to find
699 *
700 * Find an interface by name. This can be called from any
701 * context and does its own locking. The returned handle has
702 * the usage count incremented and the caller must use dev_put() to
703 * release it when it is no longer needed. %NULL is returned if no
704 * matching device is found.
705 */
706
Eric W. Biederman881d9662007-09-17 11:56:21 -0700707struct net_device *dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700708{
709 struct net_device *dev;
710
Eric Dumazet72c95282009-10-30 07:11:27 +0000711 rcu_read_lock();
712 dev = dev_get_by_name_rcu(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700713 if (dev)
714 dev_hold(dev);
Eric Dumazet72c95282009-10-30 07:11:27 +0000715 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716 return dev;
717}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700718EXPORT_SYMBOL(dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700719
720/**
721 * __dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700722 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700723 * @ifindex: index of device
724 *
725 * Search for an interface by index. Returns %NULL if the device
726 * is not found or a pointer to the device. The device has not
727 * had its reference counter increased so the caller must be careful
728 * about locking. The caller must hold either the RTNL semaphore
729 * or @dev_base_lock.
730 */
731
Eric W. Biederman881d9662007-09-17 11:56:21 -0700732struct net_device *__dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700733{
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700734 struct net_device *dev;
735 struct hlist_head *head = dev_index_hash(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700736
Sasha Levinb67bfe02013-02-27 17:06:00 -0800737 hlist_for_each_entry(dev, head, index_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700738 if (dev->ifindex == ifindex)
739 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700740
Linus Torvalds1da177e2005-04-16 15:20:36 -0700741 return NULL;
742}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700743EXPORT_SYMBOL(__dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700744
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000745/**
746 * dev_get_by_index_rcu - find a device by its ifindex
747 * @net: the applicable net namespace
748 * @ifindex: index of device
749 *
750 * Search for an interface by index. Returns %NULL if the device
751 * is not found or a pointer to the device. The device has not
752 * had its reference counter increased so the caller must be careful
753 * about locking. The caller must hold RCU lock.
754 */
755
756struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
757{
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000758 struct net_device *dev;
759 struct hlist_head *head = dev_index_hash(net, ifindex);
760
Sasha Levinb67bfe02013-02-27 17:06:00 -0800761 hlist_for_each_entry_rcu(dev, head, index_hlist)
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000762 if (dev->ifindex == ifindex)
763 return dev;
764
765 return NULL;
766}
767EXPORT_SYMBOL(dev_get_by_index_rcu);
768
Linus Torvalds1da177e2005-04-16 15:20:36 -0700769
770/**
771 * dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700772 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700773 * @ifindex: index of device
774 *
775 * Search for an interface by index. Returns NULL if the device
776 * is not found or a pointer to the device. The device returned has
777 * had a reference added and the pointer is safe until the user calls
778 * dev_put to indicate they have finished with it.
779 */
780
Eric W. Biederman881d9662007-09-17 11:56:21 -0700781struct net_device *dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700782{
783 struct net_device *dev;
784
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000785 rcu_read_lock();
786 dev = dev_get_by_index_rcu(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700787 if (dev)
788 dev_hold(dev);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000789 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790 return dev;
791}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700792EXPORT_SYMBOL(dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700793
794/**
Eric Dumazet941666c2010-12-05 01:23:53 +0000795 * dev_getbyhwaddr_rcu - find a device by its hardware address
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700796 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700797 * @type: media type of device
798 * @ha: hardware address
799 *
800 * Search for an interface by MAC address. Returns NULL if the device
Eric Dumazetc5066532011-01-24 13:16:16 -0800801 * is not found or a pointer to the device.
802 * The caller must hold RCU or RTNL.
Eric Dumazet941666c2010-12-05 01:23:53 +0000803 * The returned device has not had its ref count increased
Linus Torvalds1da177e2005-04-16 15:20:36 -0700804 * and the caller must therefore be careful about locking
805 *
Linus Torvalds1da177e2005-04-16 15:20:36 -0700806 */
807
Eric Dumazet941666c2010-12-05 01:23:53 +0000808struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
809 const char *ha)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700810{
811 struct net_device *dev;
812
Eric Dumazet941666c2010-12-05 01:23:53 +0000813 for_each_netdev_rcu(net, dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700814 if (dev->type == type &&
815 !memcmp(dev->dev_addr, ha, dev->addr_len))
Pavel Emelianov7562f872007-05-03 15:13:45 -0700816 return dev;
817
818 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700819}
Eric Dumazet941666c2010-12-05 01:23:53 +0000820EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
Jochen Friedrichcf309e32005-09-22 04:44:55 -0300821
Eric W. Biederman881d9662007-09-17 11:56:21 -0700822struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700823{
824 struct net_device *dev;
825
826 ASSERT_RTNL();
Eric W. Biederman881d9662007-09-17 11:56:21 -0700827 for_each_netdev(net, dev)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700828 if (dev->type == type)
Pavel Emelianov7562f872007-05-03 15:13:45 -0700829 return dev;
830
831 return NULL;
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700832}
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700833EXPORT_SYMBOL(__dev_getfirstbyhwtype);
834
Eric W. Biederman881d9662007-09-17 11:56:21 -0700835struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700836{
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000837 struct net_device *dev, *ret = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700838
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000839 rcu_read_lock();
840 for_each_netdev_rcu(net, dev)
841 if (dev->type == type) {
842 dev_hold(dev);
843 ret = dev;
844 break;
845 }
846 rcu_read_unlock();
847 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700848}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700849EXPORT_SYMBOL(dev_getfirstbyhwtype);
850
851/**
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000852 * dev_get_by_flags_rcu - find any device with given flags
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700853 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700854 * @if_flags: IFF_* values
855 * @mask: bitmask of bits in if_flags to check
856 *
857 * Search for any interface with the given flags. Returns NULL if a device
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000858 * is not found or a pointer to the device. Must be called inside
859 * rcu_read_lock(), and result refcount is unchanged.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700860 */
861
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000862struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700863 unsigned short mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700864{
Pavel Emelianov7562f872007-05-03 15:13:45 -0700865 struct net_device *dev, *ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700866
Pavel Emelianov7562f872007-05-03 15:13:45 -0700867 ret = NULL;
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800868 for_each_netdev_rcu(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700869 if (((dev->flags ^ if_flags) & mask) == 0) {
Pavel Emelianov7562f872007-05-03 15:13:45 -0700870 ret = dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700871 break;
872 }
873 }
Pavel Emelianov7562f872007-05-03 15:13:45 -0700874 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700875}
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000876EXPORT_SYMBOL(dev_get_by_flags_rcu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700877
878/**
879 * dev_valid_name - check if name is okay for network device
880 * @name: name string
881 *
882 * Network device names need to be valid file names to
David S. Millerc7fa9d12006-08-15 16:34:13 -0700883 * to allow sysfs to work. We also disallow any kind of
884 * whitespace.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700885 */
David S. Miller95f050b2012-03-06 16:12:15 -0500886bool dev_valid_name(const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700887{
David S. Millerc7fa9d12006-08-15 16:34:13 -0700888 if (*name == '\0')
David S. Miller95f050b2012-03-06 16:12:15 -0500889 return false;
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -0700890 if (strlen(name) >= IFNAMSIZ)
David S. Miller95f050b2012-03-06 16:12:15 -0500891 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700892 if (!strcmp(name, ".") || !strcmp(name, ".."))
David S. Miller95f050b2012-03-06 16:12:15 -0500893 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700894
895 while (*name) {
896 if (*name == '/' || isspace(*name))
David S. Miller95f050b2012-03-06 16:12:15 -0500897 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700898 name++;
899 }
David S. Miller95f050b2012-03-06 16:12:15 -0500900 return true;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700901}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700902EXPORT_SYMBOL(dev_valid_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700903
904/**
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200905 * __dev_alloc_name - allocate a name for a device
906 * @net: network namespace to allocate the device name in
Linus Torvalds1da177e2005-04-16 15:20:36 -0700907 * @name: name format string
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200908 * @buf: scratch buffer and result name string
Linus Torvalds1da177e2005-04-16 15:20:36 -0700909 *
910 * Passed a format string - eg "lt%d" it will try and find a suitable
Stephen Hemminger3041a062006-05-26 13:25:24 -0700911 * id. It scans list of devices to build up a free map, then chooses
912 * the first empty slot. The caller must hold the dev_base or rtnl lock
913 * while allocating the name and adding the device in order to avoid
914 * duplicates.
915 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
916 * Returns the number of the unit assigned or a negative errno code.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917 */
918
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200919static int __dev_alloc_name(struct net *net, const char *name, char *buf)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700920{
921 int i = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700922 const char *p;
923 const int max_netdevices = 8*PAGE_SIZE;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700924 unsigned long *inuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700925 struct net_device *d;
926
927 p = strnchr(name, IFNAMSIZ-1, '%');
928 if (p) {
929 /*
930 * Verify the string as this thing may have come from
931 * the user. There must be either one "%d" and no other "%"
932 * characters.
933 */
934 if (p[1] != 'd' || strchr(p + 2, '%'))
935 return -EINVAL;
936
937 /* Use one page as a bit array of possible slots */
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700938 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700939 if (!inuse)
940 return -ENOMEM;
941
Eric W. Biederman881d9662007-09-17 11:56:21 -0700942 for_each_netdev(net, d) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700943 if (!sscanf(d->name, name, &i))
944 continue;
945 if (i < 0 || i >= max_netdevices)
946 continue;
947
948 /* avoid cases where sscanf is not exact inverse of printf */
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200949 snprintf(buf, IFNAMSIZ, name, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700950 if (!strncmp(buf, d->name, IFNAMSIZ))
951 set_bit(i, inuse);
952 }
953
954 i = find_first_zero_bit(inuse, max_netdevices);
955 free_page((unsigned long) inuse);
956 }
957
Octavian Purdilad9031022009-11-18 02:36:59 +0000958 if (buf != name)
959 snprintf(buf, IFNAMSIZ, name, i);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200960 if (!__dev_get_by_name(net, buf))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700961 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700962
963 /* It is possible to run out of possible slots
964 * when the name is long and there isn't enough space left
965 * for the digits, or if all bits are used.
966 */
967 return -ENFILE;
968}
969
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200970/**
971 * dev_alloc_name - allocate a name for a device
972 * @dev: device
973 * @name: name format string
974 *
975 * Passed a format string - eg "lt%d" it will try and find a suitable
976 * id. It scans list of devices to build up a free map, then chooses
977 * the first empty slot. The caller must hold the dev_base or rtnl lock
978 * while allocating the name and adding the device in order to avoid
979 * duplicates.
980 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
981 * Returns the number of the unit assigned or a negative errno code.
982 */
983
984int dev_alloc_name(struct net_device *dev, const char *name)
985{
986 char buf[IFNAMSIZ];
987 struct net *net;
988 int ret;
989
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900990 BUG_ON(!dev_net(dev));
991 net = dev_net(dev);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200992 ret = __dev_alloc_name(net, name, buf);
993 if (ret >= 0)
994 strlcpy(dev->name, buf, IFNAMSIZ);
995 return ret;
996}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700997EXPORT_SYMBOL(dev_alloc_name);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200998
Gao feng828de4f2012-09-13 20:58:27 +0000999static int dev_alloc_name_ns(struct net *net,
1000 struct net_device *dev,
1001 const char *name)
Octavian Purdilad9031022009-11-18 02:36:59 +00001002{
Gao feng828de4f2012-09-13 20:58:27 +00001003 char buf[IFNAMSIZ];
1004 int ret;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001005
Gao feng828de4f2012-09-13 20:58:27 +00001006 ret = __dev_alloc_name(net, name, buf);
1007 if (ret >= 0)
1008 strlcpy(dev->name, buf, IFNAMSIZ);
1009 return ret;
1010}
1011
1012static int dev_get_valid_name(struct net *net,
1013 struct net_device *dev,
1014 const char *name)
1015{
1016 BUG_ON(!net);
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001017
Octavian Purdilad9031022009-11-18 02:36:59 +00001018 if (!dev_valid_name(name))
1019 return -EINVAL;
1020
Jiri Pirko1c5cae82011-04-30 01:21:32 +00001021 if (strchr(name, '%'))
Gao feng828de4f2012-09-13 20:58:27 +00001022 return dev_alloc_name_ns(net, dev, name);
Octavian Purdilad9031022009-11-18 02:36:59 +00001023 else if (__dev_get_by_name(net, name))
1024 return -EEXIST;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001025 else if (dev->name != name)
1026 strlcpy(dev->name, name, IFNAMSIZ);
Octavian Purdilad9031022009-11-18 02:36:59 +00001027
1028 return 0;
1029}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001030
1031/**
1032 * dev_change_name - change name of a device
1033 * @dev: device
1034 * @newname: name (or format string) must be at least IFNAMSIZ
1035 *
1036 * Change name of a device, can pass format strings "eth%d".
1037 * for wildcarding.
1038 */
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07001039int dev_change_name(struct net_device *dev, const char *newname)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001040{
Herbert Xufcc5a032007-07-30 17:03:38 -07001041 char oldname[IFNAMSIZ];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001042 int err = 0;
Herbert Xufcc5a032007-07-30 17:03:38 -07001043 int ret;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001044 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001045
1046 ASSERT_RTNL();
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001047 BUG_ON(!dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001048
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001049 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050 if (dev->flags & IFF_UP)
1051 return -EBUSY;
1052
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001053 write_seqcount_begin(&devnet_rename_seq);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001054
1055 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001056 write_seqcount_end(&devnet_rename_seq);
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -07001057 return 0;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001058 }
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -07001059
Herbert Xufcc5a032007-07-30 17:03:38 -07001060 memcpy(oldname, dev->name, IFNAMSIZ);
1061
Gao feng828de4f2012-09-13 20:58:27 +00001062 err = dev_get_valid_name(net, dev, newname);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001063 if (err < 0) {
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001064 write_seqcount_end(&devnet_rename_seq);
Octavian Purdilad9031022009-11-18 02:36:59 +00001065 return err;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001066 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001067
Herbert Xufcc5a032007-07-30 17:03:38 -07001068rollback:
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001069 ret = device_rename(&dev->dev, dev->name);
1070 if (ret) {
1071 memcpy(dev->name, oldname, IFNAMSIZ);
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001072 write_seqcount_end(&devnet_rename_seq);
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001073 return ret;
Stephen Hemmingerdcc99772008-05-14 22:33:38 -07001074 }
Herbert Xu7f988ea2007-07-30 16:35:46 -07001075
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001076 write_seqcount_end(&devnet_rename_seq);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001077
Herbert Xu7f988ea2007-07-30 16:35:46 -07001078 write_lock_bh(&dev_base_lock);
Eric Dumazet372b2312011-05-17 13:56:59 -04001079 hlist_del_rcu(&dev->name_hlist);
Eric Dumazet72c95282009-10-30 07:11:27 +00001080 write_unlock_bh(&dev_base_lock);
1081
1082 synchronize_rcu();
1083
1084 write_lock_bh(&dev_base_lock);
1085 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Herbert Xu7f988ea2007-07-30 16:35:46 -07001086 write_unlock_bh(&dev_base_lock);
1087
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001088 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001089 ret = notifier_to_errno(ret);
1090
1091 if (ret) {
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001092 /* err >= 0 after dev_alloc_name() or stores the first errno */
1093 if (err >= 0) {
Herbert Xufcc5a032007-07-30 17:03:38 -07001094 err = ret;
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001095 write_seqcount_begin(&devnet_rename_seq);
Herbert Xufcc5a032007-07-30 17:03:38 -07001096 memcpy(dev->name, oldname, IFNAMSIZ);
1097 goto rollback;
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001098 } else {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001099 pr_err("%s: name change rollback failed: %d\n",
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001100 dev->name, ret);
Herbert Xufcc5a032007-07-30 17:03:38 -07001101 }
1102 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001103
1104 return err;
1105}
1106
1107/**
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001108 * dev_set_alias - change ifalias of a device
1109 * @dev: device
1110 * @alias: name up to IFALIASZ
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07001111 * @len: limit of bytes to copy from info
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001112 *
1113 * Set ifalias for a device,
1114 */
1115int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1116{
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001117 char *new_ifalias;
1118
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001119 ASSERT_RTNL();
1120
1121 if (len >= IFALIASZ)
1122 return -EINVAL;
1123
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001124 if (!len) {
Sachin Kamat388dfc22012-11-20 00:57:04 +00001125 kfree(dev->ifalias);
1126 dev->ifalias = NULL;
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001127 return 0;
1128 }
1129
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001130 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1131 if (!new_ifalias)
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001132 return -ENOMEM;
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001133 dev->ifalias = new_ifalias;
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001134
1135 strlcpy(dev->ifalias, alias, len+1);
1136 return len;
1137}
1138
1139
1140/**
Stephen Hemminger3041a062006-05-26 13:25:24 -07001141 * netdev_features_change - device changes features
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001142 * @dev: device to cause notification
1143 *
1144 * Called to indicate a device has changed features.
1145 */
1146void netdev_features_change(struct net_device *dev)
1147{
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001148 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001149}
1150EXPORT_SYMBOL(netdev_features_change);
1151
1152/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001153 * netdev_state_change - device changes state
1154 * @dev: device to cause notification
1155 *
1156 * Called to indicate a device has changed state. This function calls
1157 * the notifier chains for netdev_chain and sends a NEWLINK message
1158 * to the routing socket.
1159 */
1160void netdev_state_change(struct net_device *dev)
1161{
1162 if (dev->flags & IFF_UP) {
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001163 call_netdevice_notifiers(NETDEV_CHANGE, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001164 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1165 }
1166}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001167EXPORT_SYMBOL(netdev_state_change);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001168
Amerigo Wangee89bab2012-08-09 22:14:56 +00001169/**
1170 * netdev_notify_peers - notify network peers about existence of @dev
1171 * @dev: network device
1172 *
1173 * Generate traffic such that interested network peers are aware of
1174 * @dev, such as by generating a gratuitous ARP. This may be used when
1175 * a device wants to inform the rest of the network about some sort of
1176 * reconfiguration such as a failover event or virtual machine
1177 * migration.
1178 */
1179void netdev_notify_peers(struct net_device *dev)
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001180{
Amerigo Wangee89bab2012-08-09 22:14:56 +00001181 rtnl_lock();
1182 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1183 rtnl_unlock();
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001184}
Amerigo Wangee89bab2012-08-09 22:14:56 +00001185EXPORT_SYMBOL(netdev_notify_peers);
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001186
Patrick McHardybd380812010-02-26 06:34:53 +00001187static int __dev_open(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001188{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001189 const struct net_device_ops *ops = dev->netdev_ops;
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001190 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001191
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001192 ASSERT_RTNL();
1193
Linus Torvalds1da177e2005-04-16 15:20:36 -07001194 if (!netif_device_present(dev))
1195 return -ENODEV;
1196
Neil Hormanca99ca12013-02-05 08:05:43 +00001197 /* Block netpoll from trying to do any rx path servicing.
1198 * If we don't do this there is a chance ndo_poll_controller
1199 * or ndo_poll may be running while we open the device
1200 */
1201 ret = netpoll_rx_disable(dev);
1202 if (ret)
1203 return ret;
1204
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001205 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1206 ret = notifier_to_errno(ret);
1207 if (ret)
1208 return ret;
1209
Linus Torvalds1da177e2005-04-16 15:20:36 -07001210 set_bit(__LINK_STATE_START, &dev->state);
Jeff Garzikbada3392007-10-23 20:19:37 -07001211
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001212 if (ops->ndo_validate_addr)
1213 ret = ops->ndo_validate_addr(dev);
Jeff Garzikbada3392007-10-23 20:19:37 -07001214
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001215 if (!ret && ops->ndo_open)
1216 ret = ops->ndo_open(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001217
Neil Hormanca99ca12013-02-05 08:05:43 +00001218 netpoll_rx_enable(dev);
1219
Jeff Garzikbada3392007-10-23 20:19:37 -07001220 if (ret)
1221 clear_bit(__LINK_STATE_START, &dev->state);
1222 else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001223 dev->flags |= IFF_UP;
David S. Millerb4bd07c2009-02-06 22:06:43 -08001224 net_dmaengine_get();
Patrick McHardy4417da62007-06-27 01:28:10 -07001225 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001226 dev_activate(dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04001227 add_device_randomness(dev->dev_addr, dev->addr_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001228 }
Jeff Garzikbada3392007-10-23 20:19:37 -07001229
Linus Torvalds1da177e2005-04-16 15:20:36 -07001230 return ret;
1231}
Patrick McHardybd380812010-02-26 06:34:53 +00001232
1233/**
1234 * dev_open - prepare an interface for use.
1235 * @dev: device to open
1236 *
1237 * Takes a device from down to up state. The device's private open
1238 * function is invoked and then the multicast lists are loaded. Finally
1239 * the device is moved into the up state and a %NETDEV_UP message is
1240 * sent to the netdev notifier chain.
1241 *
1242 * Calling this function on an active interface is a nop. On a failure
1243 * a negative errno code is returned.
1244 */
1245int dev_open(struct net_device *dev)
1246{
1247 int ret;
1248
Patrick McHardybd380812010-02-26 06:34:53 +00001249 if (dev->flags & IFF_UP)
1250 return 0;
1251
Patrick McHardybd380812010-02-26 06:34:53 +00001252 ret = __dev_open(dev);
1253 if (ret < 0)
1254 return ret;
1255
Patrick McHardybd380812010-02-26 06:34:53 +00001256 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1257 call_netdevice_notifiers(NETDEV_UP, dev);
1258
1259 return ret;
1260}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001261EXPORT_SYMBOL(dev_open);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001262
Octavian Purdila44345722010-12-13 12:44:07 +00001263static int __dev_close_many(struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001264{
Octavian Purdila44345722010-12-13 12:44:07 +00001265 struct net_device *dev;
Patrick McHardybd380812010-02-26 06:34:53 +00001266
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001267 ASSERT_RTNL();
David S. Miller9d5010d2007-09-12 14:33:25 +02001268 might_sleep();
1269
Octavian Purdila44345722010-12-13 12:44:07 +00001270 list_for_each_entry(dev, head, unreg_list) {
Octavian Purdila44345722010-12-13 12:44:07 +00001271 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001272
Octavian Purdila44345722010-12-13 12:44:07 +00001273 clear_bit(__LINK_STATE_START, &dev->state);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001274
Octavian Purdila44345722010-12-13 12:44:07 +00001275 /* Synchronize to scheduled poll. We cannot touch poll list, it
1276 * can be even on different cpu. So just clear netif_running().
1277 *
1278 * dev->stop() will invoke napi_disable() on all of it's
1279 * napi_struct instances on this device.
1280 */
1281 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1282 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001283
Octavian Purdila44345722010-12-13 12:44:07 +00001284 dev_deactivate_many(head);
1285
1286 list_for_each_entry(dev, head, unreg_list) {
1287 const struct net_device_ops *ops = dev->netdev_ops;
1288
1289 /*
1290 * Call the device specific close. This cannot fail.
1291 * Only if device is UP
1292 *
1293 * We allow it to be called even after a DETACH hot-plug
1294 * event.
1295 */
1296 if (ops->ndo_stop)
1297 ops->ndo_stop(dev);
1298
Octavian Purdila44345722010-12-13 12:44:07 +00001299 dev->flags &= ~IFF_UP;
Octavian Purdila44345722010-12-13 12:44:07 +00001300 net_dmaengine_put();
1301 }
1302
1303 return 0;
1304}
1305
1306static int __dev_close(struct net_device *dev)
1307{
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001308 int retval;
Octavian Purdila44345722010-12-13 12:44:07 +00001309 LIST_HEAD(single);
1310
Neil Hormanca99ca12013-02-05 08:05:43 +00001311 /* Temporarily disable netpoll until the interface is down */
1312 retval = netpoll_rx_disable(dev);
1313 if (retval)
1314 return retval;
1315
Octavian Purdila44345722010-12-13 12:44:07 +00001316 list_add(&dev->unreg_list, &single);
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001317 retval = __dev_close_many(&single);
1318 list_del(&single);
Neil Hormanca99ca12013-02-05 08:05:43 +00001319
1320 netpoll_rx_enable(dev);
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001321 return retval;
Octavian Purdila44345722010-12-13 12:44:07 +00001322}
1323
Eric Dumazet3fbd8752011-01-19 21:23:22 +00001324static int dev_close_many(struct list_head *head)
Octavian Purdila44345722010-12-13 12:44:07 +00001325{
1326 struct net_device *dev, *tmp;
1327 LIST_HEAD(tmp_list);
1328
1329 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1330 if (!(dev->flags & IFF_UP))
1331 list_move(&dev->unreg_list, &tmp_list);
1332
1333 __dev_close_many(head);
Matti Linnanvuorid8b2a4d2008-02-12 23:10:11 -08001334
Octavian Purdila44345722010-12-13 12:44:07 +00001335 list_for_each_entry(dev, head, unreg_list) {
1336 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1337 call_netdevice_notifiers(NETDEV_DOWN, dev);
1338 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001339
Octavian Purdila44345722010-12-13 12:44:07 +00001340 /* rollback_registered_many needs the complete original list */
1341 list_splice(&tmp_list, head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001342 return 0;
1343}
Patrick McHardybd380812010-02-26 06:34:53 +00001344
1345/**
1346 * dev_close - shutdown an interface.
1347 * @dev: device to shutdown
1348 *
1349 * This function moves an active device into down state. A
1350 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1351 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1352 * chain.
1353 */
1354int dev_close(struct net_device *dev)
1355{
Neil Hormanca99ca12013-02-05 08:05:43 +00001356 int ret = 0;
Eric Dumazete14a5992011-05-10 12:26:06 -07001357 if (dev->flags & IFF_UP) {
1358 LIST_HEAD(single);
Patrick McHardybd380812010-02-26 06:34:53 +00001359
Neil Hormanca99ca12013-02-05 08:05:43 +00001360 /* Block netpoll rx while the interface is going down */
1361 ret = netpoll_rx_disable(dev);
1362 if (ret)
1363 return ret;
1364
Eric Dumazete14a5992011-05-10 12:26:06 -07001365 list_add(&dev->unreg_list, &single);
1366 dev_close_many(&single);
1367 list_del(&single);
Neil Hormanca99ca12013-02-05 08:05:43 +00001368
1369 netpoll_rx_enable(dev);
Eric Dumazete14a5992011-05-10 12:26:06 -07001370 }
Neil Hormanca99ca12013-02-05 08:05:43 +00001371 return ret;
Patrick McHardybd380812010-02-26 06:34:53 +00001372}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001373EXPORT_SYMBOL(dev_close);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001374
1375
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001376/**
1377 * dev_disable_lro - disable Large Receive Offload on a device
1378 * @dev: device
1379 *
1380 * Disable Large Receive Offload (LRO) on a net device. Must be
1381 * called under RTNL. This is needed if received packets may be
1382 * forwarded to another interface.
1383 */
1384void dev_disable_lro(struct net_device *dev)
1385{
Neil Hormanf11970e2011-05-24 08:31:09 +00001386 /*
1387 * If we're trying to disable lro on a vlan device
1388 * use the underlying physical device instead
1389 */
1390 if (is_vlan_dev(dev))
1391 dev = vlan_dev_real_dev(dev);
1392
Michał Mirosławbc5787c62011-11-15 15:29:55 +00001393 dev->wanted_features &= ~NETIF_F_LRO;
1394 netdev_update_features(dev);
Michał Mirosław27660512011-03-18 16:56:34 +00001395
Michał Mirosław22d59692011-04-21 12:42:15 +00001396 if (unlikely(dev->features & NETIF_F_LRO))
1397 netdev_WARN(dev, "failed to disable LRO!\n");
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001398}
1399EXPORT_SYMBOL(dev_disable_lro);
1400
1401
Eric W. Biederman881d9662007-09-17 11:56:21 -07001402static int dev_boot_phase = 1;
1403
Linus Torvalds1da177e2005-04-16 15:20:36 -07001404/**
1405 * register_netdevice_notifier - register a network notifier block
1406 * @nb: notifier
1407 *
1408 * Register a notifier to be called when network device events occur.
1409 * The notifier passed is linked into the kernel structures and must
1410 * not be reused until it has been unregistered. A negative errno code
1411 * is returned on a failure.
1412 *
1413 * When registered all registration and up events are replayed
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001414 * to the new notifier to allow device to have a race free
Linus Torvalds1da177e2005-04-16 15:20:36 -07001415 * view of the network device list.
1416 */
1417
1418int register_netdevice_notifier(struct notifier_block *nb)
1419{
1420 struct net_device *dev;
Herbert Xufcc5a032007-07-30 17:03:38 -07001421 struct net_device *last;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001422 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001423 int err;
1424
1425 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001426 err = raw_notifier_chain_register(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001427 if (err)
1428 goto unlock;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001429 if (dev_boot_phase)
1430 goto unlock;
1431 for_each_net(net) {
1432 for_each_netdev(net, dev) {
1433 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1434 err = notifier_to_errno(err);
1435 if (err)
1436 goto rollback;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001437
Eric W. Biederman881d9662007-09-17 11:56:21 -07001438 if (!(dev->flags & IFF_UP))
1439 continue;
Herbert Xufcc5a032007-07-30 17:03:38 -07001440
Eric W. Biederman881d9662007-09-17 11:56:21 -07001441 nb->notifier_call(nb, NETDEV_UP, dev);
1442 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001443 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001444
1445unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001446 rtnl_unlock();
1447 return err;
Herbert Xufcc5a032007-07-30 17:03:38 -07001448
1449rollback:
1450 last = dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001451 for_each_net(net) {
1452 for_each_netdev(net, dev) {
1453 if (dev == last)
RongQing.Li8f891482011-11-30 23:43:07 -05001454 goto outroll;
Herbert Xufcc5a032007-07-30 17:03:38 -07001455
Eric W. Biederman881d9662007-09-17 11:56:21 -07001456 if (dev->flags & IFF_UP) {
1457 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1458 nb->notifier_call(nb, NETDEV_DOWN, dev);
1459 }
1460 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001461 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001462 }
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001463
RongQing.Li8f891482011-11-30 23:43:07 -05001464outroll:
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001465 raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001466 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001467}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001468EXPORT_SYMBOL(register_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001469
1470/**
1471 * unregister_netdevice_notifier - unregister a network notifier block
1472 * @nb: notifier
1473 *
1474 * Unregister a notifier previously registered by
1475 * register_netdevice_notifier(). The notifier is unlinked into the
1476 * kernel structures and may then be reused. A negative errno code
1477 * is returned on a failure.
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001478 *
1479 * After unregistering unregister and down device events are synthesized
1480 * for all devices on the device list to the removed notifier to remove
1481 * the need for special case cleanup code.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001482 */
1483
1484int unregister_netdevice_notifier(struct notifier_block *nb)
1485{
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001486 struct net_device *dev;
1487 struct net *net;
Herbert Xu9f514952006-03-25 01:24:25 -08001488 int err;
1489
1490 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001491 err = raw_notifier_chain_unregister(&netdev_chain, nb);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001492 if (err)
1493 goto unlock;
1494
1495 for_each_net(net) {
1496 for_each_netdev(net, dev) {
1497 if (dev->flags & IFF_UP) {
1498 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1499 nb->notifier_call(nb, NETDEV_DOWN, dev);
1500 }
1501 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001502 }
1503 }
1504unlock:
Herbert Xu9f514952006-03-25 01:24:25 -08001505 rtnl_unlock();
1506 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001507}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001508EXPORT_SYMBOL(unregister_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001509
1510/**
1511 * call_netdevice_notifiers - call all network notifier blocks
1512 * @val: value passed unmodified to notifier function
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001513 * @dev: net_device pointer passed unmodified to notifier function
Linus Torvalds1da177e2005-04-16 15:20:36 -07001514 *
1515 * Call all network notifier blocks. Parameters and return value
Alan Sternf07d5b92006-05-09 15:23:03 -07001516 * are as for raw_notifier_call_chain().
Linus Torvalds1da177e2005-04-16 15:20:36 -07001517 */
1518
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001519int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001520{
Jiri Pirkoab930472010-04-20 01:45:37 -07001521 ASSERT_RTNL();
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001522 return raw_notifier_call_chain(&netdev_chain, val, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001523}
stephen hemmingeredf947f2011-03-24 13:24:01 +00001524EXPORT_SYMBOL(call_netdevice_notifiers);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001525
Ingo Molnarc5905af2012-02-24 08:31:31 +01001526static struct static_key netstamp_needed __read_mostly;
Eric Dumazetb90e5792011-11-28 11:16:50 +00001527#ifdef HAVE_JUMP_LABEL
Ingo Molnarc5905af2012-02-24 08:31:31 +01001528/* We are not allowed to call static_key_slow_dec() from irq context
Eric Dumazetb90e5792011-11-28 11:16:50 +00001529 * If net_disable_timestamp() is called from irq context, defer the
Ingo Molnarc5905af2012-02-24 08:31:31 +01001530 * static_key_slow_dec() calls.
Eric Dumazetb90e5792011-11-28 11:16:50 +00001531 */
1532static atomic_t netstamp_needed_deferred;
1533#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001534
1535void net_enable_timestamp(void)
1536{
Eric Dumazetb90e5792011-11-28 11:16:50 +00001537#ifdef HAVE_JUMP_LABEL
1538 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1539
1540 if (deferred) {
1541 while (--deferred)
Ingo Molnarc5905af2012-02-24 08:31:31 +01001542 static_key_slow_dec(&netstamp_needed);
Eric Dumazetb90e5792011-11-28 11:16:50 +00001543 return;
1544 }
1545#endif
Ingo Molnarc5905af2012-02-24 08:31:31 +01001546 static_key_slow_inc(&netstamp_needed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001547}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001548EXPORT_SYMBOL(net_enable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549
1550void net_disable_timestamp(void)
1551{
Eric Dumazetb90e5792011-11-28 11:16:50 +00001552#ifdef HAVE_JUMP_LABEL
1553 if (in_interrupt()) {
1554 atomic_inc(&netstamp_needed_deferred);
1555 return;
1556 }
1557#endif
Ingo Molnarc5905af2012-02-24 08:31:31 +01001558 static_key_slow_dec(&netstamp_needed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001559}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001560EXPORT_SYMBOL(net_disable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001561
Eric Dumazet3b098e22010-05-15 23:57:10 -07001562static inline void net_timestamp_set(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001563{
Eric Dumazet588f0332011-11-15 04:12:55 +00001564 skb->tstamp.tv64 = 0;
Ingo Molnarc5905af2012-02-24 08:31:31 +01001565 if (static_key_false(&netstamp_needed))
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001566 __net_timestamp(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001567}
1568
Eric Dumazet588f0332011-11-15 04:12:55 +00001569#define net_timestamp_check(COND, SKB) \
Ingo Molnarc5905af2012-02-24 08:31:31 +01001570 if (static_key_false(&netstamp_needed)) { \
Eric Dumazet588f0332011-11-15 04:12:55 +00001571 if ((COND) && !(SKB)->tstamp.tv64) \
1572 __net_timestamp(SKB); \
1573 } \
Eric Dumazet3b098e22010-05-15 23:57:10 -07001574
Daniel Lezcano79b569f2011-03-30 02:42:17 -07001575static inline bool is_skb_forwardable(struct net_device *dev,
1576 struct sk_buff *skb)
1577{
1578 unsigned int len;
1579
1580 if (!(dev->flags & IFF_UP))
1581 return false;
1582
1583 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1584 if (skb->len <= len)
1585 return true;
1586
1587 /* if TSO is enabled, we don't care about the length as the packet
1588 * could be forwarded without being segmented before
1589 */
1590 if (skb_is_gso(skb))
1591 return true;
1592
1593 return false;
1594}
1595
Arnd Bergmann44540962009-11-26 06:07:08 +00001596/**
1597 * dev_forward_skb - loopback an skb to another netif
1598 *
1599 * @dev: destination network device
1600 * @skb: buffer to forward
1601 *
1602 * return values:
1603 * NET_RX_SUCCESS (no congestion)
Eric Dumazet6ec82562010-05-06 00:53:53 -07001604 * NET_RX_DROP (packet was dropped, but freed)
Arnd Bergmann44540962009-11-26 06:07:08 +00001605 *
1606 * dev_forward_skb can be used for injecting an skb from the
1607 * start_xmit function of one device into the receive queue
1608 * of another device.
1609 *
1610 * The receiving device may be in another namespace, so
1611 * we have to clear all information in the skb that could
1612 * impact namespace isolation.
1613 */
1614int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1615{
Michael S. Tsirkin48c83012011-08-31 08:03:29 +00001616 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1617 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1618 atomic_long_inc(&dev->rx_dropped);
1619 kfree_skb(skb);
1620 return NET_RX_DROP;
1621 }
1622 }
1623
Arnd Bergmann44540962009-11-26 06:07:08 +00001624 skb_orphan(skb);
1625
Daniel Lezcano79b569f2011-03-30 02:42:17 -07001626 if (unlikely(!is_skb_forwardable(dev, skb))) {
Eric Dumazetcaf586e2010-09-30 21:06:55 +00001627 atomic_long_inc(&dev->rx_dropped);
Eric Dumazet6ec82562010-05-06 00:53:53 -07001628 kfree_skb(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001629 return NET_RX_DROP;
Eric Dumazet6ec82562010-05-06 00:53:53 -07001630 }
Benjamin LaHaise3b9785c2012-03-27 15:55:44 +00001631 skb->skb_iif = 0;
David S. Miller59b99972012-05-10 23:03:34 -04001632 skb->dev = dev;
1633 skb_dst_drop(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001634 skb->tstamp.tv64 = 0;
1635 skb->pkt_type = PACKET_HOST;
1636 skb->protocol = eth_type_trans(skb, dev);
David S. Miller59b99972012-05-10 23:03:34 -04001637 skb->mark = 0;
1638 secpath_reset(skb);
1639 nf_reset(skb);
Patrick McHardy124dff02013-04-05 20:42:05 +02001640 nf_reset_trace(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001641 return netif_rx(skb);
1642}
1643EXPORT_SYMBOL_GPL(dev_forward_skb);
1644
Changli Gao71d9dec2010-12-15 19:57:25 +00001645static inline int deliver_skb(struct sk_buff *skb,
1646 struct packet_type *pt_prev,
1647 struct net_device *orig_dev)
1648{
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00001649 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1650 return -ENOMEM;
Changli Gao71d9dec2010-12-15 19:57:25 +00001651 atomic_inc(&skb->users);
1652 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1653}
1654
Eric Leblondc0de08d2012-08-16 22:02:58 +00001655static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1656{
Eric Leblonda3d744e2012-11-06 02:10:10 +00001657 if (!ptype->af_packet_priv || !skb->sk)
Eric Leblondc0de08d2012-08-16 22:02:58 +00001658 return false;
1659
1660 if (ptype->id_match)
1661 return ptype->id_match(ptype, skb->sk);
1662 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1663 return true;
1664
1665 return false;
1666}
1667
Linus Torvalds1da177e2005-04-16 15:20:36 -07001668/*
1669 * Support routine. Sends outgoing frames to any network
1670 * taps currently in use.
1671 */
1672
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001673static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001674{
1675 struct packet_type *ptype;
Changli Gao71d9dec2010-12-15 19:57:25 +00001676 struct sk_buff *skb2 = NULL;
1677 struct packet_type *pt_prev = NULL;
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001678
Linus Torvalds1da177e2005-04-16 15:20:36 -07001679 rcu_read_lock();
1680 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1681 /* Never send packets back to the socket
1682 * they originated from - MvS (miquels@drinkel.ow.org)
1683 */
1684 if ((ptype->dev == dev || !ptype->dev) &&
Eric Leblondc0de08d2012-08-16 22:02:58 +00001685 (!skb_loop_sk(ptype, skb))) {
Changli Gao71d9dec2010-12-15 19:57:25 +00001686 if (pt_prev) {
1687 deliver_skb(skb2, pt_prev, skb->dev);
1688 pt_prev = ptype;
1689 continue;
1690 }
1691
1692 skb2 = skb_clone(skb, GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001693 if (!skb2)
1694 break;
1695
Eric Dumazet70978182010-12-20 21:22:51 +00001696 net_timestamp_set(skb2);
1697
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698 /* skb->nh should be correctly
1699 set by sender, so that the second statement is
1700 just protection against buggy protocols.
1701 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001702 skb_reset_mac_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001703
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001704 if (skb_network_header(skb2) < skb2->data ||
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001705 skb2->network_header > skb2->tail) {
Joe Perchese87cc472012-05-13 21:56:26 +00001706 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1707 ntohs(skb2->protocol),
1708 dev->name);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07001709 skb_reset_network_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001710 }
1711
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001712 skb2->transport_header = skb2->network_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001713 skb2->pkt_type = PACKET_OUTGOING;
Changli Gao71d9dec2010-12-15 19:57:25 +00001714 pt_prev = ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001715 }
1716 }
Changli Gao71d9dec2010-12-15 19:57:25 +00001717 if (pt_prev)
1718 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001719 rcu_read_unlock();
1720}
1721
Ben Hutchings2c530402012-07-10 10:55:09 +00001722/**
1723 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
John Fastabend4f57c082011-01-17 08:06:04 +00001724 * @dev: Network device
1725 * @txq: number of queues available
1726 *
1727 * If real_num_tx_queues is changed the tc mappings may no longer be
1728 * valid. To resolve this verify the tc mapping remains valid and if
1729 * not NULL the mapping. With no priorities mapping to this
1730 * offset/count pair it will no longer be used. In the worst case TC0
1731 * is invalid nothing can be done so disable priority mappings. If is
1732 * expected that drivers will fix this mapping if they can before
1733 * calling netif_set_real_num_tx_queues.
1734 */
Eric Dumazetbb134d22011-01-20 19:18:08 +00001735static void netif_setup_tc(struct net_device *dev, unsigned int txq)
John Fastabend4f57c082011-01-17 08:06:04 +00001736{
1737 int i;
1738 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1739
1740 /* If TC0 is invalidated disable TC mapping */
1741 if (tc->offset + tc->count > txq) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001742 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
John Fastabend4f57c082011-01-17 08:06:04 +00001743 dev->num_tc = 0;
1744 return;
1745 }
1746
1747 /* Invalidated prio to tc mappings set to TC0 */
1748 for (i = 1; i < TC_BITMASK + 1; i++) {
1749 int q = netdev_get_prio_tc_map(dev, i);
1750
1751 tc = &dev->tc_to_txq[q];
1752 if (tc->offset + tc->count > txq) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001753 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1754 i, q);
John Fastabend4f57c082011-01-17 08:06:04 +00001755 netdev_set_prio_tc_map(dev, i, 0);
1756 }
1757 }
1758}
1759
Alexander Duyck537c00d2013-01-10 08:57:02 +00001760#ifdef CONFIG_XPS
1761static DEFINE_MUTEX(xps_map_mutex);
1762#define xmap_dereference(P) \
1763 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1764
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001765static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1766 int cpu, u16 index)
1767{
1768 struct xps_map *map = NULL;
1769 int pos;
1770
1771 if (dev_maps)
1772 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1773
1774 for (pos = 0; map && pos < map->len; pos++) {
1775 if (map->queues[pos] == index) {
1776 if (map->len > 1) {
1777 map->queues[pos] = map->queues[--map->len];
1778 } else {
1779 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1780 kfree_rcu(map, rcu);
1781 map = NULL;
1782 }
1783 break;
1784 }
1785 }
1786
1787 return map;
1788}
1789
Alexander Duyck024e9672013-01-10 08:57:46 +00001790static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
Alexander Duyck537c00d2013-01-10 08:57:02 +00001791{
1792 struct xps_dev_maps *dev_maps;
Alexander Duyck024e9672013-01-10 08:57:46 +00001793 int cpu, i;
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001794 bool active = false;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001795
1796 mutex_lock(&xps_map_mutex);
1797 dev_maps = xmap_dereference(dev->xps_maps);
1798
1799 if (!dev_maps)
1800 goto out_no_maps;
1801
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001802 for_each_possible_cpu(cpu) {
Alexander Duyck024e9672013-01-10 08:57:46 +00001803 for (i = index; i < dev->num_tx_queues; i++) {
1804 if (!remove_xps_queue(dev_maps, cpu, i))
1805 break;
1806 }
1807 if (i == dev->num_tx_queues)
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001808 active = true;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001809 }
1810
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001811 if (!active) {
Alexander Duyck537c00d2013-01-10 08:57:02 +00001812 RCU_INIT_POINTER(dev->xps_maps, NULL);
1813 kfree_rcu(dev_maps, rcu);
1814 }
1815
Alexander Duyck024e9672013-01-10 08:57:46 +00001816 for (i = index; i < dev->num_tx_queues; i++)
1817 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1818 NUMA_NO_NODE);
1819
Alexander Duyck537c00d2013-01-10 08:57:02 +00001820out_no_maps:
1821 mutex_unlock(&xps_map_mutex);
1822}
1823
Alexander Duyck01c5f862013-01-10 08:57:35 +00001824static struct xps_map *expand_xps_map(struct xps_map *map,
1825 int cpu, u16 index)
1826{
1827 struct xps_map *new_map;
1828 int alloc_len = XPS_MIN_MAP_ALLOC;
1829 int i, pos;
1830
1831 for (pos = 0; map && pos < map->len; pos++) {
1832 if (map->queues[pos] != index)
1833 continue;
1834 return map;
1835 }
1836
1837 /* Need to add queue to this CPU's existing map */
1838 if (map) {
1839 if (pos < map->alloc_len)
1840 return map;
1841
1842 alloc_len = map->alloc_len * 2;
1843 }
1844
1845 /* Need to allocate new map to store queue on this CPU's map */
1846 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1847 cpu_to_node(cpu));
1848 if (!new_map)
1849 return NULL;
1850
1851 for (i = 0; i < pos; i++)
1852 new_map->queues[i] = map->queues[i];
1853 new_map->alloc_len = alloc_len;
1854 new_map->len = pos;
1855
1856 return new_map;
1857}
1858
Alexander Duyck537c00d2013-01-10 08:57:02 +00001859int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1860{
Alexander Duyck01c5f862013-01-10 08:57:35 +00001861 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001862 struct xps_map *map, *new_map;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001863 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
Alexander Duyck01c5f862013-01-10 08:57:35 +00001864 int cpu, numa_node_id = -2;
1865 bool active = false;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001866
1867 mutex_lock(&xps_map_mutex);
1868
1869 dev_maps = xmap_dereference(dev->xps_maps);
1870
Alexander Duyck01c5f862013-01-10 08:57:35 +00001871 /* allocate memory for queue storage */
1872 for_each_online_cpu(cpu) {
1873 if (!cpumask_test_cpu(cpu, mask))
1874 continue;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001875
Alexander Duyck01c5f862013-01-10 08:57:35 +00001876 if (!new_dev_maps)
1877 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
Alexander Duyck2bb60cb2013-02-22 06:38:44 +00001878 if (!new_dev_maps) {
1879 mutex_unlock(&xps_map_mutex);
Alexander Duyck01c5f862013-01-10 08:57:35 +00001880 return -ENOMEM;
Alexander Duyck2bb60cb2013-02-22 06:38:44 +00001881 }
Alexander Duyck01c5f862013-01-10 08:57:35 +00001882
1883 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1884 NULL;
1885
1886 map = expand_xps_map(map, cpu, index);
1887 if (!map)
1888 goto error;
1889
1890 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1891 }
1892
1893 if (!new_dev_maps)
1894 goto out_no_new_maps;
1895
1896 for_each_possible_cpu(cpu) {
1897 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1898 /* add queue to CPU maps */
1899 int pos = 0;
1900
1901 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1902 while ((pos < map->len) && (map->queues[pos] != index))
1903 pos++;
1904
1905 if (pos == map->len)
1906 map->queues[map->len++] = index;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001907#ifdef CONFIG_NUMA
Alexander Duyck537c00d2013-01-10 08:57:02 +00001908 if (numa_node_id == -2)
1909 numa_node_id = cpu_to_node(cpu);
1910 else if (numa_node_id != cpu_to_node(cpu))
1911 numa_node_id = -1;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001912#endif
Alexander Duyck01c5f862013-01-10 08:57:35 +00001913 } else if (dev_maps) {
1914 /* fill in the new device map from the old device map */
1915 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1916 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
Alexander Duyck537c00d2013-01-10 08:57:02 +00001917 }
Alexander Duyck01c5f862013-01-10 08:57:35 +00001918
Alexander Duyck537c00d2013-01-10 08:57:02 +00001919 }
1920
Alexander Duyck01c5f862013-01-10 08:57:35 +00001921 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1922
Alexander Duyck537c00d2013-01-10 08:57:02 +00001923 /* Cleanup old maps */
Alexander Duyck01c5f862013-01-10 08:57:35 +00001924 if (dev_maps) {
1925 for_each_possible_cpu(cpu) {
1926 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1927 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1928 if (map && map != new_map)
1929 kfree_rcu(map, rcu);
1930 }
Alexander Duyck537c00d2013-01-10 08:57:02 +00001931
Alexander Duyck537c00d2013-01-10 08:57:02 +00001932 kfree_rcu(dev_maps, rcu);
Alexander Duyck01c5f862013-01-10 08:57:35 +00001933 }
Alexander Duyck537c00d2013-01-10 08:57:02 +00001934
Alexander Duyck01c5f862013-01-10 08:57:35 +00001935 dev_maps = new_dev_maps;
1936 active = true;
1937
1938out_no_new_maps:
1939 /* update Tx queue numa node */
Alexander Duyck537c00d2013-01-10 08:57:02 +00001940 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1941 (numa_node_id >= 0) ? numa_node_id :
1942 NUMA_NO_NODE);
1943
Alexander Duyck01c5f862013-01-10 08:57:35 +00001944 if (!dev_maps)
1945 goto out_no_maps;
1946
1947 /* removes queue from unused CPUs */
1948 for_each_possible_cpu(cpu) {
1949 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1950 continue;
1951
1952 if (remove_xps_queue(dev_maps, cpu, index))
1953 active = true;
1954 }
1955
1956 /* free map if not active */
1957 if (!active) {
1958 RCU_INIT_POINTER(dev->xps_maps, NULL);
1959 kfree_rcu(dev_maps, rcu);
1960 }
1961
1962out_no_maps:
Alexander Duyck537c00d2013-01-10 08:57:02 +00001963 mutex_unlock(&xps_map_mutex);
1964
1965 return 0;
1966error:
Alexander Duyck01c5f862013-01-10 08:57:35 +00001967 /* remove any maps that we added */
1968 for_each_possible_cpu(cpu) {
1969 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1970 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1971 NULL;
1972 if (new_map && new_map != map)
1973 kfree(new_map);
1974 }
1975
Alexander Duyck537c00d2013-01-10 08:57:02 +00001976 mutex_unlock(&xps_map_mutex);
1977
Alexander Duyck537c00d2013-01-10 08:57:02 +00001978 kfree(new_dev_maps);
1979 return -ENOMEM;
1980}
1981EXPORT_SYMBOL(netif_set_xps_queue);
1982
1983#endif
John Fastabendf0796d52010-07-01 13:21:57 +00001984/*
1985 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1986 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1987 */
Tom Herberte6484932010-10-18 18:04:39 +00001988int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
John Fastabendf0796d52010-07-01 13:21:57 +00001989{
Tom Herbert1d24eb42010-11-21 13:17:27 +00001990 int rc;
1991
Tom Herberte6484932010-10-18 18:04:39 +00001992 if (txq < 1 || txq > dev->num_tx_queues)
1993 return -EINVAL;
John Fastabendf0796d52010-07-01 13:21:57 +00001994
Ben Hutchings5c565802011-02-15 19:39:21 +00001995 if (dev->reg_state == NETREG_REGISTERED ||
1996 dev->reg_state == NETREG_UNREGISTERING) {
Tom Herberte6484932010-10-18 18:04:39 +00001997 ASSERT_RTNL();
1998
Tom Herbert1d24eb42010-11-21 13:17:27 +00001999 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2000 txq);
Tom Herbertbf264142010-11-26 08:36:09 +00002001 if (rc)
2002 return rc;
2003
John Fastabend4f57c082011-01-17 08:06:04 +00002004 if (dev->num_tc)
2005 netif_setup_tc(dev, txq);
2006
Alexander Duyck024e9672013-01-10 08:57:46 +00002007 if (txq < dev->real_num_tx_queues) {
Tom Herberte6484932010-10-18 18:04:39 +00002008 qdisc_reset_all_tx_gt(dev, txq);
Alexander Duyck024e9672013-01-10 08:57:46 +00002009#ifdef CONFIG_XPS
2010 netif_reset_xps_queues_gt(dev, txq);
2011#endif
2012 }
John Fastabendf0796d52010-07-01 13:21:57 +00002013 }
Tom Herberte6484932010-10-18 18:04:39 +00002014
2015 dev->real_num_tx_queues = txq;
2016 return 0;
John Fastabendf0796d52010-07-01 13:21:57 +00002017}
2018EXPORT_SYMBOL(netif_set_real_num_tx_queues);
Denis Vlasenko56079432006-03-29 15:57:29 -08002019
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002020#ifdef CONFIG_RPS
2021/**
2022 * netif_set_real_num_rx_queues - set actual number of RX queues used
2023 * @dev: Network device
2024 * @rxq: Actual number of RX queues
2025 *
2026 * This must be called either with the rtnl_lock held or before
2027 * registration of the net device. Returns 0 on success, or a
Ben Hutchings4e7f7952010-10-08 10:33:39 -07002028 * negative error code. If called before registration, it always
2029 * succeeds.
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002030 */
2031int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2032{
2033 int rc;
2034
Tom Herbertbd25fa72010-10-18 18:00:16 +00002035 if (rxq < 1 || rxq > dev->num_rx_queues)
2036 return -EINVAL;
2037
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002038 if (dev->reg_state == NETREG_REGISTERED) {
2039 ASSERT_RTNL();
2040
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002041 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2042 rxq);
2043 if (rc)
2044 return rc;
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002045 }
2046
2047 dev->real_num_rx_queues = rxq;
2048 return 0;
2049}
2050EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2051#endif
2052
Ben Hutchings2c530402012-07-10 10:55:09 +00002053/**
2054 * netif_get_num_default_rss_queues - default number of RSS queues
Yuval Mintz16917b82012-07-01 03:18:50 +00002055 *
2056 * This routine should set an upper limit on the number of RSS queues
2057 * used by default by multiqueue devices.
2058 */
Ben Hutchingsa55b1382012-07-10 10:54:38 +00002059int netif_get_num_default_rss_queues(void)
Yuval Mintz16917b82012-07-01 03:18:50 +00002060{
2061 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2062}
2063EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2064
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002065static inline void __netif_reschedule(struct Qdisc *q)
2066{
2067 struct softnet_data *sd;
2068 unsigned long flags;
2069
2070 local_irq_save(flags);
2071 sd = &__get_cpu_var(softnet_data);
Changli Gaoa9cbd582010-04-26 23:06:24 +00002072 q->next_sched = NULL;
2073 *sd->output_queue_tailp = q;
2074 sd->output_queue_tailp = &q->next_sched;
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002075 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2076 local_irq_restore(flags);
2077}
2078
David S. Miller37437bb2008-07-16 02:15:04 -07002079void __netif_schedule(struct Qdisc *q)
Denis Vlasenko56079432006-03-29 15:57:29 -08002080{
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002081 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2082 __netif_reschedule(q);
Denis Vlasenko56079432006-03-29 15:57:29 -08002083}
2084EXPORT_SYMBOL(__netif_schedule);
2085
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002086void dev_kfree_skb_irq(struct sk_buff *skb)
Denis Vlasenko56079432006-03-29 15:57:29 -08002087{
David S. Miller3578b0c2010-08-03 00:24:04 -07002088 if (atomic_dec_and_test(&skb->users)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002089 struct softnet_data *sd;
2090 unsigned long flags;
Denis Vlasenko56079432006-03-29 15:57:29 -08002091
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002092 local_irq_save(flags);
2093 sd = &__get_cpu_var(softnet_data);
2094 skb->next = sd->completion_queue;
2095 sd->completion_queue = skb;
2096 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2097 local_irq_restore(flags);
2098 }
Denis Vlasenko56079432006-03-29 15:57:29 -08002099}
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002100EXPORT_SYMBOL(dev_kfree_skb_irq);
Denis Vlasenko56079432006-03-29 15:57:29 -08002101
2102void dev_kfree_skb_any(struct sk_buff *skb)
2103{
2104 if (in_irq() || irqs_disabled())
2105 dev_kfree_skb_irq(skb);
2106 else
2107 dev_kfree_skb(skb);
2108}
2109EXPORT_SYMBOL(dev_kfree_skb_any);
2110
2111
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002112/**
2113 * netif_device_detach - mark device as removed
2114 * @dev: network device
2115 *
2116 * Mark device as removed from system and therefore no longer available.
2117 */
Denis Vlasenko56079432006-03-29 15:57:29 -08002118void netif_device_detach(struct net_device *dev)
2119{
2120 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2121 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00002122 netif_tx_stop_all_queues(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08002123 }
2124}
2125EXPORT_SYMBOL(netif_device_detach);
2126
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002127/**
2128 * netif_device_attach - mark device as attached
2129 * @dev: network device
2130 *
2131 * Mark device as attached from system and restart if needed.
2132 */
Denis Vlasenko56079432006-03-29 15:57:29 -08002133void netif_device_attach(struct net_device *dev)
2134{
2135 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2136 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00002137 netif_tx_wake_all_queues(dev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002138 __netdev_watchdog_up(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08002139 }
2140}
2141EXPORT_SYMBOL(netif_device_attach);
2142
Ben Hutchings36c92472012-01-17 07:57:56 +00002143static void skb_warn_bad_offload(const struct sk_buff *skb)
2144{
Michał Mirosław65e9d2f2012-01-17 10:00:40 +00002145 static const netdev_features_t null_features = 0;
Ben Hutchings36c92472012-01-17 07:57:56 +00002146 struct net_device *dev = skb->dev;
2147 const char *driver = "";
2148
Ben Greearc846ad92013-04-19 10:45:52 +00002149 if (!net_ratelimit())
2150 return;
2151
Ben Hutchings36c92472012-01-17 07:57:56 +00002152 if (dev && dev->dev.parent)
2153 driver = dev_driver_string(dev->dev.parent);
2154
2155 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2156 "gso_type=%d ip_summed=%d\n",
Michał Mirosław65e9d2f2012-01-17 10:00:40 +00002157 driver, dev ? &dev->features : &null_features,
2158 skb->sk ? &skb->sk->sk_route_caps : &null_features,
Ben Hutchings36c92472012-01-17 07:57:56 +00002159 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2160 skb_shinfo(skb)->gso_type, skb->ip_summed);
2161}
2162
Linus Torvalds1da177e2005-04-16 15:20:36 -07002163/*
2164 * Invalidate hardware checksum when packet is to be mangled, and
2165 * complete checksum manually on outgoing path.
2166 */
Patrick McHardy84fa7932006-08-29 16:44:56 -07002167int skb_checksum_help(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002168{
Al Virod3bc23e2006-11-14 21:24:49 -08002169 __wsum csum;
Herbert Xu663ead32007-04-09 11:59:07 -07002170 int ret = 0, offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002171
Patrick McHardy84fa7932006-08-29 16:44:56 -07002172 if (skb->ip_summed == CHECKSUM_COMPLETE)
Herbert Xua430a432006-07-08 13:34:56 -07002173 goto out_set_summed;
2174
2175 if (unlikely(skb_shinfo(skb)->gso_size)) {
Ben Hutchings36c92472012-01-17 07:57:56 +00002176 skb_warn_bad_offload(skb);
2177 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002178 }
2179
Eric Dumazetcef401d2013-01-25 20:34:37 +00002180 /* Before computing a checksum, we should make sure no frag could
2181 * be modified by an external entity : checksum could be wrong.
2182 */
2183 if (skb_has_shared_frag(skb)) {
2184 ret = __skb_linearize(skb);
2185 if (ret)
2186 goto out;
2187 }
2188
Michał Mirosław55508d62010-12-14 15:24:08 +00002189 offset = skb_checksum_start_offset(skb);
Herbert Xua0308472007-10-15 01:47:15 -07002190 BUG_ON(offset >= skb_headlen(skb));
2191 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2192
2193 offset += skb->csum_offset;
2194 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2195
2196 if (skb_cloned(skb) &&
2197 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002198 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2199 if (ret)
2200 goto out;
2201 }
2202
Herbert Xua0308472007-10-15 01:47:15 -07002203 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
Herbert Xua430a432006-07-08 13:34:56 -07002204out_set_summed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002205 skb->ip_summed = CHECKSUM_NONE;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002206out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002207 return ret;
2208}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002209EXPORT_SYMBOL(skb_checksum_help);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002210
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002211__be16 skb_network_protocol(struct sk_buff *skb)
2212{
2213 __be16 type = skb->protocol;
David S. Miller61816592013-03-20 12:46:26 -04002214 int vlan_depth = ETH_HLEN;
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002215
Pravin B Shelar19acc322013-05-07 20:41:07 +00002216 /* Tunnel gso handlers can set protocol to ethernet. */
2217 if (type == htons(ETH_P_TEB)) {
2218 struct ethhdr *eth;
2219
2220 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2221 return 0;
2222
2223 eth = (struct ethhdr *)skb_mac_header(skb);
2224 type = eth->h_proto;
2225 }
2226
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002227 while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002228 struct vlan_hdr *vh;
2229
2230 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2231 return 0;
2232
2233 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2234 type = vh->h_vlan_encapsulated_proto;
2235 vlan_depth += VLAN_HLEN;
2236 }
2237
2238 return type;
2239}
2240
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002241/**
2242 * skb_mac_gso_segment - mac layer segmentation handler.
2243 * @skb: buffer to segment
2244 * @features: features for the output path (see dev->features)
2245 */
2246struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2247 netdev_features_t features)
2248{
2249 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2250 struct packet_offload *ptype;
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002251 __be16 type = skb_network_protocol(skb);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002252
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002253 if (unlikely(!type))
2254 return ERR_PTR(-EINVAL);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002255
2256 __skb_pull(skb, skb->mac_len);
2257
2258 rcu_read_lock();
2259 list_for_each_entry_rcu(ptype, &offload_base, list) {
2260 if (ptype->type == type && ptype->callbacks.gso_segment) {
2261 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2262 int err;
2263
2264 err = ptype->callbacks.gso_send_check(skb);
2265 segs = ERR_PTR(err);
2266 if (err || skb_gso_ok(skb, features))
2267 break;
2268 __skb_push(skb, (skb->data -
2269 skb_network_header(skb)));
2270 }
2271 segs = ptype->callbacks.gso_segment(skb, features);
2272 break;
2273 }
2274 }
2275 rcu_read_unlock();
2276
2277 __skb_push(skb, skb->data - skb_mac_header(skb));
2278
2279 return segs;
2280}
2281EXPORT_SYMBOL(skb_mac_gso_segment);
2282
2283
Cong Wang12b00042013-02-05 16:36:38 +00002284/* openvswitch calls this on rx path, so we need a different check.
2285 */
2286static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2287{
2288 if (tx_path)
2289 return skb->ip_summed != CHECKSUM_PARTIAL;
2290 else
2291 return skb->ip_summed == CHECKSUM_NONE;
2292}
2293
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002294/**
Cong Wang12b00042013-02-05 16:36:38 +00002295 * __skb_gso_segment - Perform segmentation on skb.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002296 * @skb: buffer to segment
Herbert Xu576a30e2006-06-27 13:22:38 -07002297 * @features: features for the output path (see dev->features)
Cong Wang12b00042013-02-05 16:36:38 +00002298 * @tx_path: whether it is called in TX path
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002299 *
2300 * This function segments the given skb and returns a list of segments.
Herbert Xu576a30e2006-06-27 13:22:38 -07002301 *
2302 * It may return NULL if the skb requires no segmentation. This is
2303 * only possible when GSO is used for verifying header integrity.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002304 */
Cong Wang12b00042013-02-05 16:36:38 +00002305struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2306 netdev_features_t features, bool tx_path)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002307{
Cong Wang12b00042013-02-05 16:36:38 +00002308 if (unlikely(skb_needs_check(skb, tx_path))) {
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002309 int err;
2310
Ben Hutchings36c92472012-01-17 07:57:56 +00002311 skb_warn_bad_offload(skb);
Herbert Xu67fd1a72009-01-19 16:26:44 -08002312
Herbert Xua430a432006-07-08 13:34:56 -07002313 if (skb_header_cloned(skb) &&
2314 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2315 return ERR_PTR(err);
2316 }
2317
Pravin B Shelar68c33162013-02-14 14:02:41 +00002318 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002319 skb_reset_mac_header(skb);
2320 skb_reset_mac_len(skb);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002321
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002322 return skb_mac_gso_segment(skb, features);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002323}
Cong Wang12b00042013-02-05 16:36:38 +00002324EXPORT_SYMBOL(__skb_gso_segment);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002325
Herbert Xufb286bb2005-11-10 13:01:24 -08002326/* Take action when hardware reception checksum errors are detected. */
2327#ifdef CONFIG_BUG
2328void netdev_rx_csum_fault(struct net_device *dev)
2329{
2330 if (net_ratelimit()) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00002331 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
Herbert Xufb286bb2005-11-10 13:01:24 -08002332 dump_stack();
2333 }
2334}
2335EXPORT_SYMBOL(netdev_rx_csum_fault);
2336#endif
2337
Linus Torvalds1da177e2005-04-16 15:20:36 -07002338/* Actually, we should eliminate this check as soon as we know, that:
2339 * 1. IOMMU is present and allows to map all the memory.
2340 * 2. No high memory really exists on this machine.
2341 */
2342
Eric Dumazet9092c652010-04-02 13:34:49 -07002343static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002344{
Herbert Xu3d3a8532006-06-27 13:33:10 -07002345#ifdef CONFIG_HIGHMEM
Linus Torvalds1da177e2005-04-16 15:20:36 -07002346 int i;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002347 if (!(dev->features & NETIF_F_HIGHDMA)) {
Ian Campbellea2ab692011-08-22 23:44:58 +00002348 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2349 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2350 if (PageHighMem(skb_frag_page(frag)))
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002351 return 1;
Ian Campbellea2ab692011-08-22 23:44:58 +00002352 }
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002353 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002354
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002355 if (PCI_DMA_BUS_IS_PHYS) {
2356 struct device *pdev = dev->dev.parent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002357
Eric Dumazet9092c652010-04-02 13:34:49 -07002358 if (!pdev)
2359 return 0;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002360 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
Ian Campbellea2ab692011-08-22 23:44:58 +00002361 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2362 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002363 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2364 return 1;
2365 }
2366 }
Herbert Xu3d3a8532006-06-27 13:33:10 -07002367#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002368 return 0;
2369}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002370
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002371struct dev_gso_cb {
2372 void (*destructor)(struct sk_buff *skb);
2373};
2374
2375#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2376
2377static void dev_gso_skb_destructor(struct sk_buff *skb)
2378{
2379 struct dev_gso_cb *cb;
2380
2381 do {
2382 struct sk_buff *nskb = skb->next;
2383
2384 skb->next = nskb->next;
2385 nskb->next = NULL;
2386 kfree_skb(nskb);
2387 } while (skb->next);
2388
2389 cb = DEV_GSO_CB(skb);
2390 if (cb->destructor)
2391 cb->destructor(skb);
2392}
2393
2394/**
2395 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2396 * @skb: buffer to segment
Jesse Gross91ecb632011-01-09 06:23:33 +00002397 * @features: device features as applicable to this skb
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002398 *
2399 * This function segments the given skb and stores the list of segments
2400 * in skb->next.
2401 */
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002402static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002403{
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002404 struct sk_buff *segs;
2405
Herbert Xu576a30e2006-06-27 13:22:38 -07002406 segs = skb_gso_segment(skb, features);
2407
2408 /* Verifying header integrity only. */
2409 if (!segs)
2410 return 0;
2411
Hirofumi Nakagawa801678c2008-04-29 01:03:09 -07002412 if (IS_ERR(segs))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002413 return PTR_ERR(segs);
2414
2415 skb->next = segs;
2416 DEV_GSO_CB(skb)->destructor = skb->destructor;
2417 skb->destructor = dev_gso_skb_destructor;
2418
2419 return 0;
2420}
2421
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002422static netdev_features_t harmonize_features(struct sk_buff *skb,
2423 __be16 protocol, netdev_features_t features)
Jesse Grossf01a5232011-01-09 06:23:31 +00002424{
Ed Cashinc0d680e2012-09-19 15:49:00 +00002425 if (skb->ip_summed != CHECKSUM_NONE &&
2426 !can_checksum_protocol(features, protocol)) {
Jesse Grossf01a5232011-01-09 06:23:31 +00002427 features &= ~NETIF_F_ALL_CSUM;
Jesse Grossf01a5232011-01-09 06:23:31 +00002428 } else if (illegal_highdma(skb->dev, skb)) {
2429 features &= ~NETIF_F_SG;
2430 }
2431
2432 return features;
2433}
2434
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002435netdev_features_t netif_skb_features(struct sk_buff *skb)
Jesse Gross58e998c2010-10-29 12:14:55 +00002436{
2437 __be16 protocol = skb->protocol;
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002438 netdev_features_t features = skb->dev->features;
Jesse Gross58e998c2010-10-29 12:14:55 +00002439
Ben Hutchings30b678d2012-07-30 15:57:00 +00002440 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2441 features &= ~NETIF_F_GSO_MASK;
2442
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002443 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
Jesse Gross58e998c2010-10-29 12:14:55 +00002444 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2445 protocol = veh->h_vlan_encapsulated_proto;
Jesse Grossf01a5232011-01-09 06:23:31 +00002446 } else if (!vlan_tx_tag_present(skb)) {
2447 return harmonize_features(skb, protocol, features);
2448 }
Jesse Gross58e998c2010-10-29 12:14:55 +00002449
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002450 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2451 NETIF_F_HW_VLAN_STAG_TX);
Jesse Grossf01a5232011-01-09 06:23:31 +00002452
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002453 if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
Jesse Grossf01a5232011-01-09 06:23:31 +00002454 return harmonize_features(skb, protocol, features);
2455 } else {
2456 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002457 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2458 NETIF_F_HW_VLAN_STAG_TX;
Jesse Grossf01a5232011-01-09 06:23:31 +00002459 return harmonize_features(skb, protocol, features);
2460 }
Jesse Gross58e998c2010-10-29 12:14:55 +00002461}
Jesse Grossf01a5232011-01-09 06:23:31 +00002462EXPORT_SYMBOL(netif_skb_features);
Jesse Gross58e998c2010-10-29 12:14:55 +00002463
John Fastabend6afff0c2010-06-16 14:18:12 +00002464/*
2465 * Returns true if either:
2466 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
Rami Rosend1a53df2012-08-27 23:39:24 +00002467 * 2. skb is fragmented and the device does not support SG.
John Fastabend6afff0c2010-06-16 14:18:12 +00002468 */
2469static inline int skb_needs_linearize(struct sk_buff *skb,
Patrick McHardy6708c9e2013-05-01 22:36:49 +00002470 netdev_features_t features)
John Fastabend6afff0c2010-06-16 14:18:12 +00002471{
Jesse Gross02932ce2011-01-09 06:23:34 +00002472 return skb_is_nonlinear(skb) &&
2473 ((skb_has_frag_list(skb) &&
2474 !(features & NETIF_F_FRAGLIST)) ||
Jesse Grosse1e78db2010-10-29 12:14:53 +00002475 (skb_shinfo(skb)->nr_frags &&
Jesse Gross02932ce2011-01-09 06:23:34 +00002476 !(features & NETIF_F_SG)));
John Fastabend6afff0c2010-06-16 14:18:12 +00002477}
2478
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002479int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2480 struct netdev_queue *txq)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002481{
Stephen Hemminger00829822008-11-20 20:14:53 -08002482 const struct net_device_ops *ops = dev->netdev_ops;
Patrick McHardy572a9d72009-11-10 06:14:14 +00002483 int rc = NETDEV_TX_OK;
Koki Sanagiec764bf2011-05-30 21:48:34 +00002484 unsigned int skb_len;
Stephen Hemminger00829822008-11-20 20:14:53 -08002485
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002486 if (likely(!skb->next)) {
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002487 netdev_features_t features;
Jesse Grossfc741212011-01-09 06:23:32 +00002488
Eric Dumazet93f154b2009-05-18 22:19:19 -07002489 /*
Lucas De Marchi25985ed2011-03-30 22:57:33 -03002490 * If device doesn't need skb->dst, release it right now while
Eric Dumazet93f154b2009-05-18 22:19:19 -07002491 * its hot in this cpu cache
2492 */
Eric Dumazetadf30902009-06-02 05:19:30 +00002493 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2494 skb_dst_drop(skb);
2495
Jesse Grossfc741212011-01-09 06:23:32 +00002496 features = netif_skb_features(skb);
2497
Jesse Gross7b9c6092010-10-20 13:56:04 +00002498 if (vlan_tx_tag_present(skb) &&
Patrick McHardy86a9bad2013-04-19 02:04:30 +00002499 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2500 skb = __vlan_put_tag(skb, skb->vlan_proto,
2501 vlan_tx_tag_get(skb));
Jesse Gross7b9c6092010-10-20 13:56:04 +00002502 if (unlikely(!skb))
2503 goto out;
2504
2505 skb->vlan_tci = 0;
2506 }
2507
Alexander Duyckfc70fb62012-12-07 14:14:15 +00002508 /* If encapsulation offload request, verify we are testing
2509 * hardware encapsulation features instead of standard
2510 * features for the netdev
2511 */
2512 if (skb->encapsulation)
2513 features &= dev->hw_enc_features;
2514
Jesse Grossfc741212011-01-09 06:23:32 +00002515 if (netif_needs_gso(skb, features)) {
Jesse Gross91ecb632011-01-09 06:23:33 +00002516 if (unlikely(dev_gso_segment(skb, features)))
David S. Miller9ccb8972010-04-22 01:02:07 -07002517 goto out_kfree_skb;
2518 if (skb->next)
2519 goto gso;
John Fastabend6afff0c2010-06-16 14:18:12 +00002520 } else {
Jesse Gross02932ce2011-01-09 06:23:34 +00002521 if (skb_needs_linearize(skb, features) &&
John Fastabend6afff0c2010-06-16 14:18:12 +00002522 __skb_linearize(skb))
2523 goto out_kfree_skb;
2524
2525 /* If packet is not checksummed and device does not
2526 * support checksumming for this protocol, complete
2527 * checksumming here.
2528 */
2529 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Alexander Duyckfc70fb62012-12-07 14:14:15 +00002530 if (skb->encapsulation)
2531 skb_set_inner_transport_header(skb,
2532 skb_checksum_start_offset(skb));
2533 else
2534 skb_set_transport_header(skb,
2535 skb_checksum_start_offset(skb));
Jesse Gross03634662011-01-09 06:23:35 +00002536 if (!(features & NETIF_F_ALL_CSUM) &&
John Fastabend6afff0c2010-06-16 14:18:12 +00002537 skb_checksum_help(skb))
2538 goto out_kfree_skb;
2539 }
David S. Miller9ccb8972010-04-22 01:02:07 -07002540 }
2541
Eric Dumazetb40863c2012-09-18 20:44:49 +00002542 if (!list_empty(&ptype_all))
2543 dev_queue_xmit_nit(skb, dev);
2544
Koki Sanagiec764bf2011-05-30 21:48:34 +00002545 skb_len = skb->len;
Patrick Ohlyac45f602009-02-12 05:03:37 +00002546 rc = ops->ndo_start_xmit(skb, dev);
Koki Sanagiec764bf2011-05-30 21:48:34 +00002547 trace_net_dev_xmit(skb, rc, dev, skb_len);
Patrick McHardyec634fe2009-07-05 19:23:38 -07002548 if (rc == NETDEV_TX_OK)
Eric Dumazet08baf562009-05-25 22:58:01 -07002549 txq_trans_update(txq);
Patrick Ohlyac45f602009-02-12 05:03:37 +00002550 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002551 }
2552
Herbert Xu576a30e2006-06-27 13:22:38 -07002553gso:
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002554 do {
2555 struct sk_buff *nskb = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002556
2557 skb->next = nskb->next;
2558 nskb->next = NULL;
Krishna Kumar068a2de2009-12-09 20:59:58 +00002559
Eric Dumazetb40863c2012-09-18 20:44:49 +00002560 if (!list_empty(&ptype_all))
2561 dev_queue_xmit_nit(nskb, dev);
2562
Koki Sanagiec764bf2011-05-30 21:48:34 +00002563 skb_len = nskb->len;
Stephen Hemminger00829822008-11-20 20:14:53 -08002564 rc = ops->ndo_start_xmit(nskb, dev);
Koki Sanagiec764bf2011-05-30 21:48:34 +00002565 trace_net_dev_xmit(nskb, rc, dev, skb_len);
Patrick McHardyec634fe2009-07-05 19:23:38 -07002566 if (unlikely(rc != NETDEV_TX_OK)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002567 if (rc & ~NETDEV_TX_MASK)
2568 goto out_kfree_gso_skb;
Michael Chanf54d9e82006-06-25 23:57:04 -07002569 nskb->next = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002570 skb->next = nskb;
2571 return rc;
2572 }
Eric Dumazet08baf562009-05-25 22:58:01 -07002573 txq_trans_update(txq);
Tom Herbert734664982011-11-28 16:32:44 +00002574 if (unlikely(netif_xmit_stopped(txq) && skb->next))
Michael Chanf54d9e82006-06-25 23:57:04 -07002575 return NETDEV_TX_BUSY;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002576 } while (skb->next);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002577
Patrick McHardy572a9d72009-11-10 06:14:14 +00002578out_kfree_gso_skb:
Sridhar Samudrala0c772152013-04-29 13:02:42 +00002579 if (likely(skb->next == NULL)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002580 skb->destructor = DEV_GSO_CB(skb)->destructor;
Sridhar Samudrala0c772152013-04-29 13:02:42 +00002581 consume_skb(skb);
2582 return rc;
2583 }
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002584out_kfree_skb:
2585 kfree_skb(skb);
Jesse Gross7b9c6092010-10-20 13:56:04 +00002586out:
Patrick McHardy572a9d72009-11-10 06:14:14 +00002587 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002588}
2589
Eric Dumazet1def9232013-01-10 12:36:42 +00002590static void qdisc_pkt_len_init(struct sk_buff *skb)
2591{
2592 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2593
2594 qdisc_skb_cb(skb)->pkt_len = skb->len;
2595
2596 /* To get more precise estimation of bytes sent on wire,
2597 * we add to pkt_len the headers size of all segments
2598 */
2599 if (shinfo->gso_size) {
Eric Dumazet757b8b12013-01-15 21:14:21 -08002600 unsigned int hdr_len;
Jason Wang15e5a032013-03-25 20:19:59 +00002601 u16 gso_segs = shinfo->gso_segs;
Eric Dumazet1def9232013-01-10 12:36:42 +00002602
Eric Dumazet757b8b12013-01-15 21:14:21 -08002603 /* mac layer + network layer */
2604 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2605
2606 /* + transport layer */
Eric Dumazet1def9232013-01-10 12:36:42 +00002607 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2608 hdr_len += tcp_hdrlen(skb);
2609 else
2610 hdr_len += sizeof(struct udphdr);
Jason Wang15e5a032013-03-25 20:19:59 +00002611
2612 if (shinfo->gso_type & SKB_GSO_DODGY)
2613 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2614 shinfo->gso_size);
2615
2616 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
Eric Dumazet1def9232013-01-10 12:36:42 +00002617 }
2618}
2619
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002620static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2621 struct net_device *dev,
2622 struct netdev_queue *txq)
2623{
2624 spinlock_t *root_lock = qdisc_lock(q);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002625 bool contended;
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002626 int rc;
2627
Eric Dumazet1def9232013-01-10 12:36:42 +00002628 qdisc_pkt_len_init(skb);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002629 qdisc_calculate_pkt_len(skb, q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002630 /*
2631 * Heuristic to force contended enqueues to serialize on a
2632 * separate lock before trying to get qdisc main lock.
2633 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2634 * and dequeue packets faster.
2635 */
Eric Dumazeta2da5702011-01-20 03:48:19 +00002636 contended = qdisc_is_running(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002637 if (unlikely(contended))
2638 spin_lock(&q->busylock);
2639
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002640 spin_lock(root_lock);
2641 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2642 kfree_skb(skb);
2643 rc = NET_XMIT_DROP;
2644 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
Eric Dumazetbc135b22010-06-02 03:23:51 -07002645 qdisc_run_begin(q)) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002646 /*
2647 * This is a work-conserving queue; there are no old skbs
2648 * waiting to be sent out; and the qdisc is not running -
2649 * xmit the skb directly.
2650 */
Eric Dumazet7fee2262010-05-11 23:19:48 +00002651 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2652 skb_dst_force(skb);
Eric Dumazetbfe0d022011-01-09 08:30:54 +00002653
Eric Dumazetbfe0d022011-01-09 08:30:54 +00002654 qdisc_bstats_update(q, skb);
2655
Eric Dumazet79640a42010-06-02 05:09:29 -07002656 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2657 if (unlikely(contended)) {
2658 spin_unlock(&q->busylock);
2659 contended = false;
2660 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002661 __qdisc_run(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002662 } else
Eric Dumazetbc135b22010-06-02 03:23:51 -07002663 qdisc_run_end(q);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002664
2665 rc = NET_XMIT_SUCCESS;
2666 } else {
Eric Dumazet7fee2262010-05-11 23:19:48 +00002667 skb_dst_force(skb);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002668 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
Eric Dumazet79640a42010-06-02 05:09:29 -07002669 if (qdisc_run_begin(q)) {
2670 if (unlikely(contended)) {
2671 spin_unlock(&q->busylock);
2672 contended = false;
2673 }
2674 __qdisc_run(q);
2675 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002676 }
2677 spin_unlock(root_lock);
Eric Dumazet79640a42010-06-02 05:09:29 -07002678 if (unlikely(contended))
2679 spin_unlock(&q->busylock);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002680 return rc;
2681}
2682
Neil Horman5bc14212011-11-22 05:10:51 +00002683#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2684static void skb_update_prio(struct sk_buff *skb)
2685{
Igor Maravic6977a792011-11-25 07:44:54 +00002686 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
Neil Horman5bc14212011-11-22 05:10:51 +00002687
Eric Dumazet91c68ce2012-07-08 21:45:10 +00002688 if (!skb->priority && skb->sk && map) {
2689 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2690
2691 if (prioidx < map->priomap_len)
2692 skb->priority = map->priomap[prioidx];
2693 }
Neil Horman5bc14212011-11-22 05:10:51 +00002694}
2695#else
2696#define skb_update_prio(skb)
2697#endif
2698
Eric Dumazet745e20f2010-09-29 13:23:09 -07002699static DEFINE_PER_CPU(int, xmit_recursion);
David S. Miller11a766c2010-10-25 12:51:55 -07002700#define RECURSION_LIMIT 10
Eric Dumazet745e20f2010-09-29 13:23:09 -07002701
Dave Jonesd29f7492008-07-22 14:09:06 -07002702/**
Michel Machado95603e22012-06-12 10:16:35 +00002703 * dev_loopback_xmit - loop back @skb
2704 * @skb: buffer to transmit
2705 */
2706int dev_loopback_xmit(struct sk_buff *skb)
2707{
2708 skb_reset_mac_header(skb);
2709 __skb_pull(skb, skb_network_offset(skb));
2710 skb->pkt_type = PACKET_LOOPBACK;
2711 skb->ip_summed = CHECKSUM_UNNECESSARY;
2712 WARN_ON(!skb_dst(skb));
2713 skb_dst_force(skb);
2714 netif_rx_ni(skb);
2715 return 0;
2716}
2717EXPORT_SYMBOL(dev_loopback_xmit);
2718
2719/**
Dave Jonesd29f7492008-07-22 14:09:06 -07002720 * dev_queue_xmit - transmit a buffer
2721 * @skb: buffer to transmit
2722 *
2723 * Queue a buffer for transmission to a network device. The caller must
2724 * have set the device and priority and built the buffer before calling
2725 * this function. The function can be called from an interrupt.
2726 *
2727 * A negative errno code is returned on a failure. A success does not
2728 * guarantee the frame will be transmitted as it may be dropped due
2729 * to congestion or traffic shaping.
2730 *
2731 * -----------------------------------------------------------------------------------
2732 * I notice this method can also return errors from the queue disciplines,
2733 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2734 * be positive.
2735 *
2736 * Regardless of the return value, the skb is consumed, so it is currently
2737 * difficult to retry a send to this method. (You can bump the ref count
2738 * before sending to hold a reference for retry if you are careful.)
2739 *
2740 * When calling this method, interrupts MUST be enabled. This is because
2741 * the BH enable code must have IRQs enabled so that it will not deadlock.
2742 * --BLG
2743 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002744int dev_queue_xmit(struct sk_buff *skb)
2745{
2746 struct net_device *dev = skb->dev;
David S. Millerdc2b4842008-07-08 17:18:23 -07002747 struct netdev_queue *txq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002748 struct Qdisc *q;
2749 int rc = -ENOMEM;
2750
Eric Dumazet6d1ccff2013-02-05 20:22:20 +00002751 skb_reset_mac_header(skb);
2752
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002753 /* Disable soft irqs for various locks below. Also
2754 * stops preemption for RCU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002755 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002756 rcu_read_lock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002757
Neil Horman5bc14212011-11-22 05:10:51 +00002758 skb_update_prio(skb);
2759
Amerigo Wang8c4c49d2012-09-17 20:16:31 +00002760 txq = netdev_pick_tx(dev, skb);
Paul E. McKenneya898def2010-02-22 17:04:49 -08002761 q = rcu_dereference_bh(txq->qdisc);
David S. Miller37437bb2008-07-16 02:15:04 -07002762
Linus Torvalds1da177e2005-04-16 15:20:36 -07002763#ifdef CONFIG_NET_CLS_ACT
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002764 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002765#endif
Koki Sanagicf66ba52010-08-23 18:45:02 +09002766 trace_net_dev_queue(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002767 if (q->enqueue) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002768 rc = __dev_xmit_skb(skb, q, dev, txq);
David S. Miller37437bb2008-07-16 02:15:04 -07002769 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002770 }
2771
2772 /* The device has no queue. Common case for software devices:
2773 loopback, all the sorts of tunnels...
2774
Herbert Xu932ff272006-06-09 12:20:56 -07002775 Really, it is unlikely that netif_tx_lock protection is necessary
2776 here. (f.e. loopback and IP tunnels are clean ignoring statistics
Linus Torvalds1da177e2005-04-16 15:20:36 -07002777 counters.)
2778 However, it is possible, that they rely on protection
2779 made by us here.
2780
2781 Check this and shot the lock. It is not prone from deadlocks.
2782 Either shot noqueue qdisc, it is even simpler 8)
2783 */
2784 if (dev->flags & IFF_UP) {
2785 int cpu = smp_processor_id(); /* ok because BHs are off */
2786
David S. Millerc773e842008-07-08 23:13:53 -07002787 if (txq->xmit_lock_owner != cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002788
Eric Dumazet745e20f2010-09-29 13:23:09 -07002789 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2790 goto recursion_alert;
2791
David S. Millerc773e842008-07-08 23:13:53 -07002792 HARD_TX_LOCK(dev, txq, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002793
Tom Herbert734664982011-11-28 16:32:44 +00002794 if (!netif_xmit_stopped(txq)) {
Eric Dumazet745e20f2010-09-29 13:23:09 -07002795 __this_cpu_inc(xmit_recursion);
Patrick McHardy572a9d72009-11-10 06:14:14 +00002796 rc = dev_hard_start_xmit(skb, dev, txq);
Eric Dumazet745e20f2010-09-29 13:23:09 -07002797 __this_cpu_dec(xmit_recursion);
Patrick McHardy572a9d72009-11-10 06:14:14 +00002798 if (dev_xmit_complete(rc)) {
David S. Millerc773e842008-07-08 23:13:53 -07002799 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002800 goto out;
2801 }
2802 }
David S. Millerc773e842008-07-08 23:13:53 -07002803 HARD_TX_UNLOCK(dev, txq);
Joe Perchese87cc472012-05-13 21:56:26 +00002804 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2805 dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002806 } else {
2807 /* Recursion is detected! It is possible,
Eric Dumazet745e20f2010-09-29 13:23:09 -07002808 * unfortunately
2809 */
2810recursion_alert:
Joe Perchese87cc472012-05-13 21:56:26 +00002811 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2812 dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002813 }
2814 }
2815
2816 rc = -ENETDOWN;
Herbert Xud4828d82006-06-22 02:28:18 -07002817 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002818
Linus Torvalds1da177e2005-04-16 15:20:36 -07002819 kfree_skb(skb);
2820 return rc;
2821out:
Herbert Xud4828d82006-06-22 02:28:18 -07002822 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002823 return rc;
2824}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002825EXPORT_SYMBOL(dev_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002826
2827
2828/*=======================================================================
2829 Receiver routines
2830 =======================================================================*/
2831
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002832int netdev_max_backlog __read_mostly = 1000;
Eric Dumazetc9e6bc62012-09-27 19:29:05 +00002833EXPORT_SYMBOL(netdev_max_backlog);
2834
Eric Dumazet3b098e22010-05-15 23:57:10 -07002835int netdev_tstamp_prequeue __read_mostly = 1;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002836int netdev_budget __read_mostly = 300;
2837int weight_p __read_mostly = 64; /* old backlog weight */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002838
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002839/* Called with irq disabled */
2840static inline void ____napi_schedule(struct softnet_data *sd,
2841 struct napi_struct *napi)
2842{
2843 list_add_tail(&napi->poll_list, &sd->poll_list);
2844 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2845}
2846
Eric Dumazetdf334542010-03-24 19:13:54 +00002847#ifdef CONFIG_RPS
Tom Herbertfec5e652010-04-16 16:01:27 -07002848
2849/* One global table that all flow-based protocols share. */
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002850struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
Tom Herbertfec5e652010-04-16 16:01:27 -07002851EXPORT_SYMBOL(rps_sock_flow_table);
2852
Ingo Molnarc5905af2012-02-24 08:31:31 +01002853struct static_key rps_needed __read_mostly;
Eric Dumazetadc93002011-11-17 03:13:26 +00002854
Ben Hutchingsc4454772011-01-19 11:03:53 +00002855static struct rps_dev_flow *
2856set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2857 struct rps_dev_flow *rflow, u16 next_cpu)
2858{
Ben Hutchings09994d12011-10-03 04:42:46 +00002859 if (next_cpu != RPS_NO_CPU) {
Ben Hutchingsc4454772011-01-19 11:03:53 +00002860#ifdef CONFIG_RFS_ACCEL
2861 struct netdev_rx_queue *rxqueue;
2862 struct rps_dev_flow_table *flow_table;
2863 struct rps_dev_flow *old_rflow;
2864 u32 flow_id;
2865 u16 rxq_index;
2866 int rc;
2867
2868 /* Should we steer this flow to a different hardware queue? */
Ben Hutchings69a19ee2011-02-15 20:32:04 +00002869 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2870 !(dev->features & NETIF_F_NTUPLE))
Ben Hutchingsc4454772011-01-19 11:03:53 +00002871 goto out;
2872 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2873 if (rxq_index == skb_get_rx_queue(skb))
2874 goto out;
2875
2876 rxqueue = dev->_rx + rxq_index;
2877 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2878 if (!flow_table)
2879 goto out;
2880 flow_id = skb->rxhash & flow_table->mask;
2881 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2882 rxq_index, flow_id);
2883 if (rc < 0)
2884 goto out;
2885 old_rflow = rflow;
2886 rflow = &flow_table->flows[flow_id];
Ben Hutchingsc4454772011-01-19 11:03:53 +00002887 rflow->filter = rc;
2888 if (old_rflow->filter == rflow->filter)
2889 old_rflow->filter = RPS_NO_FILTER;
2890 out:
2891#endif
2892 rflow->last_qtail =
Ben Hutchings09994d12011-10-03 04:42:46 +00002893 per_cpu(softnet_data, next_cpu).input_queue_head;
Ben Hutchingsc4454772011-01-19 11:03:53 +00002894 }
2895
Ben Hutchings09994d12011-10-03 04:42:46 +00002896 rflow->cpu = next_cpu;
Ben Hutchingsc4454772011-01-19 11:03:53 +00002897 return rflow;
2898}
2899
Tom Herbert0a9627f2010-03-16 08:03:29 +00002900/*
2901 * get_rps_cpu is called from netif_receive_skb and returns the target
2902 * CPU from the RPS map of the receiving queue for a given skb.
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002903 * rcu_read_lock must be held on entry.
Tom Herbert0a9627f2010-03-16 08:03:29 +00002904 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002905static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2906 struct rps_dev_flow **rflowp)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002907{
Tom Herbert0a9627f2010-03-16 08:03:29 +00002908 struct netdev_rx_queue *rxqueue;
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002909 struct rps_map *map;
Tom Herbertfec5e652010-04-16 16:01:27 -07002910 struct rps_dev_flow_table *flow_table;
2911 struct rps_sock_flow_table *sock_flow_table;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002912 int cpu = -1;
Tom Herbertfec5e652010-04-16 16:01:27 -07002913 u16 tcpu;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002914
Tom Herbert0a9627f2010-03-16 08:03:29 +00002915 if (skb_rx_queue_recorded(skb)) {
2916 u16 index = skb_get_rx_queue(skb);
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002917 if (unlikely(index >= dev->real_num_rx_queues)) {
2918 WARN_ONCE(dev->real_num_rx_queues > 1,
2919 "%s received packet on queue %u, but number "
2920 "of RX queues is %u\n",
2921 dev->name, index, dev->real_num_rx_queues);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002922 goto done;
2923 }
2924 rxqueue = dev->_rx + index;
2925 } else
2926 rxqueue = dev->_rx;
2927
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002928 map = rcu_dereference(rxqueue->rps_map);
2929 if (map) {
Tom Herbert85875232011-01-31 16:23:42 -08002930 if (map->len == 1 &&
Eric Dumazet33d480c2011-08-11 19:30:52 +00002931 !rcu_access_pointer(rxqueue->rps_flow_table)) {
Changli Gao6febfca2010-09-03 23:12:37 +00002932 tcpu = map->cpus[0];
2933 if (cpu_online(tcpu))
2934 cpu = tcpu;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002935 goto done;
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002936 }
Eric Dumazet33d480c2011-08-11 19:30:52 +00002937 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00002938 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002939 }
2940
Changli Gao2d47b452010-08-17 19:00:56 +00002941 skb_reset_network_header(skb);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002942 if (!skb_get_rxhash(skb))
Tom Herbert0a9627f2010-03-16 08:03:29 +00002943 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002944
Tom Herbertfec5e652010-04-16 16:01:27 -07002945 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2946 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2947 if (flow_table && sock_flow_table) {
2948 u16 next_cpu;
2949 struct rps_dev_flow *rflow;
2950
2951 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2952 tcpu = rflow->cpu;
2953
2954 next_cpu = sock_flow_table->ents[skb->rxhash &
2955 sock_flow_table->mask];
2956
2957 /*
2958 * If the desired CPU (where last recvmsg was done) is
2959 * different from current CPU (one in the rx-queue flow
2960 * table entry), switch if one of the following holds:
2961 * - Current CPU is unset (equal to RPS_NO_CPU).
2962 * - Current CPU is offline.
2963 * - The current CPU's queue tail has advanced beyond the
2964 * last packet that was enqueued using this table entry.
2965 * This guarantees that all previous packets for the flow
2966 * have been dequeued, thus preserving in order delivery.
2967 */
2968 if (unlikely(tcpu != next_cpu) &&
2969 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2970 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
Tom Herbertbaefa312012-11-16 09:04:15 +00002971 rflow->last_qtail)) >= 0)) {
2972 tcpu = next_cpu;
Ben Hutchingsc4454772011-01-19 11:03:53 +00002973 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
Tom Herbertbaefa312012-11-16 09:04:15 +00002974 }
Ben Hutchingsc4454772011-01-19 11:03:53 +00002975
Tom Herbertfec5e652010-04-16 16:01:27 -07002976 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2977 *rflowp = rflow;
2978 cpu = tcpu;
2979 goto done;
2980 }
2981 }
2982
Tom Herbert0a9627f2010-03-16 08:03:29 +00002983 if (map) {
Tom Herbertfec5e652010-04-16 16:01:27 -07002984 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
Tom Herbert0a9627f2010-03-16 08:03:29 +00002985
2986 if (cpu_online(tcpu)) {
2987 cpu = tcpu;
2988 goto done;
2989 }
2990 }
2991
2992done:
Tom Herbert0a9627f2010-03-16 08:03:29 +00002993 return cpu;
2994}
2995
Ben Hutchingsc4454772011-01-19 11:03:53 +00002996#ifdef CONFIG_RFS_ACCEL
2997
2998/**
2999 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3000 * @dev: Device on which the filter was set
3001 * @rxq_index: RX queue index
3002 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3003 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3004 *
3005 * Drivers that implement ndo_rx_flow_steer() should periodically call
3006 * this function for each installed filter and remove the filters for
3007 * which it returns %true.
3008 */
3009bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3010 u32 flow_id, u16 filter_id)
3011{
3012 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3013 struct rps_dev_flow_table *flow_table;
3014 struct rps_dev_flow *rflow;
3015 bool expire = true;
3016 int cpu;
3017
3018 rcu_read_lock();
3019 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3020 if (flow_table && flow_id <= flow_table->mask) {
3021 rflow = &flow_table->flows[flow_id];
3022 cpu = ACCESS_ONCE(rflow->cpu);
3023 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3024 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3025 rflow->last_qtail) <
3026 (int)(10 * flow_table->mask)))
3027 expire = false;
3028 }
3029 rcu_read_unlock();
3030 return expire;
3031}
3032EXPORT_SYMBOL(rps_may_expire_flow);
3033
3034#endif /* CONFIG_RFS_ACCEL */
3035
Tom Herbert0a9627f2010-03-16 08:03:29 +00003036/* Called from hardirq (IPI) context */
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003037static void rps_trigger_softirq(void *data)
Tom Herbert0a9627f2010-03-16 08:03:29 +00003038{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003039 struct softnet_data *sd = data;
3040
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003041 ____napi_schedule(sd, &sd->backlog);
Changli Gaodee42872010-05-02 05:42:16 +00003042 sd->received_rps++;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003043}
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003044
Tom Herbertfec5e652010-04-16 16:01:27 -07003045#endif /* CONFIG_RPS */
Tom Herbert0a9627f2010-03-16 08:03:29 +00003046
3047/*
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003048 * Check if this softnet_data structure is another cpu one
3049 * If yes, queue it to our IPI list and return 1
3050 * If no, return 0
3051 */
3052static int rps_ipi_queued(struct softnet_data *sd)
3053{
3054#ifdef CONFIG_RPS
3055 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3056
3057 if (sd != mysd) {
3058 sd->rps_ipi_next = mysd->rps_ipi_list;
3059 mysd->rps_ipi_list = sd;
3060
3061 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3062 return 1;
3063 }
3064#endif /* CONFIG_RPS */
3065 return 0;
3066}
3067
3068/*
Tom Herbert0a9627f2010-03-16 08:03:29 +00003069 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3070 * queue (may be a remote CPU queue).
3071 */
Tom Herbertfec5e652010-04-16 16:01:27 -07003072static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3073 unsigned int *qtail)
Tom Herbert0a9627f2010-03-16 08:03:29 +00003074{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003075 struct softnet_data *sd;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003076 unsigned long flags;
3077
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003078 sd = &per_cpu(softnet_data, cpu);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003079
3080 local_irq_save(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003081
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003082 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003083 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3084 if (skb_queue_len(&sd->input_pkt_queue)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00003085enqueue:
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003086 __skb_queue_tail(&sd->input_pkt_queue, skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003087 input_queue_tail_incr_save(sd, qtail);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003088 rps_unlock(sd);
Changli Gao152102c2010-03-30 20:16:22 +00003089 local_irq_restore(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003090 return NET_RX_SUCCESS;
3091 }
3092
Eric Dumazetebda37c22010-05-06 23:51:21 +00003093 /* Schedule NAPI for backlog device
3094 * We can use non atomic operation since we own the queue lock
3095 */
3096 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003097 if (!rps_ipi_queued(sd))
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003098 ____napi_schedule(sd, &sd->backlog);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003099 }
3100 goto enqueue;
3101 }
3102
Changli Gaodee42872010-05-02 05:42:16 +00003103 sd->dropped++;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003104 rps_unlock(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003105
Tom Herbert0a9627f2010-03-16 08:03:29 +00003106 local_irq_restore(flags);
3107
Eric Dumazetcaf586e2010-09-30 21:06:55 +00003108 atomic_long_inc(&skb->dev->rx_dropped);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003109 kfree_skb(skb);
3110 return NET_RX_DROP;
3111}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003112
Linus Torvalds1da177e2005-04-16 15:20:36 -07003113/**
3114 * netif_rx - post buffer to the network code
3115 * @skb: buffer to post
3116 *
3117 * This function receives a packet from a device driver and queues it for
3118 * the upper (protocol) levels to process. It always succeeds. The buffer
3119 * may be dropped during processing for congestion control or by the
3120 * protocol layers.
3121 *
3122 * return values:
3123 * NET_RX_SUCCESS (no congestion)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003124 * NET_RX_DROP (packet was dropped)
3125 *
3126 */
3127
3128int netif_rx(struct sk_buff *skb)
3129{
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003130 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003131
3132 /* if netpoll wants it, pretend we never saw it */
3133 if (netpoll_rx(skb))
3134 return NET_RX_DROP;
3135
Eric Dumazet588f0332011-11-15 04:12:55 +00003136 net_timestamp_check(netdev_tstamp_prequeue, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003137
Koki Sanagicf66ba52010-08-23 18:45:02 +09003138 trace_netif_rx(skb);
Eric Dumazetdf334542010-03-24 19:13:54 +00003139#ifdef CONFIG_RPS
Ingo Molnarc5905af2012-02-24 08:31:31 +01003140 if (static_key_false(&rps_needed)) {
Tom Herbertfec5e652010-04-16 16:01:27 -07003141 struct rps_dev_flow voidflow, *rflow = &voidflow;
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003142 int cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003143
Changli Gaocece1942010-08-07 20:35:43 -07003144 preempt_disable();
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003145 rcu_read_lock();
Tom Herbertfec5e652010-04-16 16:01:27 -07003146
3147 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003148 if (cpu < 0)
3149 cpu = smp_processor_id();
Tom Herbertfec5e652010-04-16 16:01:27 -07003150
3151 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3152
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003153 rcu_read_unlock();
Changli Gaocece1942010-08-07 20:35:43 -07003154 preempt_enable();
Eric Dumazetadc93002011-11-17 03:13:26 +00003155 } else
3156#endif
Tom Herbertfec5e652010-04-16 16:01:27 -07003157 {
3158 unsigned int qtail;
3159 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3160 put_cpu();
3161 }
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003162 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003163}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003164EXPORT_SYMBOL(netif_rx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003165
3166int netif_rx_ni(struct sk_buff *skb)
3167{
3168 int err;
3169
3170 preempt_disable();
3171 err = netif_rx(skb);
3172 if (local_softirq_pending())
3173 do_softirq();
3174 preempt_enable();
3175
3176 return err;
3177}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003178EXPORT_SYMBOL(netif_rx_ni);
3179
Linus Torvalds1da177e2005-04-16 15:20:36 -07003180static void net_tx_action(struct softirq_action *h)
3181{
3182 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3183
3184 if (sd->completion_queue) {
3185 struct sk_buff *clist;
3186
3187 local_irq_disable();
3188 clist = sd->completion_queue;
3189 sd->completion_queue = NULL;
3190 local_irq_enable();
3191
3192 while (clist) {
3193 struct sk_buff *skb = clist;
3194 clist = clist->next;
3195
Ilpo Järvinen547b7922008-07-25 21:43:18 -07003196 WARN_ON(atomic_read(&skb->users));
Koki Sanagi07dc22e2010-08-23 18:46:12 +09003197 trace_kfree_skb(skb, net_tx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003198 __kfree_skb(skb);
3199 }
3200 }
3201
3202 if (sd->output_queue) {
David S. Miller37437bb2008-07-16 02:15:04 -07003203 struct Qdisc *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003204
3205 local_irq_disable();
3206 head = sd->output_queue;
3207 sd->output_queue = NULL;
Changli Gaoa9cbd582010-04-26 23:06:24 +00003208 sd->output_queue_tailp = &sd->output_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003209 local_irq_enable();
3210
3211 while (head) {
David S. Miller37437bb2008-07-16 02:15:04 -07003212 struct Qdisc *q = head;
3213 spinlock_t *root_lock;
3214
Linus Torvalds1da177e2005-04-16 15:20:36 -07003215 head = head->next_sched;
3216
David S. Miller5fb66222008-08-02 20:02:43 -07003217 root_lock = qdisc_lock(q);
David S. Miller37437bb2008-07-16 02:15:04 -07003218 if (spin_trylock(root_lock)) {
Jarek Poplawskidef82a12008-08-17 21:54:43 -07003219 smp_mb__before_clear_bit();
3220 clear_bit(__QDISC_STATE_SCHED,
3221 &q->state);
David S. Miller37437bb2008-07-16 02:15:04 -07003222 qdisc_run(q);
3223 spin_unlock(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003224 } else {
David S. Miller195648b2008-08-19 04:00:36 -07003225 if (!test_bit(__QDISC_STATE_DEACTIVATED,
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07003226 &q->state)) {
David S. Miller195648b2008-08-19 04:00:36 -07003227 __netif_reschedule(q);
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07003228 } else {
3229 smp_mb__before_clear_bit();
3230 clear_bit(__QDISC_STATE_SCHED,
3231 &q->state);
3232 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003233 }
3234 }
3235 }
3236}
3237
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003238#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3239 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
Michał Mirosławda678292009-06-05 05:35:28 +00003240/* This hook is defined here for ATM LANE */
3241int (*br_fdb_test_addr_hook)(struct net_device *dev,
3242 unsigned char *addr) __read_mostly;
Stephen Hemminger4fb019a2009-09-11 11:50:08 -07003243EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
Michał Mirosławda678292009-06-05 05:35:28 +00003244#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003245
Linus Torvalds1da177e2005-04-16 15:20:36 -07003246#ifdef CONFIG_NET_CLS_ACT
3247/* TODO: Maybe we should just force sch_ingress to be compiled in
3248 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3249 * a compare and 2 stores extra right now if we dont have it on
3250 * but have CONFIG_NET_CLS_ACT
Lucas De Marchi25985ed2011-03-30 22:57:33 -03003251 * NOTE: This doesn't stop any functionality; if you dont have
3252 * the ingress scheduler, you just can't add policies on ingress.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003253 *
3254 */
Eric Dumazet24824a02010-10-02 06:11:55 +00003255static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003256{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003257 struct net_device *dev = skb->dev;
Herbert Xuf697c3e2007-10-14 00:38:47 -07003258 u32 ttl = G_TC_RTTL(skb->tc_verd);
David S. Miller555353c2008-07-08 17:33:13 -07003259 int result = TC_ACT_OK;
3260 struct Qdisc *q;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003261
Stephen Hemmingerde384832010-08-01 00:33:23 -07003262 if (unlikely(MAX_RED_LOOP < ttl++)) {
Joe Perchese87cc472012-05-13 21:56:26 +00003263 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3264 skb->skb_iif, dev->ifindex);
Herbert Xuf697c3e2007-10-14 00:38:47 -07003265 return TC_ACT_SHOT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003266 }
3267
Herbert Xuf697c3e2007-10-14 00:38:47 -07003268 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3269 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3270
David S. Miller83874002008-07-17 00:53:03 -07003271 q = rxq->qdisc;
David S. Miller8d50b532008-07-30 02:37:46 -07003272 if (q != &noop_qdisc) {
David S. Miller83874002008-07-17 00:53:03 -07003273 spin_lock(qdisc_lock(q));
David S. Millera9312ae2008-08-17 21:51:03 -07003274 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3275 result = qdisc_enqueue_root(skb, q);
David S. Miller83874002008-07-17 00:53:03 -07003276 spin_unlock(qdisc_lock(q));
3277 }
Herbert Xuf697c3e2007-10-14 00:38:47 -07003278
Linus Torvalds1da177e2005-04-16 15:20:36 -07003279 return result;
3280}
Herbert Xuf697c3e2007-10-14 00:38:47 -07003281
3282static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3283 struct packet_type **pt_prev,
3284 int *ret, struct net_device *orig_dev)
3285{
Eric Dumazet24824a02010-10-02 06:11:55 +00003286 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3287
3288 if (!rxq || rxq->qdisc == &noop_qdisc)
Herbert Xuf697c3e2007-10-14 00:38:47 -07003289 goto out;
3290
3291 if (*pt_prev) {
3292 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3293 *pt_prev = NULL;
Herbert Xuf697c3e2007-10-14 00:38:47 -07003294 }
3295
Eric Dumazet24824a02010-10-02 06:11:55 +00003296 switch (ing_filter(skb, rxq)) {
Herbert Xuf697c3e2007-10-14 00:38:47 -07003297 case TC_ACT_SHOT:
3298 case TC_ACT_STOLEN:
3299 kfree_skb(skb);
3300 return NULL;
3301 }
3302
3303out:
3304 skb->tc_verd = 0;
3305 return skb;
3306}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003307#endif
3308
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003309/**
3310 * netdev_rx_handler_register - register receive handler
3311 * @dev: device to register a handler for
3312 * @rx_handler: receive handler to register
Jiri Pirko93e2c322010-06-10 03:34:59 +00003313 * @rx_handler_data: data pointer that is used by rx handler
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003314 *
3315 * Register a receive hander for a device. This handler will then be
3316 * called from __netif_receive_skb. A negative errno code is returned
3317 * on a failure.
3318 *
3319 * The caller must hold the rtnl_mutex.
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003320 *
3321 * For a general description of rx_handler, see enum rx_handler_result.
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003322 */
3323int netdev_rx_handler_register(struct net_device *dev,
Jiri Pirko93e2c322010-06-10 03:34:59 +00003324 rx_handler_func_t *rx_handler,
3325 void *rx_handler_data)
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003326{
3327 ASSERT_RTNL();
3328
3329 if (dev->rx_handler)
3330 return -EBUSY;
3331
Eric Dumazet00cfec32013-03-29 03:01:22 +00003332 /* Note: rx_handler_data must be set before rx_handler */
Jiri Pirko93e2c322010-06-10 03:34:59 +00003333 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003334 rcu_assign_pointer(dev->rx_handler, rx_handler);
3335
3336 return 0;
3337}
3338EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3339
3340/**
3341 * netdev_rx_handler_unregister - unregister receive handler
3342 * @dev: device to unregister a handler from
3343 *
Kusanagi Kouichi166ec362013-03-18 02:59:52 +00003344 * Unregister a receive handler from a device.
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003345 *
3346 * The caller must hold the rtnl_mutex.
3347 */
3348void netdev_rx_handler_unregister(struct net_device *dev)
3349{
3350
3351 ASSERT_RTNL();
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00003352 RCU_INIT_POINTER(dev->rx_handler, NULL);
Eric Dumazet00cfec32013-03-29 03:01:22 +00003353 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3354 * section has a guarantee to see a non NULL rx_handler_data
3355 * as well.
3356 */
3357 synchronize_net();
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00003358 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003359}
3360EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3361
Mel Gormanb4b9e352012-07-31 16:44:26 -07003362/*
3363 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3364 * the special handling of PFMEMALLOC skbs.
3365 */
3366static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3367{
3368 switch (skb->protocol) {
3369 case __constant_htons(ETH_P_ARP):
3370 case __constant_htons(ETH_P_IP):
3371 case __constant_htons(ETH_P_IPV6):
3372 case __constant_htons(ETH_P_8021Q):
Patrick McHardy8ad227f2013-04-19 02:04:31 +00003373 case __constant_htons(ETH_P_8021AD):
Mel Gormanb4b9e352012-07-31 16:44:26 -07003374 return true;
3375 default:
3376 return false;
3377 }
3378}
3379
David S. Miller9754e292013-02-14 15:57:38 -05003380static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003381{
3382 struct packet_type *ptype, *pt_prev;
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003383 rx_handler_func_t *rx_handler;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003384 struct net_device *orig_dev;
David S. Miller63d8ea72011-02-28 10:48:59 -08003385 struct net_device *null_or_dev;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003386 bool deliver_exact = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003387 int ret = NET_RX_DROP;
Al Viro252e3342006-11-14 20:48:11 -08003388 __be16 type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003389
Eric Dumazet588f0332011-11-15 04:12:55 +00003390 net_timestamp_check(!netdev_tstamp_prequeue, skb);
Eric Dumazet81bbb3d2009-09-30 16:42:42 -07003391
Koki Sanagicf66ba52010-08-23 18:45:02 +09003392 trace_netif_receive_skb(skb);
Patrick McHardy9b22ea52008-11-04 14:49:57 -08003393
Linus Torvalds1da177e2005-04-16 15:20:36 -07003394 /* if we've gotten here through NAPI, check netpoll */
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003395 if (netpoll_receive_skb(skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003396 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003397
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07003398 orig_dev = skb->dev;
Jiri Pirko1765a572011-02-12 06:48:36 +00003399
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003400 skb_reset_network_header(skb);
Eric Dumazetfda55ec2013-01-07 09:28:21 +00003401 if (!skb_transport_header_was_set(skb))
3402 skb_reset_transport_header(skb);
Jiri Pirko0b5c9db2011-06-10 06:56:58 +00003403 skb_reset_mac_len(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003404
3405 pt_prev = NULL;
3406
3407 rcu_read_lock();
3408
David S. Miller63d8ea72011-02-28 10:48:59 -08003409another_round:
David S. Millerb6858172012-07-23 16:27:54 -07003410 skb->skb_iif = skb->dev->ifindex;
David S. Miller63d8ea72011-02-28 10:48:59 -08003411
3412 __this_cpu_inc(softnet_data.processed);
3413
Patrick McHardy8ad227f2013-04-19 02:04:31 +00003414 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3415 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
Jiri Pirkobcc6d472011-04-07 19:48:33 +00003416 skb = vlan_untag(skb);
3417 if (unlikely(!skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003418 goto unlock;
Jiri Pirkobcc6d472011-04-07 19:48:33 +00003419 }
3420
Linus Torvalds1da177e2005-04-16 15:20:36 -07003421#ifdef CONFIG_NET_CLS_ACT
3422 if (skb->tc_verd & TC_NCLS) {
3423 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3424 goto ncls;
3425 }
3426#endif
3427
David S. Miller9754e292013-02-14 15:57:38 -05003428 if (pfmemalloc)
Mel Gormanb4b9e352012-07-31 16:44:26 -07003429 goto skip_taps;
3430
Linus Torvalds1da177e2005-04-16 15:20:36 -07003431 list_for_each_entry_rcu(ptype, &ptype_all, list) {
David S. Miller63d8ea72011-02-28 10:48:59 -08003432 if (!ptype->dev || ptype->dev == skb->dev) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003433 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003434 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003435 pt_prev = ptype;
3436 }
3437 }
3438
Mel Gormanb4b9e352012-07-31 16:44:26 -07003439skip_taps:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003440#ifdef CONFIG_NET_CLS_ACT
Herbert Xuf697c3e2007-10-14 00:38:47 -07003441 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3442 if (!skb)
Mel Gormanb4b9e352012-07-31 16:44:26 -07003443 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003444ncls:
3445#endif
3446
David S. Miller9754e292013-02-14 15:57:38 -05003447 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003448 goto drop;
3449
John Fastabend24257172011-10-10 09:16:41 +00003450 if (vlan_tx_tag_present(skb)) {
3451 if (pt_prev) {
3452 ret = deliver_skb(skb, pt_prev, orig_dev);
3453 pt_prev = NULL;
3454 }
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003455 if (vlan_do_receive(&skb))
John Fastabend24257172011-10-10 09:16:41 +00003456 goto another_round;
3457 else if (unlikely(!skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003458 goto unlock;
John Fastabend24257172011-10-10 09:16:41 +00003459 }
3460
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003461 rx_handler = rcu_dereference(skb->dev->rx_handler);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003462 if (rx_handler) {
3463 if (pt_prev) {
3464 ret = deliver_skb(skb, pt_prev, orig_dev);
3465 pt_prev = NULL;
3466 }
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003467 switch (rx_handler(&skb)) {
3468 case RX_HANDLER_CONSUMED:
Cristian Bercaru3bc1b1a2013-03-08 07:03:38 +00003469 ret = NET_RX_SUCCESS;
Mel Gormanb4b9e352012-07-31 16:44:26 -07003470 goto unlock;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003471 case RX_HANDLER_ANOTHER:
David S. Miller63d8ea72011-02-28 10:48:59 -08003472 goto another_round;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003473 case RX_HANDLER_EXACT:
3474 deliver_exact = true;
3475 case RX_HANDLER_PASS:
3476 break;
3477 default:
3478 BUG();
3479 }
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003480 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003481
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003482 if (vlan_tx_nonzero_tag_present(skb))
3483 skb->pkt_type = PACKET_OTHERHOST;
3484
David S. Miller63d8ea72011-02-28 10:48:59 -08003485 /* deliver only exact match when indicated */
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003486 null_or_dev = deliver_exact ? skb->dev : NULL;
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00003487
Linus Torvalds1da177e2005-04-16 15:20:36 -07003488 type = skb->protocol;
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08003489 list_for_each_entry_rcu(ptype,
3490 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
David S. Miller63d8ea72011-02-28 10:48:59 -08003491 if (ptype->type == type &&
Jiri Pirkoe3f48d32011-02-28 20:26:31 +00003492 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3493 ptype->dev == orig_dev)) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003494 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003495 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003496 pt_prev = ptype;
3497 }
3498 }
3499
3500 if (pt_prev) {
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00003501 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
Michael S. Tsirkin0e698bf2012-09-15 22:44:16 +00003502 goto drop;
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00003503 else
3504 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003505 } else {
Mel Gormanb4b9e352012-07-31 16:44:26 -07003506drop:
Eric Dumazetcaf586e2010-09-30 21:06:55 +00003507 atomic_long_inc(&skb->dev->rx_dropped);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003508 kfree_skb(skb);
3509 /* Jamal, now you will not able to escape explaining
3510 * me how you were going to use this. :-)
3511 */
3512 ret = NET_RX_DROP;
3513 }
3514
Mel Gormanb4b9e352012-07-31 16:44:26 -07003515unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003516 rcu_read_unlock();
Mel Gormanb4b9e352012-07-31 16:44:26 -07003517out:
David S. Miller9754e292013-02-14 15:57:38 -05003518 return ret;
3519}
3520
3521static int __netif_receive_skb(struct sk_buff *skb)
3522{
3523 int ret;
3524
3525 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3526 unsigned long pflags = current->flags;
3527
3528 /*
3529 * PFMEMALLOC skbs are special, they should
3530 * - be delivered to SOCK_MEMALLOC sockets only
3531 * - stay away from userspace
3532 * - have bounded memory usage
3533 *
3534 * Use PF_MEMALLOC as this saves us from propagating the allocation
3535 * context down to all allocation sites.
3536 */
3537 current->flags |= PF_MEMALLOC;
3538 ret = __netif_receive_skb_core(skb, true);
3539 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3540 } else
3541 ret = __netif_receive_skb_core(skb, false);
3542
Linus Torvalds1da177e2005-04-16 15:20:36 -07003543 return ret;
3544}
Tom Herbert0a9627f2010-03-16 08:03:29 +00003545
3546/**
3547 * netif_receive_skb - process receive buffer from network
3548 * @skb: buffer to process
3549 *
3550 * netif_receive_skb() is the main receive data processing function.
3551 * It always succeeds. The buffer may be dropped during processing
3552 * for congestion control or by the protocol layers.
3553 *
3554 * This function may only be called from softirq context and interrupts
3555 * should be enabled.
3556 *
3557 * Return values (usually ignored):
3558 * NET_RX_SUCCESS: no congestion
3559 * NET_RX_DROP: packet was dropped
3560 */
3561int netif_receive_skb(struct sk_buff *skb)
3562{
Eric Dumazet588f0332011-11-15 04:12:55 +00003563 net_timestamp_check(netdev_tstamp_prequeue, skb);
Eric Dumazet3b098e22010-05-15 23:57:10 -07003564
Richard Cochranc1f19b52010-07-17 08:49:36 +00003565 if (skb_defer_rx_timestamp(skb))
3566 return NET_RX_SUCCESS;
3567
Eric Dumazetdf334542010-03-24 19:13:54 +00003568#ifdef CONFIG_RPS
Ingo Molnarc5905af2012-02-24 08:31:31 +01003569 if (static_key_false(&rps_needed)) {
Eric Dumazet3b098e22010-05-15 23:57:10 -07003570 struct rps_dev_flow voidflow, *rflow = &voidflow;
3571 int cpu, ret;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003572
Eric Dumazet3b098e22010-05-15 23:57:10 -07003573 rcu_read_lock();
Tom Herbert0a9627f2010-03-16 08:03:29 +00003574
Eric Dumazet3b098e22010-05-15 23:57:10 -07003575 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Tom Herbertfec5e652010-04-16 16:01:27 -07003576
Eric Dumazet3b098e22010-05-15 23:57:10 -07003577 if (cpu >= 0) {
3578 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3579 rcu_read_unlock();
Eric Dumazetadc93002011-11-17 03:13:26 +00003580 return ret;
Eric Dumazet3b098e22010-05-15 23:57:10 -07003581 }
Eric Dumazetadc93002011-11-17 03:13:26 +00003582 rcu_read_unlock();
Tom Herbertfec5e652010-04-16 16:01:27 -07003583 }
Tom Herbert1e94d722010-03-18 17:45:44 -07003584#endif
Eric Dumazetadc93002011-11-17 03:13:26 +00003585 return __netif_receive_skb(skb);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003586}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003587EXPORT_SYMBOL(netif_receive_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003588
Eric Dumazet88751272010-04-19 05:07:33 +00003589/* Network device is going away, flush any packets still pending
3590 * Called with irqs disabled.
3591 */
Changli Gao152102c2010-03-30 20:16:22 +00003592static void flush_backlog(void *arg)
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003593{
Changli Gao152102c2010-03-30 20:16:22 +00003594 struct net_device *dev = arg;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003595 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003596 struct sk_buff *skb, *tmp;
3597
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003598 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003599 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003600 if (skb->dev == dev) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003601 __skb_unlink(skb, &sd->input_pkt_queue);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003602 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003603 input_queue_head_incr(sd);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003604 }
Changli Gao6e7676c2010-04-27 15:07:33 -07003605 }
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003606 rps_unlock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003607
3608 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3609 if (skb->dev == dev) {
3610 __skb_unlink(skb, &sd->process_queue);
3611 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003612 input_queue_head_incr(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003613 }
3614 }
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003615}
3616
Herbert Xud565b0a2008-12-15 23:38:52 -08003617static int napi_gro_complete(struct sk_buff *skb)
3618{
Vlad Yasevich22061d82012-11-15 08:49:11 +00003619 struct packet_offload *ptype;
Herbert Xud565b0a2008-12-15 23:38:52 -08003620 __be16 type = skb->protocol;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003621 struct list_head *head = &offload_base;
Herbert Xud565b0a2008-12-15 23:38:52 -08003622 int err = -ENOENT;
3623
Eric Dumazetc3c7c252012-12-06 13:54:59 +00003624 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3625
Herbert Xufc59f9a2009-04-14 15:11:06 -07003626 if (NAPI_GRO_CB(skb)->count == 1) {
3627 skb_shinfo(skb)->gso_size = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003628 goto out;
Herbert Xufc59f9a2009-04-14 15:11:06 -07003629 }
Herbert Xud565b0a2008-12-15 23:38:52 -08003630
3631 rcu_read_lock();
3632 list_for_each_entry_rcu(ptype, head, list) {
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003633 if (ptype->type != type || !ptype->callbacks.gro_complete)
Herbert Xud565b0a2008-12-15 23:38:52 -08003634 continue;
3635
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003636 err = ptype->callbacks.gro_complete(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003637 break;
3638 }
3639 rcu_read_unlock();
3640
3641 if (err) {
3642 WARN_ON(&ptype->list == head);
3643 kfree_skb(skb);
3644 return NET_RX_SUCCESS;
3645 }
3646
3647out:
Herbert Xud565b0a2008-12-15 23:38:52 -08003648 return netif_receive_skb(skb);
3649}
3650
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003651/* napi->gro_list contains packets ordered by age.
3652 * youngest packets at the head of it.
3653 * Complete skbs in reverse order to reduce latencies.
3654 */
3655void napi_gro_flush(struct napi_struct *napi, bool flush_old)
Herbert Xud565b0a2008-12-15 23:38:52 -08003656{
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003657 struct sk_buff *skb, *prev = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08003658
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003659 /* scan list and build reverse chain */
3660 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3661 skb->prev = prev;
3662 prev = skb;
Herbert Xud565b0a2008-12-15 23:38:52 -08003663 }
3664
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003665 for (skb = prev; skb; skb = prev) {
3666 skb->next = NULL;
3667
3668 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3669 return;
3670
3671 prev = skb->prev;
3672 napi_gro_complete(skb);
3673 napi->gro_count--;
3674 }
3675
Herbert Xud565b0a2008-12-15 23:38:52 -08003676 napi->gro_list = NULL;
3677}
Eric Dumazet86cac582010-08-31 18:25:32 +00003678EXPORT_SYMBOL(napi_gro_flush);
Herbert Xud565b0a2008-12-15 23:38:52 -08003679
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003680static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3681{
3682 struct sk_buff *p;
3683 unsigned int maclen = skb->dev->hard_header_len;
3684
3685 for (p = napi->gro_list; p; p = p->next) {
3686 unsigned long diffs;
3687
3688 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3689 diffs |= p->vlan_tci ^ skb->vlan_tci;
3690 if (maclen == ETH_HLEN)
3691 diffs |= compare_ether_header(skb_mac_header(p),
3692 skb_gro_mac_header(skb));
3693 else if (!diffs)
3694 diffs = memcmp(skb_mac_header(p),
3695 skb_gro_mac_header(skb),
3696 maclen);
3697 NAPI_GRO_CB(p)->same_flow = !diffs;
3698 NAPI_GRO_CB(p)->flush = 0;
3699 }
3700}
3701
Rami Rosenbb728822012-11-28 21:55:25 +00003702static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xud565b0a2008-12-15 23:38:52 -08003703{
3704 struct sk_buff **pp = NULL;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003705 struct packet_offload *ptype;
Herbert Xud565b0a2008-12-15 23:38:52 -08003706 __be16 type = skb->protocol;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003707 struct list_head *head = &offload_base;
Herbert Xu0da2afd52008-12-26 14:57:42 -08003708 int same_flow;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003709 enum gro_result ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003710
Jarek Poplawskice9e76c2010-08-05 01:19:11 +00003711 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
Herbert Xud565b0a2008-12-15 23:38:52 -08003712 goto normal;
3713
David S. Miller21dc3302010-08-23 00:13:46 -07003714 if (skb_is_gso(skb) || skb_has_frag_list(skb))
Herbert Xuf17f5c92009-01-14 14:36:12 -08003715 goto normal;
3716
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003717 gro_list_prepare(napi, skb);
3718
Herbert Xud565b0a2008-12-15 23:38:52 -08003719 rcu_read_lock();
3720 list_for_each_entry_rcu(ptype, head, list) {
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003721 if (ptype->type != type || !ptype->callbacks.gro_receive)
Herbert Xud565b0a2008-12-15 23:38:52 -08003722 continue;
3723
Herbert Xu86911732009-01-29 14:19:50 +00003724 skb_set_network_header(skb, skb_gro_offset(skb));
Eric Dumazetefd94502013-02-14 17:31:48 +00003725 skb_reset_mac_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003726 NAPI_GRO_CB(skb)->same_flow = 0;
3727 NAPI_GRO_CB(skb)->flush = 0;
Herbert Xu5d38a072009-01-04 16:13:40 -08003728 NAPI_GRO_CB(skb)->free = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003729
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003730 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003731 break;
3732 }
3733 rcu_read_unlock();
3734
3735 if (&ptype->list == head)
3736 goto normal;
3737
Herbert Xu0da2afd52008-12-26 14:57:42 -08003738 same_flow = NAPI_GRO_CB(skb)->same_flow;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003739 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
Herbert Xu0da2afd52008-12-26 14:57:42 -08003740
Herbert Xud565b0a2008-12-15 23:38:52 -08003741 if (pp) {
3742 struct sk_buff *nskb = *pp;
3743
3744 *pp = nskb->next;
3745 nskb->next = NULL;
3746 napi_gro_complete(nskb);
Herbert Xu4ae55442009-02-08 18:00:36 +00003747 napi->gro_count--;
Herbert Xud565b0a2008-12-15 23:38:52 -08003748 }
3749
Herbert Xu0da2afd52008-12-26 14:57:42 -08003750 if (same_flow)
Herbert Xud565b0a2008-12-15 23:38:52 -08003751 goto ok;
3752
Herbert Xu4ae55442009-02-08 18:00:36 +00003753 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
Herbert Xud565b0a2008-12-15 23:38:52 -08003754 goto normal;
Herbert Xud565b0a2008-12-15 23:38:52 -08003755
Herbert Xu4ae55442009-02-08 18:00:36 +00003756 napi->gro_count++;
Herbert Xud565b0a2008-12-15 23:38:52 -08003757 NAPI_GRO_CB(skb)->count = 1;
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003758 NAPI_GRO_CB(skb)->age = jiffies;
Herbert Xu86911732009-01-29 14:19:50 +00003759 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003760 skb->next = napi->gro_list;
3761 napi->gro_list = skb;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003762 ret = GRO_HELD;
Herbert Xud565b0a2008-12-15 23:38:52 -08003763
Herbert Xuad0f9902009-02-01 01:24:55 -08003764pull:
Herbert Xucb189782009-05-26 18:50:31 +00003765 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3766 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3767
3768 BUG_ON(skb->end - skb->tail < grow);
3769
3770 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3771
3772 skb->tail += grow;
3773 skb->data_len -= grow;
3774
3775 skb_shinfo(skb)->frags[0].page_offset += grow;
Eric Dumazet9e903e02011-10-18 21:00:24 +00003776 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
Herbert Xucb189782009-05-26 18:50:31 +00003777
Eric Dumazet9e903e02011-10-18 21:00:24 +00003778 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
Ian Campbellea2ab692011-08-22 23:44:58 +00003779 skb_frag_unref(skb, 0);
Herbert Xucb189782009-05-26 18:50:31 +00003780 memmove(skb_shinfo(skb)->frags,
3781 skb_shinfo(skb)->frags + 1,
Jarek Poplawskie5093ae2010-08-11 02:02:10 +00003782 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
Herbert Xucb189782009-05-26 18:50:31 +00003783 }
Herbert Xuad0f9902009-02-01 01:24:55 -08003784 }
3785
Herbert Xud565b0a2008-12-15 23:38:52 -08003786ok:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003787 return ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003788
3789normal:
Herbert Xuad0f9902009-02-01 01:24:55 -08003790 ret = GRO_NORMAL;
3791 goto pull;
Herbert Xu5d38a072009-01-04 16:13:40 -08003792}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003793
Herbert Xu96e93ea2009-01-06 10:49:34 -08003794
Rami Rosenbb728822012-11-28 21:55:25 +00003795static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
Herbert Xu5d38a072009-01-04 16:13:40 -08003796{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003797 switch (ret) {
3798 case GRO_NORMAL:
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003799 if (netif_receive_skb(skb))
3800 ret = GRO_DROP;
3801 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003802
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003803 case GRO_DROP:
Herbert Xu5d38a072009-01-04 16:13:40 -08003804 kfree_skb(skb);
3805 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003806
Eric Dumazetdaa86542012-04-19 07:07:40 +00003807 case GRO_MERGED_FREE:
Eric Dumazetd7e88832012-04-30 08:10:34 +00003808 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3809 kmem_cache_free(skbuff_head_cache, skb);
3810 else
3811 __kfree_skb(skb);
Eric Dumazetdaa86542012-04-19 07:07:40 +00003812 break;
3813
Ben Hutchings5b252f02009-10-29 07:17:09 +00003814 case GRO_HELD:
3815 case GRO_MERGED:
3816 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003817 }
3818
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003819 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003820}
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003821
Eric Dumazetca07e432012-10-06 22:28:06 +00003822static void skb_gro_reset_offset(struct sk_buff *skb)
Herbert Xu78a478d2009-05-26 18:50:21 +00003823{
Eric Dumazetca07e432012-10-06 22:28:06 +00003824 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3825 const skb_frag_t *frag0 = &pinfo->frags[0];
3826
Herbert Xu78a478d2009-05-26 18:50:21 +00003827 NAPI_GRO_CB(skb)->data_offset = 0;
3828 NAPI_GRO_CB(skb)->frag0 = NULL;
Herbert Xu74895942009-05-26 18:50:27 +00003829 NAPI_GRO_CB(skb)->frag0_len = 0;
Herbert Xu78a478d2009-05-26 18:50:21 +00003830
Herbert Xu78d3fd02009-05-26 18:50:23 +00003831 if (skb->mac_header == skb->tail &&
Eric Dumazetca07e432012-10-06 22:28:06 +00003832 pinfo->nr_frags &&
3833 !PageHighMem(skb_frag_page(frag0))) {
3834 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3835 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
Herbert Xu74895942009-05-26 18:50:27 +00003836 }
Herbert Xu78a478d2009-05-26 18:50:21 +00003837}
Herbert Xu78a478d2009-05-26 18:50:21 +00003838
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003839gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003840{
Herbert Xu86911732009-01-29 14:19:50 +00003841 skb_gro_reset_offset(skb);
3842
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003843 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003844}
3845EXPORT_SYMBOL(napi_gro_receive);
3846
stephen hemmingerd0c2b0d2010-10-19 07:12:10 +00003847static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003848{
Herbert Xu96e93ea2009-01-06 10:49:34 -08003849 __skb_pull(skb, skb_headlen(skb));
Eric Dumazet2a2a4592012-03-21 06:58:03 +00003850 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3851 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
Jesse Gross3701e512010-10-20 13:56:06 +00003852 skb->vlan_tci = 0;
Herbert Xu66c46d72011-01-29 20:44:54 -08003853 skb->dev = napi->dev;
Andy Gospodarek6d152e22011-02-02 14:53:25 -08003854 skb->skb_iif = 0;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003855
3856 napi->skb = skb;
3857}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003858
Herbert Xu76620aa2009-04-16 02:02:07 -07003859struct sk_buff *napi_get_frags(struct napi_struct *napi)
Herbert Xu5d38a072009-01-04 16:13:40 -08003860{
Herbert Xu5d38a072009-01-04 16:13:40 -08003861 struct sk_buff *skb = napi->skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003862
3863 if (!skb) {
Eric Dumazet89d71a62009-10-13 05:34:20 +00003864 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3865 if (skb)
3866 napi->skb = skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003867 }
Herbert Xu96e93ea2009-01-06 10:49:34 -08003868 return skb;
3869}
Herbert Xu76620aa2009-04-16 02:02:07 -07003870EXPORT_SYMBOL(napi_get_frags);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003871
Rami Rosenbb728822012-11-28 21:55:25 +00003872static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003873 gro_result_t ret)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003874{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003875 switch (ret) {
3876 case GRO_NORMAL:
Herbert Xu86911732009-01-29 14:19:50 +00003877 case GRO_HELD:
Ajit Khapardee76b69c2010-02-16 20:25:43 +00003878 skb->protocol = eth_type_trans(skb, skb->dev);
Herbert Xu86911732009-01-29 14:19:50 +00003879
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003880 if (ret == GRO_HELD)
3881 skb_gro_pull(skb, -ETH_HLEN);
3882 else if (netif_receive_skb(skb))
3883 ret = GRO_DROP;
Herbert Xu86911732009-01-29 14:19:50 +00003884 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003885
3886 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003887 case GRO_MERGED_FREE:
3888 napi_reuse_skb(napi, skb);
3889 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003890
3891 case GRO_MERGED:
3892 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003893 }
3894
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003895 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003896}
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003897
Eric Dumazet4adb9c42012-05-18 20:49:06 +00003898static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003899{
Herbert Xu76620aa2009-04-16 02:02:07 -07003900 struct sk_buff *skb = napi->skb;
3901 struct ethhdr *eth;
Herbert Xua5b1cf22009-05-26 18:50:28 +00003902 unsigned int hlen;
3903 unsigned int off;
Herbert Xu76620aa2009-04-16 02:02:07 -07003904
3905 napi->skb = NULL;
3906
3907 skb_reset_mac_header(skb);
3908 skb_gro_reset_offset(skb);
3909
Herbert Xua5b1cf22009-05-26 18:50:28 +00003910 off = skb_gro_offset(skb);
3911 hlen = off + sizeof(*eth);
3912 eth = skb_gro_header_fast(skb, off);
3913 if (skb_gro_header_hard(skb, hlen)) {
3914 eth = skb_gro_header_slow(skb, hlen, off);
3915 if (unlikely(!eth)) {
3916 napi_reuse_skb(napi, skb);
3917 skb = NULL;
3918 goto out;
3919 }
Herbert Xu76620aa2009-04-16 02:02:07 -07003920 }
3921
3922 skb_gro_pull(skb, sizeof(*eth));
3923
3924 /*
3925 * This works because the only protocols we care about don't require
3926 * special handling. We'll fix it up properly at the end.
3927 */
3928 skb->protocol = eth->h_proto;
3929
3930out:
3931 return skb;
3932}
Herbert Xu76620aa2009-04-16 02:02:07 -07003933
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003934gro_result_t napi_gro_frags(struct napi_struct *napi)
Herbert Xu76620aa2009-04-16 02:02:07 -07003935{
3936 struct sk_buff *skb = napi_frags_skb(napi);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003937
3938 if (!skb)
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003939 return GRO_DROP;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003940
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003941 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
Herbert Xu5d38a072009-01-04 16:13:40 -08003942}
3943EXPORT_SYMBOL(napi_gro_frags);
3944
Eric Dumazete326bed2010-04-22 00:22:45 -07003945/*
3946 * net_rps_action sends any pending IPI's for rps.
3947 * Note: called with local irq disabled, but exits with local irq enabled.
3948 */
3949static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3950{
3951#ifdef CONFIG_RPS
3952 struct softnet_data *remsd = sd->rps_ipi_list;
3953
3954 if (remsd) {
3955 sd->rps_ipi_list = NULL;
3956
3957 local_irq_enable();
3958
3959 /* Send pending IPI's to kick RPS processing on remote cpus. */
3960 while (remsd) {
3961 struct softnet_data *next = remsd->rps_ipi_next;
3962
3963 if (cpu_online(remsd->cpu))
3964 __smp_call_function_single(remsd->cpu,
3965 &remsd->csd, 0);
3966 remsd = next;
3967 }
3968 } else
3969#endif
3970 local_irq_enable();
3971}
3972
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003973static int process_backlog(struct napi_struct *napi, int quota)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003974{
3975 int work = 0;
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003976 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003977
Eric Dumazete326bed2010-04-22 00:22:45 -07003978#ifdef CONFIG_RPS
3979 /* Check if we have pending ipi, its better to send them now,
3980 * not waiting net_rx_action() end.
3981 */
3982 if (sd->rps_ipi_list) {
3983 local_irq_disable();
3984 net_rps_action_and_irq_enable(sd);
3985 }
3986#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003987 napi->weight = weight_p;
Changli Gao6e7676c2010-04-27 15:07:33 -07003988 local_irq_disable();
3989 while (work < quota) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003990 struct sk_buff *skb;
Changli Gao6e7676c2010-04-27 15:07:33 -07003991 unsigned int qlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003992
Changli Gao6e7676c2010-04-27 15:07:33 -07003993 while ((skb = __skb_dequeue(&sd->process_queue))) {
Eric Dumazete4008272010-04-05 15:42:39 -07003994 local_irq_enable();
Changli Gao6e7676c2010-04-27 15:07:33 -07003995 __netif_receive_skb(skb);
Changli Gao6e7676c2010-04-27 15:07:33 -07003996 local_irq_disable();
Tom Herbert76cc8b12010-05-20 18:37:59 +00003997 input_queue_head_incr(sd);
3998 if (++work >= quota) {
3999 local_irq_enable();
4000 return work;
4001 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004002 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004003
Changli Gao6e7676c2010-04-27 15:07:33 -07004004 rps_lock(sd);
4005 qlen = skb_queue_len(&sd->input_pkt_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00004006 if (qlen)
Changli Gao6e7676c2010-04-27 15:07:33 -07004007 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4008 &sd->process_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00004009
Changli Gao6e7676c2010-04-27 15:07:33 -07004010 if (qlen < quota - work) {
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004011 /*
4012 * Inline a custom version of __napi_complete().
4013 * only current cpu owns and manipulates this napi,
4014 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4015 * we can use a plain write instead of clear_bit(),
4016 * and we dont need an smp_mb() memory barrier.
4017 */
4018 list_del(&napi->poll_list);
4019 napi->state = 0;
4020
Changli Gao6e7676c2010-04-27 15:07:33 -07004021 quota = work + qlen;
4022 }
4023 rps_unlock(sd);
4024 }
4025 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004026
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004027 return work;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004028}
4029
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004030/**
4031 * __napi_schedule - schedule for receive
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004032 * @n: entry to schedule
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004033 *
4034 * The entry's receive function will be scheduled to run
4035 */
Harvey Harrisonb5606c22008-02-13 15:03:16 -08004036void __napi_schedule(struct napi_struct *n)
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004037{
4038 unsigned long flags;
4039
4040 local_irq_save(flags);
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004041 ____napi_schedule(&__get_cpu_var(softnet_data), n);
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004042 local_irq_restore(flags);
4043}
4044EXPORT_SYMBOL(__napi_schedule);
4045
Herbert Xud565b0a2008-12-15 23:38:52 -08004046void __napi_complete(struct napi_struct *n)
4047{
4048 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4049 BUG_ON(n->gro_list);
4050
4051 list_del(&n->poll_list);
4052 smp_mb__before_clear_bit();
4053 clear_bit(NAPI_STATE_SCHED, &n->state);
4054}
4055EXPORT_SYMBOL(__napi_complete);
4056
4057void napi_complete(struct napi_struct *n)
4058{
4059 unsigned long flags;
4060
4061 /*
4062 * don't let napi dequeue from the cpu poll list
4063 * just in case its running on a different cpu
4064 */
4065 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4066 return;
4067
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004068 napi_gro_flush(n, false);
Herbert Xud565b0a2008-12-15 23:38:52 -08004069 local_irq_save(flags);
4070 __napi_complete(n);
4071 local_irq_restore(flags);
4072}
4073EXPORT_SYMBOL(napi_complete);
4074
4075void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4076 int (*poll)(struct napi_struct *, int), int weight)
4077{
4078 INIT_LIST_HEAD(&napi->poll_list);
Herbert Xu4ae55442009-02-08 18:00:36 +00004079 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08004080 napi->gro_list = NULL;
Herbert Xu5d38a072009-01-04 16:13:40 -08004081 napi->skb = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08004082 napi->poll = poll;
Eric Dumazet82dc3c62013-03-05 15:57:22 +00004083 if (weight > NAPI_POLL_WEIGHT)
4084 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4085 weight, dev->name);
Herbert Xud565b0a2008-12-15 23:38:52 -08004086 napi->weight = weight;
4087 list_add(&napi->dev_list, &dev->napi_list);
Herbert Xud565b0a2008-12-15 23:38:52 -08004088 napi->dev = dev;
Herbert Xu5d38a072009-01-04 16:13:40 -08004089#ifdef CONFIG_NETPOLL
Herbert Xud565b0a2008-12-15 23:38:52 -08004090 spin_lock_init(&napi->poll_lock);
4091 napi->poll_owner = -1;
4092#endif
4093 set_bit(NAPI_STATE_SCHED, &napi->state);
4094}
4095EXPORT_SYMBOL(netif_napi_add);
4096
4097void netif_napi_del(struct napi_struct *napi)
4098{
4099 struct sk_buff *skb, *next;
4100
Peter P Waskiewicz Jrd7b06632008-12-26 01:35:35 -08004101 list_del_init(&napi->dev_list);
Herbert Xu76620aa2009-04-16 02:02:07 -07004102 napi_free_frags(napi);
Herbert Xud565b0a2008-12-15 23:38:52 -08004103
4104 for (skb = napi->gro_list; skb; skb = next) {
4105 next = skb->next;
4106 skb->next = NULL;
4107 kfree_skb(skb);
4108 }
4109
4110 napi->gro_list = NULL;
Herbert Xu4ae55442009-02-08 18:00:36 +00004111 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08004112}
4113EXPORT_SYMBOL(netif_napi_del);
4114
Linus Torvalds1da177e2005-04-16 15:20:36 -07004115static void net_rx_action(struct softirq_action *h)
4116{
Eric Dumazete326bed2010-04-22 00:22:45 -07004117 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger24f8b232008-11-03 17:14:38 -08004118 unsigned long time_limit = jiffies + 2;
Stephen Hemminger51b0bde2005-06-23 20:14:40 -07004119 int budget = netdev_budget;
Matt Mackall53fb95d2005-08-11 19:27:43 -07004120 void *have;
4121
Linus Torvalds1da177e2005-04-16 15:20:36 -07004122 local_irq_disable();
4123
Eric Dumazete326bed2010-04-22 00:22:45 -07004124 while (!list_empty(&sd->poll_list)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004125 struct napi_struct *n;
4126 int work, weight;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004127
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004128 /* If softirq window is exhuasted then punt.
Stephen Hemminger24f8b232008-11-03 17:14:38 -08004129 * Allow this to run for 2 jiffies since which will allow
4130 * an average latency of 1.5/HZ.
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004131 */
Eric Dumazetd1f41b62013-03-05 07:15:13 +00004132 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004133 goto softnet_break;
4134
4135 local_irq_enable();
4136
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004137 /* Even though interrupts have been re-enabled, this
4138 * access is safe because interrupts can only add new
4139 * entries to the tail of this list, and only ->poll()
4140 * calls can remove this head entry from the list.
4141 */
Eric Dumazete326bed2010-04-22 00:22:45 -07004142 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004143
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004144 have = netpoll_poll_lock(n);
4145
4146 weight = n->weight;
4147
David S. Miller0a7606c2007-10-29 21:28:47 -07004148 /* This NAPI_STATE_SCHED test is for avoiding a race
4149 * with netpoll's poll_napi(). Only the entity which
4150 * obtains the lock and sees NAPI_STATE_SCHED set will
4151 * actually make the ->poll() call. Therefore we avoid
Lucas De Marchi25985ed2011-03-30 22:57:33 -03004152 * accidentally calling ->poll() when NAPI is not scheduled.
David S. Miller0a7606c2007-10-29 21:28:47 -07004153 */
4154 work = 0;
Neil Horman4ea7e382009-05-21 07:36:08 +00004155 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
David S. Miller0a7606c2007-10-29 21:28:47 -07004156 work = n->poll(n, weight);
Neil Horman4ea7e382009-05-21 07:36:08 +00004157 trace_napi_poll(n);
4158 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004159
4160 WARN_ON_ONCE(work > weight);
4161
4162 budget -= work;
4163
4164 local_irq_disable();
4165
4166 /* Drivers must not modify the NAPI state if they
4167 * consume the entire weight. In such cases this code
4168 * still "owns" the NAPI instance and therefore can
4169 * move the instance around on the list at-will.
4170 */
David S. Millerfed17f32008-01-07 21:00:40 -08004171 if (unlikely(work == weight)) {
Herbert Xuff780cd2009-06-26 19:27:04 -07004172 if (unlikely(napi_disable_pending(n))) {
4173 local_irq_enable();
4174 napi_complete(n);
4175 local_irq_disable();
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004176 } else {
4177 if (n->gro_list) {
4178 /* flush too old packets
4179 * If HZ < 1000, flush all packets.
4180 */
4181 local_irq_enable();
4182 napi_gro_flush(n, HZ >= 1000);
4183 local_irq_disable();
4184 }
Eric Dumazete326bed2010-04-22 00:22:45 -07004185 list_move_tail(&n->poll_list, &sd->poll_list);
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004186 }
David S. Millerfed17f32008-01-07 21:00:40 -08004187 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004188
4189 netpoll_poll_unlock(have);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004190 }
4191out:
Eric Dumazete326bed2010-04-22 00:22:45 -07004192 net_rps_action_and_irq_enable(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00004193
Chris Leechdb217332006-06-17 21:24:58 -07004194#ifdef CONFIG_NET_DMA
4195 /*
4196 * There may not be any more sk_buffs coming right now, so push
4197 * any pending DMA copies to hardware
4198 */
Dan Williams2ba05622009-01-06 11:38:14 -07004199 dma_issue_pending_all();
Chris Leechdb217332006-06-17 21:24:58 -07004200#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004201
Linus Torvalds1da177e2005-04-16 15:20:36 -07004202 return;
4203
4204softnet_break:
Changli Gaodee42872010-05-02 05:42:16 +00004205 sd->time_squeeze++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004206 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4207 goto out;
4208}
4209
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004210struct netdev_upper {
4211 struct net_device *dev;
4212 bool master;
4213 struct list_head list;
4214 struct rcu_head rcu;
4215 struct list_head search_list;
4216};
4217
4218static void __append_search_uppers(struct list_head *search_list,
4219 struct net_device *dev)
4220{
4221 struct netdev_upper *upper;
4222
4223 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4224 /* check if this upper is not already in search list */
4225 if (list_empty(&upper->search_list))
4226 list_add_tail(&upper->search_list, search_list);
4227 }
4228}
4229
4230static bool __netdev_search_upper_dev(struct net_device *dev,
4231 struct net_device *upper_dev)
4232{
4233 LIST_HEAD(search_list);
4234 struct netdev_upper *upper;
4235 struct netdev_upper *tmp;
4236 bool ret = false;
4237
4238 __append_search_uppers(&search_list, dev);
4239 list_for_each_entry(upper, &search_list, search_list) {
4240 if (upper->dev == upper_dev) {
4241 ret = true;
4242 break;
4243 }
4244 __append_search_uppers(&search_list, upper->dev);
4245 }
4246 list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4247 INIT_LIST_HEAD(&upper->search_list);
4248 return ret;
4249}
4250
4251static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4252 struct net_device *upper_dev)
4253{
4254 struct netdev_upper *upper;
4255
4256 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4257 if (upper->dev == upper_dev)
4258 return upper;
4259 }
4260 return NULL;
4261}
4262
4263/**
4264 * netdev_has_upper_dev - Check if device is linked to an upper device
4265 * @dev: device
4266 * @upper_dev: upper device to check
4267 *
4268 * Find out if a device is linked to specified upper device and return true
4269 * in case it is. Note that this checks only immediate upper device,
4270 * not through a complete stack of devices. The caller must hold the RTNL lock.
4271 */
4272bool netdev_has_upper_dev(struct net_device *dev,
4273 struct net_device *upper_dev)
4274{
4275 ASSERT_RTNL();
4276
4277 return __netdev_find_upper(dev, upper_dev);
4278}
4279EXPORT_SYMBOL(netdev_has_upper_dev);
4280
4281/**
4282 * netdev_has_any_upper_dev - Check if device is linked to some device
4283 * @dev: device
4284 *
4285 * Find out if a device is linked to an upper device and return true in case
4286 * it is. The caller must hold the RTNL lock.
4287 */
4288bool netdev_has_any_upper_dev(struct net_device *dev)
4289{
4290 ASSERT_RTNL();
4291
4292 return !list_empty(&dev->upper_dev_list);
4293}
4294EXPORT_SYMBOL(netdev_has_any_upper_dev);
4295
4296/**
4297 * netdev_master_upper_dev_get - Get master upper device
4298 * @dev: device
4299 *
4300 * Find a master upper device and return pointer to it or NULL in case
4301 * it's not there. The caller must hold the RTNL lock.
4302 */
4303struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4304{
4305 struct netdev_upper *upper;
4306
4307 ASSERT_RTNL();
4308
4309 if (list_empty(&dev->upper_dev_list))
4310 return NULL;
4311
4312 upper = list_first_entry(&dev->upper_dev_list,
4313 struct netdev_upper, list);
4314 if (likely(upper->master))
4315 return upper->dev;
4316 return NULL;
4317}
4318EXPORT_SYMBOL(netdev_master_upper_dev_get);
4319
4320/**
4321 * netdev_master_upper_dev_get_rcu - Get master upper device
4322 * @dev: device
4323 *
4324 * Find a master upper device and return pointer to it or NULL in case
4325 * it's not there. The caller must hold the RCU read lock.
4326 */
4327struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4328{
4329 struct netdev_upper *upper;
4330
4331 upper = list_first_or_null_rcu(&dev->upper_dev_list,
4332 struct netdev_upper, list);
4333 if (upper && likely(upper->master))
4334 return upper->dev;
4335 return NULL;
4336}
4337EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4338
4339static int __netdev_upper_dev_link(struct net_device *dev,
4340 struct net_device *upper_dev, bool master)
4341{
4342 struct netdev_upper *upper;
4343
4344 ASSERT_RTNL();
4345
4346 if (dev == upper_dev)
4347 return -EBUSY;
4348
4349 /* To prevent loops, check if dev is not upper device to upper_dev. */
4350 if (__netdev_search_upper_dev(upper_dev, dev))
4351 return -EBUSY;
4352
4353 if (__netdev_find_upper(dev, upper_dev))
4354 return -EEXIST;
4355
4356 if (master && netdev_master_upper_dev_get(dev))
4357 return -EBUSY;
4358
4359 upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4360 if (!upper)
4361 return -ENOMEM;
4362
4363 upper->dev = upper_dev;
4364 upper->master = master;
4365 INIT_LIST_HEAD(&upper->search_list);
4366
4367 /* Ensure that master upper link is always the first item in list. */
4368 if (master)
4369 list_add_rcu(&upper->list, &dev->upper_dev_list);
4370 else
4371 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4372 dev_hold(upper_dev);
4373
4374 return 0;
4375}
4376
4377/**
4378 * netdev_upper_dev_link - Add a link to the upper device
4379 * @dev: device
4380 * @upper_dev: new upper device
4381 *
4382 * Adds a link to device which is upper to this one. The caller must hold
4383 * the RTNL lock. On a failure a negative errno code is returned.
4384 * On success the reference counts are adjusted and the function
4385 * returns zero.
4386 */
4387int netdev_upper_dev_link(struct net_device *dev,
4388 struct net_device *upper_dev)
4389{
4390 return __netdev_upper_dev_link(dev, upper_dev, false);
4391}
4392EXPORT_SYMBOL(netdev_upper_dev_link);
4393
4394/**
4395 * netdev_master_upper_dev_link - Add a master link to the upper device
4396 * @dev: device
4397 * @upper_dev: new upper device
4398 *
4399 * Adds a link to device which is upper to this one. In this case, only
4400 * one master upper device can be linked, although other non-master devices
4401 * might be linked as well. The caller must hold the RTNL lock.
4402 * On a failure a negative errno code is returned. On success the reference
4403 * counts are adjusted and the function returns zero.
4404 */
4405int netdev_master_upper_dev_link(struct net_device *dev,
4406 struct net_device *upper_dev)
4407{
4408 return __netdev_upper_dev_link(dev, upper_dev, true);
4409}
4410EXPORT_SYMBOL(netdev_master_upper_dev_link);
4411
4412/**
4413 * netdev_upper_dev_unlink - Removes a link to upper device
4414 * @dev: device
4415 * @upper_dev: new upper device
4416 *
4417 * Removes a link to device which is upper to this one. The caller must hold
4418 * the RTNL lock.
4419 */
4420void netdev_upper_dev_unlink(struct net_device *dev,
4421 struct net_device *upper_dev)
4422{
4423 struct netdev_upper *upper;
4424
4425 ASSERT_RTNL();
4426
4427 upper = __netdev_find_upper(dev, upper_dev);
4428 if (!upper)
4429 return;
4430 list_del_rcu(&upper->list);
4431 dev_put(upper_dev);
4432 kfree_rcu(upper, rcu);
4433}
4434EXPORT_SYMBOL(netdev_upper_dev_unlink);
4435
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004436static void dev_change_rx_flags(struct net_device *dev, int flags)
4437{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004438 const struct net_device_ops *ops = dev->netdev_ops;
4439
4440 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4441 ops->ndo_change_rx_flags(dev, flags);
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004442}
4443
Wang Chendad9b332008-06-18 01:48:28 -07004444static int __dev_set_promiscuity(struct net_device *dev, int inc)
Patrick McHardy4417da62007-06-27 01:28:10 -07004445{
Eric Dumazetb536db92011-11-30 21:42:26 +00004446 unsigned int old_flags = dev->flags;
Eric W. Biedermand04a48b2012-05-23 17:01:57 -06004447 kuid_t uid;
4448 kgid_t gid;
Patrick McHardy4417da62007-06-27 01:28:10 -07004449
Patrick McHardy24023452007-07-14 18:51:31 -07004450 ASSERT_RTNL();
4451
Wang Chendad9b332008-06-18 01:48:28 -07004452 dev->flags |= IFF_PROMISC;
4453 dev->promiscuity += inc;
4454 if (dev->promiscuity == 0) {
4455 /*
4456 * Avoid overflow.
4457 * If inc causes overflow, untouch promisc and return error.
4458 */
4459 if (inc < 0)
4460 dev->flags &= ~IFF_PROMISC;
4461 else {
4462 dev->promiscuity -= inc;
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004463 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4464 dev->name);
Wang Chendad9b332008-06-18 01:48:28 -07004465 return -EOVERFLOW;
4466 }
4467 }
Patrick McHardy4417da62007-06-27 01:28:10 -07004468 if (dev->flags != old_flags) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004469 pr_info("device %s %s promiscuous mode\n",
4470 dev->name,
4471 dev->flags & IFF_PROMISC ? "entered" : "left");
David Howells8192b0c2008-11-14 10:39:10 +11004472 if (audit_enabled) {
4473 current_uid_gid(&uid, &gid);
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004474 audit_log(current->audit_context, GFP_ATOMIC,
4475 AUDIT_ANOM_PROMISCUOUS,
4476 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4477 dev->name, (dev->flags & IFF_PROMISC),
4478 (old_flags & IFF_PROMISC),
Eric W. Biedermane1760bd2012-09-10 22:39:43 -07004479 from_kuid(&init_user_ns, audit_get_loginuid(current)),
Eric W. Biedermand04a48b2012-05-23 17:01:57 -06004480 from_kuid(&init_user_ns, uid),
4481 from_kgid(&init_user_ns, gid),
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004482 audit_get_sessionid(current));
David Howells8192b0c2008-11-14 10:39:10 +11004483 }
Patrick McHardy24023452007-07-14 18:51:31 -07004484
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004485 dev_change_rx_flags(dev, IFF_PROMISC);
Patrick McHardy4417da62007-06-27 01:28:10 -07004486 }
Wang Chendad9b332008-06-18 01:48:28 -07004487 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004488}
4489
Linus Torvalds1da177e2005-04-16 15:20:36 -07004490/**
4491 * dev_set_promiscuity - update promiscuity count on a device
4492 * @dev: device
4493 * @inc: modifier
4494 *
Stephen Hemminger3041a062006-05-26 13:25:24 -07004495 * Add or remove promiscuity from a device. While the count in the device
Linus Torvalds1da177e2005-04-16 15:20:36 -07004496 * remains above zero the interface remains promiscuous. Once it hits zero
4497 * the device reverts back to normal filtering operation. A negative inc
4498 * value is used to drop promiscuity on the device.
Wang Chendad9b332008-06-18 01:48:28 -07004499 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004500 */
Wang Chendad9b332008-06-18 01:48:28 -07004501int dev_set_promiscuity(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004502{
Eric Dumazetb536db92011-11-30 21:42:26 +00004503 unsigned int old_flags = dev->flags;
Wang Chendad9b332008-06-18 01:48:28 -07004504 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004505
Wang Chendad9b332008-06-18 01:48:28 -07004506 err = __dev_set_promiscuity(dev, inc);
Patrick McHardy4b5a6982008-07-06 15:49:08 -07004507 if (err < 0)
Wang Chendad9b332008-06-18 01:48:28 -07004508 return err;
Patrick McHardy4417da62007-06-27 01:28:10 -07004509 if (dev->flags != old_flags)
4510 dev_set_rx_mode(dev);
Wang Chendad9b332008-06-18 01:48:28 -07004511 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004512}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004513EXPORT_SYMBOL(dev_set_promiscuity);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004514
4515/**
4516 * dev_set_allmulti - update allmulti count on a device
4517 * @dev: device
4518 * @inc: modifier
4519 *
4520 * Add or remove reception of all multicast frames to a device. While the
4521 * count in the device remains above zero the interface remains listening
4522 * to all interfaces. Once it hits zero the device reverts back to normal
4523 * filtering operation. A negative @inc value is used to drop the counter
4524 * when releasing a resource needing all multicasts.
Wang Chendad9b332008-06-18 01:48:28 -07004525 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004526 */
4527
Wang Chendad9b332008-06-18 01:48:28 -07004528int dev_set_allmulti(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004529{
Eric Dumazetb536db92011-11-30 21:42:26 +00004530 unsigned int old_flags = dev->flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004531
Patrick McHardy24023452007-07-14 18:51:31 -07004532 ASSERT_RTNL();
4533
Linus Torvalds1da177e2005-04-16 15:20:36 -07004534 dev->flags |= IFF_ALLMULTI;
Wang Chendad9b332008-06-18 01:48:28 -07004535 dev->allmulti += inc;
4536 if (dev->allmulti == 0) {
4537 /*
4538 * Avoid overflow.
4539 * If inc causes overflow, untouch allmulti and return error.
4540 */
4541 if (inc < 0)
4542 dev->flags &= ~IFF_ALLMULTI;
4543 else {
4544 dev->allmulti -= inc;
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004545 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4546 dev->name);
Wang Chendad9b332008-06-18 01:48:28 -07004547 return -EOVERFLOW;
4548 }
4549 }
Patrick McHardy24023452007-07-14 18:51:31 -07004550 if (dev->flags ^ old_flags) {
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004551 dev_change_rx_flags(dev, IFF_ALLMULTI);
Patrick McHardy4417da62007-06-27 01:28:10 -07004552 dev_set_rx_mode(dev);
Patrick McHardy24023452007-07-14 18:51:31 -07004553 }
Wang Chendad9b332008-06-18 01:48:28 -07004554 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004555}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004556EXPORT_SYMBOL(dev_set_allmulti);
Patrick McHardy4417da62007-06-27 01:28:10 -07004557
4558/*
4559 * Upload unicast and multicast address lists to device and
4560 * configure RX filtering. When the device doesn't support unicast
Joe Perches53ccaae2007-12-20 14:02:06 -08004561 * filtering it is put in promiscuous mode while unicast addresses
Patrick McHardy4417da62007-06-27 01:28:10 -07004562 * are present.
4563 */
4564void __dev_set_rx_mode(struct net_device *dev)
4565{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004566 const struct net_device_ops *ops = dev->netdev_ops;
4567
Patrick McHardy4417da62007-06-27 01:28:10 -07004568 /* dev_open will call this function so the list will stay sane. */
4569 if (!(dev->flags&IFF_UP))
4570 return;
4571
4572 if (!netif_device_present(dev))
YOSHIFUJI Hideaki40b77c92007-07-19 10:43:23 +09004573 return;
Patrick McHardy4417da62007-06-27 01:28:10 -07004574
Jiri Pirko01789342011-08-16 06:29:00 +00004575 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004576 /* Unicast addresses changes may only happen under the rtnl,
4577 * therefore calling __dev_set_promiscuity here is safe.
4578 */
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004579 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004580 __dev_set_promiscuity(dev, 1);
Joe Perches2d348d12011-07-25 16:17:35 -07004581 dev->uc_promisc = true;
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004582 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004583 __dev_set_promiscuity(dev, -1);
Joe Perches2d348d12011-07-25 16:17:35 -07004584 dev->uc_promisc = false;
Patrick McHardy4417da62007-06-27 01:28:10 -07004585 }
Patrick McHardy4417da62007-06-27 01:28:10 -07004586 }
Jiri Pirko01789342011-08-16 06:29:00 +00004587
4588 if (ops->ndo_set_rx_mode)
4589 ops->ndo_set_rx_mode(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004590}
4591
4592void dev_set_rx_mode(struct net_device *dev)
4593{
David S. Millerb9e40852008-07-15 00:15:08 -07004594 netif_addr_lock_bh(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004595 __dev_set_rx_mode(dev);
David S. Millerb9e40852008-07-15 00:15:08 -07004596 netif_addr_unlock_bh(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004597}
4598
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004599/**
4600 * dev_get_flags - get flags reported to userspace
4601 * @dev: device
4602 *
4603 * Get the combination of flag bits exported through APIs to userspace.
4604 */
Eric Dumazet95c96172012-04-15 05:58:06 +00004605unsigned int dev_get_flags(const struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004606{
Eric Dumazet95c96172012-04-15 05:58:06 +00004607 unsigned int flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004608
4609 flags = (dev->flags & ~(IFF_PROMISC |
4610 IFF_ALLMULTI |
Stefan Rompfb00055a2006-03-20 17:09:11 -08004611 IFF_RUNNING |
4612 IFF_LOWER_UP |
4613 IFF_DORMANT)) |
Linus Torvalds1da177e2005-04-16 15:20:36 -07004614 (dev->gflags & (IFF_PROMISC |
4615 IFF_ALLMULTI));
4616
Stefan Rompfb00055a2006-03-20 17:09:11 -08004617 if (netif_running(dev)) {
4618 if (netif_oper_up(dev))
4619 flags |= IFF_RUNNING;
4620 if (netif_carrier_ok(dev))
4621 flags |= IFF_LOWER_UP;
4622 if (netif_dormant(dev))
4623 flags |= IFF_DORMANT;
4624 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004625
4626 return flags;
4627}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004628EXPORT_SYMBOL(dev_get_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004629
Patrick McHardybd380812010-02-26 06:34:53 +00004630int __dev_change_flags(struct net_device *dev, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004631{
Eric Dumazetb536db92011-11-30 21:42:26 +00004632 unsigned int old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00004633 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004634
Patrick McHardy24023452007-07-14 18:51:31 -07004635 ASSERT_RTNL();
4636
Linus Torvalds1da177e2005-04-16 15:20:36 -07004637 /*
4638 * Set the flags on our device.
4639 */
4640
4641 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4642 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4643 IFF_AUTOMEDIA)) |
4644 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4645 IFF_ALLMULTI));
4646
4647 /*
4648 * Load in the correct multicast list now the flags have changed.
4649 */
4650
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004651 if ((old_flags ^ flags) & IFF_MULTICAST)
4652 dev_change_rx_flags(dev, IFF_MULTICAST);
Patrick McHardy24023452007-07-14 18:51:31 -07004653
Patrick McHardy4417da62007-06-27 01:28:10 -07004654 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004655
4656 /*
4657 * Have we downed the interface. We handle IFF_UP ourselves
4658 * according to user attempts to set it, rather than blindly
4659 * setting it.
4660 */
4661
4662 ret = 0;
4663 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
Patrick McHardybd380812010-02-26 06:34:53 +00004664 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004665
4666 if (!ret)
Patrick McHardy4417da62007-06-27 01:28:10 -07004667 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004668 }
4669
Linus Torvalds1da177e2005-04-16 15:20:36 -07004670 if ((flags ^ dev->gflags) & IFF_PROMISC) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004671 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4672
Linus Torvalds1da177e2005-04-16 15:20:36 -07004673 dev->gflags ^= IFF_PROMISC;
4674 dev_set_promiscuity(dev, inc);
4675 }
4676
4677 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4678 is important. Some (broken) drivers set IFF_PROMISC, when
4679 IFF_ALLMULTI is requested not asking us and not reporting.
4680 */
4681 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004682 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4683
Linus Torvalds1da177e2005-04-16 15:20:36 -07004684 dev->gflags ^= IFF_ALLMULTI;
4685 dev_set_allmulti(dev, inc);
4686 }
4687
Patrick McHardybd380812010-02-26 06:34:53 +00004688 return ret;
4689}
4690
4691void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4692{
4693 unsigned int changes = dev->flags ^ old_flags;
4694
4695 if (changes & IFF_UP) {
4696 if (dev->flags & IFF_UP)
4697 call_netdevice_notifiers(NETDEV_UP, dev);
4698 else
4699 call_netdevice_notifiers(NETDEV_DOWN, dev);
4700 }
4701
4702 if (dev->flags & IFF_UP &&
4703 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4704 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4705}
4706
4707/**
4708 * dev_change_flags - change device settings
4709 * @dev: device
4710 * @flags: device state flags
4711 *
4712 * Change settings on device based state flags. The flags are
4713 * in the userspace exported format.
4714 */
Eric Dumazetb536db92011-11-30 21:42:26 +00004715int dev_change_flags(struct net_device *dev, unsigned int flags)
Patrick McHardybd380812010-02-26 06:34:53 +00004716{
Eric Dumazetb536db92011-11-30 21:42:26 +00004717 int ret;
4718 unsigned int changes, old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00004719
4720 ret = __dev_change_flags(dev, flags);
4721 if (ret < 0)
4722 return ret;
4723
4724 changes = old_flags ^ dev->flags;
Thomas Graf7c355f52007-06-05 16:03:03 -07004725 if (changes)
4726 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004727
Patrick McHardybd380812010-02-26 06:34:53 +00004728 __dev_notify_flags(dev, old_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004729 return ret;
4730}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004731EXPORT_SYMBOL(dev_change_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004732
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004733/**
4734 * dev_set_mtu - Change maximum transfer unit
4735 * @dev: device
4736 * @new_mtu: new transfer unit
4737 *
4738 * Change the maximum transfer size of the network device.
4739 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004740int dev_set_mtu(struct net_device *dev, int new_mtu)
4741{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004742 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004743 int err;
4744
4745 if (new_mtu == dev->mtu)
4746 return 0;
4747
4748 /* MTU must be positive. */
4749 if (new_mtu < 0)
4750 return -EINVAL;
4751
4752 if (!netif_device_present(dev))
4753 return -ENODEV;
4754
4755 err = 0;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004756 if (ops->ndo_change_mtu)
4757 err = ops->ndo_change_mtu(dev, new_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004758 else
4759 dev->mtu = new_mtu;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004760
Jiri Pirkoe3d8fab2012-12-03 01:16:32 +00004761 if (!err)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07004762 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004763 return err;
4764}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004765EXPORT_SYMBOL(dev_set_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004766
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004767/**
Vlad Dogarucbda10f2011-01-13 23:38:30 +00004768 * dev_set_group - Change group this device belongs to
4769 * @dev: device
4770 * @new_group: group this device should belong to
4771 */
4772void dev_set_group(struct net_device *dev, int new_group)
4773{
4774 dev->group = new_group;
4775}
4776EXPORT_SYMBOL(dev_set_group);
4777
4778/**
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004779 * dev_set_mac_address - Change Media Access Control Address
4780 * @dev: device
4781 * @sa: new address
4782 *
4783 * Change the hardware (MAC) address of the device
4784 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004785int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4786{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004787 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004788 int err;
4789
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004790 if (!ops->ndo_set_mac_address)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004791 return -EOPNOTSUPP;
4792 if (sa->sa_family != dev->type)
4793 return -EINVAL;
4794 if (!netif_device_present(dev))
4795 return -ENODEV;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004796 err = ops->ndo_set_mac_address(dev, sa);
Jiri Pirkof6521512013-01-01 03:30:14 +00004797 if (err)
4798 return err;
Jiri Pirkofbdeca22013-01-01 03:30:16 +00004799 dev->addr_assign_type = NET_ADDR_SET;
Jiri Pirkof6521512013-01-01 03:30:14 +00004800 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04004801 add_device_randomness(dev->dev_addr, dev->addr_len);
Jiri Pirkof6521512013-01-01 03:30:14 +00004802 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004803}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004804EXPORT_SYMBOL(dev_set_mac_address);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004805
Jiri Pirko4bf84c32012-12-27 23:49:37 +00004806/**
4807 * dev_change_carrier - Change device carrier
4808 * @dev: device
Randy Dunlap691b3b72013-03-04 12:32:43 +00004809 * @new_carrier: new value
Jiri Pirko4bf84c32012-12-27 23:49:37 +00004810 *
4811 * Change device carrier
4812 */
4813int dev_change_carrier(struct net_device *dev, bool new_carrier)
4814{
4815 const struct net_device_ops *ops = dev->netdev_ops;
4816
4817 if (!ops->ndo_change_carrier)
4818 return -EOPNOTSUPP;
4819 if (!netif_device_present(dev))
4820 return -ENODEV;
4821 return ops->ndo_change_carrier(dev, new_carrier);
4822}
4823EXPORT_SYMBOL(dev_change_carrier);
4824
Linus Torvalds1da177e2005-04-16 15:20:36 -07004825/**
4826 * dev_new_index - allocate an ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004827 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07004828 *
4829 * Returns a suitable unique value for a new device interface
4830 * number. The caller must hold the rtnl semaphore or the
4831 * dev_base_lock to be sure it remains unique.
4832 */
Eric W. Biederman881d9662007-09-17 11:56:21 -07004833static int dev_new_index(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004834{
Pavel Emelyanovaa79e662012-08-08 21:53:19 +00004835 int ifindex = net->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004836 for (;;) {
4837 if (++ifindex <= 0)
4838 ifindex = 1;
Eric W. Biederman881d9662007-09-17 11:56:21 -07004839 if (!__dev_get_by_index(net, ifindex))
Pavel Emelyanovaa79e662012-08-08 21:53:19 +00004840 return net->ifindex = ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004841 }
4842}
4843
Linus Torvalds1da177e2005-04-16 15:20:36 -07004844/* Delayed registration/unregisteration */
Denis Cheng3b5b34f2007-12-07 00:49:17 -08004845static LIST_HEAD(net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004846
Stephen Hemminger6f05f622007-03-08 20:46:03 -08004847static void net_set_todo(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004848{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004849 list_add_tail(&dev->todo_list, &net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004850}
4851
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004852static void rollback_registered_many(struct list_head *head)
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004853{
Krishna Kumare93737b2009-12-08 22:26:02 +00004854 struct net_device *dev, *tmp;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004855
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004856 BUG_ON(dev_boot_phase);
4857 ASSERT_RTNL();
4858
Krishna Kumare93737b2009-12-08 22:26:02 +00004859 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004860 /* Some devices call without registering
Krishna Kumare93737b2009-12-08 22:26:02 +00004861 * for initialization unwind. Remove those
4862 * devices and proceed with the remaining.
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004863 */
4864 if (dev->reg_state == NETREG_UNINITIALIZED) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004865 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4866 dev->name, dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004867
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004868 WARN_ON(1);
Krishna Kumare93737b2009-12-08 22:26:02 +00004869 list_del(&dev->unreg_list);
4870 continue;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004871 }
Eric Dumazet449f4542011-05-19 12:24:16 +00004872 dev->dismantle = true;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004873 BUG_ON(dev->reg_state != NETREG_REGISTERED);
Octavian Purdila44345722010-12-13 12:44:07 +00004874 }
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004875
Octavian Purdila44345722010-12-13 12:44:07 +00004876 /* If device is running, close it first. */
4877 dev_close_many(head);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004878
Octavian Purdila44345722010-12-13 12:44:07 +00004879 list_for_each_entry(dev, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004880 /* And unlink it from device chain. */
4881 unlist_netdevice(dev);
4882
4883 dev->reg_state = NETREG_UNREGISTERING;
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004884 }
4885
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004886 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004887
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004888 list_for_each_entry(dev, head, unreg_list) {
4889 /* Shutdown queueing discipline. */
4890 dev_shutdown(dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004891
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004892
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004893 /* Notify protocols, that we are about to destroy
4894 this device. They should clean all the things.
4895 */
4896 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4897
Patrick McHardya2835762010-02-26 06:34:51 +00004898 if (!dev->rtnl_link_ops ||
4899 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4900 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4901
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004902 /*
4903 * Flush the unicast and multicast chains
4904 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00004905 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00004906 dev_mc_flush(dev);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004907
4908 if (dev->netdev_ops->ndo_uninit)
4909 dev->netdev_ops->ndo_uninit(dev);
4910
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004911 /* Notifier chain MUST detach us all upper devices. */
4912 WARN_ON(netdev_has_any_upper_dev(dev));
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004913
4914 /* Remove entries from kobject tree */
4915 netdev_unregister_kobject(dev);
Alexander Duyck024e9672013-01-10 08:57:46 +00004916#ifdef CONFIG_XPS
4917 /* Remove XPS queueing entries */
4918 netif_reset_xps_queues_gt(dev, 0);
4919#endif
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004920 }
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004921
Eric W. Biederman850a5452011-10-13 22:25:23 +00004922 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004923
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00004924 list_for_each_entry(dev, head, unreg_list)
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004925 dev_put(dev);
4926}
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004927
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004928static void rollback_registered(struct net_device *dev)
4929{
4930 LIST_HEAD(single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004931
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004932 list_add(&dev->unreg_list, &single);
4933 rollback_registered_many(&single);
Eric Dumazetceaaec92011-02-17 22:59:19 +00004934 list_del(&single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004935}
4936
Michał Mirosławc8f44af2011-11-15 15:29:55 +00004937static netdev_features_t netdev_fix_features(struct net_device *dev,
4938 netdev_features_t features)
Herbert Xub63365a2008-10-23 01:11:29 -07004939{
Michał Mirosław57422dc2011-01-22 12:14:12 +00004940 /* Fix illegal checksum combinations */
4941 if ((features & NETIF_F_HW_CSUM) &&
4942 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04004943 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
Michał Mirosław57422dc2011-01-22 12:14:12 +00004944 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4945 }
4946
Herbert Xub63365a2008-10-23 01:11:29 -07004947 /* TSO requires that SG is present as well. */
Ben Hutchingsea2d3682011-04-12 14:38:37 +00004948 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04004949 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
Ben Hutchingsea2d3682011-04-12 14:38:37 +00004950 features &= ~NETIF_F_ALL_TSO;
Herbert Xub63365a2008-10-23 01:11:29 -07004951 }
4952
Pravin B Shelarec5f0612013-03-07 09:28:01 +00004953 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
4954 !(features & NETIF_F_IP_CSUM)) {
4955 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
4956 features &= ~NETIF_F_TSO;
4957 features &= ~NETIF_F_TSO_ECN;
4958 }
4959
4960 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
4961 !(features & NETIF_F_IPV6_CSUM)) {
4962 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
4963 features &= ~NETIF_F_TSO6;
4964 }
4965
Ben Hutchings31d8b9e2011-04-12 14:47:15 +00004966 /* TSO ECN requires that TSO is present as well. */
4967 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
4968 features &= ~NETIF_F_TSO_ECN;
4969
Michał Mirosław212b5732011-02-15 16:59:16 +00004970 /* Software GSO depends on SG. */
4971 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04004972 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
Michał Mirosław212b5732011-02-15 16:59:16 +00004973 features &= ~NETIF_F_GSO;
4974 }
4975
Michał Mirosławacd11302011-01-24 15:45:15 -08004976 /* UFO needs SG and checksumming */
Herbert Xub63365a2008-10-23 01:11:29 -07004977 if (features & NETIF_F_UFO) {
Michał Mirosław79032642010-11-30 06:38:00 +00004978 /* maybe split UFO into V4 and V6? */
4979 if (!((features & NETIF_F_GEN_CSUM) ||
4980 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
4981 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04004982 netdev_dbg(dev,
Michał Mirosławacd11302011-01-24 15:45:15 -08004983 "Dropping NETIF_F_UFO since no checksum offload features.\n");
Herbert Xub63365a2008-10-23 01:11:29 -07004984 features &= ~NETIF_F_UFO;
4985 }
4986
4987 if (!(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04004988 netdev_dbg(dev,
Michał Mirosławacd11302011-01-24 15:45:15 -08004989 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
Herbert Xub63365a2008-10-23 01:11:29 -07004990 features &= ~NETIF_F_UFO;
4991 }
4992 }
4993
4994 return features;
4995}
Herbert Xub63365a2008-10-23 01:11:29 -07004996
Michał Mirosław6cb6a272011-04-02 22:48:47 -07004997int __netdev_update_features(struct net_device *dev)
Michał Mirosław5455c692011-02-15 16:59:17 +00004998{
Michał Mirosławc8f44af2011-11-15 15:29:55 +00004999 netdev_features_t features;
Michał Mirosław5455c692011-02-15 16:59:17 +00005000 int err = 0;
5001
Michał Mirosław87267482011-04-12 09:56:38 +00005002 ASSERT_RTNL();
5003
Michał Mirosław5455c692011-02-15 16:59:17 +00005004 features = netdev_get_wanted_features(dev);
5005
5006 if (dev->netdev_ops->ndo_fix_features)
5007 features = dev->netdev_ops->ndo_fix_features(dev, features);
5008
5009 /* driver might be less strict about feature dependencies */
5010 features = netdev_fix_features(dev, features);
5011
5012 if (dev->features == features)
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005013 return 0;
Michał Mirosław5455c692011-02-15 16:59:17 +00005014
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005015 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5016 &dev->features, &features);
Michał Mirosław5455c692011-02-15 16:59:17 +00005017
5018 if (dev->netdev_ops->ndo_set_features)
5019 err = dev->netdev_ops->ndo_set_features(dev, features);
5020
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005021 if (unlikely(err < 0)) {
Michał Mirosław5455c692011-02-15 16:59:17 +00005022 netdev_err(dev,
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005023 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5024 err, &features, &dev->features);
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005025 return -1;
5026 }
5027
5028 if (!err)
5029 dev->features = features;
5030
5031 return 1;
5032}
5033
Michał Mirosławafe12cc2011-05-07 03:22:17 +00005034/**
5035 * netdev_update_features - recalculate device features
5036 * @dev: the device to check
5037 *
5038 * Recalculate dev->features set and send notifications if it
5039 * has changed. Should be called after driver or hardware dependent
5040 * conditions might have changed that influence the features.
5041 */
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005042void netdev_update_features(struct net_device *dev)
5043{
5044 if (__netdev_update_features(dev))
5045 netdev_features_change(dev);
Michał Mirosław5455c692011-02-15 16:59:17 +00005046}
5047EXPORT_SYMBOL(netdev_update_features);
5048
Linus Torvalds1da177e2005-04-16 15:20:36 -07005049/**
Michał Mirosławafe12cc2011-05-07 03:22:17 +00005050 * netdev_change_features - recalculate device features
5051 * @dev: the device to check
5052 *
5053 * Recalculate dev->features set and send notifications even
5054 * if they have not changed. Should be called instead of
5055 * netdev_update_features() if also dev->vlan_features might
5056 * have changed to allow the changes to be propagated to stacked
5057 * VLAN devices.
5058 */
5059void netdev_change_features(struct net_device *dev)
5060{
5061 __netdev_update_features(dev);
5062 netdev_features_change(dev);
5063}
5064EXPORT_SYMBOL(netdev_change_features);
5065
5066/**
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08005067 * netif_stacked_transfer_operstate - transfer operstate
5068 * @rootdev: the root or lower level device to transfer state from
5069 * @dev: the device to transfer operstate to
5070 *
5071 * Transfer operational state from root to device. This is normally
5072 * called when a stacking relationship exists between the root
5073 * device and the device(a leaf device).
5074 */
5075void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5076 struct net_device *dev)
5077{
5078 if (rootdev->operstate == IF_OPER_DORMANT)
5079 netif_dormant_on(dev);
5080 else
5081 netif_dormant_off(dev);
5082
5083 if (netif_carrier_ok(rootdev)) {
5084 if (!netif_carrier_ok(dev))
5085 netif_carrier_on(dev);
5086 } else {
5087 if (netif_carrier_ok(dev))
5088 netif_carrier_off(dev);
5089 }
5090}
5091EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5092
Tom Herbertbf264142010-11-26 08:36:09 +00005093#ifdef CONFIG_RPS
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005094static int netif_alloc_rx_queues(struct net_device *dev)
5095{
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005096 unsigned int i, count = dev->num_rx_queues;
Tom Herbertbd25fa72010-10-18 18:00:16 +00005097 struct netdev_rx_queue *rx;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005098
Tom Herbertbd25fa72010-10-18 18:00:16 +00005099 BUG_ON(count < 1);
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005100
Tom Herbertbd25fa72010-10-18 18:00:16 +00005101 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
Joe Perches62b59422013-02-04 16:48:16 +00005102 if (!rx)
Tom Herbertbd25fa72010-10-18 18:00:16 +00005103 return -ENOMEM;
Joe Perches62b59422013-02-04 16:48:16 +00005104
Tom Herbertbd25fa72010-10-18 18:00:16 +00005105 dev->_rx = rx;
5106
Tom Herbertbd25fa72010-10-18 18:00:16 +00005107 for (i = 0; i < count; i++)
Tom Herbertfe822242010-11-09 10:47:38 +00005108 rx[i].dev = dev;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005109 return 0;
5110}
Tom Herbertbf264142010-11-26 08:36:09 +00005111#endif
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005112
Changli Gaoaa942102010-12-04 02:31:41 +00005113static void netdev_init_one_queue(struct net_device *dev,
5114 struct netdev_queue *queue, void *_unused)
5115{
5116 /* Initialize queue lock */
5117 spin_lock_init(&queue->_xmit_lock);
5118 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5119 queue->xmit_lock_owner = -1;
Changli Gaob236da62010-12-14 03:09:15 +00005120 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
Changli Gaoaa942102010-12-04 02:31:41 +00005121 queue->dev = dev;
Tom Herbert114cf582011-11-28 16:33:09 +00005122#ifdef CONFIG_BQL
5123 dql_init(&queue->dql, HZ);
5124#endif
Changli Gaoaa942102010-12-04 02:31:41 +00005125}
5126
Tom Herberte6484932010-10-18 18:04:39 +00005127static int netif_alloc_netdev_queues(struct net_device *dev)
5128{
5129 unsigned int count = dev->num_tx_queues;
5130 struct netdev_queue *tx;
5131
5132 BUG_ON(count < 1);
5133
5134 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
Joe Perches62b59422013-02-04 16:48:16 +00005135 if (!tx)
Tom Herberte6484932010-10-18 18:04:39 +00005136 return -ENOMEM;
Joe Perches62b59422013-02-04 16:48:16 +00005137
Tom Herberte6484932010-10-18 18:04:39 +00005138 dev->_tx = tx;
Tom Herbert1d24eb42010-11-21 13:17:27 +00005139
Tom Herberte6484932010-10-18 18:04:39 +00005140 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5141 spin_lock_init(&dev->tx_global_lock);
Changli Gaoaa942102010-12-04 02:31:41 +00005142
5143 return 0;
Tom Herberte6484932010-10-18 18:04:39 +00005144}
5145
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08005146/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005147 * register_netdevice - register a network device
5148 * @dev: device to register
5149 *
5150 * Take a completed network device structure and add it to the kernel
5151 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5152 * chain. 0 is returned on success. A negative errno code is returned
5153 * on a failure to set up the device, or if the name is a duplicate.
5154 *
5155 * Callers must hold the rtnl semaphore. You may want
5156 * register_netdev() instead of this.
5157 *
5158 * BUGS:
5159 * The locking appears insufficient to guarantee two parallel registers
5160 * will not get the same name.
5161 */
5162
5163int register_netdevice(struct net_device *dev)
5164{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005165 int ret;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005166 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005167
5168 BUG_ON(dev_boot_phase);
5169 ASSERT_RTNL();
5170
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005171 might_sleep();
5172
Linus Torvalds1da177e2005-04-16 15:20:36 -07005173 /* When net_device's are persistent, this will be fatal. */
5174 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005175 BUG_ON(!net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005176
David S. Millerf1f28aa2008-07-15 00:08:33 -07005177 spin_lock_init(&dev->addr_list_lock);
David S. Millercf508b12008-07-22 14:16:42 -07005178 netdev_set_addr_lockdep_class(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005179
Linus Torvalds1da177e2005-04-16 15:20:36 -07005180 dev->iflink = -1;
5181
Gao feng828de4f2012-09-13 20:58:27 +00005182 ret = dev_get_valid_name(net, dev, dev->name);
Peter Pan(潘卫平)0696c3a2011-05-12 15:46:56 +00005183 if (ret < 0)
5184 goto out;
5185
Linus Torvalds1da177e2005-04-16 15:20:36 -07005186 /* Init, if this function is available */
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005187 if (dev->netdev_ops->ndo_init) {
5188 ret = dev->netdev_ops->ndo_init(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005189 if (ret) {
5190 if (ret > 0)
5191 ret = -EIO;
Adrian Bunk90833aa2006-11-13 16:02:22 -08005192 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005193 }
5194 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005195
Patrick McHardyf6469682013-04-19 02:04:27 +00005196 if (((dev->hw_features | dev->features) &
5197 NETIF_F_HW_VLAN_CTAG_FILTER) &&
Michał Mirosławd2ed2732013-01-29 15:14:16 +00005198 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5199 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5200 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5201 ret = -EINVAL;
5202 goto err_uninit;
5203 }
5204
Pavel Emelyanov9c7dafb2012-08-08 21:52:46 +00005205 ret = -EBUSY;
5206 if (!dev->ifindex)
5207 dev->ifindex = dev_new_index(net);
5208 else if (__dev_get_by_index(net, dev->ifindex))
5209 goto err_uninit;
5210
Linus Torvalds1da177e2005-04-16 15:20:36 -07005211 if (dev->iflink == -1)
5212 dev->iflink = dev->ifindex;
5213
Michał Mirosław5455c692011-02-15 16:59:17 +00005214 /* Transfer changeable features to wanted_features and enable
5215 * software offloads (GSO and GRO).
5216 */
5217 dev->hw_features |= NETIF_F_SOFT_FEATURES;
Michał Mirosław14d12322011-02-22 16:52:28 +00005218 dev->features |= NETIF_F_SOFT_FEATURES;
5219 dev->wanted_features = dev->features & dev->hw_features;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005220
Tom Herbertc6e1a0d2011-04-04 22:30:30 -07005221 /* Turn on no cache copy if HW is doing checksum */
Michał Mirosław34324dc2011-11-15 15:29:55 +00005222 if (!(dev->flags & IFF_LOOPBACK)) {
5223 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5224 if (dev->features & NETIF_F_ALL_CSUM) {
5225 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5226 dev->features |= NETIF_F_NOCACHE_COPY;
5227 }
Tom Herbertc6e1a0d2011-04-04 22:30:30 -07005228 }
5229
Michał Mirosław1180e7d2011-07-14 14:41:11 -07005230 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
Brandon Philips16c3ea72010-09-15 09:24:24 +00005231 */
Michał Mirosław1180e7d2011-07-14 14:41:11 -07005232 dev->vlan_features |= NETIF_F_HIGHDMA;
Brandon Philips16c3ea72010-09-15 09:24:24 +00005233
Pravin B Shelaree579672013-03-07 09:28:08 +00005234 /* Make NETIF_F_SG inheritable to tunnel devices.
5235 */
5236 dev->hw_enc_features |= NETIF_F_SG;
5237
Johannes Berg7ffbe3f2009-10-02 05:15:27 +00005238 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5239 ret = notifier_to_errno(ret);
5240 if (ret)
5241 goto err_uninit;
5242
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005243 ret = netdev_register_kobject(dev);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005244 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005245 goto err_uninit;
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005246 dev->reg_state = NETREG_REGISTERED;
5247
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005248 __netdev_update_features(dev);
Michał Mirosław8e9b59b2011-02-22 16:52:28 +00005249
Linus Torvalds1da177e2005-04-16 15:20:36 -07005250 /*
5251 * Default initial state at registry is that the
5252 * device is present.
5253 */
5254
5255 set_bit(__LINK_STATE_PRESENT, &dev->state);
5256
Ben Hutchings8f4cccb2012-08-20 22:16:51 +01005257 linkwatch_init_dev(dev);
5258
Linus Torvalds1da177e2005-04-16 15:20:36 -07005259 dev_init_scheduler(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005260 dev_hold(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005261 list_netdevice(dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04005262 add_device_randomness(dev->dev_addr, dev->addr_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005263
Jiri Pirko948b3372013-01-08 01:38:25 +00005264 /* If the device has permanent device address, driver should
5265 * set dev_addr and also addr_assign_type should be set to
5266 * NET_ADDR_PERM (default value).
5267 */
5268 if (dev->addr_assign_type == NET_ADDR_PERM)
5269 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5270
Linus Torvalds1da177e2005-04-16 15:20:36 -07005271 /* Notify protocols, that a new device appeared. */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005272 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07005273 ret = notifier_to_errno(ret);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005274 if (ret) {
5275 rollback_registered(dev);
5276 dev->reg_state = NETREG_UNREGISTERED;
5277 }
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005278 /*
5279 * Prevent userspace races by waiting until the network
5280 * device is fully setup before sending notifications.
5281 */
Patrick McHardya2835762010-02-26 06:34:51 +00005282 if (!dev->rtnl_link_ops ||
5283 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5284 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005285
5286out:
5287 return ret;
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005288
5289err_uninit:
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005290 if (dev->netdev_ops->ndo_uninit)
5291 dev->netdev_ops->ndo_uninit(dev);
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005292 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005293}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005294EXPORT_SYMBOL(register_netdevice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005295
5296/**
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005297 * init_dummy_netdev - init a dummy network device for NAPI
5298 * @dev: device to init
5299 *
5300 * This takes a network device structure and initialize the minimum
5301 * amount of fields so it can be used to schedule NAPI polls without
5302 * registering a full blown interface. This is to be used by drivers
5303 * that need to tie several hardware interfaces to a single NAPI
5304 * poll scheduler due to HW limitations.
5305 */
5306int init_dummy_netdev(struct net_device *dev)
5307{
5308 /* Clear everything. Note we don't initialize spinlocks
5309 * are they aren't supposed to be taken by any of the
5310 * NAPI code and this dummy netdev is supposed to be
5311 * only ever used for NAPI polls
5312 */
5313 memset(dev, 0, sizeof(struct net_device));
5314
5315 /* make sure we BUG if trying to hit standard
5316 * register/unregister code path
5317 */
5318 dev->reg_state = NETREG_DUMMY;
5319
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005320 /* NAPI wants this */
5321 INIT_LIST_HEAD(&dev->napi_list);
5322
5323 /* a dummy interface is started by default */
5324 set_bit(__LINK_STATE_PRESENT, &dev->state);
5325 set_bit(__LINK_STATE_START, &dev->state);
5326
Eric Dumazet29b44332010-10-11 10:22:12 +00005327 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5328 * because users of this 'device' dont need to change
5329 * its refcount.
5330 */
5331
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005332 return 0;
5333}
5334EXPORT_SYMBOL_GPL(init_dummy_netdev);
5335
5336
5337/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005338 * register_netdev - register a network device
5339 * @dev: device to register
5340 *
5341 * Take a completed network device structure and add it to the kernel
5342 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5343 * chain. 0 is returned on success. A negative errno code is returned
5344 * on a failure to set up the device, or if the name is a duplicate.
5345 *
Borislav Petkov38b4da32007-04-20 22:14:10 -07005346 * This is a wrapper around register_netdevice that takes the rtnl semaphore
Linus Torvalds1da177e2005-04-16 15:20:36 -07005347 * and expands the device name if you passed a format string to
5348 * alloc_netdev.
5349 */
5350int register_netdev(struct net_device *dev)
5351{
5352 int err;
5353
5354 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005355 err = register_netdevice(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005356 rtnl_unlock();
5357 return err;
5358}
5359EXPORT_SYMBOL(register_netdev);
5360
Eric Dumazet29b44332010-10-11 10:22:12 +00005361int netdev_refcnt_read(const struct net_device *dev)
5362{
5363 int i, refcnt = 0;
5364
5365 for_each_possible_cpu(i)
5366 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5367 return refcnt;
5368}
5369EXPORT_SYMBOL(netdev_refcnt_read);
5370
Ben Hutchings2c530402012-07-10 10:55:09 +00005371/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005372 * netdev_wait_allrefs - wait until all references are gone.
Randy Dunlap3de7a372012-08-18 14:36:44 +00005373 * @dev: target net_device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005374 *
5375 * This is called when unregistering network devices.
5376 *
5377 * Any protocol or device that holds a reference should register
5378 * for netdevice notification, and cleanup and put back the
5379 * reference if they receive an UNREGISTER event.
5380 * We can get stuck here if buggy protocols don't correctly
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005381 * call dev_put.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005382 */
5383static void netdev_wait_allrefs(struct net_device *dev)
5384{
5385 unsigned long rebroadcast_time, warning_time;
Eric Dumazet29b44332010-10-11 10:22:12 +00005386 int refcnt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005387
Eric Dumazete014deb2009-11-17 05:59:21 +00005388 linkwatch_forget_dev(dev);
5389
Linus Torvalds1da177e2005-04-16 15:20:36 -07005390 rebroadcast_time = warning_time = jiffies;
Eric Dumazet29b44332010-10-11 10:22:12 +00005391 refcnt = netdev_refcnt_read(dev);
5392
5393 while (refcnt != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005394 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005395 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005396
5397 /* Rebroadcast unregister notification */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005398 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005399
Eric Dumazet748e2d92012-08-22 21:50:59 +00005400 __rtnl_unlock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005401 rcu_barrier();
Eric Dumazet748e2d92012-08-22 21:50:59 +00005402 rtnl_lock();
5403
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005404 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005405 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5406 &dev->state)) {
5407 /* We must not have linkwatch events
5408 * pending on unregister. If this
5409 * happens, we simply run the queue
5410 * unscheduled, resulting in a noop
5411 * for this device.
5412 */
5413 linkwatch_run_queue();
5414 }
5415
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005416 __rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005417
5418 rebroadcast_time = jiffies;
5419 }
5420
5421 msleep(250);
5422
Eric Dumazet29b44332010-10-11 10:22:12 +00005423 refcnt = netdev_refcnt_read(dev);
5424
Linus Torvalds1da177e2005-04-16 15:20:36 -07005425 if (time_after(jiffies, warning_time + 10 * HZ)) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005426 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5427 dev->name, refcnt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005428 warning_time = jiffies;
5429 }
5430 }
5431}
5432
5433/* The sequence is:
5434 *
5435 * rtnl_lock();
5436 * ...
5437 * register_netdevice(x1);
5438 * register_netdevice(x2);
5439 * ...
5440 * unregister_netdevice(y1);
5441 * unregister_netdevice(y2);
5442 * ...
5443 * rtnl_unlock();
5444 * free_netdev(y1);
5445 * free_netdev(y2);
5446 *
Herbert Xu58ec3b42008-10-07 15:50:03 -07005447 * We are invoked by rtnl_unlock().
Linus Torvalds1da177e2005-04-16 15:20:36 -07005448 * This allows us to deal with problems:
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005449 * 1) We can delete sysfs objects which invoke hotplug
Linus Torvalds1da177e2005-04-16 15:20:36 -07005450 * without deadlocking with linkwatch via keventd.
5451 * 2) Since we run with the RTNL semaphore not held, we can sleep
5452 * safely in order to wait for the netdev refcnt to drop to zero.
Herbert Xu58ec3b42008-10-07 15:50:03 -07005453 *
5454 * We must not return until all unregister events added during
5455 * the interval the lock was held have been completed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005456 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005457void netdev_run_todo(void)
5458{
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005459 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005460
Linus Torvalds1da177e2005-04-16 15:20:36 -07005461 /* Snapshot list, allow later requests */
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005462 list_replace_init(&net_todo_list, &list);
Herbert Xu58ec3b42008-10-07 15:50:03 -07005463
5464 __rtnl_unlock();
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005465
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005466
5467 /* Wait for rcu callbacks to finish before next phase */
Eric W. Biederman850a5452011-10-13 22:25:23 +00005468 if (!list_empty(&list))
5469 rcu_barrier();
5470
Linus Torvalds1da177e2005-04-16 15:20:36 -07005471 while (!list_empty(&list)) {
5472 struct net_device *dev
stephen hemmingere5e26d72010-02-24 14:01:38 +00005473 = list_first_entry(&list, struct net_device, todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005474 list_del(&dev->todo_list);
5475
Eric Dumazet748e2d92012-08-22 21:50:59 +00005476 rtnl_lock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005477 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Eric Dumazet748e2d92012-08-22 21:50:59 +00005478 __rtnl_unlock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005479
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005480 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005481 pr_err("network todo '%s' but state %d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07005482 dev->name, dev->reg_state);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005483 dump_stack();
5484 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005485 }
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005486
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005487 dev->reg_state = NETREG_UNREGISTERED;
5488
Changli Gao152102c2010-03-30 20:16:22 +00005489 on_each_cpu(flush_backlog, dev, 1);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07005490
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005491 netdev_wait_allrefs(dev);
5492
5493 /* paranoia */
Eric Dumazet29b44332010-10-11 10:22:12 +00005494 BUG_ON(netdev_refcnt_read(dev));
Eric Dumazet33d480c2011-08-11 19:30:52 +00005495 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5496 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
Ilpo Järvinen547b7922008-07-25 21:43:18 -07005497 WARN_ON(dev->dn_ptr);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005498
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005499 if (dev->destructor)
5500 dev->destructor(dev);
Stephen Hemminger9093bbb2007-05-19 15:39:25 -07005501
5502 /* Free network device */
5503 kobject_put(&dev->dev.kobj);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005504 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005505}
5506
Ben Hutchings3cfde792010-07-09 09:11:52 +00005507/* Convert net_device_stats to rtnl_link_stats64. They have the same
5508 * fields in the same order, with only the type differing.
5509 */
Eric Dumazet77a1abf2012-03-05 04:50:09 +00005510void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5511 const struct net_device_stats *netdev_stats)
Ben Hutchings3cfde792010-07-09 09:11:52 +00005512{
5513#if BITS_PER_LONG == 64
Eric Dumazet77a1abf2012-03-05 04:50:09 +00005514 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5515 memcpy(stats64, netdev_stats, sizeof(*stats64));
Ben Hutchings3cfde792010-07-09 09:11:52 +00005516#else
5517 size_t i, n = sizeof(*stats64) / sizeof(u64);
5518 const unsigned long *src = (const unsigned long *)netdev_stats;
5519 u64 *dst = (u64 *)stats64;
5520
5521 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5522 sizeof(*stats64) / sizeof(u64));
5523 for (i = 0; i < n; i++)
5524 dst[i] = src[i];
5525#endif
5526}
Eric Dumazet77a1abf2012-03-05 04:50:09 +00005527EXPORT_SYMBOL(netdev_stats_to_stats64);
Ben Hutchings3cfde792010-07-09 09:11:52 +00005528
Eric Dumazetd83345a2009-11-16 03:36:51 +00005529/**
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005530 * dev_get_stats - get network device statistics
5531 * @dev: device to get statistics from
Eric Dumazet28172732010-07-07 14:58:56 -07005532 * @storage: place to store stats
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005533 *
Ben Hutchingsd7753512010-07-09 09:12:41 +00005534 * Get network statistics from device. Return @storage.
5535 * The device driver may provide its own method by setting
5536 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5537 * otherwise the internal statistics structure is used.
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005538 */
Ben Hutchingsd7753512010-07-09 09:12:41 +00005539struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5540 struct rtnl_link_stats64 *storage)
Eric Dumazet7004bf22009-05-18 00:34:33 +00005541{
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005542 const struct net_device_ops *ops = dev->netdev_ops;
5543
Eric Dumazet28172732010-07-07 14:58:56 -07005544 if (ops->ndo_get_stats64) {
5545 memset(storage, 0, sizeof(*storage));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005546 ops->ndo_get_stats64(dev, storage);
5547 } else if (ops->ndo_get_stats) {
Ben Hutchings3cfde792010-07-09 09:11:52 +00005548 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005549 } else {
5550 netdev_stats_to_stats64(storage, &dev->stats);
Eric Dumazet28172732010-07-07 14:58:56 -07005551 }
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005552 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
Eric Dumazet28172732010-07-07 14:58:56 -07005553 return storage;
Rusty Russellc45d2862007-03-28 14:29:08 -07005554}
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005555EXPORT_SYMBOL(dev_get_stats);
Rusty Russellc45d2862007-03-28 14:29:08 -07005556
Eric Dumazet24824a02010-10-02 06:11:55 +00005557struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
David S. Millerdc2b4842008-07-08 17:18:23 -07005558{
Eric Dumazet24824a02010-10-02 06:11:55 +00005559 struct netdev_queue *queue = dev_ingress_queue(dev);
David S. Millerdc2b4842008-07-08 17:18:23 -07005560
Eric Dumazet24824a02010-10-02 06:11:55 +00005561#ifdef CONFIG_NET_CLS_ACT
5562 if (queue)
5563 return queue;
5564 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5565 if (!queue)
5566 return NULL;
5567 netdev_init_one_queue(dev, queue, NULL);
Eric Dumazet24824a02010-10-02 06:11:55 +00005568 queue->qdisc = &noop_qdisc;
5569 queue->qdisc_sleeping = &noop_qdisc;
5570 rcu_assign_pointer(dev->ingress_queue, queue);
5571#endif
5572 return queue;
David S. Millerbb949fb2008-07-08 16:55:56 -07005573}
5574
Eric Dumazet2c60db02012-09-16 09:17:26 +00005575static const struct ethtool_ops default_ethtool_ops;
5576
Stanislaw Gruszkad07d7502013-01-10 23:19:10 +00005577void netdev_set_default_ethtool_ops(struct net_device *dev,
5578 const struct ethtool_ops *ops)
5579{
5580 if (dev->ethtool_ops == &default_ethtool_ops)
5581 dev->ethtool_ops = ops;
5582}
5583EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5584
Linus Torvalds1da177e2005-04-16 15:20:36 -07005585/**
Tom Herbert36909ea2011-01-09 19:36:31 +00005586 * alloc_netdev_mqs - allocate network device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005587 * @sizeof_priv: size of private data to allocate space for
5588 * @name: device name format string
5589 * @setup: callback to initialize device
Tom Herbert36909ea2011-01-09 19:36:31 +00005590 * @txqs: the number of TX subqueues to allocate
5591 * @rxqs: the number of RX subqueues to allocate
Linus Torvalds1da177e2005-04-16 15:20:36 -07005592 *
5593 * Allocates a struct net_device with private data area for driver use
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005594 * and performs basic initialization. Also allocates subquue structs
Tom Herbert36909ea2011-01-09 19:36:31 +00005595 * for each queue on the device.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005596 */
Tom Herbert36909ea2011-01-09 19:36:31 +00005597struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5598 void (*setup)(struct net_device *),
5599 unsigned int txqs, unsigned int rxqs)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005600{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005601 struct net_device *dev;
Stephen Hemminger79439862008-07-21 13:28:44 -07005602 size_t alloc_size;
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005603 struct net_device *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005604
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07005605 BUG_ON(strlen(name) >= sizeof(dev->name));
5606
Tom Herbert36909ea2011-01-09 19:36:31 +00005607 if (txqs < 1) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005608 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
Tom Herbert55513fb2010-10-18 17:55:58 +00005609 return NULL;
5610 }
5611
Tom Herbert36909ea2011-01-09 19:36:31 +00005612#ifdef CONFIG_RPS
5613 if (rxqs < 1) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005614 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
Tom Herbert36909ea2011-01-09 19:36:31 +00005615 return NULL;
5616 }
5617#endif
5618
David S. Millerfd2ea0a2008-07-17 01:56:23 -07005619 alloc_size = sizeof(struct net_device);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005620 if (sizeof_priv) {
5621 /* ensure 32-byte alignment of private area */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005622 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005623 alloc_size += sizeof_priv;
5624 }
5625 /* ensure 32-byte alignment of whole construct */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005626 alloc_size += NETDEV_ALIGN - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005627
Paolo 'Blaisorblade' Giarrusso31380de2006-04-06 22:38:28 -07005628 p = kzalloc(alloc_size, GFP_KERNEL);
Joe Perches62b59422013-02-04 16:48:16 +00005629 if (!p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005630 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005631
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005632 dev = PTR_ALIGN(p, NETDEV_ALIGN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005633 dev->padded = (char *)dev - (char *)p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005634
Eric Dumazet29b44332010-10-11 10:22:12 +00005635 dev->pcpu_refcnt = alloc_percpu(int);
5636 if (!dev->pcpu_refcnt)
Tom Herberte6484932010-10-18 18:04:39 +00005637 goto free_p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005638
Linus Torvalds1da177e2005-04-16 15:20:36 -07005639 if (dev_addr_init(dev))
Eric Dumazet29b44332010-10-11 10:22:12 +00005640 goto free_pcpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005641
Jiri Pirko22bedad32010-04-01 21:22:57 +00005642 dev_mc_init(dev);
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005643 dev_uc_init(dev);
Jiri Pirkoccffad252009-05-22 23:22:17 +00005644
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005645 dev_net_set(dev, &init_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005646
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07005647 dev->gso_max_size = GSO_MAX_SIZE;
Ben Hutchings30b678d2012-07-30 15:57:00 +00005648 dev->gso_max_segs = GSO_MAX_SEGS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005649
Herbert Xud565b0a2008-12-15 23:38:52 -08005650 INIT_LIST_HEAD(&dev->napi_list);
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005651 INIT_LIST_HEAD(&dev->unreg_list);
Eric Dumazete014deb2009-11-17 05:59:21 +00005652 INIT_LIST_HEAD(&dev->link_watch_list);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005653 INIT_LIST_HEAD(&dev->upper_dev_list);
Eric Dumazet93f154b2009-05-18 22:19:19 -07005654 dev->priv_flags = IFF_XMIT_DST_RELEASE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005655 setup(dev);
David S. Miller8d3bdbd2011-02-08 15:02:50 -08005656
5657 dev->num_tx_queues = txqs;
5658 dev->real_num_tx_queues = txqs;
5659 if (netif_alloc_netdev_queues(dev))
5660 goto free_all;
5661
5662#ifdef CONFIG_RPS
5663 dev->num_rx_queues = rxqs;
5664 dev->real_num_rx_queues = rxqs;
5665 if (netif_alloc_rx_queues(dev))
5666 goto free_all;
5667#endif
5668
Linus Torvalds1da177e2005-04-16 15:20:36 -07005669 strcpy(dev->name, name);
Vlad Dogarucbda10f2011-01-13 23:38:30 +00005670 dev->group = INIT_NETDEV_GROUP;
Eric Dumazet2c60db02012-09-16 09:17:26 +00005671 if (!dev->ethtool_ops)
5672 dev->ethtool_ops = &default_ethtool_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005673 return dev;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005674
David S. Miller8d3bdbd2011-02-08 15:02:50 -08005675free_all:
5676 free_netdev(dev);
5677 return NULL;
5678
Eric Dumazet29b44332010-10-11 10:22:12 +00005679free_pcpu:
5680 free_percpu(dev->pcpu_refcnt);
Tom Herberted9af2e2010-11-09 10:47:30 +00005681 kfree(dev->_tx);
Tom Herbertfe822242010-11-09 10:47:38 +00005682#ifdef CONFIG_RPS
5683 kfree(dev->_rx);
5684#endif
5685
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005686free_p:
5687 kfree(p);
5688 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005689}
Tom Herbert36909ea2011-01-09 19:36:31 +00005690EXPORT_SYMBOL(alloc_netdev_mqs);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005691
5692/**
5693 * free_netdev - free network device
5694 * @dev: device
5695 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005696 * This function does the last stage of destroying an allocated device
5697 * interface. The reference to the device object is released.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005698 * If this is the last reference then it will be freed.
5699 */
5700void free_netdev(struct net_device *dev)
5701{
Herbert Xud565b0a2008-12-15 23:38:52 -08005702 struct napi_struct *p, *n;
5703
Denis V. Lunevf3005d72008-04-16 02:02:18 -07005704 release_net(dev_net(dev));
5705
David S. Millere8a04642008-07-17 00:34:19 -07005706 kfree(dev->_tx);
Tom Herbertfe822242010-11-09 10:47:38 +00005707#ifdef CONFIG_RPS
5708 kfree(dev->_rx);
5709#endif
David S. Millere8a04642008-07-17 00:34:19 -07005710
Eric Dumazet33d480c2011-08-11 19:30:52 +00005711 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
Eric Dumazet24824a02010-10-02 06:11:55 +00005712
Jiri Pirkof001fde2009-05-05 02:48:28 +00005713 /* Flush device addresses */
5714 dev_addr_flush(dev);
5715
Herbert Xud565b0a2008-12-15 23:38:52 -08005716 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5717 netif_napi_del(p);
5718
Eric Dumazet29b44332010-10-11 10:22:12 +00005719 free_percpu(dev->pcpu_refcnt);
5720 dev->pcpu_refcnt = NULL;
5721
Stephen Hemminger3041a062006-05-26 13:25:24 -07005722 /* Compatibility with error handling in drivers */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005723 if (dev->reg_state == NETREG_UNINITIALIZED) {
5724 kfree((char *)dev - dev->padded);
5725 return;
5726 }
5727
5728 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5729 dev->reg_state = NETREG_RELEASED;
5730
Greg Kroah-Hartman43cb76d2002-04-09 12:14:34 -07005731 /* will free via device release */
5732 put_device(&dev->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005733}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005734EXPORT_SYMBOL(free_netdev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005735
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005736/**
5737 * synchronize_net - Synchronize with packet receive processing
5738 *
5739 * Wait for packets currently being received to be done.
5740 * Does not block later packets from starting.
5741 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005742void synchronize_net(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005743{
5744 might_sleep();
Eric Dumazetbe3fc412011-05-23 23:07:32 +00005745 if (rtnl_is_locked())
5746 synchronize_rcu_expedited();
5747 else
5748 synchronize_rcu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005749}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005750EXPORT_SYMBOL(synchronize_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005751
5752/**
Eric Dumazet44a08732009-10-27 07:03:04 +00005753 * unregister_netdevice_queue - remove device from the kernel
Linus Torvalds1da177e2005-04-16 15:20:36 -07005754 * @dev: device
Eric Dumazet44a08732009-10-27 07:03:04 +00005755 * @head: list
Jaswinder Singh Rajput6ebfbc02009-11-22 20:43:13 -08005756 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07005757 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005758 * from the kernel tables.
Eric Dumazet44a08732009-10-27 07:03:04 +00005759 * If head not NULL, device is queued to be unregistered later.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005760 *
5761 * Callers must hold the rtnl semaphore. You may want
5762 * unregister_netdev() instead of this.
5763 */
5764
Eric Dumazet44a08732009-10-27 07:03:04 +00005765void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005766{
Herbert Xua6620712007-12-12 19:21:56 -08005767 ASSERT_RTNL();
5768
Eric Dumazet44a08732009-10-27 07:03:04 +00005769 if (head) {
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005770 list_move_tail(&dev->unreg_list, head);
Eric Dumazet44a08732009-10-27 07:03:04 +00005771 } else {
5772 rollback_registered(dev);
5773 /* Finish processing unregister after unlock */
5774 net_set_todo(dev);
5775 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005776}
Eric Dumazet44a08732009-10-27 07:03:04 +00005777EXPORT_SYMBOL(unregister_netdevice_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005778
5779/**
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005780 * unregister_netdevice_many - unregister many devices
5781 * @head: list of devices
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005782 */
5783void unregister_netdevice_many(struct list_head *head)
5784{
5785 struct net_device *dev;
5786
5787 if (!list_empty(head)) {
5788 rollback_registered_many(head);
5789 list_for_each_entry(dev, head, unreg_list)
5790 net_set_todo(dev);
5791 }
5792}
Eric Dumazet63c80992009-10-27 07:06:49 +00005793EXPORT_SYMBOL(unregister_netdevice_many);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005794
5795/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005796 * unregister_netdev - remove device from the kernel
5797 * @dev: device
5798 *
5799 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005800 * from the kernel tables.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005801 *
5802 * This is just a wrapper for unregister_netdevice that takes
5803 * the rtnl semaphore. In general you want to use this and not
5804 * unregister_netdevice.
5805 */
5806void unregister_netdev(struct net_device *dev)
5807{
5808 rtnl_lock();
5809 unregister_netdevice(dev);
5810 rtnl_unlock();
5811}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005812EXPORT_SYMBOL(unregister_netdev);
5813
Eric W. Biedermance286d32007-09-12 13:53:49 +02005814/**
5815 * dev_change_net_namespace - move device to different nethost namespace
5816 * @dev: device
5817 * @net: network namespace
5818 * @pat: If not NULL name pattern to try if the current device name
5819 * is already taken in the destination network namespace.
5820 *
5821 * This function shuts down a device interface and moves it
5822 * to a new network namespace. On success 0 is returned, on
5823 * a failure a netagive errno code is returned.
5824 *
5825 * Callers must hold the rtnl semaphore.
5826 */
5827
5828int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5829{
Eric W. Biedermance286d32007-09-12 13:53:49 +02005830 int err;
5831
5832 ASSERT_RTNL();
5833
5834 /* Don't allow namespace local devices to be moved. */
5835 err = -EINVAL;
5836 if (dev->features & NETIF_F_NETNS_LOCAL)
5837 goto out;
5838
5839 /* Ensure the device has been registrered */
Eric W. Biedermance286d32007-09-12 13:53:49 +02005840 if (dev->reg_state != NETREG_REGISTERED)
5841 goto out;
5842
5843 /* Get out if there is nothing todo */
5844 err = 0;
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09005845 if (net_eq(dev_net(dev), net))
Eric W. Biedermance286d32007-09-12 13:53:49 +02005846 goto out;
5847
5848 /* Pick the destination device name, and ensure
5849 * we can use it in the destination network namespace.
5850 */
5851 err = -EEXIST;
Octavian Purdilad9031022009-11-18 02:36:59 +00005852 if (__dev_get_by_name(net, dev->name)) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02005853 /* We get here if we can't use the current device name */
5854 if (!pat)
5855 goto out;
Gao feng828de4f2012-09-13 20:58:27 +00005856 if (dev_get_valid_name(net, dev, pat) < 0)
Eric W. Biedermance286d32007-09-12 13:53:49 +02005857 goto out;
5858 }
5859
5860 /*
5861 * And now a mini version of register_netdevice unregister_netdevice.
5862 */
5863
5864 /* If device is running close it first. */
Pavel Emelyanov9b772652007-10-10 02:49:09 -07005865 dev_close(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005866
5867 /* And unlink it from device chain */
5868 err = -ENODEV;
5869 unlist_netdevice(dev);
5870
5871 synchronize_net();
5872
5873 /* Shutdown queueing discipline. */
5874 dev_shutdown(dev);
5875
5876 /* Notify protocols, that we are about to destroy
5877 this device. They should clean all the things.
David Lamparter3b27e102010-09-17 03:22:19 +00005878
5879 Note that dev->reg_state stays at NETREG_REGISTERED.
5880 This is wanted because this way 8021q and macvlan know
5881 the device is just moving and can keep their slaves up.
Eric W. Biedermance286d32007-09-12 13:53:49 +02005882 */
5883 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Gao feng6549dd42012-08-23 15:36:55 +00005884 rcu_barrier();
5885 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Eric W. Biedermand2237d32011-10-21 06:24:20 +00005886 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005887
5888 /*
5889 * Flush the unicast and multicast chains
5890 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005891 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00005892 dev_mc_flush(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005893
Serge Hallyn4e66ae22012-12-03 16:17:12 +00005894 /* Send a netdev-removed uevent to the old namespace */
5895 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
5896
Eric W. Biedermance286d32007-09-12 13:53:49 +02005897 /* Actually switch the network namespace */
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005898 dev_net_set(dev, net);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005899
Eric W. Biedermance286d32007-09-12 13:53:49 +02005900 /* If there is an ifindex conflict assign a new one */
5901 if (__dev_get_by_index(net, dev->ifindex)) {
5902 int iflink = (dev->iflink == dev->ifindex);
5903 dev->ifindex = dev_new_index(net);
5904 if (iflink)
5905 dev->iflink = dev->ifindex;
5906 }
5907
Serge Hallyn4e66ae22012-12-03 16:17:12 +00005908 /* Send a netdev-add uevent to the new namespace */
5909 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
5910
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005911 /* Fixup kobjects */
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07005912 err = device_rename(&dev->dev, dev->name);
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005913 WARN_ON(err);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005914
5915 /* Add the device back in the hashes */
5916 list_netdevice(dev);
5917
5918 /* Notify protocols, that a new device appeared. */
5919 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5920
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005921 /*
5922 * Prevent userspace races by waiting until the network
5923 * device is fully setup before sending notifications.
5924 */
5925 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5926
Eric W. Biedermance286d32007-09-12 13:53:49 +02005927 synchronize_net();
5928 err = 0;
5929out:
5930 return err;
5931}
Johannes Berg463d0182009-07-14 00:33:35 +02005932EXPORT_SYMBOL_GPL(dev_change_net_namespace);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005933
Linus Torvalds1da177e2005-04-16 15:20:36 -07005934static int dev_cpu_callback(struct notifier_block *nfb,
5935 unsigned long action,
5936 void *ocpu)
5937{
5938 struct sk_buff **list_skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005939 struct sk_buff *skb;
5940 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5941 struct softnet_data *sd, *oldsd;
5942
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005943 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005944 return NOTIFY_OK;
5945
5946 local_irq_disable();
5947 cpu = smp_processor_id();
5948 sd = &per_cpu(softnet_data, cpu);
5949 oldsd = &per_cpu(softnet_data, oldcpu);
5950
5951 /* Find end of our completion_queue. */
5952 list_skb = &sd->completion_queue;
5953 while (*list_skb)
5954 list_skb = &(*list_skb)->next;
5955 /* Append completion queue from offline CPU. */
5956 *list_skb = oldsd->completion_queue;
5957 oldsd->completion_queue = NULL;
5958
Linus Torvalds1da177e2005-04-16 15:20:36 -07005959 /* Append output queue from offline CPU. */
Changli Gaoa9cbd582010-04-26 23:06:24 +00005960 if (oldsd->output_queue) {
5961 *sd->output_queue_tailp = oldsd->output_queue;
5962 sd->output_queue_tailp = oldsd->output_queue_tailp;
5963 oldsd->output_queue = NULL;
5964 oldsd->output_queue_tailp = &oldsd->output_queue;
5965 }
Heiko Carstens264524d2011-06-06 20:50:03 +00005966 /* Append NAPI poll list from offline CPU. */
5967 if (!list_empty(&oldsd->poll_list)) {
5968 list_splice_init(&oldsd->poll_list, &sd->poll_list);
5969 raise_softirq_irqoff(NET_RX_SOFTIRQ);
5970 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005971
5972 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5973 local_irq_enable();
5974
5975 /* Process offline CPU's input_pkt_queue */
Tom Herbert76cc8b12010-05-20 18:37:59 +00005976 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5977 netif_rx(skb);
5978 input_queue_head_incr(oldsd);
5979 }
Tom Herbertfec5e652010-04-16 16:01:27 -07005980 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005981 netif_rx(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00005982 input_queue_head_incr(oldsd);
Tom Herbertfec5e652010-04-16 16:01:27 -07005983 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005984
5985 return NOTIFY_OK;
5986}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005987
5988
Herbert Xu7f353bf2007-08-10 15:47:58 -07005989/**
Herbert Xub63365a2008-10-23 01:11:29 -07005990 * netdev_increment_features - increment feature set by one
5991 * @all: current feature set
5992 * @one: new feature set
5993 * @mask: mask feature set
Herbert Xu7f353bf2007-08-10 15:47:58 -07005994 *
5995 * Computes a new feature set after adding a device with feature set
Herbert Xub63365a2008-10-23 01:11:29 -07005996 * @one to the master device with current feature set @all. Will not
5997 * enable anything that is off in @mask. Returns the new feature set.
Herbert Xu7f353bf2007-08-10 15:47:58 -07005998 */
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005999netdev_features_t netdev_increment_features(netdev_features_t all,
6000 netdev_features_t one, netdev_features_t mask)
Herbert Xu7f353bf2007-08-10 15:47:58 -07006001{
Michał Mirosław1742f182011-04-22 06:31:16 +00006002 if (mask & NETIF_F_GEN_CSUM)
6003 mask |= NETIF_F_ALL_CSUM;
6004 mask |= NETIF_F_VLAN_CHALLENGED;
6005
6006 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6007 all &= one | ~NETIF_F_ALL_FOR_ALL;
6008
Michał Mirosław1742f182011-04-22 06:31:16 +00006009 /* If one device supports hw checksumming, set for all. */
6010 if (all & NETIF_F_GEN_CSUM)
6011 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
Herbert Xu7f353bf2007-08-10 15:47:58 -07006012
6013 return all;
6014}
Herbert Xub63365a2008-10-23 01:11:29 -07006015EXPORT_SYMBOL(netdev_increment_features);
Herbert Xu7f353bf2007-08-10 15:47:58 -07006016
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006017static struct hlist_head *netdev_create_hash(void)
6018{
6019 int i;
6020 struct hlist_head *hash;
6021
6022 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6023 if (hash != NULL)
6024 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6025 INIT_HLIST_HEAD(&hash[i]);
6026
6027 return hash;
6028}
6029
Eric W. Biederman881d9662007-09-17 11:56:21 -07006030/* Initialize per network namespace state */
Pavel Emelyanov46650792007-10-08 20:38:39 -07006031static int __net_init netdev_init(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07006032{
Rustad, Mark D734b6542012-07-18 09:06:07 +00006033 if (net != &init_net)
6034 INIT_LIST_HEAD(&net->dev_base_head);
Eric W. Biederman881d9662007-09-17 11:56:21 -07006035
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006036 net->dev_name_head = netdev_create_hash();
6037 if (net->dev_name_head == NULL)
6038 goto err_name;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006039
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006040 net->dev_index_head = netdev_create_hash();
6041 if (net->dev_index_head == NULL)
6042 goto err_idx;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006043
6044 return 0;
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006045
6046err_idx:
6047 kfree(net->dev_name_head);
6048err_name:
6049 return -ENOMEM;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006050}
6051
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006052/**
6053 * netdev_drivername - network driver for the device
6054 * @dev: network device
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006055 *
6056 * Determine network driver for device.
6057 */
David S. Miller3019de12011-06-06 16:41:33 -07006058const char *netdev_drivername(const struct net_device *dev)
Arjan van de Ven6579e572008-07-21 13:31:48 -07006059{
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07006060 const struct device_driver *driver;
6061 const struct device *parent;
David S. Miller3019de12011-06-06 16:41:33 -07006062 const char *empty = "";
Arjan van de Ven6579e572008-07-21 13:31:48 -07006063
6064 parent = dev->dev.parent;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006065 if (!parent)
David S. Miller3019de12011-06-06 16:41:33 -07006066 return empty;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006067
6068 driver = parent->driver;
6069 if (driver && driver->name)
David S. Miller3019de12011-06-06 16:41:33 -07006070 return driver->name;
6071 return empty;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006072}
6073
Joe Perchesb004ff42012-09-12 20:12:19 -07006074static int __netdev_printk(const char *level, const struct net_device *dev,
Joe Perches256df2f2010-06-27 01:02:35 +00006075 struct va_format *vaf)
6076{
6077 int r;
6078
Joe Perchesb004ff42012-09-12 20:12:19 -07006079 if (dev && dev->dev.parent) {
Joe Perches666f3552012-09-12 20:14:11 -07006080 r = dev_printk_emit(level[1] - '0',
6081 dev->dev.parent,
6082 "%s %s %s: %pV",
6083 dev_driver_string(dev->dev.parent),
6084 dev_name(dev->dev.parent),
6085 netdev_name(dev), vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006086 } else if (dev) {
Joe Perches256df2f2010-06-27 01:02:35 +00006087 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006088 } else {
Joe Perches256df2f2010-06-27 01:02:35 +00006089 r = printk("%s(NULL net_device): %pV", level, vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006090 }
Joe Perches256df2f2010-06-27 01:02:35 +00006091
6092 return r;
6093}
6094
6095int netdev_printk(const char *level, const struct net_device *dev,
6096 const char *format, ...)
6097{
6098 struct va_format vaf;
6099 va_list args;
6100 int r;
6101
6102 va_start(args, format);
6103
6104 vaf.fmt = format;
6105 vaf.va = &args;
6106
6107 r = __netdev_printk(level, dev, &vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006108
Joe Perches256df2f2010-06-27 01:02:35 +00006109 va_end(args);
6110
6111 return r;
6112}
6113EXPORT_SYMBOL(netdev_printk);
6114
6115#define define_netdev_printk_level(func, level) \
6116int func(const struct net_device *dev, const char *fmt, ...) \
6117{ \
6118 int r; \
6119 struct va_format vaf; \
6120 va_list args; \
6121 \
6122 va_start(args, fmt); \
6123 \
6124 vaf.fmt = fmt; \
6125 vaf.va = &args; \
6126 \
6127 r = __netdev_printk(level, dev, &vaf); \
Joe Perchesb004ff42012-09-12 20:12:19 -07006128 \
Joe Perches256df2f2010-06-27 01:02:35 +00006129 va_end(args); \
6130 \
6131 return r; \
6132} \
6133EXPORT_SYMBOL(func);
6134
6135define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6136define_netdev_printk_level(netdev_alert, KERN_ALERT);
6137define_netdev_printk_level(netdev_crit, KERN_CRIT);
6138define_netdev_printk_level(netdev_err, KERN_ERR);
6139define_netdev_printk_level(netdev_warn, KERN_WARNING);
6140define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6141define_netdev_printk_level(netdev_info, KERN_INFO);
6142
Pavel Emelyanov46650792007-10-08 20:38:39 -07006143static void __net_exit netdev_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07006144{
6145 kfree(net->dev_name_head);
6146 kfree(net->dev_index_head);
6147}
6148
Denis V. Lunev022cbae2007-11-13 03:23:50 -08006149static struct pernet_operations __net_initdata netdev_net_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07006150 .init = netdev_init,
6151 .exit = netdev_exit,
6152};
6153
Pavel Emelyanov46650792007-10-08 20:38:39 -07006154static void __net_exit default_device_exit(struct net *net)
Eric W. Biedermance286d32007-09-12 13:53:49 +02006155{
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006156 struct net_device *dev, *aux;
Eric W. Biedermance286d32007-09-12 13:53:49 +02006157 /*
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006158 * Push all migratable network devices back to the
Eric W. Biedermance286d32007-09-12 13:53:49 +02006159 * initial network namespace
6160 */
6161 rtnl_lock();
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006162 for_each_netdev_safe(net, dev, aux) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006163 int err;
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006164 char fb_name[IFNAMSIZ];
Eric W. Biedermance286d32007-09-12 13:53:49 +02006165
6166 /* Ignore unmoveable devices (i.e. loopback) */
6167 if (dev->features & NETIF_F_NETNS_LOCAL)
6168 continue;
6169
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006170 /* Leave virtual devices for the generic cleanup */
6171 if (dev->rtnl_link_ops)
6172 continue;
Eric W. Biedermand0c082c2008-11-05 15:59:38 -08006173
Lucas De Marchi25985ed2011-03-30 22:57:33 -03006174 /* Push remaining network devices to init_net */
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006175 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6176 err = dev_change_net_namespace(dev, &init_net, fb_name);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006177 if (err) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00006178 pr_emerg("%s: failed to move %s to init_net: %d\n",
6179 __func__, dev->name, err);
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006180 BUG();
Eric W. Biedermance286d32007-09-12 13:53:49 +02006181 }
6182 }
6183 rtnl_unlock();
6184}
6185
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006186static void __net_exit default_device_exit_batch(struct list_head *net_list)
6187{
6188 /* At exit all network devices most be removed from a network
Uwe Kleine-Königb5950762010-11-01 15:38:34 -04006189 * namespace. Do this in the reverse order of registration.
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006190 * Do this across as many network namespaces as possible to
6191 * improve batching efficiency.
6192 */
6193 struct net_device *dev;
6194 struct net *net;
6195 LIST_HEAD(dev_kill_list);
6196
6197 rtnl_lock();
6198 list_for_each_entry(net, net_list, exit_list) {
6199 for_each_netdev_reverse(net, dev) {
6200 if (dev->rtnl_link_ops)
6201 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6202 else
6203 unregister_netdevice_queue(dev, &dev_kill_list);
6204 }
6205 }
6206 unregister_netdevice_many(&dev_kill_list);
Eric Dumazetceaaec92011-02-17 22:59:19 +00006207 list_del(&dev_kill_list);
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006208 rtnl_unlock();
6209}
6210
Denis V. Lunev022cbae2007-11-13 03:23:50 -08006211static struct pernet_operations __net_initdata default_device_ops = {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006212 .exit = default_device_exit,
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006213 .exit_batch = default_device_exit_batch,
Eric W. Biedermance286d32007-09-12 13:53:49 +02006214};
6215
Linus Torvalds1da177e2005-04-16 15:20:36 -07006216/*
6217 * Initialize the DEV module. At boot time this walks the device list and
6218 * unhooks any devices that fail to initialise (normally hardware not
6219 * present) and leaves us with a valid list of present and active devices.
6220 *
6221 */
6222
6223/*
6224 * This is called single threaded during boot, so no need
6225 * to take the rtnl semaphore.
6226 */
6227static int __init net_dev_init(void)
6228{
6229 int i, rc = -ENOMEM;
6230
6231 BUG_ON(!dev_boot_phase);
6232
Linus Torvalds1da177e2005-04-16 15:20:36 -07006233 if (dev_proc_init())
6234 goto out;
6235
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006236 if (netdev_kobject_init())
Linus Torvalds1da177e2005-04-16 15:20:36 -07006237 goto out;
6238
6239 INIT_LIST_HEAD(&ptype_all);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08006240 for (i = 0; i < PTYPE_HASH_SIZE; i++)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006241 INIT_LIST_HEAD(&ptype_base[i]);
6242
Vlad Yasevich62532da2012-11-15 08:49:10 +00006243 INIT_LIST_HEAD(&offload_base);
6244
Eric W. Biederman881d9662007-09-17 11:56:21 -07006245 if (register_pernet_subsys(&netdev_net_ops))
6246 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006247
6248 /*
6249 * Initialise the packet receive queues.
6250 */
6251
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07006252 for_each_possible_cpu(i) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006253 struct softnet_data *sd = &per_cpu(softnet_data, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006254
Changli Gaodee42872010-05-02 05:42:16 +00006255 memset(sd, 0, sizeof(*sd));
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006256 skb_queue_head_init(&sd->input_pkt_queue);
Changli Gao6e7676c2010-04-27 15:07:33 -07006257 skb_queue_head_init(&sd->process_queue);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006258 sd->completion_queue = NULL;
6259 INIT_LIST_HEAD(&sd->poll_list);
Changli Gaoa9cbd582010-04-26 23:06:24 +00006260 sd->output_queue = NULL;
6261 sd->output_queue_tailp = &sd->output_queue;
Eric Dumazetdf334542010-03-24 19:13:54 +00006262#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006263 sd->csd.func = rps_trigger_softirq;
6264 sd->csd.info = sd;
6265 sd->csd.flags = 0;
6266 sd->cpu = i;
Tom Herbert1e94d722010-03-18 17:45:44 -07006267#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00006268
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006269 sd->backlog.poll = process_backlog;
6270 sd->backlog.weight = weight_p;
6271 sd->backlog.gro_list = NULL;
6272 sd->backlog.gro_count = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006273 }
6274
Linus Torvalds1da177e2005-04-16 15:20:36 -07006275 dev_boot_phase = 0;
6276
Eric W. Biederman505d4f72008-11-07 22:54:20 -08006277 /* The loopback device is special if any other network devices
6278 * is present in a network namespace the loopback device must
6279 * be present. Since we now dynamically allocate and free the
6280 * loopback device ensure this invariant is maintained by
6281 * keeping the loopback device as the first device on the
6282 * list of network devices. Ensuring the loopback devices
6283 * is the first device that appears and the last network device
6284 * that disappears.
6285 */
6286 if (register_pernet_device(&loopback_net_ops))
6287 goto out;
6288
6289 if (register_pernet_device(&default_device_ops))
6290 goto out;
6291
Carlos R. Mafra962cf362008-05-15 11:15:37 -03006292 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6293 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006294
6295 hotcpu_notifier(dev_cpu_callback, 0);
6296 dst_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006297 rc = 0;
6298out:
6299 return rc;
6300}
6301
6302subsys_initcall(net_dev_init);