blob: decf55f9ad80f12bfdc07c4e775129ba03673198 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070076#include <linux/bitops.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080077#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070078#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
stephen hemminger08e98972009-11-10 07:20:34 +000081#include <linux/hash.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090082#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/sched.h>
Arjan van de Ven4a3e2f72006-03-20 22:33:17 -080084#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
Ben Hutchings0187bdf2008-06-19 16:15:47 -070094#include <linux/ethtool.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <linux/notifier.h>
96#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020097#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <net/sock.h>
99#include <linux/rtnetlink.h>
100#include <linux/proc_fs.h>
101#include <linux/seq_file.h>
102#include <linux/stat.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103#include <net/dst.h>
104#include <net/pkt_sched.h>
105#include <net/checksum.h>
Arnd Bergmann44540962009-11-26 06:07:08 +0000106#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107#include <linux/highmem.h>
108#include <linux/init.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700109#include <linux/module.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110#include <linux/netpoll.h>
111#include <linux/rcupdate.h>
112#include <linux/delay.h>
Johannes Berg295f4a12007-04-26 20:43:56 -0700113#include <net/wext.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114#include <net/iw_handler.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115#include <asm/current.h>
Steve Grubb5bdb9882005-12-03 08:39:35 -0500116#include <linux/audit.h>
Chris Leechdb217332006-06-17 21:24:58 -0700117#include <linux/dmaengine.h>
Herbert Xuf6a78bf2006-06-22 02:57:17 -0700118#include <linux/err.h>
David S. Millerc7fa9d12006-08-15 16:34:13 -0700119#include <linux/ctype.h>
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700120#include <linux/if_arp.h>
Ben Hutchings6de329e2008-06-16 17:02:28 -0700121#include <linux/if_vlan.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700122#include <linux/ip.h>
Alexander Duyckad55dca2008-09-20 22:05:50 -0700123#include <net/ip.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700124#include <linux/ipv6.h>
125#include <linux/in.h>
David S. Millerb6b2fed2008-07-21 09:48:06 -0700126#include <linux/jhash.h>
127#include <linux/random.h>
David S. Miller9cbc1cb2009-06-15 03:02:23 -0700128#include <trace/events/napi.h>
Koki Sanagicf66ba52010-08-23 18:45:02 +0900129#include <trace/events/net.h>
Koki Sanagi07dc22e2010-08-23 18:46:12 +0900130#include <trace/events/skb.h>
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +0000131#include <linux/pci.h>
Stephen Rothwellcaeda9b2010-09-16 21:39:16 -0700132#include <linux/inetdevice.h>
Ben Hutchingsc4454772011-01-19 11:03:53 +0000133#include <linux/cpu_rmap.h>
Ingo Molnarc5905af2012-02-24 08:31:31 +0100134#include <linux/static_key.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135
Pavel Emelyanov342709e2007-10-23 21:14:45 -0700136#include "net-sysfs.h"
137
Herbert Xud565b0a2008-12-15 23:38:52 -0800138/* Instead of increasing this, you should create a hash table. */
139#define MAX_GRO_SKBS 8
140
Herbert Xu5d38a072009-01-04 16:13:40 -0800141/* This should be increased if a protocol with a bigger head is added. */
142#define GRO_MAX_HEAD (MAX_HEADER + 128)
143
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144/*
145 * The list of packet types we will receive (as opposed to discard)
146 * and the routines to invoke.
147 *
148 * Why 16. Because with 16 the only overlap we get on a hash of the
149 * low nibble of the protocol value is RARP/SNAP/X.25.
150 *
151 * NOTE: That is no longer true with the addition of VLAN tags. Not
152 * sure which should go first, but I bet it won't make much
153 * difference if we are running VLANs. The good news is that
154 * this protocol won't be in the list unless compiled in, so
Stephen Hemminger3041a062006-05-26 13:25:24 -0700155 * the average user (w/out VLANs) will not be adversely affected.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156 * --BLG
157 *
158 * 0800 IP
159 * 8100 802.1Q VLAN
160 * 0001 802.3
161 * 0002 AX.25
162 * 0004 802.2
163 * 8035 RARP
164 * 0005 SNAP
165 * 0805 X.25
166 * 0806 ARP
167 * 8137 IPX
168 * 0009 Localtalk
169 * 86DD IPv6
170 */
171
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +0800172#define PTYPE_HASH_SIZE (16)
173#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
174
Linus Torvalds1da177e2005-04-16 15:20:36 -0700175static DEFINE_SPINLOCK(ptype_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000176static DEFINE_SPINLOCK(offload_lock);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +0800177static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -0700178static struct list_head ptype_all __read_mostly; /* Taps */
Vlad Yasevich62532da2012-11-15 08:49:10 +0000179static struct list_head offload_base __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181/*
Pavel Emelianov7562f872007-05-03 15:13:45 -0700182 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183 * semaphore.
184 *
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800185 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186 *
187 * Writers must hold the rtnl semaphore while they loop through the
Pavel Emelianov7562f872007-05-03 15:13:45 -0700188 * dev_base_head list, and hold dev_base_lock for writing when they do the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189 * actual updates. This allows pure readers to access the list even
190 * while a writer is preparing to update it.
191 *
192 * To put it another way, dev_base_lock is held for writing only to
193 * protect against pure readers; the rtnl semaphore provides the
194 * protection against other writers.
195 *
196 * See, for example usages, register_netdevice() and
197 * unregister_netdevice(), which must be called with the rtnl
198 * semaphore held.
199 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700200DEFINE_RWLOCK(dev_base_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700201EXPORT_SYMBOL(dev_base_lock);
202
Eric Dumazet30e6c9f2012-12-20 17:25:08 +0000203seqcount_t devnet_rename_seq;
Brian Haleyc91f6df2012-11-26 05:21:08 +0000204
Thomas Graf4e985ad2011-06-21 03:11:20 +0000205static inline void dev_base_seq_inc(struct net *net)
206{
207 while (++net->dev_base_seq == 0);
208}
209
Eric W. Biederman881d9662007-09-17 11:56:21 -0700210static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700211{
Eric Dumazet95c96172012-04-15 05:58:06 +0000212 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
213
stephen hemminger08e98972009-11-10 07:20:34 +0000214 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700215}
216
Eric W. Biederman881d9662007-09-17 11:56:21 -0700217static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700218{
Eric Dumazet7c28bd02009-10-24 06:13:17 -0700219 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700220}
221
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000222static inline void rps_lock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000223{
224#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000225 spin_lock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000226#endif
227}
228
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000229static inline void rps_unlock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000230{
231#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000232 spin_unlock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000233#endif
234}
235
Eric W. Biedermance286d32007-09-12 13:53:49 +0200236/* Device list insertion */
237static int list_netdevice(struct net_device *dev)
238{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900239 struct net *net = dev_net(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200240
241 ASSERT_RTNL();
242
243 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800244 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
Eric Dumazet72c95282009-10-30 07:11:27 +0000245 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000246 hlist_add_head_rcu(&dev->index_hlist,
247 dev_index_hash(net, dev->ifindex));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200248 write_unlock_bh(&dev_base_lock);
Thomas Graf4e985ad2011-06-21 03:11:20 +0000249
250 dev_base_seq_inc(net);
251
Eric W. Biedermance286d32007-09-12 13:53:49 +0200252 return 0;
253}
254
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000255/* Device list removal
256 * caller must respect a RCU grace period before freeing/reusing dev
257 */
Eric W. Biedermance286d32007-09-12 13:53:49 +0200258static void unlist_netdevice(struct net_device *dev)
259{
260 ASSERT_RTNL();
261
262 /* Unlink dev from the device chain */
263 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800264 list_del_rcu(&dev->dev_list);
Eric Dumazet72c95282009-10-30 07:11:27 +0000265 hlist_del_rcu(&dev->name_hlist);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000266 hlist_del_rcu(&dev->index_hlist);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200267 write_unlock_bh(&dev_base_lock);
Thomas Graf4e985ad2011-06-21 03:11:20 +0000268
269 dev_base_seq_inc(dev_net(dev));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200270}
271
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272/*
273 * Our notifier list
274 */
275
Alan Sternf07d5b92006-05-09 15:23:03 -0700276static RAW_NOTIFIER_HEAD(netdev_chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277
278/*
279 * Device drivers call our routines to queue packets here. We empty the
280 * queue in the local softnet handler.
281 */
Stephen Hemmingerbea33482007-10-03 16:41:36 -0700282
Eric Dumazet9958da02010-04-17 04:17:02 +0000283DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700284EXPORT_PER_CPU_SYMBOL(softnet_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285
David S. Millercf508b12008-07-22 14:16:42 -0700286#ifdef CONFIG_LOCKDEP
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700287/*
David S. Millerc773e842008-07-08 23:13:53 -0700288 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700289 * according to dev->type
290 */
291static const unsigned short netdev_lock_type[] =
292 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
293 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
294 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
295 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
296 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
297 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
298 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
299 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
300 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
301 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
302 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
303 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
Paul Gortmaker211ed862012-05-10 17:14:35 -0400304 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
305 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
306 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700307
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700308static const char *const netdev_lock_name[] =
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700309 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
310 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
311 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
312 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
313 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
314 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
315 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
316 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
317 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
318 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
319 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
320 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
Paul Gortmaker211ed862012-05-10 17:14:35 -0400321 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
322 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
323 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700324
325static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
David S. Millercf508b12008-07-22 14:16:42 -0700326static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700327
328static inline unsigned short netdev_lock_pos(unsigned short dev_type)
329{
330 int i;
331
332 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
333 if (netdev_lock_type[i] == dev_type)
334 return i;
335 /* the last key is used by default */
336 return ARRAY_SIZE(netdev_lock_type) - 1;
337}
338
David S. Millercf508b12008-07-22 14:16:42 -0700339static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
340 unsigned short dev_type)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700341{
342 int i;
343
344 i = netdev_lock_pos(dev_type);
345 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
346 netdev_lock_name[i]);
347}
David S. Millercf508b12008-07-22 14:16:42 -0700348
349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
350{
351 int i;
352
353 i = netdev_lock_pos(dev->type);
354 lockdep_set_class_and_name(&dev->addr_list_lock,
355 &netdev_addr_lock_key[i],
356 netdev_lock_name[i]);
357}
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700358#else
David S. Millercf508b12008-07-22 14:16:42 -0700359static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
360 unsigned short dev_type)
361{
362}
363static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700364{
365}
366#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700367
368/*******************************************************************************
369
370 Protocol management and registration routines
371
372*******************************************************************************/
373
374/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700375 * Add a protocol ID to the list. Now that the input handler is
376 * smarter we can dispense with all the messy stuff that used to be
377 * here.
378 *
379 * BEWARE!!! Protocol handlers, mangling input packets,
380 * MUST BE last in hash buckets and checking protocol handlers
381 * MUST start from promiscuous ptype_all chain in net_bh.
382 * It is true now, do not change it.
383 * Explanation follows: if protocol handler, mangling packet, will
384 * be the first on list, it is not able to sense, that packet
385 * is cloned and should be copied-on-write, so that it will
386 * change it and subsequent readers will get broken packet.
387 * --ANK (980803)
388 */
389
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000390static inline struct list_head *ptype_head(const struct packet_type *pt)
391{
392 if (pt->type == htons(ETH_P_ALL))
393 return &ptype_all;
394 else
395 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
396}
397
Linus Torvalds1da177e2005-04-16 15:20:36 -0700398/**
399 * dev_add_pack - add packet handler
400 * @pt: packet type declaration
401 *
402 * Add a protocol handler to the networking stack. The passed &packet_type
403 * is linked into kernel lists and may not be freed until it has been
404 * removed from the kernel lists.
405 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900406 * This call does not sleep therefore it can not
Linus Torvalds1da177e2005-04-16 15:20:36 -0700407 * guarantee all CPU's that are in middle of receiving packets
408 * will see the new packet type (until the next received packet).
409 */
410
411void dev_add_pack(struct packet_type *pt)
412{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000413 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000415 spin_lock(&ptype_lock);
416 list_add_rcu(&pt->list, head);
417 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700418}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700419EXPORT_SYMBOL(dev_add_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700420
Linus Torvalds1da177e2005-04-16 15:20:36 -0700421/**
422 * __dev_remove_pack - remove packet handler
423 * @pt: packet type declaration
424 *
425 * Remove a protocol handler that was previously added to the kernel
426 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
427 * from the kernel lists and can be freed or reused once this function
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900428 * returns.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429 *
430 * The packet type might still be in use by receivers
431 * and must not be freed until after all the CPU's have gone
432 * through a quiescent state.
433 */
434void __dev_remove_pack(struct packet_type *pt)
435{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000436 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437 struct packet_type *pt1;
438
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000439 spin_lock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440
441 list_for_each_entry(pt1, head, list) {
442 if (pt == pt1) {
443 list_del_rcu(&pt->list);
444 goto out;
445 }
446 }
447
Joe Perches7b6cd1c2012-02-01 10:54:43 +0000448 pr_warn("dev_remove_pack: %p not found\n", pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700449out:
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000450 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700451}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700452EXPORT_SYMBOL(__dev_remove_pack);
453
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454/**
455 * dev_remove_pack - remove packet handler
456 * @pt: packet type declaration
457 *
458 * Remove a protocol handler that was previously added to the kernel
459 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
460 * from the kernel lists and can be freed or reused once this function
461 * returns.
462 *
463 * This call sleeps to guarantee that no CPU is looking at the packet
464 * type after return.
465 */
466void dev_remove_pack(struct packet_type *pt)
467{
468 __dev_remove_pack(pt);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900469
Linus Torvalds1da177e2005-04-16 15:20:36 -0700470 synchronize_net();
471}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700472EXPORT_SYMBOL(dev_remove_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700473
Vlad Yasevich62532da2012-11-15 08:49:10 +0000474
475/**
476 * dev_add_offload - register offload handlers
477 * @po: protocol offload declaration
478 *
479 * Add protocol offload handlers to the networking stack. The passed
480 * &proto_offload is linked into kernel lists and may not be freed until
481 * it has been removed from the kernel lists.
482 *
483 * This call does not sleep therefore it can not
484 * guarantee all CPU's that are in middle of receiving packets
485 * will see the new offload handlers (until the next received packet).
486 */
487void dev_add_offload(struct packet_offload *po)
488{
489 struct list_head *head = &offload_base;
490
491 spin_lock(&offload_lock);
492 list_add_rcu(&po->list, head);
493 spin_unlock(&offload_lock);
494}
495EXPORT_SYMBOL(dev_add_offload);
496
497/**
498 * __dev_remove_offload - remove offload handler
499 * @po: packet offload declaration
500 *
501 * Remove a protocol offload handler that was previously added to the
502 * kernel offload handlers by dev_add_offload(). The passed &offload_type
503 * is removed from the kernel lists and can be freed or reused once this
504 * function returns.
505 *
506 * The packet type might still be in use by receivers
507 * and must not be freed until after all the CPU's have gone
508 * through a quiescent state.
509 */
510void __dev_remove_offload(struct packet_offload *po)
511{
512 struct list_head *head = &offload_base;
513 struct packet_offload *po1;
514
Eric Dumazetc53aa502012-11-16 08:08:23 +0000515 spin_lock(&offload_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000516
517 list_for_each_entry(po1, head, list) {
518 if (po == po1) {
519 list_del_rcu(&po->list);
520 goto out;
521 }
522 }
523
524 pr_warn("dev_remove_offload: %p not found\n", po);
525out:
Eric Dumazetc53aa502012-11-16 08:08:23 +0000526 spin_unlock(&offload_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000527}
528EXPORT_SYMBOL(__dev_remove_offload);
529
530/**
531 * dev_remove_offload - remove packet offload handler
532 * @po: packet offload declaration
533 *
534 * Remove a packet offload handler that was previously added to the kernel
535 * offload handlers by dev_add_offload(). The passed &offload_type is
536 * removed from the kernel lists and can be freed or reused once this
537 * function returns.
538 *
539 * This call sleeps to guarantee that no CPU is looking at the packet
540 * type after return.
541 */
542void dev_remove_offload(struct packet_offload *po)
543{
544 __dev_remove_offload(po);
545
546 synchronize_net();
547}
548EXPORT_SYMBOL(dev_remove_offload);
549
Linus Torvalds1da177e2005-04-16 15:20:36 -0700550/******************************************************************************
551
552 Device Boot-time Settings Routines
553
554*******************************************************************************/
555
556/* Boot time configuration table */
557static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
558
559/**
560 * netdev_boot_setup_add - add new setup entry
561 * @name: name of the device
562 * @map: configured settings for the device
563 *
564 * Adds new setup entry to the dev_boot_setup list. The function
565 * returns 0 on error and 1 on success. This is a generic routine to
566 * all netdevices.
567 */
568static int netdev_boot_setup_add(char *name, struct ifmap *map)
569{
570 struct netdev_boot_setup *s;
571 int i;
572
573 s = dev_boot_setup;
574 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
575 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
576 memset(s[i].name, 0, sizeof(s[i].name));
Wang Chen93b3cff2008-07-01 19:57:19 -0700577 strlcpy(s[i].name, name, IFNAMSIZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700578 memcpy(&s[i].map, map, sizeof(s[i].map));
579 break;
580 }
581 }
582
583 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
584}
585
586/**
587 * netdev_boot_setup_check - check boot time settings
588 * @dev: the netdevice
589 *
590 * Check boot time settings for the device.
591 * The found settings are set for the device to be used
592 * later in the device probing.
593 * Returns 0 if no settings found, 1 if they are.
594 */
595int netdev_boot_setup_check(struct net_device *dev)
596{
597 struct netdev_boot_setup *s = dev_boot_setup;
598 int i;
599
600 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
601 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
Wang Chen93b3cff2008-07-01 19:57:19 -0700602 !strcmp(dev->name, s[i].name)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700603 dev->irq = s[i].map.irq;
604 dev->base_addr = s[i].map.base_addr;
605 dev->mem_start = s[i].map.mem_start;
606 dev->mem_end = s[i].map.mem_end;
607 return 1;
608 }
609 }
610 return 0;
611}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700612EXPORT_SYMBOL(netdev_boot_setup_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700613
614
615/**
616 * netdev_boot_base - get address from boot time settings
617 * @prefix: prefix for network device
618 * @unit: id for network device
619 *
620 * Check boot time settings for the base address of device.
621 * The found settings are set for the device to be used
622 * later in the device probing.
623 * Returns 0 if no settings found.
624 */
625unsigned long netdev_boot_base(const char *prefix, int unit)
626{
627 const struct netdev_boot_setup *s = dev_boot_setup;
628 char name[IFNAMSIZ];
629 int i;
630
631 sprintf(name, "%s%d", prefix, unit);
632
633 /*
634 * If device already registered then return base of 1
635 * to indicate not to probe for this interface
636 */
Eric W. Biederman881d9662007-09-17 11:56:21 -0700637 if (__dev_get_by_name(&init_net, name))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700638 return 1;
639
640 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
641 if (!strcmp(name, s[i].name))
642 return s[i].map.base_addr;
643 return 0;
644}
645
646/*
647 * Saves at boot time configured settings for any netdevice.
648 */
649int __init netdev_boot_setup(char *str)
650{
651 int ints[5];
652 struct ifmap map;
653
654 str = get_options(str, ARRAY_SIZE(ints), ints);
655 if (!str || !*str)
656 return 0;
657
658 /* Save settings */
659 memset(&map, 0, sizeof(map));
660 if (ints[0] > 0)
661 map.irq = ints[1];
662 if (ints[0] > 1)
663 map.base_addr = ints[2];
664 if (ints[0] > 2)
665 map.mem_start = ints[3];
666 if (ints[0] > 3)
667 map.mem_end = ints[4];
668
669 /* Add new entry to the list */
670 return netdev_boot_setup_add(str, &map);
671}
672
673__setup("netdev=", netdev_boot_setup);
674
675/*******************************************************************************
676
677 Device Interface Subroutines
678
679*******************************************************************************/
680
681/**
682 * __dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700683 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700684 * @name: name to find
685 *
686 * Find an interface by name. Must be called under RTNL semaphore
687 * or @dev_base_lock. If the name is found a pointer to the device
688 * is returned. If the name is not found then %NULL is returned. The
689 * reference counters are not incremented so the caller must be
690 * careful with locks.
691 */
692
Eric W. Biederman881d9662007-09-17 11:56:21 -0700693struct net_device *__dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694{
695 struct hlist_node *p;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700696 struct net_device *dev;
697 struct hlist_head *head = dev_name_hash(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700698
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700699 hlist_for_each_entry(dev, p, head, name_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700700 if (!strncmp(dev->name, name, IFNAMSIZ))
701 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700702
Linus Torvalds1da177e2005-04-16 15:20:36 -0700703 return NULL;
704}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700705EXPORT_SYMBOL(__dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700706
707/**
Eric Dumazet72c95282009-10-30 07:11:27 +0000708 * dev_get_by_name_rcu - find a device by its name
709 * @net: the applicable net namespace
710 * @name: name to find
711 *
712 * Find an interface by name.
713 * If the name is found a pointer to the device is returned.
714 * If the name is not found then %NULL is returned.
715 * The reference counters are not incremented so the caller must be
716 * careful with locks. The caller must hold RCU lock.
717 */
718
719struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
720{
721 struct hlist_node *p;
722 struct net_device *dev;
723 struct hlist_head *head = dev_name_hash(net, name);
724
725 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
726 if (!strncmp(dev->name, name, IFNAMSIZ))
727 return dev;
728
729 return NULL;
730}
731EXPORT_SYMBOL(dev_get_by_name_rcu);
732
733/**
Linus Torvalds1da177e2005-04-16 15:20:36 -0700734 * dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700735 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700736 * @name: name to find
737 *
738 * Find an interface by name. This can be called from any
739 * context and does its own locking. The returned handle has
740 * the usage count incremented and the caller must use dev_put() to
741 * release it when it is no longer needed. %NULL is returned if no
742 * matching device is found.
743 */
744
Eric W. Biederman881d9662007-09-17 11:56:21 -0700745struct net_device *dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700746{
747 struct net_device *dev;
748
Eric Dumazet72c95282009-10-30 07:11:27 +0000749 rcu_read_lock();
750 dev = dev_get_by_name_rcu(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700751 if (dev)
752 dev_hold(dev);
Eric Dumazet72c95282009-10-30 07:11:27 +0000753 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700754 return dev;
755}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700756EXPORT_SYMBOL(dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700757
758/**
759 * __dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700760 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700761 * @ifindex: index of device
762 *
763 * Search for an interface by index. Returns %NULL if the device
764 * is not found or a pointer to the device. The device has not
765 * had its reference counter increased so the caller must be careful
766 * about locking. The caller must hold either the RTNL semaphore
767 * or @dev_base_lock.
768 */
769
Eric W. Biederman881d9662007-09-17 11:56:21 -0700770struct net_device *__dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700771{
772 struct hlist_node *p;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700773 struct net_device *dev;
774 struct hlist_head *head = dev_index_hash(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700775
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700776 hlist_for_each_entry(dev, p, head, index_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700777 if (dev->ifindex == ifindex)
778 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700779
Linus Torvalds1da177e2005-04-16 15:20:36 -0700780 return NULL;
781}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700782EXPORT_SYMBOL(__dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700783
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000784/**
785 * dev_get_by_index_rcu - find a device by its ifindex
786 * @net: the applicable net namespace
787 * @ifindex: index of device
788 *
789 * Search for an interface by index. Returns %NULL if the device
790 * is not found or a pointer to the device. The device has not
791 * had its reference counter increased so the caller must be careful
792 * about locking. The caller must hold RCU lock.
793 */
794
795struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
796{
797 struct hlist_node *p;
798 struct net_device *dev;
799 struct hlist_head *head = dev_index_hash(net, ifindex);
800
801 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
802 if (dev->ifindex == ifindex)
803 return dev;
804
805 return NULL;
806}
807EXPORT_SYMBOL(dev_get_by_index_rcu);
808
Linus Torvalds1da177e2005-04-16 15:20:36 -0700809
810/**
811 * dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700812 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700813 * @ifindex: index of device
814 *
815 * Search for an interface by index. Returns NULL if the device
816 * is not found or a pointer to the device. The device returned has
817 * had a reference added and the pointer is safe until the user calls
818 * dev_put to indicate they have finished with it.
819 */
820
Eric W. Biederman881d9662007-09-17 11:56:21 -0700821struct net_device *dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700822{
823 struct net_device *dev;
824
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000825 rcu_read_lock();
826 dev = dev_get_by_index_rcu(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700827 if (dev)
828 dev_hold(dev);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000829 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700830 return dev;
831}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700832EXPORT_SYMBOL(dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700833
834/**
Eric Dumazet941666c2010-12-05 01:23:53 +0000835 * dev_getbyhwaddr_rcu - find a device by its hardware address
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700836 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700837 * @type: media type of device
838 * @ha: hardware address
839 *
840 * Search for an interface by MAC address. Returns NULL if the device
Eric Dumazetc5066532011-01-24 13:16:16 -0800841 * is not found or a pointer to the device.
842 * The caller must hold RCU or RTNL.
Eric Dumazet941666c2010-12-05 01:23:53 +0000843 * The returned device has not had its ref count increased
Linus Torvalds1da177e2005-04-16 15:20:36 -0700844 * and the caller must therefore be careful about locking
845 *
Linus Torvalds1da177e2005-04-16 15:20:36 -0700846 */
847
Eric Dumazet941666c2010-12-05 01:23:53 +0000848struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
849 const char *ha)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700850{
851 struct net_device *dev;
852
Eric Dumazet941666c2010-12-05 01:23:53 +0000853 for_each_netdev_rcu(net, dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700854 if (dev->type == type &&
855 !memcmp(dev->dev_addr, ha, dev->addr_len))
Pavel Emelianov7562f872007-05-03 15:13:45 -0700856 return dev;
857
858 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700859}
Eric Dumazet941666c2010-12-05 01:23:53 +0000860EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
Jochen Friedrichcf309e32005-09-22 04:44:55 -0300861
Eric W. Biederman881d9662007-09-17 11:56:21 -0700862struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700863{
864 struct net_device *dev;
865
866 ASSERT_RTNL();
Eric W. Biederman881d9662007-09-17 11:56:21 -0700867 for_each_netdev(net, dev)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700868 if (dev->type == type)
Pavel Emelianov7562f872007-05-03 15:13:45 -0700869 return dev;
870
871 return NULL;
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700872}
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700873EXPORT_SYMBOL(__dev_getfirstbyhwtype);
874
Eric W. Biederman881d9662007-09-17 11:56:21 -0700875struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700876{
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000877 struct net_device *dev, *ret = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700878
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000879 rcu_read_lock();
880 for_each_netdev_rcu(net, dev)
881 if (dev->type == type) {
882 dev_hold(dev);
883 ret = dev;
884 break;
885 }
886 rcu_read_unlock();
887 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700888}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700889EXPORT_SYMBOL(dev_getfirstbyhwtype);
890
891/**
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000892 * dev_get_by_flags_rcu - find any device with given flags
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700893 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700894 * @if_flags: IFF_* values
895 * @mask: bitmask of bits in if_flags to check
896 *
897 * Search for any interface with the given flags. Returns NULL if a device
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000898 * is not found or a pointer to the device. Must be called inside
899 * rcu_read_lock(), and result refcount is unchanged.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700900 */
901
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000902struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700903 unsigned short mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700904{
Pavel Emelianov7562f872007-05-03 15:13:45 -0700905 struct net_device *dev, *ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700906
Pavel Emelianov7562f872007-05-03 15:13:45 -0700907 ret = NULL;
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800908 for_each_netdev_rcu(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700909 if (((dev->flags ^ if_flags) & mask) == 0) {
Pavel Emelianov7562f872007-05-03 15:13:45 -0700910 ret = dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700911 break;
912 }
913 }
Pavel Emelianov7562f872007-05-03 15:13:45 -0700914 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700915}
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000916EXPORT_SYMBOL(dev_get_by_flags_rcu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917
918/**
919 * dev_valid_name - check if name is okay for network device
920 * @name: name string
921 *
922 * Network device names need to be valid file names to
David S. Millerc7fa9d12006-08-15 16:34:13 -0700923 * to allow sysfs to work. We also disallow any kind of
924 * whitespace.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700925 */
David S. Miller95f050b2012-03-06 16:12:15 -0500926bool dev_valid_name(const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700927{
David S. Millerc7fa9d12006-08-15 16:34:13 -0700928 if (*name == '\0')
David S. Miller95f050b2012-03-06 16:12:15 -0500929 return false;
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -0700930 if (strlen(name) >= IFNAMSIZ)
David S. Miller95f050b2012-03-06 16:12:15 -0500931 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700932 if (!strcmp(name, ".") || !strcmp(name, ".."))
David S. Miller95f050b2012-03-06 16:12:15 -0500933 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700934
935 while (*name) {
936 if (*name == '/' || isspace(*name))
David S. Miller95f050b2012-03-06 16:12:15 -0500937 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700938 name++;
939 }
David S. Miller95f050b2012-03-06 16:12:15 -0500940 return true;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700941}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700942EXPORT_SYMBOL(dev_valid_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700943
944/**
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200945 * __dev_alloc_name - allocate a name for a device
946 * @net: network namespace to allocate the device name in
Linus Torvalds1da177e2005-04-16 15:20:36 -0700947 * @name: name format string
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200948 * @buf: scratch buffer and result name string
Linus Torvalds1da177e2005-04-16 15:20:36 -0700949 *
950 * Passed a format string - eg "lt%d" it will try and find a suitable
Stephen Hemminger3041a062006-05-26 13:25:24 -0700951 * id. It scans list of devices to build up a free map, then chooses
952 * the first empty slot. The caller must hold the dev_base or rtnl lock
953 * while allocating the name and adding the device in order to avoid
954 * duplicates.
955 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
956 * Returns the number of the unit assigned or a negative errno code.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700957 */
958
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200959static int __dev_alloc_name(struct net *net, const char *name, char *buf)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700960{
961 int i = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700962 const char *p;
963 const int max_netdevices = 8*PAGE_SIZE;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700964 unsigned long *inuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700965 struct net_device *d;
966
967 p = strnchr(name, IFNAMSIZ-1, '%');
968 if (p) {
969 /*
970 * Verify the string as this thing may have come from
971 * the user. There must be either one "%d" and no other "%"
972 * characters.
973 */
974 if (p[1] != 'd' || strchr(p + 2, '%'))
975 return -EINVAL;
976
977 /* Use one page as a bit array of possible slots */
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700978 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700979 if (!inuse)
980 return -ENOMEM;
981
Eric W. Biederman881d9662007-09-17 11:56:21 -0700982 for_each_netdev(net, d) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700983 if (!sscanf(d->name, name, &i))
984 continue;
985 if (i < 0 || i >= max_netdevices)
986 continue;
987
988 /* avoid cases where sscanf is not exact inverse of printf */
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200989 snprintf(buf, IFNAMSIZ, name, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700990 if (!strncmp(buf, d->name, IFNAMSIZ))
991 set_bit(i, inuse);
992 }
993
994 i = find_first_zero_bit(inuse, max_netdevices);
995 free_page((unsigned long) inuse);
996 }
997
Octavian Purdilad9031022009-11-18 02:36:59 +0000998 if (buf != name)
999 snprintf(buf, IFNAMSIZ, name, i);
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001000 if (!__dev_get_by_name(net, buf))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001001 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001002
1003 /* It is possible to run out of possible slots
1004 * when the name is long and there isn't enough space left
1005 * for the digits, or if all bits are used.
1006 */
1007 return -ENFILE;
1008}
1009
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001010/**
1011 * dev_alloc_name - allocate a name for a device
1012 * @dev: device
1013 * @name: name format string
1014 *
1015 * Passed a format string - eg "lt%d" it will try and find a suitable
1016 * id. It scans list of devices to build up a free map, then chooses
1017 * the first empty slot. The caller must hold the dev_base or rtnl lock
1018 * while allocating the name and adding the device in order to avoid
1019 * duplicates.
1020 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 * Returns the number of the unit assigned or a negative errno code.
1022 */
1023
1024int dev_alloc_name(struct net_device *dev, const char *name)
1025{
1026 char buf[IFNAMSIZ];
1027 struct net *net;
1028 int ret;
1029
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001030 BUG_ON(!dev_net(dev));
1031 net = dev_net(dev);
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001032 ret = __dev_alloc_name(net, name, buf);
1033 if (ret >= 0)
1034 strlcpy(dev->name, buf, IFNAMSIZ);
1035 return ret;
1036}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001037EXPORT_SYMBOL(dev_alloc_name);
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001038
Gao feng828de4f2012-09-13 20:58:27 +00001039static int dev_alloc_name_ns(struct net *net,
1040 struct net_device *dev,
1041 const char *name)
Octavian Purdilad9031022009-11-18 02:36:59 +00001042{
Gao feng828de4f2012-09-13 20:58:27 +00001043 char buf[IFNAMSIZ];
1044 int ret;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001045
Gao feng828de4f2012-09-13 20:58:27 +00001046 ret = __dev_alloc_name(net, name, buf);
1047 if (ret >= 0)
1048 strlcpy(dev->name, buf, IFNAMSIZ);
1049 return ret;
1050}
1051
1052static int dev_get_valid_name(struct net *net,
1053 struct net_device *dev,
1054 const char *name)
1055{
1056 BUG_ON(!net);
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001057
Octavian Purdilad9031022009-11-18 02:36:59 +00001058 if (!dev_valid_name(name))
1059 return -EINVAL;
1060
Jiri Pirko1c5cae82011-04-30 01:21:32 +00001061 if (strchr(name, '%'))
Gao feng828de4f2012-09-13 20:58:27 +00001062 return dev_alloc_name_ns(net, dev, name);
Octavian Purdilad9031022009-11-18 02:36:59 +00001063 else if (__dev_get_by_name(net, name))
1064 return -EEXIST;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001065 else if (dev->name != name)
1066 strlcpy(dev->name, name, IFNAMSIZ);
Octavian Purdilad9031022009-11-18 02:36:59 +00001067
1068 return 0;
1069}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001070
1071/**
1072 * dev_change_name - change name of a device
1073 * @dev: device
1074 * @newname: name (or format string) must be at least IFNAMSIZ
1075 *
1076 * Change name of a device, can pass format strings "eth%d".
1077 * for wildcarding.
1078 */
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07001079int dev_change_name(struct net_device *dev, const char *newname)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001080{
Herbert Xufcc5a032007-07-30 17:03:38 -07001081 char oldname[IFNAMSIZ];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001082 int err = 0;
Herbert Xufcc5a032007-07-30 17:03:38 -07001083 int ret;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001084 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001085
1086 ASSERT_RTNL();
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001087 BUG_ON(!dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001088
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001089 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001090 if (dev->flags & IFF_UP)
1091 return -EBUSY;
1092
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001093 write_seqcount_begin(&devnet_rename_seq);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001094
1095 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001096 write_seqcount_end(&devnet_rename_seq);
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -07001097 return 0;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001098 }
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -07001099
Herbert Xufcc5a032007-07-30 17:03:38 -07001100 memcpy(oldname, dev->name, IFNAMSIZ);
1101
Gao feng828de4f2012-09-13 20:58:27 +00001102 err = dev_get_valid_name(net, dev, newname);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001103 if (err < 0) {
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001104 write_seqcount_end(&devnet_rename_seq);
Octavian Purdilad9031022009-11-18 02:36:59 +00001105 return err;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001106 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001107
Herbert Xufcc5a032007-07-30 17:03:38 -07001108rollback:
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001109 ret = device_rename(&dev->dev, dev->name);
1110 if (ret) {
1111 memcpy(dev->name, oldname, IFNAMSIZ);
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001112 write_seqcount_end(&devnet_rename_seq);
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001113 return ret;
Stephen Hemmingerdcc99772008-05-14 22:33:38 -07001114 }
Herbert Xu7f988ea2007-07-30 16:35:46 -07001115
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001116 write_seqcount_end(&devnet_rename_seq);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001117
Herbert Xu7f988ea2007-07-30 16:35:46 -07001118 write_lock_bh(&dev_base_lock);
Eric Dumazet372b2312011-05-17 13:56:59 -04001119 hlist_del_rcu(&dev->name_hlist);
Eric Dumazet72c95282009-10-30 07:11:27 +00001120 write_unlock_bh(&dev_base_lock);
1121
1122 synchronize_rcu();
1123
1124 write_lock_bh(&dev_base_lock);
1125 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Herbert Xu7f988ea2007-07-30 16:35:46 -07001126 write_unlock_bh(&dev_base_lock);
1127
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001128 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001129 ret = notifier_to_errno(ret);
1130
1131 if (ret) {
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001132 /* err >= 0 after dev_alloc_name() or stores the first errno */
1133 if (err >= 0) {
Herbert Xufcc5a032007-07-30 17:03:38 -07001134 err = ret;
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001135 write_seqcount_begin(&devnet_rename_seq);
Herbert Xufcc5a032007-07-30 17:03:38 -07001136 memcpy(dev->name, oldname, IFNAMSIZ);
1137 goto rollback;
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001138 } else {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001139 pr_err("%s: name change rollback failed: %d\n",
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001140 dev->name, ret);
Herbert Xufcc5a032007-07-30 17:03:38 -07001141 }
1142 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001143
1144 return err;
1145}
1146
1147/**
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001148 * dev_set_alias - change ifalias of a device
1149 * @dev: device
1150 * @alias: name up to IFALIASZ
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07001151 * @len: limit of bytes to copy from info
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001152 *
1153 * Set ifalias for a device,
1154 */
1155int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1156{
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001157 char *new_ifalias;
1158
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001159 ASSERT_RTNL();
1160
1161 if (len >= IFALIASZ)
1162 return -EINVAL;
1163
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001164 if (!len) {
Sachin Kamat388dfc22012-11-20 00:57:04 +00001165 kfree(dev->ifalias);
1166 dev->ifalias = NULL;
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001167 return 0;
1168 }
1169
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001170 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1171 if (!new_ifalias)
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001172 return -ENOMEM;
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001173 dev->ifalias = new_ifalias;
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001174
1175 strlcpy(dev->ifalias, alias, len+1);
1176 return len;
1177}
1178
1179
1180/**
Stephen Hemminger3041a062006-05-26 13:25:24 -07001181 * netdev_features_change - device changes features
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001182 * @dev: device to cause notification
1183 *
1184 * Called to indicate a device has changed features.
1185 */
1186void netdev_features_change(struct net_device *dev)
1187{
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001188 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001189}
1190EXPORT_SYMBOL(netdev_features_change);
1191
1192/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001193 * netdev_state_change - device changes state
1194 * @dev: device to cause notification
1195 *
1196 * Called to indicate a device has changed state. This function calls
1197 * the notifier chains for netdev_chain and sends a NEWLINK message
1198 * to the routing socket.
1199 */
1200void netdev_state_change(struct net_device *dev)
1201{
1202 if (dev->flags & IFF_UP) {
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001203 call_netdevice_notifiers(NETDEV_CHANGE, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001204 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1205 }
1206}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001207EXPORT_SYMBOL(netdev_state_change);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001208
Amerigo Wangee89bab2012-08-09 22:14:56 +00001209/**
1210 * netdev_notify_peers - notify network peers about existence of @dev
1211 * @dev: network device
1212 *
1213 * Generate traffic such that interested network peers are aware of
1214 * @dev, such as by generating a gratuitous ARP. This may be used when
1215 * a device wants to inform the rest of the network about some sort of
1216 * reconfiguration such as a failover event or virtual machine
1217 * migration.
1218 */
1219void netdev_notify_peers(struct net_device *dev)
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001220{
Amerigo Wangee89bab2012-08-09 22:14:56 +00001221 rtnl_lock();
1222 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1223 rtnl_unlock();
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001224}
Amerigo Wangee89bab2012-08-09 22:14:56 +00001225EXPORT_SYMBOL(netdev_notify_peers);
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001226
Patrick McHardybd380812010-02-26 06:34:53 +00001227static int __dev_open(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001228{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001229 const struct net_device_ops *ops = dev->netdev_ops;
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001230 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001231
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001232 ASSERT_RTNL();
1233
Linus Torvalds1da177e2005-04-16 15:20:36 -07001234 if (!netif_device_present(dev))
1235 return -ENODEV;
1236
Neil Hormanca99ca12013-02-05 08:05:43 +00001237 /* Block netpoll from trying to do any rx path servicing.
1238 * If we don't do this there is a chance ndo_poll_controller
1239 * or ndo_poll may be running while we open the device
1240 */
1241 ret = netpoll_rx_disable(dev);
1242 if (ret)
1243 return ret;
1244
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001245 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1246 ret = notifier_to_errno(ret);
1247 if (ret)
1248 return ret;
1249
Linus Torvalds1da177e2005-04-16 15:20:36 -07001250 set_bit(__LINK_STATE_START, &dev->state);
Jeff Garzikbada3392007-10-23 20:19:37 -07001251
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001252 if (ops->ndo_validate_addr)
1253 ret = ops->ndo_validate_addr(dev);
Jeff Garzikbada3392007-10-23 20:19:37 -07001254
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001255 if (!ret && ops->ndo_open)
1256 ret = ops->ndo_open(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001257
Neil Hormanca99ca12013-02-05 08:05:43 +00001258 netpoll_rx_enable(dev);
1259
Jeff Garzikbada3392007-10-23 20:19:37 -07001260 if (ret)
1261 clear_bit(__LINK_STATE_START, &dev->state);
1262 else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001263 dev->flags |= IFF_UP;
David S. Millerb4bd07c2009-02-06 22:06:43 -08001264 net_dmaengine_get();
Patrick McHardy4417da62007-06-27 01:28:10 -07001265 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001266 dev_activate(dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04001267 add_device_randomness(dev->dev_addr, dev->addr_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001268 }
Jeff Garzikbada3392007-10-23 20:19:37 -07001269
Linus Torvalds1da177e2005-04-16 15:20:36 -07001270 return ret;
1271}
Patrick McHardybd380812010-02-26 06:34:53 +00001272
1273/**
1274 * dev_open - prepare an interface for use.
1275 * @dev: device to open
1276 *
1277 * Takes a device from down to up state. The device's private open
1278 * function is invoked and then the multicast lists are loaded. Finally
1279 * the device is moved into the up state and a %NETDEV_UP message is
1280 * sent to the netdev notifier chain.
1281 *
1282 * Calling this function on an active interface is a nop. On a failure
1283 * a negative errno code is returned.
1284 */
1285int dev_open(struct net_device *dev)
1286{
1287 int ret;
1288
Patrick McHardybd380812010-02-26 06:34:53 +00001289 if (dev->flags & IFF_UP)
1290 return 0;
1291
Patrick McHardybd380812010-02-26 06:34:53 +00001292 ret = __dev_open(dev);
1293 if (ret < 0)
1294 return ret;
1295
Patrick McHardybd380812010-02-26 06:34:53 +00001296 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1297 call_netdevice_notifiers(NETDEV_UP, dev);
1298
1299 return ret;
1300}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001301EXPORT_SYMBOL(dev_open);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001302
Octavian Purdila44345722010-12-13 12:44:07 +00001303static int __dev_close_many(struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001304{
Octavian Purdila44345722010-12-13 12:44:07 +00001305 struct net_device *dev;
Patrick McHardybd380812010-02-26 06:34:53 +00001306
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001307 ASSERT_RTNL();
David S. Miller9d5010d2007-09-12 14:33:25 +02001308 might_sleep();
1309
Octavian Purdila44345722010-12-13 12:44:07 +00001310 list_for_each_entry(dev, head, unreg_list) {
Octavian Purdila44345722010-12-13 12:44:07 +00001311 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001312
Octavian Purdila44345722010-12-13 12:44:07 +00001313 clear_bit(__LINK_STATE_START, &dev->state);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001314
Octavian Purdila44345722010-12-13 12:44:07 +00001315 /* Synchronize to scheduled poll. We cannot touch poll list, it
1316 * can be even on different cpu. So just clear netif_running().
1317 *
1318 * dev->stop() will invoke napi_disable() on all of it's
1319 * napi_struct instances on this device.
1320 */
1321 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1322 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001323
Octavian Purdila44345722010-12-13 12:44:07 +00001324 dev_deactivate_many(head);
1325
1326 list_for_each_entry(dev, head, unreg_list) {
1327 const struct net_device_ops *ops = dev->netdev_ops;
1328
1329 /*
1330 * Call the device specific close. This cannot fail.
1331 * Only if device is UP
1332 *
1333 * We allow it to be called even after a DETACH hot-plug
1334 * event.
1335 */
1336 if (ops->ndo_stop)
1337 ops->ndo_stop(dev);
1338
Octavian Purdila44345722010-12-13 12:44:07 +00001339 dev->flags &= ~IFF_UP;
Octavian Purdila44345722010-12-13 12:44:07 +00001340 net_dmaengine_put();
1341 }
1342
1343 return 0;
1344}
1345
1346static int __dev_close(struct net_device *dev)
1347{
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001348 int retval;
Octavian Purdila44345722010-12-13 12:44:07 +00001349 LIST_HEAD(single);
1350
Neil Hormanca99ca12013-02-05 08:05:43 +00001351 /* Temporarily disable netpoll until the interface is down */
1352 retval = netpoll_rx_disable(dev);
1353 if (retval)
1354 return retval;
1355
Octavian Purdila44345722010-12-13 12:44:07 +00001356 list_add(&dev->unreg_list, &single);
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001357 retval = __dev_close_many(&single);
1358 list_del(&single);
Neil Hormanca99ca12013-02-05 08:05:43 +00001359
1360 netpoll_rx_enable(dev);
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001361 return retval;
Octavian Purdila44345722010-12-13 12:44:07 +00001362}
1363
Eric Dumazet3fbd8752011-01-19 21:23:22 +00001364static int dev_close_many(struct list_head *head)
Octavian Purdila44345722010-12-13 12:44:07 +00001365{
1366 struct net_device *dev, *tmp;
1367 LIST_HEAD(tmp_list);
1368
1369 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1370 if (!(dev->flags & IFF_UP))
1371 list_move(&dev->unreg_list, &tmp_list);
1372
1373 __dev_close_many(head);
Matti Linnanvuorid8b2a4d2008-02-12 23:10:11 -08001374
Octavian Purdila44345722010-12-13 12:44:07 +00001375 list_for_each_entry(dev, head, unreg_list) {
1376 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1377 call_netdevice_notifiers(NETDEV_DOWN, dev);
1378 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001379
Octavian Purdila44345722010-12-13 12:44:07 +00001380 /* rollback_registered_many needs the complete original list */
1381 list_splice(&tmp_list, head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001382 return 0;
1383}
Patrick McHardybd380812010-02-26 06:34:53 +00001384
1385/**
1386 * dev_close - shutdown an interface.
1387 * @dev: device to shutdown
1388 *
1389 * This function moves an active device into down state. A
1390 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1391 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1392 * chain.
1393 */
1394int dev_close(struct net_device *dev)
1395{
Neil Hormanca99ca12013-02-05 08:05:43 +00001396 int ret = 0;
Eric Dumazete14a5992011-05-10 12:26:06 -07001397 if (dev->flags & IFF_UP) {
1398 LIST_HEAD(single);
Patrick McHardybd380812010-02-26 06:34:53 +00001399
Neil Hormanca99ca12013-02-05 08:05:43 +00001400 /* Block netpoll rx while the interface is going down */
1401 ret = netpoll_rx_disable(dev);
1402 if (ret)
1403 return ret;
1404
Eric Dumazete14a5992011-05-10 12:26:06 -07001405 list_add(&dev->unreg_list, &single);
1406 dev_close_many(&single);
1407 list_del(&single);
Neil Hormanca99ca12013-02-05 08:05:43 +00001408
1409 netpoll_rx_enable(dev);
Eric Dumazete14a5992011-05-10 12:26:06 -07001410 }
Neil Hormanca99ca12013-02-05 08:05:43 +00001411 return ret;
Patrick McHardybd380812010-02-26 06:34:53 +00001412}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001413EXPORT_SYMBOL(dev_close);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001414
1415
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001416/**
1417 * dev_disable_lro - disable Large Receive Offload on a device
1418 * @dev: device
1419 *
1420 * Disable Large Receive Offload (LRO) on a net device. Must be
1421 * called under RTNL. This is needed if received packets may be
1422 * forwarded to another interface.
1423 */
1424void dev_disable_lro(struct net_device *dev)
1425{
Neil Hormanf11970e2011-05-24 08:31:09 +00001426 /*
1427 * If we're trying to disable lro on a vlan device
1428 * use the underlying physical device instead
1429 */
1430 if (is_vlan_dev(dev))
1431 dev = vlan_dev_real_dev(dev);
1432
Michał Mirosławbc5787c62011-11-15 15:29:55 +00001433 dev->wanted_features &= ~NETIF_F_LRO;
1434 netdev_update_features(dev);
Michał Mirosław27660512011-03-18 16:56:34 +00001435
Michał Mirosław22d59692011-04-21 12:42:15 +00001436 if (unlikely(dev->features & NETIF_F_LRO))
1437 netdev_WARN(dev, "failed to disable LRO!\n");
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001438}
1439EXPORT_SYMBOL(dev_disable_lro);
1440
1441
Eric W. Biederman881d9662007-09-17 11:56:21 -07001442static int dev_boot_phase = 1;
1443
Linus Torvalds1da177e2005-04-16 15:20:36 -07001444/**
1445 * register_netdevice_notifier - register a network notifier block
1446 * @nb: notifier
1447 *
1448 * Register a notifier to be called when network device events occur.
1449 * The notifier passed is linked into the kernel structures and must
1450 * not be reused until it has been unregistered. A negative errno code
1451 * is returned on a failure.
1452 *
1453 * When registered all registration and up events are replayed
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001454 * to the new notifier to allow device to have a race free
Linus Torvalds1da177e2005-04-16 15:20:36 -07001455 * view of the network device list.
1456 */
1457
1458int register_netdevice_notifier(struct notifier_block *nb)
1459{
1460 struct net_device *dev;
Herbert Xufcc5a032007-07-30 17:03:38 -07001461 struct net_device *last;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001462 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001463 int err;
1464
1465 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001466 err = raw_notifier_chain_register(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001467 if (err)
1468 goto unlock;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001469 if (dev_boot_phase)
1470 goto unlock;
1471 for_each_net(net) {
1472 for_each_netdev(net, dev) {
1473 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1474 err = notifier_to_errno(err);
1475 if (err)
1476 goto rollback;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001477
Eric W. Biederman881d9662007-09-17 11:56:21 -07001478 if (!(dev->flags & IFF_UP))
1479 continue;
Herbert Xufcc5a032007-07-30 17:03:38 -07001480
Eric W. Biederman881d9662007-09-17 11:56:21 -07001481 nb->notifier_call(nb, NETDEV_UP, dev);
1482 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001483 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001484
1485unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001486 rtnl_unlock();
1487 return err;
Herbert Xufcc5a032007-07-30 17:03:38 -07001488
1489rollback:
1490 last = dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001491 for_each_net(net) {
1492 for_each_netdev(net, dev) {
1493 if (dev == last)
RongQing.Li8f891482011-11-30 23:43:07 -05001494 goto outroll;
Herbert Xufcc5a032007-07-30 17:03:38 -07001495
Eric W. Biederman881d9662007-09-17 11:56:21 -07001496 if (dev->flags & IFF_UP) {
1497 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1498 nb->notifier_call(nb, NETDEV_DOWN, dev);
1499 }
1500 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001501 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001502 }
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001503
RongQing.Li8f891482011-11-30 23:43:07 -05001504outroll:
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001505 raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001506 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001507}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001508EXPORT_SYMBOL(register_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001509
1510/**
1511 * unregister_netdevice_notifier - unregister a network notifier block
1512 * @nb: notifier
1513 *
1514 * Unregister a notifier previously registered by
1515 * register_netdevice_notifier(). The notifier is unlinked into the
1516 * kernel structures and may then be reused. A negative errno code
1517 * is returned on a failure.
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001518 *
1519 * After unregistering unregister and down device events are synthesized
1520 * for all devices on the device list to the removed notifier to remove
1521 * the need for special case cleanup code.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001522 */
1523
1524int unregister_netdevice_notifier(struct notifier_block *nb)
1525{
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001526 struct net_device *dev;
1527 struct net *net;
Herbert Xu9f514952006-03-25 01:24:25 -08001528 int err;
1529
1530 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001531 err = raw_notifier_chain_unregister(&netdev_chain, nb);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001532 if (err)
1533 goto unlock;
1534
1535 for_each_net(net) {
1536 for_each_netdev(net, dev) {
1537 if (dev->flags & IFF_UP) {
1538 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1539 nb->notifier_call(nb, NETDEV_DOWN, dev);
1540 }
1541 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001542 }
1543 }
1544unlock:
Herbert Xu9f514952006-03-25 01:24:25 -08001545 rtnl_unlock();
1546 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001547}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001548EXPORT_SYMBOL(unregister_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549
1550/**
1551 * call_netdevice_notifiers - call all network notifier blocks
1552 * @val: value passed unmodified to notifier function
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001553 * @dev: net_device pointer passed unmodified to notifier function
Linus Torvalds1da177e2005-04-16 15:20:36 -07001554 *
1555 * Call all network notifier blocks. Parameters and return value
Alan Sternf07d5b92006-05-09 15:23:03 -07001556 * are as for raw_notifier_call_chain().
Linus Torvalds1da177e2005-04-16 15:20:36 -07001557 */
1558
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001559int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001560{
Jiri Pirkoab930472010-04-20 01:45:37 -07001561 ASSERT_RTNL();
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001562 return raw_notifier_call_chain(&netdev_chain, val, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001563}
stephen hemmingeredf947f2011-03-24 13:24:01 +00001564EXPORT_SYMBOL(call_netdevice_notifiers);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001565
Ingo Molnarc5905af2012-02-24 08:31:31 +01001566static struct static_key netstamp_needed __read_mostly;
Eric Dumazetb90e5792011-11-28 11:16:50 +00001567#ifdef HAVE_JUMP_LABEL
Ingo Molnarc5905af2012-02-24 08:31:31 +01001568/* We are not allowed to call static_key_slow_dec() from irq context
Eric Dumazetb90e5792011-11-28 11:16:50 +00001569 * If net_disable_timestamp() is called from irq context, defer the
Ingo Molnarc5905af2012-02-24 08:31:31 +01001570 * static_key_slow_dec() calls.
Eric Dumazetb90e5792011-11-28 11:16:50 +00001571 */
1572static atomic_t netstamp_needed_deferred;
1573#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001574
1575void net_enable_timestamp(void)
1576{
Eric Dumazetb90e5792011-11-28 11:16:50 +00001577#ifdef HAVE_JUMP_LABEL
1578 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1579
1580 if (deferred) {
1581 while (--deferred)
Ingo Molnarc5905af2012-02-24 08:31:31 +01001582 static_key_slow_dec(&netstamp_needed);
Eric Dumazetb90e5792011-11-28 11:16:50 +00001583 return;
1584 }
1585#endif
1586 WARN_ON(in_interrupt());
Ingo Molnarc5905af2012-02-24 08:31:31 +01001587 static_key_slow_inc(&netstamp_needed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001588}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001589EXPORT_SYMBOL(net_enable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001590
1591void net_disable_timestamp(void)
1592{
Eric Dumazetb90e5792011-11-28 11:16:50 +00001593#ifdef HAVE_JUMP_LABEL
1594 if (in_interrupt()) {
1595 atomic_inc(&netstamp_needed_deferred);
1596 return;
1597 }
1598#endif
Ingo Molnarc5905af2012-02-24 08:31:31 +01001599 static_key_slow_dec(&netstamp_needed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001600}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001601EXPORT_SYMBOL(net_disable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001602
Eric Dumazet3b098e22010-05-15 23:57:10 -07001603static inline void net_timestamp_set(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001604{
Eric Dumazet588f0332011-11-15 04:12:55 +00001605 skb->tstamp.tv64 = 0;
Ingo Molnarc5905af2012-02-24 08:31:31 +01001606 if (static_key_false(&netstamp_needed))
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001607 __net_timestamp(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001608}
1609
Eric Dumazet588f0332011-11-15 04:12:55 +00001610#define net_timestamp_check(COND, SKB) \
Ingo Molnarc5905af2012-02-24 08:31:31 +01001611 if (static_key_false(&netstamp_needed)) { \
Eric Dumazet588f0332011-11-15 04:12:55 +00001612 if ((COND) && !(SKB)->tstamp.tv64) \
1613 __net_timestamp(SKB); \
1614 } \
Eric Dumazet3b098e22010-05-15 23:57:10 -07001615
Daniel Lezcano79b569f2011-03-30 02:42:17 -07001616static inline bool is_skb_forwardable(struct net_device *dev,
1617 struct sk_buff *skb)
1618{
1619 unsigned int len;
1620
1621 if (!(dev->flags & IFF_UP))
1622 return false;
1623
1624 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1625 if (skb->len <= len)
1626 return true;
1627
1628 /* if TSO is enabled, we don't care about the length as the packet
1629 * could be forwarded without being segmented before
1630 */
1631 if (skb_is_gso(skb))
1632 return true;
1633
1634 return false;
1635}
1636
Arnd Bergmann44540962009-11-26 06:07:08 +00001637/**
1638 * dev_forward_skb - loopback an skb to another netif
1639 *
1640 * @dev: destination network device
1641 * @skb: buffer to forward
1642 *
1643 * return values:
1644 * NET_RX_SUCCESS (no congestion)
Eric Dumazet6ec82562010-05-06 00:53:53 -07001645 * NET_RX_DROP (packet was dropped, but freed)
Arnd Bergmann44540962009-11-26 06:07:08 +00001646 *
1647 * dev_forward_skb can be used for injecting an skb from the
1648 * start_xmit function of one device into the receive queue
1649 * of another device.
1650 *
1651 * The receiving device may be in another namespace, so
1652 * we have to clear all information in the skb that could
1653 * impact namespace isolation.
1654 */
1655int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1656{
Michael S. Tsirkin48c83012011-08-31 08:03:29 +00001657 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1658 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1659 atomic_long_inc(&dev->rx_dropped);
1660 kfree_skb(skb);
1661 return NET_RX_DROP;
1662 }
1663 }
1664
Arnd Bergmann44540962009-11-26 06:07:08 +00001665 skb_orphan(skb);
Ben Greearc736eef2010-07-22 09:54:47 +00001666 nf_reset(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001667
Daniel Lezcano79b569f2011-03-30 02:42:17 -07001668 if (unlikely(!is_skb_forwardable(dev, skb))) {
Eric Dumazetcaf586e2010-09-30 21:06:55 +00001669 atomic_long_inc(&dev->rx_dropped);
Eric Dumazet6ec82562010-05-06 00:53:53 -07001670 kfree_skb(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001671 return NET_RX_DROP;
Eric Dumazet6ec82562010-05-06 00:53:53 -07001672 }
Benjamin LaHaise3b9785c2012-03-27 15:55:44 +00001673 skb->skb_iif = 0;
David S. Miller59b99972012-05-10 23:03:34 -04001674 skb->dev = dev;
1675 skb_dst_drop(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001676 skb->tstamp.tv64 = 0;
1677 skb->pkt_type = PACKET_HOST;
1678 skb->protocol = eth_type_trans(skb, dev);
David S. Miller59b99972012-05-10 23:03:34 -04001679 skb->mark = 0;
1680 secpath_reset(skb);
1681 nf_reset(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001682 return netif_rx(skb);
1683}
1684EXPORT_SYMBOL_GPL(dev_forward_skb);
1685
Changli Gao71d9dec2010-12-15 19:57:25 +00001686static inline int deliver_skb(struct sk_buff *skb,
1687 struct packet_type *pt_prev,
1688 struct net_device *orig_dev)
1689{
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00001690 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1691 return -ENOMEM;
Changli Gao71d9dec2010-12-15 19:57:25 +00001692 atomic_inc(&skb->users);
1693 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1694}
1695
Eric Leblondc0de08d2012-08-16 22:02:58 +00001696static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1697{
Eric Leblonda3d744e2012-11-06 02:10:10 +00001698 if (!ptype->af_packet_priv || !skb->sk)
Eric Leblondc0de08d2012-08-16 22:02:58 +00001699 return false;
1700
1701 if (ptype->id_match)
1702 return ptype->id_match(ptype, skb->sk);
1703 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1704 return true;
1705
1706 return false;
1707}
1708
Linus Torvalds1da177e2005-04-16 15:20:36 -07001709/*
1710 * Support routine. Sends outgoing frames to any network
1711 * taps currently in use.
1712 */
1713
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001714static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001715{
1716 struct packet_type *ptype;
Changli Gao71d9dec2010-12-15 19:57:25 +00001717 struct sk_buff *skb2 = NULL;
1718 struct packet_type *pt_prev = NULL;
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001719
Linus Torvalds1da177e2005-04-16 15:20:36 -07001720 rcu_read_lock();
1721 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1722 /* Never send packets back to the socket
1723 * they originated from - MvS (miquels@drinkel.ow.org)
1724 */
1725 if ((ptype->dev == dev || !ptype->dev) &&
Eric Leblondc0de08d2012-08-16 22:02:58 +00001726 (!skb_loop_sk(ptype, skb))) {
Changli Gao71d9dec2010-12-15 19:57:25 +00001727 if (pt_prev) {
1728 deliver_skb(skb2, pt_prev, skb->dev);
1729 pt_prev = ptype;
1730 continue;
1731 }
1732
1733 skb2 = skb_clone(skb, GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001734 if (!skb2)
1735 break;
1736
Eric Dumazet70978182010-12-20 21:22:51 +00001737 net_timestamp_set(skb2);
1738
Linus Torvalds1da177e2005-04-16 15:20:36 -07001739 /* skb->nh should be correctly
1740 set by sender, so that the second statement is
1741 just protection against buggy protocols.
1742 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001743 skb_reset_mac_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001744
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001745 if (skb_network_header(skb2) < skb2->data ||
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001746 skb2->network_header > skb2->tail) {
Joe Perchese87cc472012-05-13 21:56:26 +00001747 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1748 ntohs(skb2->protocol),
1749 dev->name);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07001750 skb_reset_network_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001751 }
1752
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001753 skb2->transport_header = skb2->network_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001754 skb2->pkt_type = PACKET_OUTGOING;
Changli Gao71d9dec2010-12-15 19:57:25 +00001755 pt_prev = ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001756 }
1757 }
Changli Gao71d9dec2010-12-15 19:57:25 +00001758 if (pt_prev)
1759 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001760 rcu_read_unlock();
1761}
1762
Ben Hutchings2c530402012-07-10 10:55:09 +00001763/**
1764 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
John Fastabend4f57c082011-01-17 08:06:04 +00001765 * @dev: Network device
1766 * @txq: number of queues available
1767 *
1768 * If real_num_tx_queues is changed the tc mappings may no longer be
1769 * valid. To resolve this verify the tc mapping remains valid and if
1770 * not NULL the mapping. With no priorities mapping to this
1771 * offset/count pair it will no longer be used. In the worst case TC0
1772 * is invalid nothing can be done so disable priority mappings. If is
1773 * expected that drivers will fix this mapping if they can before
1774 * calling netif_set_real_num_tx_queues.
1775 */
Eric Dumazetbb134d22011-01-20 19:18:08 +00001776static void netif_setup_tc(struct net_device *dev, unsigned int txq)
John Fastabend4f57c082011-01-17 08:06:04 +00001777{
1778 int i;
1779 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1780
1781 /* If TC0 is invalidated disable TC mapping */
1782 if (tc->offset + tc->count > txq) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001783 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
John Fastabend4f57c082011-01-17 08:06:04 +00001784 dev->num_tc = 0;
1785 return;
1786 }
1787
1788 /* Invalidated prio to tc mappings set to TC0 */
1789 for (i = 1; i < TC_BITMASK + 1; i++) {
1790 int q = netdev_get_prio_tc_map(dev, i);
1791
1792 tc = &dev->tc_to_txq[q];
1793 if (tc->offset + tc->count > txq) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001794 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1795 i, q);
John Fastabend4f57c082011-01-17 08:06:04 +00001796 netdev_set_prio_tc_map(dev, i, 0);
1797 }
1798 }
1799}
1800
Alexander Duyck537c00d2013-01-10 08:57:02 +00001801#ifdef CONFIG_XPS
1802static DEFINE_MUTEX(xps_map_mutex);
1803#define xmap_dereference(P) \
1804 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1805
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001806static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1807 int cpu, u16 index)
1808{
1809 struct xps_map *map = NULL;
1810 int pos;
1811
1812 if (dev_maps)
1813 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1814
1815 for (pos = 0; map && pos < map->len; pos++) {
1816 if (map->queues[pos] == index) {
1817 if (map->len > 1) {
1818 map->queues[pos] = map->queues[--map->len];
1819 } else {
1820 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1821 kfree_rcu(map, rcu);
1822 map = NULL;
1823 }
1824 break;
1825 }
1826 }
1827
1828 return map;
1829}
1830
Alexander Duyck024e9672013-01-10 08:57:46 +00001831static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
Alexander Duyck537c00d2013-01-10 08:57:02 +00001832{
1833 struct xps_dev_maps *dev_maps;
Alexander Duyck024e9672013-01-10 08:57:46 +00001834 int cpu, i;
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001835 bool active = false;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001836
1837 mutex_lock(&xps_map_mutex);
1838 dev_maps = xmap_dereference(dev->xps_maps);
1839
1840 if (!dev_maps)
1841 goto out_no_maps;
1842
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001843 for_each_possible_cpu(cpu) {
Alexander Duyck024e9672013-01-10 08:57:46 +00001844 for (i = index; i < dev->num_tx_queues; i++) {
1845 if (!remove_xps_queue(dev_maps, cpu, i))
1846 break;
1847 }
1848 if (i == dev->num_tx_queues)
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001849 active = true;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001850 }
1851
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001852 if (!active) {
Alexander Duyck537c00d2013-01-10 08:57:02 +00001853 RCU_INIT_POINTER(dev->xps_maps, NULL);
1854 kfree_rcu(dev_maps, rcu);
1855 }
1856
Alexander Duyck024e9672013-01-10 08:57:46 +00001857 for (i = index; i < dev->num_tx_queues; i++)
1858 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1859 NUMA_NO_NODE);
1860
Alexander Duyck537c00d2013-01-10 08:57:02 +00001861out_no_maps:
1862 mutex_unlock(&xps_map_mutex);
1863}
1864
Alexander Duyck01c5f862013-01-10 08:57:35 +00001865static struct xps_map *expand_xps_map(struct xps_map *map,
1866 int cpu, u16 index)
1867{
1868 struct xps_map *new_map;
1869 int alloc_len = XPS_MIN_MAP_ALLOC;
1870 int i, pos;
1871
1872 for (pos = 0; map && pos < map->len; pos++) {
1873 if (map->queues[pos] != index)
1874 continue;
1875 return map;
1876 }
1877
1878 /* Need to add queue to this CPU's existing map */
1879 if (map) {
1880 if (pos < map->alloc_len)
1881 return map;
1882
1883 alloc_len = map->alloc_len * 2;
1884 }
1885
1886 /* Need to allocate new map to store queue on this CPU's map */
1887 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1888 cpu_to_node(cpu));
1889 if (!new_map)
1890 return NULL;
1891
1892 for (i = 0; i < pos; i++)
1893 new_map->queues[i] = map->queues[i];
1894 new_map->alloc_len = alloc_len;
1895 new_map->len = pos;
1896
1897 return new_map;
1898}
1899
Alexander Duyck537c00d2013-01-10 08:57:02 +00001900int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1901{
Alexander Duyck01c5f862013-01-10 08:57:35 +00001902 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001903 struct xps_map *map, *new_map;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001904 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
Alexander Duyck01c5f862013-01-10 08:57:35 +00001905 int cpu, numa_node_id = -2;
1906 bool active = false;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001907
1908 mutex_lock(&xps_map_mutex);
1909
1910 dev_maps = xmap_dereference(dev->xps_maps);
1911
Alexander Duyck01c5f862013-01-10 08:57:35 +00001912 /* allocate memory for queue storage */
1913 for_each_online_cpu(cpu) {
1914 if (!cpumask_test_cpu(cpu, mask))
1915 continue;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001916
Alexander Duyck01c5f862013-01-10 08:57:35 +00001917 if (!new_dev_maps)
1918 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1919 if (!new_dev_maps)
1920 return -ENOMEM;
1921
1922 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1923 NULL;
1924
1925 map = expand_xps_map(map, cpu, index);
1926 if (!map)
1927 goto error;
1928
1929 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1930 }
1931
1932 if (!new_dev_maps)
1933 goto out_no_new_maps;
1934
1935 for_each_possible_cpu(cpu) {
1936 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1937 /* add queue to CPU maps */
1938 int pos = 0;
1939
1940 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1941 while ((pos < map->len) && (map->queues[pos] != index))
1942 pos++;
1943
1944 if (pos == map->len)
1945 map->queues[map->len++] = index;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001946#ifdef CONFIG_NUMA
Alexander Duyck537c00d2013-01-10 08:57:02 +00001947 if (numa_node_id == -2)
1948 numa_node_id = cpu_to_node(cpu);
1949 else if (numa_node_id != cpu_to_node(cpu))
1950 numa_node_id = -1;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001951#endif
Alexander Duyck01c5f862013-01-10 08:57:35 +00001952 } else if (dev_maps) {
1953 /* fill in the new device map from the old device map */
1954 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1955 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
Alexander Duyck537c00d2013-01-10 08:57:02 +00001956 }
Alexander Duyck01c5f862013-01-10 08:57:35 +00001957
Alexander Duyck537c00d2013-01-10 08:57:02 +00001958 }
1959
Alexander Duyck01c5f862013-01-10 08:57:35 +00001960 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1961
Alexander Duyck537c00d2013-01-10 08:57:02 +00001962 /* Cleanup old maps */
Alexander Duyck01c5f862013-01-10 08:57:35 +00001963 if (dev_maps) {
1964 for_each_possible_cpu(cpu) {
1965 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1966 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1967 if (map && map != new_map)
1968 kfree_rcu(map, rcu);
1969 }
Alexander Duyck537c00d2013-01-10 08:57:02 +00001970
Alexander Duyck537c00d2013-01-10 08:57:02 +00001971 kfree_rcu(dev_maps, rcu);
Alexander Duyck01c5f862013-01-10 08:57:35 +00001972 }
Alexander Duyck537c00d2013-01-10 08:57:02 +00001973
Alexander Duyck01c5f862013-01-10 08:57:35 +00001974 dev_maps = new_dev_maps;
1975 active = true;
1976
1977out_no_new_maps:
1978 /* update Tx queue numa node */
Alexander Duyck537c00d2013-01-10 08:57:02 +00001979 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1980 (numa_node_id >= 0) ? numa_node_id :
1981 NUMA_NO_NODE);
1982
Alexander Duyck01c5f862013-01-10 08:57:35 +00001983 if (!dev_maps)
1984 goto out_no_maps;
1985
1986 /* removes queue from unused CPUs */
1987 for_each_possible_cpu(cpu) {
1988 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1989 continue;
1990
1991 if (remove_xps_queue(dev_maps, cpu, index))
1992 active = true;
1993 }
1994
1995 /* free map if not active */
1996 if (!active) {
1997 RCU_INIT_POINTER(dev->xps_maps, NULL);
1998 kfree_rcu(dev_maps, rcu);
1999 }
2000
2001out_no_maps:
Alexander Duyck537c00d2013-01-10 08:57:02 +00002002 mutex_unlock(&xps_map_mutex);
2003
2004 return 0;
2005error:
Alexander Duyck01c5f862013-01-10 08:57:35 +00002006 /* remove any maps that we added */
2007 for_each_possible_cpu(cpu) {
2008 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2009 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2010 NULL;
2011 if (new_map && new_map != map)
2012 kfree(new_map);
2013 }
2014
Alexander Duyck537c00d2013-01-10 08:57:02 +00002015 mutex_unlock(&xps_map_mutex);
2016
Alexander Duyck537c00d2013-01-10 08:57:02 +00002017 kfree(new_dev_maps);
2018 return -ENOMEM;
2019}
2020EXPORT_SYMBOL(netif_set_xps_queue);
2021
2022#endif
John Fastabendf0796d52010-07-01 13:21:57 +00002023/*
2024 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2025 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2026 */
Tom Herberte6484932010-10-18 18:04:39 +00002027int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
John Fastabendf0796d52010-07-01 13:21:57 +00002028{
Tom Herbert1d24eb42010-11-21 13:17:27 +00002029 int rc;
2030
Tom Herberte6484932010-10-18 18:04:39 +00002031 if (txq < 1 || txq > dev->num_tx_queues)
2032 return -EINVAL;
John Fastabendf0796d52010-07-01 13:21:57 +00002033
Ben Hutchings5c565802011-02-15 19:39:21 +00002034 if (dev->reg_state == NETREG_REGISTERED ||
2035 dev->reg_state == NETREG_UNREGISTERING) {
Tom Herberte6484932010-10-18 18:04:39 +00002036 ASSERT_RTNL();
2037
Tom Herbert1d24eb42010-11-21 13:17:27 +00002038 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2039 txq);
Tom Herbertbf264142010-11-26 08:36:09 +00002040 if (rc)
2041 return rc;
2042
John Fastabend4f57c082011-01-17 08:06:04 +00002043 if (dev->num_tc)
2044 netif_setup_tc(dev, txq);
2045
Alexander Duyck024e9672013-01-10 08:57:46 +00002046 if (txq < dev->real_num_tx_queues) {
Tom Herberte6484932010-10-18 18:04:39 +00002047 qdisc_reset_all_tx_gt(dev, txq);
Alexander Duyck024e9672013-01-10 08:57:46 +00002048#ifdef CONFIG_XPS
2049 netif_reset_xps_queues_gt(dev, txq);
2050#endif
2051 }
John Fastabendf0796d52010-07-01 13:21:57 +00002052 }
Tom Herberte6484932010-10-18 18:04:39 +00002053
2054 dev->real_num_tx_queues = txq;
2055 return 0;
John Fastabendf0796d52010-07-01 13:21:57 +00002056}
2057EXPORT_SYMBOL(netif_set_real_num_tx_queues);
Denis Vlasenko56079432006-03-29 15:57:29 -08002058
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002059#ifdef CONFIG_RPS
2060/**
2061 * netif_set_real_num_rx_queues - set actual number of RX queues used
2062 * @dev: Network device
2063 * @rxq: Actual number of RX queues
2064 *
2065 * This must be called either with the rtnl_lock held or before
2066 * registration of the net device. Returns 0 on success, or a
Ben Hutchings4e7f7952010-10-08 10:33:39 -07002067 * negative error code. If called before registration, it always
2068 * succeeds.
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002069 */
2070int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2071{
2072 int rc;
2073
Tom Herbertbd25fa72010-10-18 18:00:16 +00002074 if (rxq < 1 || rxq > dev->num_rx_queues)
2075 return -EINVAL;
2076
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002077 if (dev->reg_state == NETREG_REGISTERED) {
2078 ASSERT_RTNL();
2079
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002080 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2081 rxq);
2082 if (rc)
2083 return rc;
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002084 }
2085
2086 dev->real_num_rx_queues = rxq;
2087 return 0;
2088}
2089EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2090#endif
2091
Ben Hutchings2c530402012-07-10 10:55:09 +00002092/**
2093 * netif_get_num_default_rss_queues - default number of RSS queues
Yuval Mintz16917b82012-07-01 03:18:50 +00002094 *
2095 * This routine should set an upper limit on the number of RSS queues
2096 * used by default by multiqueue devices.
2097 */
Ben Hutchingsa55b1382012-07-10 10:54:38 +00002098int netif_get_num_default_rss_queues(void)
Yuval Mintz16917b82012-07-01 03:18:50 +00002099{
2100 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2101}
2102EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2103
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002104static inline void __netif_reschedule(struct Qdisc *q)
2105{
2106 struct softnet_data *sd;
2107 unsigned long flags;
2108
2109 local_irq_save(flags);
2110 sd = &__get_cpu_var(softnet_data);
Changli Gaoa9cbd582010-04-26 23:06:24 +00002111 q->next_sched = NULL;
2112 *sd->output_queue_tailp = q;
2113 sd->output_queue_tailp = &q->next_sched;
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002114 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2115 local_irq_restore(flags);
2116}
2117
David S. Miller37437bb2008-07-16 02:15:04 -07002118void __netif_schedule(struct Qdisc *q)
Denis Vlasenko56079432006-03-29 15:57:29 -08002119{
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002120 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2121 __netif_reschedule(q);
Denis Vlasenko56079432006-03-29 15:57:29 -08002122}
2123EXPORT_SYMBOL(__netif_schedule);
2124
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002125void dev_kfree_skb_irq(struct sk_buff *skb)
Denis Vlasenko56079432006-03-29 15:57:29 -08002126{
David S. Miller3578b0c2010-08-03 00:24:04 -07002127 if (atomic_dec_and_test(&skb->users)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002128 struct softnet_data *sd;
2129 unsigned long flags;
Denis Vlasenko56079432006-03-29 15:57:29 -08002130
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002131 local_irq_save(flags);
2132 sd = &__get_cpu_var(softnet_data);
2133 skb->next = sd->completion_queue;
2134 sd->completion_queue = skb;
2135 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2136 local_irq_restore(flags);
2137 }
Denis Vlasenko56079432006-03-29 15:57:29 -08002138}
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002139EXPORT_SYMBOL(dev_kfree_skb_irq);
Denis Vlasenko56079432006-03-29 15:57:29 -08002140
2141void dev_kfree_skb_any(struct sk_buff *skb)
2142{
2143 if (in_irq() || irqs_disabled())
2144 dev_kfree_skb_irq(skb);
2145 else
2146 dev_kfree_skb(skb);
2147}
2148EXPORT_SYMBOL(dev_kfree_skb_any);
2149
2150
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002151/**
2152 * netif_device_detach - mark device as removed
2153 * @dev: network device
2154 *
2155 * Mark device as removed from system and therefore no longer available.
2156 */
Denis Vlasenko56079432006-03-29 15:57:29 -08002157void netif_device_detach(struct net_device *dev)
2158{
2159 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2160 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00002161 netif_tx_stop_all_queues(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08002162 }
2163}
2164EXPORT_SYMBOL(netif_device_detach);
2165
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002166/**
2167 * netif_device_attach - mark device as attached
2168 * @dev: network device
2169 *
2170 * Mark device as attached from system and restart if needed.
2171 */
Denis Vlasenko56079432006-03-29 15:57:29 -08002172void netif_device_attach(struct net_device *dev)
2173{
2174 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2175 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00002176 netif_tx_wake_all_queues(dev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002177 __netdev_watchdog_up(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08002178 }
2179}
2180EXPORT_SYMBOL(netif_device_attach);
2181
Ben Hutchings36c92472012-01-17 07:57:56 +00002182static void skb_warn_bad_offload(const struct sk_buff *skb)
2183{
Michał Mirosław65e9d2f2012-01-17 10:00:40 +00002184 static const netdev_features_t null_features = 0;
Ben Hutchings36c92472012-01-17 07:57:56 +00002185 struct net_device *dev = skb->dev;
2186 const char *driver = "";
2187
2188 if (dev && dev->dev.parent)
2189 driver = dev_driver_string(dev->dev.parent);
2190
2191 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2192 "gso_type=%d ip_summed=%d\n",
Michał Mirosław65e9d2f2012-01-17 10:00:40 +00002193 driver, dev ? &dev->features : &null_features,
2194 skb->sk ? &skb->sk->sk_route_caps : &null_features,
Ben Hutchings36c92472012-01-17 07:57:56 +00002195 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2196 skb_shinfo(skb)->gso_type, skb->ip_summed);
2197}
2198
Linus Torvalds1da177e2005-04-16 15:20:36 -07002199/*
2200 * Invalidate hardware checksum when packet is to be mangled, and
2201 * complete checksum manually on outgoing path.
2202 */
Patrick McHardy84fa7932006-08-29 16:44:56 -07002203int skb_checksum_help(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002204{
Al Virod3bc23e2006-11-14 21:24:49 -08002205 __wsum csum;
Herbert Xu663ead32007-04-09 11:59:07 -07002206 int ret = 0, offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002207
Patrick McHardy84fa7932006-08-29 16:44:56 -07002208 if (skb->ip_summed == CHECKSUM_COMPLETE)
Herbert Xua430a432006-07-08 13:34:56 -07002209 goto out_set_summed;
2210
2211 if (unlikely(skb_shinfo(skb)->gso_size)) {
Ben Hutchings36c92472012-01-17 07:57:56 +00002212 skb_warn_bad_offload(skb);
2213 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002214 }
2215
Eric Dumazetcef401d2013-01-25 20:34:37 +00002216 /* Before computing a checksum, we should make sure no frag could
2217 * be modified by an external entity : checksum could be wrong.
2218 */
2219 if (skb_has_shared_frag(skb)) {
2220 ret = __skb_linearize(skb);
2221 if (ret)
2222 goto out;
2223 }
2224
Michał Mirosław55508d62010-12-14 15:24:08 +00002225 offset = skb_checksum_start_offset(skb);
Herbert Xua0308472007-10-15 01:47:15 -07002226 BUG_ON(offset >= skb_headlen(skb));
2227 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2228
2229 offset += skb->csum_offset;
2230 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2231
2232 if (skb_cloned(skb) &&
2233 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002234 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2235 if (ret)
2236 goto out;
2237 }
2238
Herbert Xua0308472007-10-15 01:47:15 -07002239 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
Herbert Xua430a432006-07-08 13:34:56 -07002240out_set_summed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002241 skb->ip_summed = CHECKSUM_NONE;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002242out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002243 return ret;
2244}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002245EXPORT_SYMBOL(skb_checksum_help);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002246
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002247/**
2248 * skb_mac_gso_segment - mac layer segmentation handler.
2249 * @skb: buffer to segment
2250 * @features: features for the output path (see dev->features)
2251 */
2252struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2253 netdev_features_t features)
2254{
2255 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2256 struct packet_offload *ptype;
2257 __be16 type = skb->protocol;
2258
2259 while (type == htons(ETH_P_8021Q)) {
2260 int vlan_depth = ETH_HLEN;
2261 struct vlan_hdr *vh;
2262
2263 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2264 return ERR_PTR(-EINVAL);
2265
2266 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2267 type = vh->h_vlan_encapsulated_proto;
2268 vlan_depth += VLAN_HLEN;
2269 }
2270
2271 __skb_pull(skb, skb->mac_len);
2272
2273 rcu_read_lock();
2274 list_for_each_entry_rcu(ptype, &offload_base, list) {
2275 if (ptype->type == type && ptype->callbacks.gso_segment) {
2276 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2277 int err;
2278
2279 err = ptype->callbacks.gso_send_check(skb);
2280 segs = ERR_PTR(err);
2281 if (err || skb_gso_ok(skb, features))
2282 break;
2283 __skb_push(skb, (skb->data -
2284 skb_network_header(skb)));
2285 }
2286 segs = ptype->callbacks.gso_segment(skb, features);
2287 break;
2288 }
2289 }
2290 rcu_read_unlock();
2291
2292 __skb_push(skb, skb->data - skb_mac_header(skb));
2293
2294 return segs;
2295}
2296EXPORT_SYMBOL(skb_mac_gso_segment);
2297
2298
Cong Wang12b00042013-02-05 16:36:38 +00002299/* openvswitch calls this on rx path, so we need a different check.
2300 */
2301static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2302{
2303 if (tx_path)
2304 return skb->ip_summed != CHECKSUM_PARTIAL;
2305 else
2306 return skb->ip_summed == CHECKSUM_NONE;
2307}
2308
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002309/**
Cong Wang12b00042013-02-05 16:36:38 +00002310 * __skb_gso_segment - Perform segmentation on skb.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002311 * @skb: buffer to segment
Herbert Xu576a30e2006-06-27 13:22:38 -07002312 * @features: features for the output path (see dev->features)
Cong Wang12b00042013-02-05 16:36:38 +00002313 * @tx_path: whether it is called in TX path
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002314 *
2315 * This function segments the given skb and returns a list of segments.
Herbert Xu576a30e2006-06-27 13:22:38 -07002316 *
2317 * It may return NULL if the skb requires no segmentation. This is
2318 * only possible when GSO is used for verifying header integrity.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002319 */
Cong Wang12b00042013-02-05 16:36:38 +00002320struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2321 netdev_features_t features, bool tx_path)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002322{
Cong Wang12b00042013-02-05 16:36:38 +00002323 if (unlikely(skb_needs_check(skb, tx_path))) {
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002324 int err;
2325
Ben Hutchings36c92472012-01-17 07:57:56 +00002326 skb_warn_bad_offload(skb);
Herbert Xu67fd1a72009-01-19 16:26:44 -08002327
Herbert Xua430a432006-07-08 13:34:56 -07002328 if (skb_header_cloned(skb) &&
2329 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2330 return ERR_PTR(err);
2331 }
2332
Pravin B Shelar68c33162013-02-14 14:02:41 +00002333 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002334 skb_reset_mac_header(skb);
2335 skb_reset_mac_len(skb);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002336
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002337 return skb_mac_gso_segment(skb, features);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002338}
Cong Wang12b00042013-02-05 16:36:38 +00002339EXPORT_SYMBOL(__skb_gso_segment);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002340
Herbert Xufb286bb2005-11-10 13:01:24 -08002341/* Take action when hardware reception checksum errors are detected. */
2342#ifdef CONFIG_BUG
2343void netdev_rx_csum_fault(struct net_device *dev)
2344{
2345 if (net_ratelimit()) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00002346 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
Herbert Xufb286bb2005-11-10 13:01:24 -08002347 dump_stack();
2348 }
2349}
2350EXPORT_SYMBOL(netdev_rx_csum_fault);
2351#endif
2352
Linus Torvalds1da177e2005-04-16 15:20:36 -07002353/* Actually, we should eliminate this check as soon as we know, that:
2354 * 1. IOMMU is present and allows to map all the memory.
2355 * 2. No high memory really exists on this machine.
2356 */
2357
Eric Dumazet9092c652010-04-02 13:34:49 -07002358static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002359{
Herbert Xu3d3a8532006-06-27 13:33:10 -07002360#ifdef CONFIG_HIGHMEM
Linus Torvalds1da177e2005-04-16 15:20:36 -07002361 int i;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002362 if (!(dev->features & NETIF_F_HIGHDMA)) {
Ian Campbellea2ab692011-08-22 23:44:58 +00002363 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2364 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2365 if (PageHighMem(skb_frag_page(frag)))
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002366 return 1;
Ian Campbellea2ab692011-08-22 23:44:58 +00002367 }
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002368 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002369
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002370 if (PCI_DMA_BUS_IS_PHYS) {
2371 struct device *pdev = dev->dev.parent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002372
Eric Dumazet9092c652010-04-02 13:34:49 -07002373 if (!pdev)
2374 return 0;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002375 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
Ian Campbellea2ab692011-08-22 23:44:58 +00002376 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2377 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002378 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2379 return 1;
2380 }
2381 }
Herbert Xu3d3a8532006-06-27 13:33:10 -07002382#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002383 return 0;
2384}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002385
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002386struct dev_gso_cb {
2387 void (*destructor)(struct sk_buff *skb);
2388};
2389
2390#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2391
2392static void dev_gso_skb_destructor(struct sk_buff *skb)
2393{
2394 struct dev_gso_cb *cb;
2395
2396 do {
2397 struct sk_buff *nskb = skb->next;
2398
2399 skb->next = nskb->next;
2400 nskb->next = NULL;
2401 kfree_skb(nskb);
2402 } while (skb->next);
2403
2404 cb = DEV_GSO_CB(skb);
2405 if (cb->destructor)
2406 cb->destructor(skb);
2407}
2408
2409/**
2410 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2411 * @skb: buffer to segment
Jesse Gross91ecb632011-01-09 06:23:33 +00002412 * @features: device features as applicable to this skb
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002413 *
2414 * This function segments the given skb and stores the list of segments
2415 * in skb->next.
2416 */
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002417static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002418{
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002419 struct sk_buff *segs;
2420
Herbert Xu576a30e2006-06-27 13:22:38 -07002421 segs = skb_gso_segment(skb, features);
2422
2423 /* Verifying header integrity only. */
2424 if (!segs)
2425 return 0;
2426
Hirofumi Nakagawa801678c2008-04-29 01:03:09 -07002427 if (IS_ERR(segs))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002428 return PTR_ERR(segs);
2429
2430 skb->next = segs;
2431 DEV_GSO_CB(skb)->destructor = skb->destructor;
2432 skb->destructor = dev_gso_skb_destructor;
2433
2434 return 0;
2435}
2436
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002437static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
Jesse Gross03634662011-01-09 06:23:35 +00002438{
2439 return ((features & NETIF_F_GEN_CSUM) ||
2440 ((features & NETIF_F_V4_CSUM) &&
2441 protocol == htons(ETH_P_IP)) ||
2442 ((features & NETIF_F_V6_CSUM) &&
2443 protocol == htons(ETH_P_IPV6)) ||
2444 ((features & NETIF_F_FCOE_CRC) &&
2445 protocol == htons(ETH_P_FCOE)));
2446}
2447
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002448static netdev_features_t harmonize_features(struct sk_buff *skb,
2449 __be16 protocol, netdev_features_t features)
Jesse Grossf01a5232011-01-09 06:23:31 +00002450{
Ed Cashinc0d680e2012-09-19 15:49:00 +00002451 if (skb->ip_summed != CHECKSUM_NONE &&
2452 !can_checksum_protocol(features, protocol)) {
Jesse Grossf01a5232011-01-09 06:23:31 +00002453 features &= ~NETIF_F_ALL_CSUM;
2454 features &= ~NETIF_F_SG;
2455 } else if (illegal_highdma(skb->dev, skb)) {
2456 features &= ~NETIF_F_SG;
2457 }
2458
2459 return features;
2460}
2461
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002462netdev_features_t netif_skb_features(struct sk_buff *skb)
Jesse Gross58e998c2010-10-29 12:14:55 +00002463{
2464 __be16 protocol = skb->protocol;
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002465 netdev_features_t features = skb->dev->features;
Jesse Gross58e998c2010-10-29 12:14:55 +00002466
Ben Hutchings30b678d2012-07-30 15:57:00 +00002467 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2468 features &= ~NETIF_F_GSO_MASK;
2469
Jesse Gross58e998c2010-10-29 12:14:55 +00002470 if (protocol == htons(ETH_P_8021Q)) {
2471 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2472 protocol = veh->h_vlan_encapsulated_proto;
Jesse Grossf01a5232011-01-09 06:23:31 +00002473 } else if (!vlan_tx_tag_present(skb)) {
2474 return harmonize_features(skb, protocol, features);
2475 }
Jesse Gross58e998c2010-10-29 12:14:55 +00002476
Jesse Gross6ee400a2011-01-17 20:46:00 +00002477 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
Jesse Grossf01a5232011-01-09 06:23:31 +00002478
2479 if (protocol != htons(ETH_P_8021Q)) {
2480 return harmonize_features(skb, protocol, features);
2481 } else {
2482 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
Jesse Gross6ee400a2011-01-17 20:46:00 +00002483 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
Jesse Grossf01a5232011-01-09 06:23:31 +00002484 return harmonize_features(skb, protocol, features);
2485 }
Jesse Gross58e998c2010-10-29 12:14:55 +00002486}
Jesse Grossf01a5232011-01-09 06:23:31 +00002487EXPORT_SYMBOL(netif_skb_features);
Jesse Gross58e998c2010-10-29 12:14:55 +00002488
John Fastabend6afff0c2010-06-16 14:18:12 +00002489/*
2490 * Returns true if either:
2491 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
Rami Rosend1a53df2012-08-27 23:39:24 +00002492 * 2. skb is fragmented and the device does not support SG.
John Fastabend6afff0c2010-06-16 14:18:12 +00002493 */
2494static inline int skb_needs_linearize(struct sk_buff *skb,
Jesse Gross02932ce2011-01-09 06:23:34 +00002495 int features)
John Fastabend6afff0c2010-06-16 14:18:12 +00002496{
Jesse Gross02932ce2011-01-09 06:23:34 +00002497 return skb_is_nonlinear(skb) &&
2498 ((skb_has_frag_list(skb) &&
2499 !(features & NETIF_F_FRAGLIST)) ||
Jesse Grosse1e78db2010-10-29 12:14:53 +00002500 (skb_shinfo(skb)->nr_frags &&
Jesse Gross02932ce2011-01-09 06:23:34 +00002501 !(features & NETIF_F_SG)));
John Fastabend6afff0c2010-06-16 14:18:12 +00002502}
2503
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002504int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2505 struct netdev_queue *txq)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002506{
Stephen Hemminger00829822008-11-20 20:14:53 -08002507 const struct net_device_ops *ops = dev->netdev_ops;
Patrick McHardy572a9d72009-11-10 06:14:14 +00002508 int rc = NETDEV_TX_OK;
Koki Sanagiec764bf2011-05-30 21:48:34 +00002509 unsigned int skb_len;
Stephen Hemminger00829822008-11-20 20:14:53 -08002510
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002511 if (likely(!skb->next)) {
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002512 netdev_features_t features;
Jesse Grossfc741212011-01-09 06:23:32 +00002513
Eric Dumazet93f154b2009-05-18 22:19:19 -07002514 /*
Lucas De Marchi25985ed2011-03-30 22:57:33 -03002515 * If device doesn't need skb->dst, release it right now while
Eric Dumazet93f154b2009-05-18 22:19:19 -07002516 * its hot in this cpu cache
2517 */
Eric Dumazetadf30902009-06-02 05:19:30 +00002518 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2519 skb_dst_drop(skb);
2520
Jesse Grossfc741212011-01-09 06:23:32 +00002521 features = netif_skb_features(skb);
2522
Jesse Gross7b9c6092010-10-20 13:56:04 +00002523 if (vlan_tx_tag_present(skb) &&
Jesse Grossfc741212011-01-09 06:23:32 +00002524 !(features & NETIF_F_HW_VLAN_TX)) {
Jesse Gross7b9c6092010-10-20 13:56:04 +00002525 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2526 if (unlikely(!skb))
2527 goto out;
2528
2529 skb->vlan_tci = 0;
2530 }
2531
Alexander Duyckfc70fb62012-12-07 14:14:15 +00002532 /* If encapsulation offload request, verify we are testing
2533 * hardware encapsulation features instead of standard
2534 * features for the netdev
2535 */
2536 if (skb->encapsulation)
2537 features &= dev->hw_enc_features;
2538
Jesse Grossfc741212011-01-09 06:23:32 +00002539 if (netif_needs_gso(skb, features)) {
Jesse Gross91ecb632011-01-09 06:23:33 +00002540 if (unlikely(dev_gso_segment(skb, features)))
David S. Miller9ccb8972010-04-22 01:02:07 -07002541 goto out_kfree_skb;
2542 if (skb->next)
2543 goto gso;
John Fastabend6afff0c2010-06-16 14:18:12 +00002544 } else {
Jesse Gross02932ce2011-01-09 06:23:34 +00002545 if (skb_needs_linearize(skb, features) &&
John Fastabend6afff0c2010-06-16 14:18:12 +00002546 __skb_linearize(skb))
2547 goto out_kfree_skb;
2548
2549 /* If packet is not checksummed and device does not
2550 * support checksumming for this protocol, complete
2551 * checksumming here.
2552 */
2553 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Alexander Duyckfc70fb62012-12-07 14:14:15 +00002554 if (skb->encapsulation)
2555 skb_set_inner_transport_header(skb,
2556 skb_checksum_start_offset(skb));
2557 else
2558 skb_set_transport_header(skb,
2559 skb_checksum_start_offset(skb));
Jesse Gross03634662011-01-09 06:23:35 +00002560 if (!(features & NETIF_F_ALL_CSUM) &&
John Fastabend6afff0c2010-06-16 14:18:12 +00002561 skb_checksum_help(skb))
2562 goto out_kfree_skb;
2563 }
David S. Miller9ccb8972010-04-22 01:02:07 -07002564 }
2565
Eric Dumazetb40863c2012-09-18 20:44:49 +00002566 if (!list_empty(&ptype_all))
2567 dev_queue_xmit_nit(skb, dev);
2568
Koki Sanagiec764bf2011-05-30 21:48:34 +00002569 skb_len = skb->len;
Patrick Ohlyac45f602009-02-12 05:03:37 +00002570 rc = ops->ndo_start_xmit(skb, dev);
Koki Sanagiec764bf2011-05-30 21:48:34 +00002571 trace_net_dev_xmit(skb, rc, dev, skb_len);
Patrick McHardyec634fe2009-07-05 19:23:38 -07002572 if (rc == NETDEV_TX_OK)
Eric Dumazet08baf562009-05-25 22:58:01 -07002573 txq_trans_update(txq);
Patrick Ohlyac45f602009-02-12 05:03:37 +00002574 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002575 }
2576
Herbert Xu576a30e2006-06-27 13:22:38 -07002577gso:
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002578 do {
2579 struct sk_buff *nskb = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002580
2581 skb->next = nskb->next;
2582 nskb->next = NULL;
Krishna Kumar068a2de2009-12-09 20:59:58 +00002583
2584 /*
Lucas De Marchi25985ed2011-03-30 22:57:33 -03002585 * If device doesn't need nskb->dst, release it right now while
Krishna Kumar068a2de2009-12-09 20:59:58 +00002586 * its hot in this cpu cache
2587 */
2588 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2589 skb_dst_drop(nskb);
2590
Eric Dumazetb40863c2012-09-18 20:44:49 +00002591 if (!list_empty(&ptype_all))
2592 dev_queue_xmit_nit(nskb, dev);
2593
Koki Sanagiec764bf2011-05-30 21:48:34 +00002594 skb_len = nskb->len;
Stephen Hemminger00829822008-11-20 20:14:53 -08002595 rc = ops->ndo_start_xmit(nskb, dev);
Koki Sanagiec764bf2011-05-30 21:48:34 +00002596 trace_net_dev_xmit(nskb, rc, dev, skb_len);
Patrick McHardyec634fe2009-07-05 19:23:38 -07002597 if (unlikely(rc != NETDEV_TX_OK)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002598 if (rc & ~NETDEV_TX_MASK)
2599 goto out_kfree_gso_skb;
Michael Chanf54d9e82006-06-25 23:57:04 -07002600 nskb->next = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002601 skb->next = nskb;
2602 return rc;
2603 }
Eric Dumazet08baf562009-05-25 22:58:01 -07002604 txq_trans_update(txq);
Tom Herbert734664982011-11-28 16:32:44 +00002605 if (unlikely(netif_xmit_stopped(txq) && skb->next))
Michael Chanf54d9e82006-06-25 23:57:04 -07002606 return NETDEV_TX_BUSY;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002607 } while (skb->next);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002608
Patrick McHardy572a9d72009-11-10 06:14:14 +00002609out_kfree_gso_skb:
2610 if (likely(skb->next == NULL))
2611 skb->destructor = DEV_GSO_CB(skb)->destructor;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002612out_kfree_skb:
2613 kfree_skb(skb);
Jesse Gross7b9c6092010-10-20 13:56:04 +00002614out:
Patrick McHardy572a9d72009-11-10 06:14:14 +00002615 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002616}
2617
Eric Dumazet1def9232013-01-10 12:36:42 +00002618static void qdisc_pkt_len_init(struct sk_buff *skb)
2619{
2620 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2621
2622 qdisc_skb_cb(skb)->pkt_len = skb->len;
2623
2624 /* To get more precise estimation of bytes sent on wire,
2625 * we add to pkt_len the headers size of all segments
2626 */
2627 if (shinfo->gso_size) {
Eric Dumazet757b8b12013-01-15 21:14:21 -08002628 unsigned int hdr_len;
Eric Dumazet1def9232013-01-10 12:36:42 +00002629
Eric Dumazet757b8b12013-01-15 21:14:21 -08002630 /* mac layer + network layer */
2631 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2632
2633 /* + transport layer */
Eric Dumazet1def9232013-01-10 12:36:42 +00002634 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2635 hdr_len += tcp_hdrlen(skb);
2636 else
2637 hdr_len += sizeof(struct udphdr);
2638 qdisc_skb_cb(skb)->pkt_len += (shinfo->gso_segs - 1) * hdr_len;
2639 }
2640}
2641
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002642static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2643 struct net_device *dev,
2644 struct netdev_queue *txq)
2645{
2646 spinlock_t *root_lock = qdisc_lock(q);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002647 bool contended;
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002648 int rc;
2649
Eric Dumazet1def9232013-01-10 12:36:42 +00002650 qdisc_pkt_len_init(skb);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002651 qdisc_calculate_pkt_len(skb, q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002652 /*
2653 * Heuristic to force contended enqueues to serialize on a
2654 * separate lock before trying to get qdisc main lock.
2655 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2656 * and dequeue packets faster.
2657 */
Eric Dumazeta2da5702011-01-20 03:48:19 +00002658 contended = qdisc_is_running(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002659 if (unlikely(contended))
2660 spin_lock(&q->busylock);
2661
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002662 spin_lock(root_lock);
2663 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2664 kfree_skb(skb);
2665 rc = NET_XMIT_DROP;
2666 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
Eric Dumazetbc135b22010-06-02 03:23:51 -07002667 qdisc_run_begin(q)) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002668 /*
2669 * This is a work-conserving queue; there are no old skbs
2670 * waiting to be sent out; and the qdisc is not running -
2671 * xmit the skb directly.
2672 */
Eric Dumazet7fee2262010-05-11 23:19:48 +00002673 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2674 skb_dst_force(skb);
Eric Dumazetbfe0d022011-01-09 08:30:54 +00002675
Eric Dumazetbfe0d022011-01-09 08:30:54 +00002676 qdisc_bstats_update(q, skb);
2677
Eric Dumazet79640a42010-06-02 05:09:29 -07002678 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2679 if (unlikely(contended)) {
2680 spin_unlock(&q->busylock);
2681 contended = false;
2682 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002683 __qdisc_run(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002684 } else
Eric Dumazetbc135b22010-06-02 03:23:51 -07002685 qdisc_run_end(q);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002686
2687 rc = NET_XMIT_SUCCESS;
2688 } else {
Eric Dumazet7fee2262010-05-11 23:19:48 +00002689 skb_dst_force(skb);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002690 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
Eric Dumazet79640a42010-06-02 05:09:29 -07002691 if (qdisc_run_begin(q)) {
2692 if (unlikely(contended)) {
2693 spin_unlock(&q->busylock);
2694 contended = false;
2695 }
2696 __qdisc_run(q);
2697 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002698 }
2699 spin_unlock(root_lock);
Eric Dumazet79640a42010-06-02 05:09:29 -07002700 if (unlikely(contended))
2701 spin_unlock(&q->busylock);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002702 return rc;
2703}
2704
Neil Horman5bc14212011-11-22 05:10:51 +00002705#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2706static void skb_update_prio(struct sk_buff *skb)
2707{
Igor Maravic6977a792011-11-25 07:44:54 +00002708 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
Neil Horman5bc14212011-11-22 05:10:51 +00002709
Eric Dumazet91c68ce2012-07-08 21:45:10 +00002710 if (!skb->priority && skb->sk && map) {
2711 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2712
2713 if (prioidx < map->priomap_len)
2714 skb->priority = map->priomap[prioidx];
2715 }
Neil Horman5bc14212011-11-22 05:10:51 +00002716}
2717#else
2718#define skb_update_prio(skb)
2719#endif
2720
Eric Dumazet745e20f2010-09-29 13:23:09 -07002721static DEFINE_PER_CPU(int, xmit_recursion);
David S. Miller11a766c2010-10-25 12:51:55 -07002722#define RECURSION_LIMIT 10
Eric Dumazet745e20f2010-09-29 13:23:09 -07002723
Dave Jonesd29f7492008-07-22 14:09:06 -07002724/**
Michel Machado95603e22012-06-12 10:16:35 +00002725 * dev_loopback_xmit - loop back @skb
2726 * @skb: buffer to transmit
2727 */
2728int dev_loopback_xmit(struct sk_buff *skb)
2729{
2730 skb_reset_mac_header(skb);
2731 __skb_pull(skb, skb_network_offset(skb));
2732 skb->pkt_type = PACKET_LOOPBACK;
2733 skb->ip_summed = CHECKSUM_UNNECESSARY;
2734 WARN_ON(!skb_dst(skb));
2735 skb_dst_force(skb);
2736 netif_rx_ni(skb);
2737 return 0;
2738}
2739EXPORT_SYMBOL(dev_loopback_xmit);
2740
2741/**
Dave Jonesd29f7492008-07-22 14:09:06 -07002742 * dev_queue_xmit - transmit a buffer
2743 * @skb: buffer to transmit
2744 *
2745 * Queue a buffer for transmission to a network device. The caller must
2746 * have set the device and priority and built the buffer before calling
2747 * this function. The function can be called from an interrupt.
2748 *
2749 * A negative errno code is returned on a failure. A success does not
2750 * guarantee the frame will be transmitted as it may be dropped due
2751 * to congestion or traffic shaping.
2752 *
2753 * -----------------------------------------------------------------------------------
2754 * I notice this method can also return errors from the queue disciplines,
2755 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2756 * be positive.
2757 *
2758 * Regardless of the return value, the skb is consumed, so it is currently
2759 * difficult to retry a send to this method. (You can bump the ref count
2760 * before sending to hold a reference for retry if you are careful.)
2761 *
2762 * When calling this method, interrupts MUST be enabled. This is because
2763 * the BH enable code must have IRQs enabled so that it will not deadlock.
2764 * --BLG
2765 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002766int dev_queue_xmit(struct sk_buff *skb)
2767{
2768 struct net_device *dev = skb->dev;
David S. Millerdc2b4842008-07-08 17:18:23 -07002769 struct netdev_queue *txq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002770 struct Qdisc *q;
2771 int rc = -ENOMEM;
2772
Eric Dumazet6d1ccff2013-02-05 20:22:20 +00002773 skb_reset_mac_header(skb);
2774
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002775 /* Disable soft irqs for various locks below. Also
2776 * stops preemption for RCU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002777 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002778 rcu_read_lock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002779
Neil Horman5bc14212011-11-22 05:10:51 +00002780 skb_update_prio(skb);
2781
Amerigo Wang8c4c49d2012-09-17 20:16:31 +00002782 txq = netdev_pick_tx(dev, skb);
Paul E. McKenneya898def2010-02-22 17:04:49 -08002783 q = rcu_dereference_bh(txq->qdisc);
David S. Miller37437bb2008-07-16 02:15:04 -07002784
Linus Torvalds1da177e2005-04-16 15:20:36 -07002785#ifdef CONFIG_NET_CLS_ACT
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002786 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002787#endif
Koki Sanagicf66ba52010-08-23 18:45:02 +09002788 trace_net_dev_queue(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002789 if (q->enqueue) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002790 rc = __dev_xmit_skb(skb, q, dev, txq);
David S. Miller37437bb2008-07-16 02:15:04 -07002791 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002792 }
2793
2794 /* The device has no queue. Common case for software devices:
2795 loopback, all the sorts of tunnels...
2796
Herbert Xu932ff272006-06-09 12:20:56 -07002797 Really, it is unlikely that netif_tx_lock protection is necessary
2798 here. (f.e. loopback and IP tunnels are clean ignoring statistics
Linus Torvalds1da177e2005-04-16 15:20:36 -07002799 counters.)
2800 However, it is possible, that they rely on protection
2801 made by us here.
2802
2803 Check this and shot the lock. It is not prone from deadlocks.
2804 Either shot noqueue qdisc, it is even simpler 8)
2805 */
2806 if (dev->flags & IFF_UP) {
2807 int cpu = smp_processor_id(); /* ok because BHs are off */
2808
David S. Millerc773e842008-07-08 23:13:53 -07002809 if (txq->xmit_lock_owner != cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002810
Eric Dumazet745e20f2010-09-29 13:23:09 -07002811 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2812 goto recursion_alert;
2813
David S. Millerc773e842008-07-08 23:13:53 -07002814 HARD_TX_LOCK(dev, txq, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002815
Tom Herbert734664982011-11-28 16:32:44 +00002816 if (!netif_xmit_stopped(txq)) {
Eric Dumazet745e20f2010-09-29 13:23:09 -07002817 __this_cpu_inc(xmit_recursion);
Patrick McHardy572a9d72009-11-10 06:14:14 +00002818 rc = dev_hard_start_xmit(skb, dev, txq);
Eric Dumazet745e20f2010-09-29 13:23:09 -07002819 __this_cpu_dec(xmit_recursion);
Patrick McHardy572a9d72009-11-10 06:14:14 +00002820 if (dev_xmit_complete(rc)) {
David S. Millerc773e842008-07-08 23:13:53 -07002821 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002822 goto out;
2823 }
2824 }
David S. Millerc773e842008-07-08 23:13:53 -07002825 HARD_TX_UNLOCK(dev, txq);
Joe Perchese87cc472012-05-13 21:56:26 +00002826 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2827 dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002828 } else {
2829 /* Recursion is detected! It is possible,
Eric Dumazet745e20f2010-09-29 13:23:09 -07002830 * unfortunately
2831 */
2832recursion_alert:
Joe Perchese87cc472012-05-13 21:56:26 +00002833 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2834 dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002835 }
2836 }
2837
2838 rc = -ENETDOWN;
Herbert Xud4828d82006-06-22 02:28:18 -07002839 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002840
Linus Torvalds1da177e2005-04-16 15:20:36 -07002841 kfree_skb(skb);
2842 return rc;
2843out:
Herbert Xud4828d82006-06-22 02:28:18 -07002844 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002845 return rc;
2846}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002847EXPORT_SYMBOL(dev_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002848
2849
2850/*=======================================================================
2851 Receiver routines
2852 =======================================================================*/
2853
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002854int netdev_max_backlog __read_mostly = 1000;
Eric Dumazetc9e6bc62012-09-27 19:29:05 +00002855EXPORT_SYMBOL(netdev_max_backlog);
2856
Eric Dumazet3b098e22010-05-15 23:57:10 -07002857int netdev_tstamp_prequeue __read_mostly = 1;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002858int netdev_budget __read_mostly = 300;
2859int weight_p __read_mostly = 64; /* old backlog weight */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002860
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002861/* Called with irq disabled */
2862static inline void ____napi_schedule(struct softnet_data *sd,
2863 struct napi_struct *napi)
2864{
2865 list_add_tail(&napi->poll_list, &sd->poll_list);
2866 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2867}
2868
Eric Dumazetdf334542010-03-24 19:13:54 +00002869#ifdef CONFIG_RPS
Tom Herbertfec5e652010-04-16 16:01:27 -07002870
2871/* One global table that all flow-based protocols share. */
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002872struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
Tom Herbertfec5e652010-04-16 16:01:27 -07002873EXPORT_SYMBOL(rps_sock_flow_table);
2874
Ingo Molnarc5905af2012-02-24 08:31:31 +01002875struct static_key rps_needed __read_mostly;
Eric Dumazetadc93002011-11-17 03:13:26 +00002876
Ben Hutchingsc4454772011-01-19 11:03:53 +00002877static struct rps_dev_flow *
2878set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2879 struct rps_dev_flow *rflow, u16 next_cpu)
2880{
Ben Hutchings09994d12011-10-03 04:42:46 +00002881 if (next_cpu != RPS_NO_CPU) {
Ben Hutchingsc4454772011-01-19 11:03:53 +00002882#ifdef CONFIG_RFS_ACCEL
2883 struct netdev_rx_queue *rxqueue;
2884 struct rps_dev_flow_table *flow_table;
2885 struct rps_dev_flow *old_rflow;
2886 u32 flow_id;
2887 u16 rxq_index;
2888 int rc;
2889
2890 /* Should we steer this flow to a different hardware queue? */
Ben Hutchings69a19ee2011-02-15 20:32:04 +00002891 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2892 !(dev->features & NETIF_F_NTUPLE))
Ben Hutchingsc4454772011-01-19 11:03:53 +00002893 goto out;
2894 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2895 if (rxq_index == skb_get_rx_queue(skb))
2896 goto out;
2897
2898 rxqueue = dev->_rx + rxq_index;
2899 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2900 if (!flow_table)
2901 goto out;
2902 flow_id = skb->rxhash & flow_table->mask;
2903 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2904 rxq_index, flow_id);
2905 if (rc < 0)
2906 goto out;
2907 old_rflow = rflow;
2908 rflow = &flow_table->flows[flow_id];
Ben Hutchingsc4454772011-01-19 11:03:53 +00002909 rflow->filter = rc;
2910 if (old_rflow->filter == rflow->filter)
2911 old_rflow->filter = RPS_NO_FILTER;
2912 out:
2913#endif
2914 rflow->last_qtail =
Ben Hutchings09994d12011-10-03 04:42:46 +00002915 per_cpu(softnet_data, next_cpu).input_queue_head;
Ben Hutchingsc4454772011-01-19 11:03:53 +00002916 }
2917
Ben Hutchings09994d12011-10-03 04:42:46 +00002918 rflow->cpu = next_cpu;
Ben Hutchingsc4454772011-01-19 11:03:53 +00002919 return rflow;
2920}
2921
Tom Herbert0a9627f2010-03-16 08:03:29 +00002922/*
2923 * get_rps_cpu is called from netif_receive_skb and returns the target
2924 * CPU from the RPS map of the receiving queue for a given skb.
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002925 * rcu_read_lock must be held on entry.
Tom Herbert0a9627f2010-03-16 08:03:29 +00002926 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002927static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2928 struct rps_dev_flow **rflowp)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002929{
Tom Herbert0a9627f2010-03-16 08:03:29 +00002930 struct netdev_rx_queue *rxqueue;
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002931 struct rps_map *map;
Tom Herbertfec5e652010-04-16 16:01:27 -07002932 struct rps_dev_flow_table *flow_table;
2933 struct rps_sock_flow_table *sock_flow_table;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002934 int cpu = -1;
Tom Herbertfec5e652010-04-16 16:01:27 -07002935 u16 tcpu;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002936
Tom Herbert0a9627f2010-03-16 08:03:29 +00002937 if (skb_rx_queue_recorded(skb)) {
2938 u16 index = skb_get_rx_queue(skb);
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002939 if (unlikely(index >= dev->real_num_rx_queues)) {
2940 WARN_ONCE(dev->real_num_rx_queues > 1,
2941 "%s received packet on queue %u, but number "
2942 "of RX queues is %u\n",
2943 dev->name, index, dev->real_num_rx_queues);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002944 goto done;
2945 }
2946 rxqueue = dev->_rx + index;
2947 } else
2948 rxqueue = dev->_rx;
2949
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002950 map = rcu_dereference(rxqueue->rps_map);
2951 if (map) {
Tom Herbert85875232011-01-31 16:23:42 -08002952 if (map->len == 1 &&
Eric Dumazet33d480c2011-08-11 19:30:52 +00002953 !rcu_access_pointer(rxqueue->rps_flow_table)) {
Changli Gao6febfca2010-09-03 23:12:37 +00002954 tcpu = map->cpus[0];
2955 if (cpu_online(tcpu))
2956 cpu = tcpu;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002957 goto done;
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002958 }
Eric Dumazet33d480c2011-08-11 19:30:52 +00002959 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00002960 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002961 }
2962
Changli Gao2d47b452010-08-17 19:00:56 +00002963 skb_reset_network_header(skb);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002964 if (!skb_get_rxhash(skb))
Tom Herbert0a9627f2010-03-16 08:03:29 +00002965 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002966
Tom Herbertfec5e652010-04-16 16:01:27 -07002967 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2968 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2969 if (flow_table && sock_flow_table) {
2970 u16 next_cpu;
2971 struct rps_dev_flow *rflow;
2972
2973 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2974 tcpu = rflow->cpu;
2975
2976 next_cpu = sock_flow_table->ents[skb->rxhash &
2977 sock_flow_table->mask];
2978
2979 /*
2980 * If the desired CPU (where last recvmsg was done) is
2981 * different from current CPU (one in the rx-queue flow
2982 * table entry), switch if one of the following holds:
2983 * - Current CPU is unset (equal to RPS_NO_CPU).
2984 * - Current CPU is offline.
2985 * - The current CPU's queue tail has advanced beyond the
2986 * last packet that was enqueued using this table entry.
2987 * This guarantees that all previous packets for the flow
2988 * have been dequeued, thus preserving in order delivery.
2989 */
2990 if (unlikely(tcpu != next_cpu) &&
2991 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2992 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
Tom Herbertbaefa312012-11-16 09:04:15 +00002993 rflow->last_qtail)) >= 0)) {
2994 tcpu = next_cpu;
Ben Hutchingsc4454772011-01-19 11:03:53 +00002995 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
Tom Herbertbaefa312012-11-16 09:04:15 +00002996 }
Ben Hutchingsc4454772011-01-19 11:03:53 +00002997
Tom Herbertfec5e652010-04-16 16:01:27 -07002998 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2999 *rflowp = rflow;
3000 cpu = tcpu;
3001 goto done;
3002 }
3003 }
3004
Tom Herbert0a9627f2010-03-16 08:03:29 +00003005 if (map) {
Tom Herbertfec5e652010-04-16 16:01:27 -07003006 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
Tom Herbert0a9627f2010-03-16 08:03:29 +00003007
3008 if (cpu_online(tcpu)) {
3009 cpu = tcpu;
3010 goto done;
3011 }
3012 }
3013
3014done:
Tom Herbert0a9627f2010-03-16 08:03:29 +00003015 return cpu;
3016}
3017
Ben Hutchingsc4454772011-01-19 11:03:53 +00003018#ifdef CONFIG_RFS_ACCEL
3019
3020/**
3021 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3022 * @dev: Device on which the filter was set
3023 * @rxq_index: RX queue index
3024 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3025 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3026 *
3027 * Drivers that implement ndo_rx_flow_steer() should periodically call
3028 * this function for each installed filter and remove the filters for
3029 * which it returns %true.
3030 */
3031bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3032 u32 flow_id, u16 filter_id)
3033{
3034 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3035 struct rps_dev_flow_table *flow_table;
3036 struct rps_dev_flow *rflow;
3037 bool expire = true;
3038 int cpu;
3039
3040 rcu_read_lock();
3041 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3042 if (flow_table && flow_id <= flow_table->mask) {
3043 rflow = &flow_table->flows[flow_id];
3044 cpu = ACCESS_ONCE(rflow->cpu);
3045 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3046 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3047 rflow->last_qtail) <
3048 (int)(10 * flow_table->mask)))
3049 expire = false;
3050 }
3051 rcu_read_unlock();
3052 return expire;
3053}
3054EXPORT_SYMBOL(rps_may_expire_flow);
3055
3056#endif /* CONFIG_RFS_ACCEL */
3057
Tom Herbert0a9627f2010-03-16 08:03:29 +00003058/* Called from hardirq (IPI) context */
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003059static void rps_trigger_softirq(void *data)
Tom Herbert0a9627f2010-03-16 08:03:29 +00003060{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003061 struct softnet_data *sd = data;
3062
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003063 ____napi_schedule(sd, &sd->backlog);
Changli Gaodee42872010-05-02 05:42:16 +00003064 sd->received_rps++;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003065}
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003066
Tom Herbertfec5e652010-04-16 16:01:27 -07003067#endif /* CONFIG_RPS */
Tom Herbert0a9627f2010-03-16 08:03:29 +00003068
3069/*
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003070 * Check if this softnet_data structure is another cpu one
3071 * If yes, queue it to our IPI list and return 1
3072 * If no, return 0
3073 */
3074static int rps_ipi_queued(struct softnet_data *sd)
3075{
3076#ifdef CONFIG_RPS
3077 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3078
3079 if (sd != mysd) {
3080 sd->rps_ipi_next = mysd->rps_ipi_list;
3081 mysd->rps_ipi_list = sd;
3082
3083 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3084 return 1;
3085 }
3086#endif /* CONFIG_RPS */
3087 return 0;
3088}
3089
3090/*
Tom Herbert0a9627f2010-03-16 08:03:29 +00003091 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3092 * queue (may be a remote CPU queue).
3093 */
Tom Herbertfec5e652010-04-16 16:01:27 -07003094static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3095 unsigned int *qtail)
Tom Herbert0a9627f2010-03-16 08:03:29 +00003096{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003097 struct softnet_data *sd;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003098 unsigned long flags;
3099
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003100 sd = &per_cpu(softnet_data, cpu);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003101
3102 local_irq_save(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003103
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003104 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003105 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3106 if (skb_queue_len(&sd->input_pkt_queue)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00003107enqueue:
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003108 __skb_queue_tail(&sd->input_pkt_queue, skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003109 input_queue_tail_incr_save(sd, qtail);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003110 rps_unlock(sd);
Changli Gao152102c2010-03-30 20:16:22 +00003111 local_irq_restore(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003112 return NET_RX_SUCCESS;
3113 }
3114
Eric Dumazetebda37c22010-05-06 23:51:21 +00003115 /* Schedule NAPI for backlog device
3116 * We can use non atomic operation since we own the queue lock
3117 */
3118 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003119 if (!rps_ipi_queued(sd))
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003120 ____napi_schedule(sd, &sd->backlog);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003121 }
3122 goto enqueue;
3123 }
3124
Changli Gaodee42872010-05-02 05:42:16 +00003125 sd->dropped++;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003126 rps_unlock(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003127
Tom Herbert0a9627f2010-03-16 08:03:29 +00003128 local_irq_restore(flags);
3129
Eric Dumazetcaf586e2010-09-30 21:06:55 +00003130 atomic_long_inc(&skb->dev->rx_dropped);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003131 kfree_skb(skb);
3132 return NET_RX_DROP;
3133}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003134
Linus Torvalds1da177e2005-04-16 15:20:36 -07003135/**
3136 * netif_rx - post buffer to the network code
3137 * @skb: buffer to post
3138 *
3139 * This function receives a packet from a device driver and queues it for
3140 * the upper (protocol) levels to process. It always succeeds. The buffer
3141 * may be dropped during processing for congestion control or by the
3142 * protocol layers.
3143 *
3144 * return values:
3145 * NET_RX_SUCCESS (no congestion)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003146 * NET_RX_DROP (packet was dropped)
3147 *
3148 */
3149
3150int netif_rx(struct sk_buff *skb)
3151{
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003152 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003153
3154 /* if netpoll wants it, pretend we never saw it */
3155 if (netpoll_rx(skb))
3156 return NET_RX_DROP;
3157
Eric Dumazet588f0332011-11-15 04:12:55 +00003158 net_timestamp_check(netdev_tstamp_prequeue, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003159
Koki Sanagicf66ba52010-08-23 18:45:02 +09003160 trace_netif_rx(skb);
Eric Dumazetdf334542010-03-24 19:13:54 +00003161#ifdef CONFIG_RPS
Ingo Molnarc5905af2012-02-24 08:31:31 +01003162 if (static_key_false(&rps_needed)) {
Tom Herbertfec5e652010-04-16 16:01:27 -07003163 struct rps_dev_flow voidflow, *rflow = &voidflow;
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003164 int cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003165
Changli Gaocece1942010-08-07 20:35:43 -07003166 preempt_disable();
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003167 rcu_read_lock();
Tom Herbertfec5e652010-04-16 16:01:27 -07003168
3169 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003170 if (cpu < 0)
3171 cpu = smp_processor_id();
Tom Herbertfec5e652010-04-16 16:01:27 -07003172
3173 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3174
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003175 rcu_read_unlock();
Changli Gaocece1942010-08-07 20:35:43 -07003176 preempt_enable();
Eric Dumazetadc93002011-11-17 03:13:26 +00003177 } else
3178#endif
Tom Herbertfec5e652010-04-16 16:01:27 -07003179 {
3180 unsigned int qtail;
3181 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3182 put_cpu();
3183 }
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003184 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003185}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003186EXPORT_SYMBOL(netif_rx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003187
3188int netif_rx_ni(struct sk_buff *skb)
3189{
3190 int err;
3191
3192 preempt_disable();
3193 err = netif_rx(skb);
3194 if (local_softirq_pending())
3195 do_softirq();
3196 preempt_enable();
3197
3198 return err;
3199}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003200EXPORT_SYMBOL(netif_rx_ni);
3201
Linus Torvalds1da177e2005-04-16 15:20:36 -07003202static void net_tx_action(struct softirq_action *h)
3203{
3204 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3205
3206 if (sd->completion_queue) {
3207 struct sk_buff *clist;
3208
3209 local_irq_disable();
3210 clist = sd->completion_queue;
3211 sd->completion_queue = NULL;
3212 local_irq_enable();
3213
3214 while (clist) {
3215 struct sk_buff *skb = clist;
3216 clist = clist->next;
3217
Ilpo Järvinen547b7922008-07-25 21:43:18 -07003218 WARN_ON(atomic_read(&skb->users));
Koki Sanagi07dc22e2010-08-23 18:46:12 +09003219 trace_kfree_skb(skb, net_tx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003220 __kfree_skb(skb);
3221 }
3222 }
3223
3224 if (sd->output_queue) {
David S. Miller37437bb2008-07-16 02:15:04 -07003225 struct Qdisc *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003226
3227 local_irq_disable();
3228 head = sd->output_queue;
3229 sd->output_queue = NULL;
Changli Gaoa9cbd582010-04-26 23:06:24 +00003230 sd->output_queue_tailp = &sd->output_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003231 local_irq_enable();
3232
3233 while (head) {
David S. Miller37437bb2008-07-16 02:15:04 -07003234 struct Qdisc *q = head;
3235 spinlock_t *root_lock;
3236
Linus Torvalds1da177e2005-04-16 15:20:36 -07003237 head = head->next_sched;
3238
David S. Miller5fb66222008-08-02 20:02:43 -07003239 root_lock = qdisc_lock(q);
David S. Miller37437bb2008-07-16 02:15:04 -07003240 if (spin_trylock(root_lock)) {
Jarek Poplawskidef82a12008-08-17 21:54:43 -07003241 smp_mb__before_clear_bit();
3242 clear_bit(__QDISC_STATE_SCHED,
3243 &q->state);
David S. Miller37437bb2008-07-16 02:15:04 -07003244 qdisc_run(q);
3245 spin_unlock(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003246 } else {
David S. Miller195648b2008-08-19 04:00:36 -07003247 if (!test_bit(__QDISC_STATE_DEACTIVATED,
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07003248 &q->state)) {
David S. Miller195648b2008-08-19 04:00:36 -07003249 __netif_reschedule(q);
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07003250 } else {
3251 smp_mb__before_clear_bit();
3252 clear_bit(__QDISC_STATE_SCHED,
3253 &q->state);
3254 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003255 }
3256 }
3257 }
3258}
3259
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003260#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3261 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
Michał Mirosławda678292009-06-05 05:35:28 +00003262/* This hook is defined here for ATM LANE */
3263int (*br_fdb_test_addr_hook)(struct net_device *dev,
3264 unsigned char *addr) __read_mostly;
Stephen Hemminger4fb019a2009-09-11 11:50:08 -07003265EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
Michał Mirosławda678292009-06-05 05:35:28 +00003266#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003267
Linus Torvalds1da177e2005-04-16 15:20:36 -07003268#ifdef CONFIG_NET_CLS_ACT
3269/* TODO: Maybe we should just force sch_ingress to be compiled in
3270 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3271 * a compare and 2 stores extra right now if we dont have it on
3272 * but have CONFIG_NET_CLS_ACT
Lucas De Marchi25985ed2011-03-30 22:57:33 -03003273 * NOTE: This doesn't stop any functionality; if you dont have
3274 * the ingress scheduler, you just can't add policies on ingress.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003275 *
3276 */
Eric Dumazet24824a02010-10-02 06:11:55 +00003277static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003278{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003279 struct net_device *dev = skb->dev;
Herbert Xuf697c3e2007-10-14 00:38:47 -07003280 u32 ttl = G_TC_RTTL(skb->tc_verd);
David S. Miller555353c2008-07-08 17:33:13 -07003281 int result = TC_ACT_OK;
3282 struct Qdisc *q;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003283
Stephen Hemmingerde384832010-08-01 00:33:23 -07003284 if (unlikely(MAX_RED_LOOP < ttl++)) {
Joe Perchese87cc472012-05-13 21:56:26 +00003285 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3286 skb->skb_iif, dev->ifindex);
Herbert Xuf697c3e2007-10-14 00:38:47 -07003287 return TC_ACT_SHOT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003288 }
3289
Herbert Xuf697c3e2007-10-14 00:38:47 -07003290 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3291 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3292
David S. Miller83874002008-07-17 00:53:03 -07003293 q = rxq->qdisc;
David S. Miller8d50b532008-07-30 02:37:46 -07003294 if (q != &noop_qdisc) {
David S. Miller83874002008-07-17 00:53:03 -07003295 spin_lock(qdisc_lock(q));
David S. Millera9312ae2008-08-17 21:51:03 -07003296 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3297 result = qdisc_enqueue_root(skb, q);
David S. Miller83874002008-07-17 00:53:03 -07003298 spin_unlock(qdisc_lock(q));
3299 }
Herbert Xuf697c3e2007-10-14 00:38:47 -07003300
Linus Torvalds1da177e2005-04-16 15:20:36 -07003301 return result;
3302}
Herbert Xuf697c3e2007-10-14 00:38:47 -07003303
3304static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3305 struct packet_type **pt_prev,
3306 int *ret, struct net_device *orig_dev)
3307{
Eric Dumazet24824a02010-10-02 06:11:55 +00003308 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3309
3310 if (!rxq || rxq->qdisc == &noop_qdisc)
Herbert Xuf697c3e2007-10-14 00:38:47 -07003311 goto out;
3312
3313 if (*pt_prev) {
3314 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3315 *pt_prev = NULL;
Herbert Xuf697c3e2007-10-14 00:38:47 -07003316 }
3317
Eric Dumazet24824a02010-10-02 06:11:55 +00003318 switch (ing_filter(skb, rxq)) {
Herbert Xuf697c3e2007-10-14 00:38:47 -07003319 case TC_ACT_SHOT:
3320 case TC_ACT_STOLEN:
3321 kfree_skb(skb);
3322 return NULL;
3323 }
3324
3325out:
3326 skb->tc_verd = 0;
3327 return skb;
3328}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003329#endif
3330
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003331/**
3332 * netdev_rx_handler_register - register receive handler
3333 * @dev: device to register a handler for
3334 * @rx_handler: receive handler to register
Jiri Pirko93e2c322010-06-10 03:34:59 +00003335 * @rx_handler_data: data pointer that is used by rx handler
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003336 *
3337 * Register a receive hander for a device. This handler will then be
3338 * called from __netif_receive_skb. A negative errno code is returned
3339 * on a failure.
3340 *
3341 * The caller must hold the rtnl_mutex.
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003342 *
3343 * For a general description of rx_handler, see enum rx_handler_result.
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003344 */
3345int netdev_rx_handler_register(struct net_device *dev,
Jiri Pirko93e2c322010-06-10 03:34:59 +00003346 rx_handler_func_t *rx_handler,
3347 void *rx_handler_data)
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003348{
3349 ASSERT_RTNL();
3350
3351 if (dev->rx_handler)
3352 return -EBUSY;
3353
Jiri Pirko93e2c322010-06-10 03:34:59 +00003354 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003355 rcu_assign_pointer(dev->rx_handler, rx_handler);
3356
3357 return 0;
3358}
3359EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3360
3361/**
3362 * netdev_rx_handler_unregister - unregister receive handler
3363 * @dev: device to unregister a handler from
3364 *
3365 * Unregister a receive hander from a device.
3366 *
3367 * The caller must hold the rtnl_mutex.
3368 */
3369void netdev_rx_handler_unregister(struct net_device *dev)
3370{
3371
3372 ASSERT_RTNL();
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00003373 RCU_INIT_POINTER(dev->rx_handler, NULL);
3374 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003375}
3376EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3377
Mel Gormanb4b9e352012-07-31 16:44:26 -07003378/*
3379 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3380 * the special handling of PFMEMALLOC skbs.
3381 */
3382static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3383{
3384 switch (skb->protocol) {
3385 case __constant_htons(ETH_P_ARP):
3386 case __constant_htons(ETH_P_IP):
3387 case __constant_htons(ETH_P_IPV6):
3388 case __constant_htons(ETH_P_8021Q):
3389 return true;
3390 default:
3391 return false;
3392 }
3393}
3394
David S. Miller9754e292013-02-14 15:57:38 -05003395static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003396{
3397 struct packet_type *ptype, *pt_prev;
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003398 rx_handler_func_t *rx_handler;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003399 struct net_device *orig_dev;
David S. Miller63d8ea72011-02-28 10:48:59 -08003400 struct net_device *null_or_dev;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003401 bool deliver_exact = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003402 int ret = NET_RX_DROP;
Al Viro252e3342006-11-14 20:48:11 -08003403 __be16 type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003404
Eric Dumazet588f0332011-11-15 04:12:55 +00003405 net_timestamp_check(!netdev_tstamp_prequeue, skb);
Eric Dumazet81bbb3d2009-09-30 16:42:42 -07003406
Koki Sanagicf66ba52010-08-23 18:45:02 +09003407 trace_netif_receive_skb(skb);
Patrick McHardy9b22ea52008-11-04 14:49:57 -08003408
Linus Torvalds1da177e2005-04-16 15:20:36 -07003409 /* if we've gotten here through NAPI, check netpoll */
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003410 if (netpoll_receive_skb(skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003411 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003412
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07003413 orig_dev = skb->dev;
Jiri Pirko1765a572011-02-12 06:48:36 +00003414
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003415 skb_reset_network_header(skb);
Eric Dumazetfda55ec2013-01-07 09:28:21 +00003416 if (!skb_transport_header_was_set(skb))
3417 skb_reset_transport_header(skb);
Jiri Pirko0b5c9db2011-06-10 06:56:58 +00003418 skb_reset_mac_len(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003419
3420 pt_prev = NULL;
3421
3422 rcu_read_lock();
3423
David S. Miller63d8ea72011-02-28 10:48:59 -08003424another_round:
David S. Millerb6858172012-07-23 16:27:54 -07003425 skb->skb_iif = skb->dev->ifindex;
David S. Miller63d8ea72011-02-28 10:48:59 -08003426
3427 __this_cpu_inc(softnet_data.processed);
3428
Jiri Pirkobcc6d472011-04-07 19:48:33 +00003429 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3430 skb = vlan_untag(skb);
3431 if (unlikely(!skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003432 goto unlock;
Jiri Pirkobcc6d472011-04-07 19:48:33 +00003433 }
3434
Linus Torvalds1da177e2005-04-16 15:20:36 -07003435#ifdef CONFIG_NET_CLS_ACT
3436 if (skb->tc_verd & TC_NCLS) {
3437 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3438 goto ncls;
3439 }
3440#endif
3441
David S. Miller9754e292013-02-14 15:57:38 -05003442 if (pfmemalloc)
Mel Gormanb4b9e352012-07-31 16:44:26 -07003443 goto skip_taps;
3444
Linus Torvalds1da177e2005-04-16 15:20:36 -07003445 list_for_each_entry_rcu(ptype, &ptype_all, list) {
David S. Miller63d8ea72011-02-28 10:48:59 -08003446 if (!ptype->dev || ptype->dev == skb->dev) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003447 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003448 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003449 pt_prev = ptype;
3450 }
3451 }
3452
Mel Gormanb4b9e352012-07-31 16:44:26 -07003453skip_taps:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003454#ifdef CONFIG_NET_CLS_ACT
Herbert Xuf697c3e2007-10-14 00:38:47 -07003455 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3456 if (!skb)
Mel Gormanb4b9e352012-07-31 16:44:26 -07003457 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003458ncls:
3459#endif
3460
David S. Miller9754e292013-02-14 15:57:38 -05003461 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003462 goto drop;
3463
John Fastabend24257172011-10-10 09:16:41 +00003464 if (vlan_tx_tag_present(skb)) {
3465 if (pt_prev) {
3466 ret = deliver_skb(skb, pt_prev, orig_dev);
3467 pt_prev = NULL;
3468 }
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003469 if (vlan_do_receive(&skb))
John Fastabend24257172011-10-10 09:16:41 +00003470 goto another_round;
3471 else if (unlikely(!skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003472 goto unlock;
John Fastabend24257172011-10-10 09:16:41 +00003473 }
3474
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003475 rx_handler = rcu_dereference(skb->dev->rx_handler);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003476 if (rx_handler) {
3477 if (pt_prev) {
3478 ret = deliver_skb(skb, pt_prev, orig_dev);
3479 pt_prev = NULL;
3480 }
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003481 switch (rx_handler(&skb)) {
3482 case RX_HANDLER_CONSUMED:
Mel Gormanb4b9e352012-07-31 16:44:26 -07003483 goto unlock;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003484 case RX_HANDLER_ANOTHER:
David S. Miller63d8ea72011-02-28 10:48:59 -08003485 goto another_round;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003486 case RX_HANDLER_EXACT:
3487 deliver_exact = true;
3488 case RX_HANDLER_PASS:
3489 break;
3490 default:
3491 BUG();
3492 }
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003493 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003494
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003495 if (vlan_tx_nonzero_tag_present(skb))
3496 skb->pkt_type = PACKET_OTHERHOST;
3497
David S. Miller63d8ea72011-02-28 10:48:59 -08003498 /* deliver only exact match when indicated */
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003499 null_or_dev = deliver_exact ? skb->dev : NULL;
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00003500
Linus Torvalds1da177e2005-04-16 15:20:36 -07003501 type = skb->protocol;
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08003502 list_for_each_entry_rcu(ptype,
3503 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
David S. Miller63d8ea72011-02-28 10:48:59 -08003504 if (ptype->type == type &&
Jiri Pirkoe3f48d32011-02-28 20:26:31 +00003505 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3506 ptype->dev == orig_dev)) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003507 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003508 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003509 pt_prev = ptype;
3510 }
3511 }
3512
3513 if (pt_prev) {
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00003514 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
Michael S. Tsirkin0e698bf2012-09-15 22:44:16 +00003515 goto drop;
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00003516 else
3517 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003518 } else {
Mel Gormanb4b9e352012-07-31 16:44:26 -07003519drop:
Eric Dumazetcaf586e2010-09-30 21:06:55 +00003520 atomic_long_inc(&skb->dev->rx_dropped);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003521 kfree_skb(skb);
3522 /* Jamal, now you will not able to escape explaining
3523 * me how you were going to use this. :-)
3524 */
3525 ret = NET_RX_DROP;
3526 }
3527
Mel Gormanb4b9e352012-07-31 16:44:26 -07003528unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003529 rcu_read_unlock();
Mel Gormanb4b9e352012-07-31 16:44:26 -07003530out:
David S. Miller9754e292013-02-14 15:57:38 -05003531 return ret;
3532}
3533
3534static int __netif_receive_skb(struct sk_buff *skb)
3535{
3536 int ret;
3537
3538 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3539 unsigned long pflags = current->flags;
3540
3541 /*
3542 * PFMEMALLOC skbs are special, they should
3543 * - be delivered to SOCK_MEMALLOC sockets only
3544 * - stay away from userspace
3545 * - have bounded memory usage
3546 *
3547 * Use PF_MEMALLOC as this saves us from propagating the allocation
3548 * context down to all allocation sites.
3549 */
3550 current->flags |= PF_MEMALLOC;
3551 ret = __netif_receive_skb_core(skb, true);
3552 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3553 } else
3554 ret = __netif_receive_skb_core(skb, false);
3555
Linus Torvalds1da177e2005-04-16 15:20:36 -07003556 return ret;
3557}
Tom Herbert0a9627f2010-03-16 08:03:29 +00003558
3559/**
3560 * netif_receive_skb - process receive buffer from network
3561 * @skb: buffer to process
3562 *
3563 * netif_receive_skb() is the main receive data processing function.
3564 * It always succeeds. The buffer may be dropped during processing
3565 * for congestion control or by the protocol layers.
3566 *
3567 * This function may only be called from softirq context and interrupts
3568 * should be enabled.
3569 *
3570 * Return values (usually ignored):
3571 * NET_RX_SUCCESS: no congestion
3572 * NET_RX_DROP: packet was dropped
3573 */
3574int netif_receive_skb(struct sk_buff *skb)
3575{
Eric Dumazet588f0332011-11-15 04:12:55 +00003576 net_timestamp_check(netdev_tstamp_prequeue, skb);
Eric Dumazet3b098e22010-05-15 23:57:10 -07003577
Richard Cochranc1f19b52010-07-17 08:49:36 +00003578 if (skb_defer_rx_timestamp(skb))
3579 return NET_RX_SUCCESS;
3580
Eric Dumazetdf334542010-03-24 19:13:54 +00003581#ifdef CONFIG_RPS
Ingo Molnarc5905af2012-02-24 08:31:31 +01003582 if (static_key_false(&rps_needed)) {
Eric Dumazet3b098e22010-05-15 23:57:10 -07003583 struct rps_dev_flow voidflow, *rflow = &voidflow;
3584 int cpu, ret;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003585
Eric Dumazet3b098e22010-05-15 23:57:10 -07003586 rcu_read_lock();
Tom Herbert0a9627f2010-03-16 08:03:29 +00003587
Eric Dumazet3b098e22010-05-15 23:57:10 -07003588 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Tom Herbertfec5e652010-04-16 16:01:27 -07003589
Eric Dumazet3b098e22010-05-15 23:57:10 -07003590 if (cpu >= 0) {
3591 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3592 rcu_read_unlock();
Eric Dumazetadc93002011-11-17 03:13:26 +00003593 return ret;
Eric Dumazet3b098e22010-05-15 23:57:10 -07003594 }
Eric Dumazetadc93002011-11-17 03:13:26 +00003595 rcu_read_unlock();
Tom Herbertfec5e652010-04-16 16:01:27 -07003596 }
Tom Herbert1e94d722010-03-18 17:45:44 -07003597#endif
Eric Dumazetadc93002011-11-17 03:13:26 +00003598 return __netif_receive_skb(skb);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003599}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003600EXPORT_SYMBOL(netif_receive_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003601
Eric Dumazet88751272010-04-19 05:07:33 +00003602/* Network device is going away, flush any packets still pending
3603 * Called with irqs disabled.
3604 */
Changli Gao152102c2010-03-30 20:16:22 +00003605static void flush_backlog(void *arg)
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003606{
Changli Gao152102c2010-03-30 20:16:22 +00003607 struct net_device *dev = arg;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003608 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003609 struct sk_buff *skb, *tmp;
3610
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003611 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003612 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003613 if (skb->dev == dev) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003614 __skb_unlink(skb, &sd->input_pkt_queue);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003615 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003616 input_queue_head_incr(sd);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003617 }
Changli Gao6e7676c2010-04-27 15:07:33 -07003618 }
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003619 rps_unlock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003620
3621 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3622 if (skb->dev == dev) {
3623 __skb_unlink(skb, &sd->process_queue);
3624 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003625 input_queue_head_incr(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003626 }
3627 }
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003628}
3629
Herbert Xud565b0a2008-12-15 23:38:52 -08003630static int napi_gro_complete(struct sk_buff *skb)
3631{
Vlad Yasevich22061d82012-11-15 08:49:11 +00003632 struct packet_offload *ptype;
Herbert Xud565b0a2008-12-15 23:38:52 -08003633 __be16 type = skb->protocol;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003634 struct list_head *head = &offload_base;
Herbert Xud565b0a2008-12-15 23:38:52 -08003635 int err = -ENOENT;
3636
Eric Dumazetc3c7c252012-12-06 13:54:59 +00003637 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3638
Herbert Xufc59f9a2009-04-14 15:11:06 -07003639 if (NAPI_GRO_CB(skb)->count == 1) {
3640 skb_shinfo(skb)->gso_size = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003641 goto out;
Herbert Xufc59f9a2009-04-14 15:11:06 -07003642 }
Herbert Xud565b0a2008-12-15 23:38:52 -08003643
3644 rcu_read_lock();
3645 list_for_each_entry_rcu(ptype, head, list) {
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003646 if (ptype->type != type || !ptype->callbacks.gro_complete)
Herbert Xud565b0a2008-12-15 23:38:52 -08003647 continue;
3648
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003649 err = ptype->callbacks.gro_complete(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003650 break;
3651 }
3652 rcu_read_unlock();
3653
3654 if (err) {
3655 WARN_ON(&ptype->list == head);
3656 kfree_skb(skb);
3657 return NET_RX_SUCCESS;
3658 }
3659
3660out:
Herbert Xud565b0a2008-12-15 23:38:52 -08003661 return netif_receive_skb(skb);
3662}
3663
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003664/* napi->gro_list contains packets ordered by age.
3665 * youngest packets at the head of it.
3666 * Complete skbs in reverse order to reduce latencies.
3667 */
3668void napi_gro_flush(struct napi_struct *napi, bool flush_old)
Herbert Xud565b0a2008-12-15 23:38:52 -08003669{
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003670 struct sk_buff *skb, *prev = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08003671
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003672 /* scan list and build reverse chain */
3673 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3674 skb->prev = prev;
3675 prev = skb;
Herbert Xud565b0a2008-12-15 23:38:52 -08003676 }
3677
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003678 for (skb = prev; skb; skb = prev) {
3679 skb->next = NULL;
3680
3681 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3682 return;
3683
3684 prev = skb->prev;
3685 napi_gro_complete(skb);
3686 napi->gro_count--;
3687 }
3688
Herbert Xud565b0a2008-12-15 23:38:52 -08003689 napi->gro_list = NULL;
3690}
Eric Dumazet86cac582010-08-31 18:25:32 +00003691EXPORT_SYMBOL(napi_gro_flush);
Herbert Xud565b0a2008-12-15 23:38:52 -08003692
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003693static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3694{
3695 struct sk_buff *p;
3696 unsigned int maclen = skb->dev->hard_header_len;
3697
3698 for (p = napi->gro_list; p; p = p->next) {
3699 unsigned long diffs;
3700
3701 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3702 diffs |= p->vlan_tci ^ skb->vlan_tci;
3703 if (maclen == ETH_HLEN)
3704 diffs |= compare_ether_header(skb_mac_header(p),
3705 skb_gro_mac_header(skb));
3706 else if (!diffs)
3707 diffs = memcmp(skb_mac_header(p),
3708 skb_gro_mac_header(skb),
3709 maclen);
3710 NAPI_GRO_CB(p)->same_flow = !diffs;
3711 NAPI_GRO_CB(p)->flush = 0;
3712 }
3713}
3714
Rami Rosenbb728822012-11-28 21:55:25 +00003715static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xud565b0a2008-12-15 23:38:52 -08003716{
3717 struct sk_buff **pp = NULL;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003718 struct packet_offload *ptype;
Herbert Xud565b0a2008-12-15 23:38:52 -08003719 __be16 type = skb->protocol;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003720 struct list_head *head = &offload_base;
Herbert Xu0da2afd52008-12-26 14:57:42 -08003721 int same_flow;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003722 enum gro_result ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003723
Jarek Poplawskice9e76c2010-08-05 01:19:11 +00003724 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
Herbert Xud565b0a2008-12-15 23:38:52 -08003725 goto normal;
3726
David S. Miller21dc3302010-08-23 00:13:46 -07003727 if (skb_is_gso(skb) || skb_has_frag_list(skb))
Herbert Xuf17f5c92009-01-14 14:36:12 -08003728 goto normal;
3729
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003730 gro_list_prepare(napi, skb);
3731
Herbert Xud565b0a2008-12-15 23:38:52 -08003732 rcu_read_lock();
3733 list_for_each_entry_rcu(ptype, head, list) {
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003734 if (ptype->type != type || !ptype->callbacks.gro_receive)
Herbert Xud565b0a2008-12-15 23:38:52 -08003735 continue;
3736
Herbert Xu86911732009-01-29 14:19:50 +00003737 skb_set_network_header(skb, skb_gro_offset(skb));
Eric Dumazetefd94502013-02-14 17:31:48 +00003738 skb_reset_mac_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003739 NAPI_GRO_CB(skb)->same_flow = 0;
3740 NAPI_GRO_CB(skb)->flush = 0;
Herbert Xu5d38a072009-01-04 16:13:40 -08003741 NAPI_GRO_CB(skb)->free = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003742
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003743 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003744 break;
3745 }
3746 rcu_read_unlock();
3747
3748 if (&ptype->list == head)
3749 goto normal;
3750
Herbert Xu0da2afd52008-12-26 14:57:42 -08003751 same_flow = NAPI_GRO_CB(skb)->same_flow;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003752 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
Herbert Xu0da2afd52008-12-26 14:57:42 -08003753
Herbert Xud565b0a2008-12-15 23:38:52 -08003754 if (pp) {
3755 struct sk_buff *nskb = *pp;
3756
3757 *pp = nskb->next;
3758 nskb->next = NULL;
3759 napi_gro_complete(nskb);
Herbert Xu4ae55442009-02-08 18:00:36 +00003760 napi->gro_count--;
Herbert Xud565b0a2008-12-15 23:38:52 -08003761 }
3762
Herbert Xu0da2afd52008-12-26 14:57:42 -08003763 if (same_flow)
Herbert Xud565b0a2008-12-15 23:38:52 -08003764 goto ok;
3765
Herbert Xu4ae55442009-02-08 18:00:36 +00003766 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
Herbert Xud565b0a2008-12-15 23:38:52 -08003767 goto normal;
Herbert Xud565b0a2008-12-15 23:38:52 -08003768
Herbert Xu4ae55442009-02-08 18:00:36 +00003769 napi->gro_count++;
Herbert Xud565b0a2008-12-15 23:38:52 -08003770 NAPI_GRO_CB(skb)->count = 1;
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003771 NAPI_GRO_CB(skb)->age = jiffies;
Herbert Xu86911732009-01-29 14:19:50 +00003772 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003773 skb->next = napi->gro_list;
3774 napi->gro_list = skb;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003775 ret = GRO_HELD;
Herbert Xud565b0a2008-12-15 23:38:52 -08003776
Herbert Xuad0f9902009-02-01 01:24:55 -08003777pull:
Herbert Xucb189782009-05-26 18:50:31 +00003778 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3779 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3780
3781 BUG_ON(skb->end - skb->tail < grow);
3782
3783 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3784
3785 skb->tail += grow;
3786 skb->data_len -= grow;
3787
3788 skb_shinfo(skb)->frags[0].page_offset += grow;
Eric Dumazet9e903e02011-10-18 21:00:24 +00003789 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
Herbert Xucb189782009-05-26 18:50:31 +00003790
Eric Dumazet9e903e02011-10-18 21:00:24 +00003791 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
Ian Campbellea2ab692011-08-22 23:44:58 +00003792 skb_frag_unref(skb, 0);
Herbert Xucb189782009-05-26 18:50:31 +00003793 memmove(skb_shinfo(skb)->frags,
3794 skb_shinfo(skb)->frags + 1,
Jarek Poplawskie5093ae2010-08-11 02:02:10 +00003795 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
Herbert Xucb189782009-05-26 18:50:31 +00003796 }
Herbert Xuad0f9902009-02-01 01:24:55 -08003797 }
3798
Herbert Xud565b0a2008-12-15 23:38:52 -08003799ok:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003800 return ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003801
3802normal:
Herbert Xuad0f9902009-02-01 01:24:55 -08003803 ret = GRO_NORMAL;
3804 goto pull;
Herbert Xu5d38a072009-01-04 16:13:40 -08003805}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003806
Herbert Xu96e93ea2009-01-06 10:49:34 -08003807
Rami Rosenbb728822012-11-28 21:55:25 +00003808static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
Herbert Xu5d38a072009-01-04 16:13:40 -08003809{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003810 switch (ret) {
3811 case GRO_NORMAL:
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003812 if (netif_receive_skb(skb))
3813 ret = GRO_DROP;
3814 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003815
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003816 case GRO_DROP:
Herbert Xu5d38a072009-01-04 16:13:40 -08003817 kfree_skb(skb);
3818 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003819
Eric Dumazetdaa86542012-04-19 07:07:40 +00003820 case GRO_MERGED_FREE:
Eric Dumazetd7e88832012-04-30 08:10:34 +00003821 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3822 kmem_cache_free(skbuff_head_cache, skb);
3823 else
3824 __kfree_skb(skb);
Eric Dumazetdaa86542012-04-19 07:07:40 +00003825 break;
3826
Ben Hutchings5b252f02009-10-29 07:17:09 +00003827 case GRO_HELD:
3828 case GRO_MERGED:
3829 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003830 }
3831
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003832 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003833}
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003834
Eric Dumazetca07e432012-10-06 22:28:06 +00003835static void skb_gro_reset_offset(struct sk_buff *skb)
Herbert Xu78a478d2009-05-26 18:50:21 +00003836{
Eric Dumazetca07e432012-10-06 22:28:06 +00003837 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3838 const skb_frag_t *frag0 = &pinfo->frags[0];
3839
Herbert Xu78a478d2009-05-26 18:50:21 +00003840 NAPI_GRO_CB(skb)->data_offset = 0;
3841 NAPI_GRO_CB(skb)->frag0 = NULL;
Herbert Xu74895942009-05-26 18:50:27 +00003842 NAPI_GRO_CB(skb)->frag0_len = 0;
Herbert Xu78a478d2009-05-26 18:50:21 +00003843
Herbert Xu78d3fd02009-05-26 18:50:23 +00003844 if (skb->mac_header == skb->tail &&
Eric Dumazetca07e432012-10-06 22:28:06 +00003845 pinfo->nr_frags &&
3846 !PageHighMem(skb_frag_page(frag0))) {
3847 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3848 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
Herbert Xu74895942009-05-26 18:50:27 +00003849 }
Herbert Xu78a478d2009-05-26 18:50:21 +00003850}
Herbert Xu78a478d2009-05-26 18:50:21 +00003851
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003852gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003853{
Herbert Xu86911732009-01-29 14:19:50 +00003854 skb_gro_reset_offset(skb);
3855
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003856 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003857}
3858EXPORT_SYMBOL(napi_gro_receive);
3859
stephen hemmingerd0c2b0d2010-10-19 07:12:10 +00003860static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003861{
Herbert Xu96e93ea2009-01-06 10:49:34 -08003862 __skb_pull(skb, skb_headlen(skb));
Eric Dumazet2a2a4592012-03-21 06:58:03 +00003863 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3864 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
Jesse Gross3701e512010-10-20 13:56:06 +00003865 skb->vlan_tci = 0;
Herbert Xu66c46d72011-01-29 20:44:54 -08003866 skb->dev = napi->dev;
Andy Gospodarek6d152e22011-02-02 14:53:25 -08003867 skb->skb_iif = 0;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003868
3869 napi->skb = skb;
3870}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003871
Herbert Xu76620aa2009-04-16 02:02:07 -07003872struct sk_buff *napi_get_frags(struct napi_struct *napi)
Herbert Xu5d38a072009-01-04 16:13:40 -08003873{
Herbert Xu5d38a072009-01-04 16:13:40 -08003874 struct sk_buff *skb = napi->skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003875
3876 if (!skb) {
Eric Dumazet89d71a62009-10-13 05:34:20 +00003877 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3878 if (skb)
3879 napi->skb = skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003880 }
Herbert Xu96e93ea2009-01-06 10:49:34 -08003881 return skb;
3882}
Herbert Xu76620aa2009-04-16 02:02:07 -07003883EXPORT_SYMBOL(napi_get_frags);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003884
Rami Rosenbb728822012-11-28 21:55:25 +00003885static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003886 gro_result_t ret)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003887{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003888 switch (ret) {
3889 case GRO_NORMAL:
Herbert Xu86911732009-01-29 14:19:50 +00003890 case GRO_HELD:
Ajit Khapardee76b69c2010-02-16 20:25:43 +00003891 skb->protocol = eth_type_trans(skb, skb->dev);
Herbert Xu86911732009-01-29 14:19:50 +00003892
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003893 if (ret == GRO_HELD)
3894 skb_gro_pull(skb, -ETH_HLEN);
3895 else if (netif_receive_skb(skb))
3896 ret = GRO_DROP;
Herbert Xu86911732009-01-29 14:19:50 +00003897 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003898
3899 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003900 case GRO_MERGED_FREE:
3901 napi_reuse_skb(napi, skb);
3902 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003903
3904 case GRO_MERGED:
3905 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003906 }
3907
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003908 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003909}
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003910
Eric Dumazet4adb9c42012-05-18 20:49:06 +00003911static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003912{
Herbert Xu76620aa2009-04-16 02:02:07 -07003913 struct sk_buff *skb = napi->skb;
3914 struct ethhdr *eth;
Herbert Xua5b1cf22009-05-26 18:50:28 +00003915 unsigned int hlen;
3916 unsigned int off;
Herbert Xu76620aa2009-04-16 02:02:07 -07003917
3918 napi->skb = NULL;
3919
3920 skb_reset_mac_header(skb);
3921 skb_gro_reset_offset(skb);
3922
Herbert Xua5b1cf22009-05-26 18:50:28 +00003923 off = skb_gro_offset(skb);
3924 hlen = off + sizeof(*eth);
3925 eth = skb_gro_header_fast(skb, off);
3926 if (skb_gro_header_hard(skb, hlen)) {
3927 eth = skb_gro_header_slow(skb, hlen, off);
3928 if (unlikely(!eth)) {
3929 napi_reuse_skb(napi, skb);
3930 skb = NULL;
3931 goto out;
3932 }
Herbert Xu76620aa2009-04-16 02:02:07 -07003933 }
3934
3935 skb_gro_pull(skb, sizeof(*eth));
3936
3937 /*
3938 * This works because the only protocols we care about don't require
3939 * special handling. We'll fix it up properly at the end.
3940 */
3941 skb->protocol = eth->h_proto;
3942
3943out:
3944 return skb;
3945}
Herbert Xu76620aa2009-04-16 02:02:07 -07003946
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003947gro_result_t napi_gro_frags(struct napi_struct *napi)
Herbert Xu76620aa2009-04-16 02:02:07 -07003948{
3949 struct sk_buff *skb = napi_frags_skb(napi);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003950
3951 if (!skb)
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003952 return GRO_DROP;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003953
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003954 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
Herbert Xu5d38a072009-01-04 16:13:40 -08003955}
3956EXPORT_SYMBOL(napi_gro_frags);
3957
Eric Dumazete326bed2010-04-22 00:22:45 -07003958/*
3959 * net_rps_action sends any pending IPI's for rps.
3960 * Note: called with local irq disabled, but exits with local irq enabled.
3961 */
3962static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3963{
3964#ifdef CONFIG_RPS
3965 struct softnet_data *remsd = sd->rps_ipi_list;
3966
3967 if (remsd) {
3968 sd->rps_ipi_list = NULL;
3969
3970 local_irq_enable();
3971
3972 /* Send pending IPI's to kick RPS processing on remote cpus. */
3973 while (remsd) {
3974 struct softnet_data *next = remsd->rps_ipi_next;
3975
3976 if (cpu_online(remsd->cpu))
3977 __smp_call_function_single(remsd->cpu,
3978 &remsd->csd, 0);
3979 remsd = next;
3980 }
3981 } else
3982#endif
3983 local_irq_enable();
3984}
3985
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003986static int process_backlog(struct napi_struct *napi, int quota)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003987{
3988 int work = 0;
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003989 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003990
Eric Dumazete326bed2010-04-22 00:22:45 -07003991#ifdef CONFIG_RPS
3992 /* Check if we have pending ipi, its better to send them now,
3993 * not waiting net_rx_action() end.
3994 */
3995 if (sd->rps_ipi_list) {
3996 local_irq_disable();
3997 net_rps_action_and_irq_enable(sd);
3998 }
3999#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004000 napi->weight = weight_p;
Changli Gao6e7676c2010-04-27 15:07:33 -07004001 local_irq_disable();
4002 while (work < quota) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004003 struct sk_buff *skb;
Changli Gao6e7676c2010-04-27 15:07:33 -07004004 unsigned int qlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004005
Changli Gao6e7676c2010-04-27 15:07:33 -07004006 while ((skb = __skb_dequeue(&sd->process_queue))) {
Eric Dumazete4008272010-04-05 15:42:39 -07004007 local_irq_enable();
Changli Gao6e7676c2010-04-27 15:07:33 -07004008 __netif_receive_skb(skb);
Changli Gao6e7676c2010-04-27 15:07:33 -07004009 local_irq_disable();
Tom Herbert76cc8b12010-05-20 18:37:59 +00004010 input_queue_head_incr(sd);
4011 if (++work >= quota) {
4012 local_irq_enable();
4013 return work;
4014 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004015 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004016
Changli Gao6e7676c2010-04-27 15:07:33 -07004017 rps_lock(sd);
4018 qlen = skb_queue_len(&sd->input_pkt_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00004019 if (qlen)
Changli Gao6e7676c2010-04-27 15:07:33 -07004020 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4021 &sd->process_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00004022
Changli Gao6e7676c2010-04-27 15:07:33 -07004023 if (qlen < quota - work) {
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004024 /*
4025 * Inline a custom version of __napi_complete().
4026 * only current cpu owns and manipulates this napi,
4027 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4028 * we can use a plain write instead of clear_bit(),
4029 * and we dont need an smp_mb() memory barrier.
4030 */
4031 list_del(&napi->poll_list);
4032 napi->state = 0;
4033
Changli Gao6e7676c2010-04-27 15:07:33 -07004034 quota = work + qlen;
4035 }
4036 rps_unlock(sd);
4037 }
4038 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004039
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004040 return work;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004041}
4042
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004043/**
4044 * __napi_schedule - schedule for receive
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004045 * @n: entry to schedule
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004046 *
4047 * The entry's receive function will be scheduled to run
4048 */
Harvey Harrisonb5606c22008-02-13 15:03:16 -08004049void __napi_schedule(struct napi_struct *n)
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004050{
4051 unsigned long flags;
4052
4053 local_irq_save(flags);
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004054 ____napi_schedule(&__get_cpu_var(softnet_data), n);
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004055 local_irq_restore(flags);
4056}
4057EXPORT_SYMBOL(__napi_schedule);
4058
Herbert Xud565b0a2008-12-15 23:38:52 -08004059void __napi_complete(struct napi_struct *n)
4060{
4061 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4062 BUG_ON(n->gro_list);
4063
4064 list_del(&n->poll_list);
4065 smp_mb__before_clear_bit();
4066 clear_bit(NAPI_STATE_SCHED, &n->state);
4067}
4068EXPORT_SYMBOL(__napi_complete);
4069
4070void napi_complete(struct napi_struct *n)
4071{
4072 unsigned long flags;
4073
4074 /*
4075 * don't let napi dequeue from the cpu poll list
4076 * just in case its running on a different cpu
4077 */
4078 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4079 return;
4080
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004081 napi_gro_flush(n, false);
Herbert Xud565b0a2008-12-15 23:38:52 -08004082 local_irq_save(flags);
4083 __napi_complete(n);
4084 local_irq_restore(flags);
4085}
4086EXPORT_SYMBOL(napi_complete);
4087
4088void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4089 int (*poll)(struct napi_struct *, int), int weight)
4090{
4091 INIT_LIST_HEAD(&napi->poll_list);
Herbert Xu4ae55442009-02-08 18:00:36 +00004092 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08004093 napi->gro_list = NULL;
Herbert Xu5d38a072009-01-04 16:13:40 -08004094 napi->skb = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08004095 napi->poll = poll;
4096 napi->weight = weight;
4097 list_add(&napi->dev_list, &dev->napi_list);
Herbert Xud565b0a2008-12-15 23:38:52 -08004098 napi->dev = dev;
Herbert Xu5d38a072009-01-04 16:13:40 -08004099#ifdef CONFIG_NETPOLL
Herbert Xud565b0a2008-12-15 23:38:52 -08004100 spin_lock_init(&napi->poll_lock);
4101 napi->poll_owner = -1;
4102#endif
4103 set_bit(NAPI_STATE_SCHED, &napi->state);
4104}
4105EXPORT_SYMBOL(netif_napi_add);
4106
4107void netif_napi_del(struct napi_struct *napi)
4108{
4109 struct sk_buff *skb, *next;
4110
Peter P Waskiewicz Jrd7b06632008-12-26 01:35:35 -08004111 list_del_init(&napi->dev_list);
Herbert Xu76620aa2009-04-16 02:02:07 -07004112 napi_free_frags(napi);
Herbert Xud565b0a2008-12-15 23:38:52 -08004113
4114 for (skb = napi->gro_list; skb; skb = next) {
4115 next = skb->next;
4116 skb->next = NULL;
4117 kfree_skb(skb);
4118 }
4119
4120 napi->gro_list = NULL;
Herbert Xu4ae55442009-02-08 18:00:36 +00004121 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08004122}
4123EXPORT_SYMBOL(netif_napi_del);
4124
Linus Torvalds1da177e2005-04-16 15:20:36 -07004125static void net_rx_action(struct softirq_action *h)
4126{
Eric Dumazete326bed2010-04-22 00:22:45 -07004127 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger24f8b232008-11-03 17:14:38 -08004128 unsigned long time_limit = jiffies + 2;
Stephen Hemminger51b0bde2005-06-23 20:14:40 -07004129 int budget = netdev_budget;
Matt Mackall53fb95d2005-08-11 19:27:43 -07004130 void *have;
4131
Linus Torvalds1da177e2005-04-16 15:20:36 -07004132 local_irq_disable();
4133
Eric Dumazete326bed2010-04-22 00:22:45 -07004134 while (!list_empty(&sd->poll_list)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004135 struct napi_struct *n;
4136 int work, weight;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004137
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004138 /* If softirq window is exhuasted then punt.
Stephen Hemminger24f8b232008-11-03 17:14:38 -08004139 * Allow this to run for 2 jiffies since which will allow
4140 * an average latency of 1.5/HZ.
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004141 */
Stephen Hemminger24f8b232008-11-03 17:14:38 -08004142 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004143 goto softnet_break;
4144
4145 local_irq_enable();
4146
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004147 /* Even though interrupts have been re-enabled, this
4148 * access is safe because interrupts can only add new
4149 * entries to the tail of this list, and only ->poll()
4150 * calls can remove this head entry from the list.
4151 */
Eric Dumazete326bed2010-04-22 00:22:45 -07004152 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004153
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004154 have = netpoll_poll_lock(n);
4155
4156 weight = n->weight;
4157
David S. Miller0a7606c2007-10-29 21:28:47 -07004158 /* This NAPI_STATE_SCHED test is for avoiding a race
4159 * with netpoll's poll_napi(). Only the entity which
4160 * obtains the lock and sees NAPI_STATE_SCHED set will
4161 * actually make the ->poll() call. Therefore we avoid
Lucas De Marchi25985ed2011-03-30 22:57:33 -03004162 * accidentally calling ->poll() when NAPI is not scheduled.
David S. Miller0a7606c2007-10-29 21:28:47 -07004163 */
4164 work = 0;
Neil Horman4ea7e382009-05-21 07:36:08 +00004165 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
David S. Miller0a7606c2007-10-29 21:28:47 -07004166 work = n->poll(n, weight);
Neil Horman4ea7e382009-05-21 07:36:08 +00004167 trace_napi_poll(n);
4168 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004169
4170 WARN_ON_ONCE(work > weight);
4171
4172 budget -= work;
4173
4174 local_irq_disable();
4175
4176 /* Drivers must not modify the NAPI state if they
4177 * consume the entire weight. In such cases this code
4178 * still "owns" the NAPI instance and therefore can
4179 * move the instance around on the list at-will.
4180 */
David S. Millerfed17f32008-01-07 21:00:40 -08004181 if (unlikely(work == weight)) {
Herbert Xuff780cd2009-06-26 19:27:04 -07004182 if (unlikely(napi_disable_pending(n))) {
4183 local_irq_enable();
4184 napi_complete(n);
4185 local_irq_disable();
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004186 } else {
4187 if (n->gro_list) {
4188 /* flush too old packets
4189 * If HZ < 1000, flush all packets.
4190 */
4191 local_irq_enable();
4192 napi_gro_flush(n, HZ >= 1000);
4193 local_irq_disable();
4194 }
Eric Dumazete326bed2010-04-22 00:22:45 -07004195 list_move_tail(&n->poll_list, &sd->poll_list);
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004196 }
David S. Millerfed17f32008-01-07 21:00:40 -08004197 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004198
4199 netpoll_poll_unlock(have);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004200 }
4201out:
Eric Dumazete326bed2010-04-22 00:22:45 -07004202 net_rps_action_and_irq_enable(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00004203
Chris Leechdb217332006-06-17 21:24:58 -07004204#ifdef CONFIG_NET_DMA
4205 /*
4206 * There may not be any more sk_buffs coming right now, so push
4207 * any pending DMA copies to hardware
4208 */
Dan Williams2ba05622009-01-06 11:38:14 -07004209 dma_issue_pending_all();
Chris Leechdb217332006-06-17 21:24:58 -07004210#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004211
Linus Torvalds1da177e2005-04-16 15:20:36 -07004212 return;
4213
4214softnet_break:
Changli Gaodee42872010-05-02 05:42:16 +00004215 sd->time_squeeze++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004216 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4217 goto out;
4218}
4219
Linus Torvalds1da177e2005-04-16 15:20:36 -07004220#ifdef CONFIG_PROC_FS
Mihai Maruseacf04565d2011-10-20 20:45:10 +00004221
Eric Dumazet2def16a2012-04-02 22:33:02 +00004222#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
Mihai Maruseacf04565d2011-10-20 20:45:10 +00004223
4224#define get_bucket(x) ((x) >> BUCKET_SPACE)
4225#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4226#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4227
Eric Dumazet2def16a2012-04-02 22:33:02 +00004228static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
Mihai Maruseacf04565d2011-10-20 20:45:10 +00004229{
Mihai Maruseacf04565d2011-10-20 20:45:10 +00004230 struct net *net = seq_file_net(seq);
4231 struct net_device *dev;
4232 struct hlist_node *p;
4233 struct hlist_head *h;
Eric Dumazet2def16a2012-04-02 22:33:02 +00004234 unsigned int count = 0, offset = get_offset(*pos);
Mihai Maruseacf04565d2011-10-20 20:45:10 +00004235
Eric Dumazet2def16a2012-04-02 22:33:02 +00004236 h = &net->dev_name_head[get_bucket(*pos)];
Mihai Maruseacf04565d2011-10-20 20:45:10 +00004237 hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
Eric Dumazet2def16a2012-04-02 22:33:02 +00004238 if (++count == offset)
Mihai Maruseacf04565d2011-10-20 20:45:10 +00004239 return dev;
Mihai Maruseacf04565d2011-10-20 20:45:10 +00004240 }
4241
4242 return NULL;
4243}
4244
Eric Dumazet2def16a2012-04-02 22:33:02 +00004245static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
Mihai Maruseacf04565d2011-10-20 20:45:10 +00004246{
Mihai Maruseacf04565d2011-10-20 20:45:10 +00004247 struct net_device *dev;
4248 unsigned int bucket;
4249
Mihai Maruseacf04565d2011-10-20 20:45:10 +00004250 do {
Eric Dumazet2def16a2012-04-02 22:33:02 +00004251 dev = dev_from_same_bucket(seq, pos);
Mihai Maruseacf04565d2011-10-20 20:45:10 +00004252 if (dev)
4253 return dev;
4254
Eric Dumazet2def16a2012-04-02 22:33:02 +00004255 bucket = get_bucket(*pos) + 1;
4256 *pos = set_bucket_offset(bucket, 1);
Mihai Maruseacf04565d2011-10-20 20:45:10 +00004257 } while (bucket < NETDEV_HASHENTRIES);
4258
4259 return NULL;
4260}
4261
Linus Torvalds1da177e2005-04-16 15:20:36 -07004262/*
4263 * This is invoked by the /proc filesystem handler to display a device
4264 * in detail.
4265 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004266void *dev_seq_start(struct seq_file *seq, loff_t *pos)
Eric Dumazetc6d14c82009-11-04 05:43:23 -08004267 __acquires(RCU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004268{
Eric Dumazetc6d14c82009-11-04 05:43:23 -08004269 rcu_read_lock();
Pavel Emelianov7562f872007-05-03 15:13:45 -07004270 if (!*pos)
4271 return SEQ_START_TOKEN;
4272
Eric Dumazet2def16a2012-04-02 22:33:02 +00004273 if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
Mihai Maruseacf04565d2011-10-20 20:45:10 +00004274 return NULL;
Pavel Emelianov7562f872007-05-03 15:13:45 -07004275
Eric Dumazet2def16a2012-04-02 22:33:02 +00004276 return dev_from_bucket(seq, pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004277}
4278
4279void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4280{
4281 ++*pos;
Eric Dumazet2def16a2012-04-02 22:33:02 +00004282 return dev_from_bucket(seq, pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004283}
4284
4285void dev_seq_stop(struct seq_file *seq, void *v)
Eric Dumazetc6d14c82009-11-04 05:43:23 -08004286 __releases(RCU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004287{
Eric Dumazetc6d14c82009-11-04 05:43:23 -08004288 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004289}
4290
4291static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4292{
Eric Dumazet28172732010-07-07 14:58:56 -07004293 struct rtnl_link_stats64 temp;
4294 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004295
Ben Hutchingsbe1f3c22010-06-08 07:19:54 +00004296 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4297 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
Rusty Russell5a1b5892007-04-28 21:04:03 -07004298 dev->name, stats->rx_bytes, stats->rx_packets,
4299 stats->rx_errors,
4300 stats->rx_dropped + stats->rx_missed_errors,
4301 stats->rx_fifo_errors,
4302 stats->rx_length_errors + stats->rx_over_errors +
4303 stats->rx_crc_errors + stats->rx_frame_errors,
4304 stats->rx_compressed, stats->multicast,
4305 stats->tx_bytes, stats->tx_packets,
4306 stats->tx_errors, stats->tx_dropped,
4307 stats->tx_fifo_errors, stats->collisions,
4308 stats->tx_carrier_errors +
4309 stats->tx_aborted_errors +
4310 stats->tx_window_errors +
4311 stats->tx_heartbeat_errors,
4312 stats->tx_compressed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004313}
4314
4315/*
4316 * Called from the PROCfs module. This now uses the new arbitrary sized
4317 * /proc/net interface to create /proc/net/dev
4318 */
4319static int dev_seq_show(struct seq_file *seq, void *v)
4320{
4321 if (v == SEQ_START_TOKEN)
4322 seq_puts(seq, "Inter-| Receive "
4323 " | Transmit\n"
4324 " face |bytes packets errs drop fifo frame "
4325 "compressed multicast|bytes packets errs "
4326 "drop fifo colls carrier compressed\n");
4327 else
4328 dev_seq_printf_stats(seq, v);
4329 return 0;
4330}
4331
Changli Gaodee42872010-05-02 05:42:16 +00004332static struct softnet_data *softnet_get_online(loff_t *pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004333{
Changli Gaodee42872010-05-02 05:42:16 +00004334 struct softnet_data *sd = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004335
Mike Travis0c0b0ac2008-05-02 16:43:08 -07004336 while (*pos < nr_cpu_ids)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09004337 if (cpu_online(*pos)) {
Changli Gaodee42872010-05-02 05:42:16 +00004338 sd = &per_cpu(softnet_data, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004339 break;
4340 } else
4341 ++*pos;
Changli Gaodee42872010-05-02 05:42:16 +00004342 return sd;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004343}
4344
4345static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4346{
4347 return softnet_get_online(pos);
4348}
4349
4350static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4351{
4352 ++*pos;
4353 return softnet_get_online(pos);
4354}
4355
4356static void softnet_seq_stop(struct seq_file *seq, void *v)
4357{
4358}
4359
4360static int softnet_seq_show(struct seq_file *seq, void *v)
4361{
Changli Gaodee42872010-05-02 05:42:16 +00004362 struct softnet_data *sd = v;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004363
Tom Herbert0a9627f2010-03-16 08:03:29 +00004364 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
Changli Gaodee42872010-05-02 05:42:16 +00004365 sd->processed, sd->dropped, sd->time_squeeze, 0,
Stephen Hemmingerc1ebcdb2005-06-23 20:08:59 -07004366 0, 0, 0, 0, /* was fastroute */
Changli Gaodee42872010-05-02 05:42:16 +00004367 sd->cpu_collision, sd->received_rps);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004368 return 0;
4369}
4370
Stephen Hemmingerf6908082007-03-12 14:34:29 -07004371static const struct seq_operations dev_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004372 .start = dev_seq_start,
4373 .next = dev_seq_next,
4374 .stop = dev_seq_stop,
4375 .show = dev_seq_show,
4376};
4377
4378static int dev_seq_open(struct inode *inode, struct file *file)
4379{
Denis V. Luneve372c412007-11-19 22:31:54 -08004380 return seq_open_net(inode, file, &dev_seq_ops,
Eric Dumazet2def16a2012-04-02 22:33:02 +00004381 sizeof(struct seq_net_private));
Anton Blanchard5cac98d2011-11-27 21:14:46 +00004382}
4383
Arjan van de Ven9a321442007-02-12 00:55:35 -08004384static const struct file_operations dev_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004385 .owner = THIS_MODULE,
4386 .open = dev_seq_open,
4387 .read = seq_read,
4388 .llseek = seq_lseek,
Denis V. Luneve372c412007-11-19 22:31:54 -08004389 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07004390};
4391
Stephen Hemmingerf6908082007-03-12 14:34:29 -07004392static const struct seq_operations softnet_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004393 .start = softnet_seq_start,
4394 .next = softnet_seq_next,
4395 .stop = softnet_seq_stop,
4396 .show = softnet_seq_show,
4397};
4398
4399static int softnet_seq_open(struct inode *inode, struct file *file)
4400{
4401 return seq_open(file, &softnet_seq_ops);
4402}
4403
Arjan van de Ven9a321442007-02-12 00:55:35 -08004404static const struct file_operations softnet_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004405 .owner = THIS_MODULE,
4406 .open = softnet_seq_open,
4407 .read = seq_read,
4408 .llseek = seq_lseek,
4409 .release = seq_release,
4410};
4411
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004412static void *ptype_get_idx(loff_t pos)
4413{
4414 struct packet_type *pt = NULL;
4415 loff_t i = 0;
4416 int t;
4417
4418 list_for_each_entry_rcu(pt, &ptype_all, list) {
4419 if (i == pos)
4420 return pt;
4421 ++i;
4422 }
4423
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08004424 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004425 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4426 if (i == pos)
4427 return pt;
4428 ++i;
4429 }
4430 }
4431 return NULL;
4432}
4433
4434static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
Stephen Hemminger72348a42008-01-21 02:27:29 -08004435 __acquires(RCU)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004436{
4437 rcu_read_lock();
4438 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4439}
4440
4441static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4442{
4443 struct packet_type *pt;
4444 struct list_head *nxt;
4445 int hash;
4446
4447 ++*pos;
4448 if (v == SEQ_START_TOKEN)
4449 return ptype_get_idx(0);
4450
4451 pt = v;
4452 nxt = pt->list.next;
4453 if (pt->type == htons(ETH_P_ALL)) {
4454 if (nxt != &ptype_all)
4455 goto found;
4456 hash = 0;
4457 nxt = ptype_base[0].next;
4458 } else
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08004459 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004460
4461 while (nxt == &ptype_base[hash]) {
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08004462 if (++hash >= PTYPE_HASH_SIZE)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004463 return NULL;
4464 nxt = ptype_base[hash].next;
4465 }
4466found:
4467 return list_entry(nxt, struct packet_type, list);
4468}
4469
4470static void ptype_seq_stop(struct seq_file *seq, void *v)
Stephen Hemminger72348a42008-01-21 02:27:29 -08004471 __releases(RCU)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004472{
4473 rcu_read_unlock();
4474}
4475
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004476static int ptype_seq_show(struct seq_file *seq, void *v)
4477{
4478 struct packet_type *pt = v;
4479
4480 if (v == SEQ_START_TOKEN)
4481 seq_puts(seq, "Type Device Function\n");
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09004482 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004483 if (pt->type == htons(ETH_P_ALL))
4484 seq_puts(seq, "ALL ");
4485 else
4486 seq_printf(seq, "%04x", ntohs(pt->type));
4487
Alexey Dobriyan908cd2d2008-11-16 19:50:35 -08004488 seq_printf(seq, " %-8s %pF\n",
4489 pt->dev ? pt->dev->name : "", pt->func);
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004490 }
4491
4492 return 0;
4493}
4494
4495static const struct seq_operations ptype_seq_ops = {
4496 .start = ptype_seq_start,
4497 .next = ptype_seq_next,
4498 .stop = ptype_seq_stop,
4499 .show = ptype_seq_show,
4500};
4501
4502static int ptype_seq_open(struct inode *inode, struct file *file)
4503{
Pavel Emelyanov2feb27d2008-03-24 14:57:45 -07004504 return seq_open_net(inode, file, &ptype_seq_ops,
4505 sizeof(struct seq_net_private));
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004506}
4507
4508static const struct file_operations ptype_seq_fops = {
4509 .owner = THIS_MODULE,
4510 .open = ptype_seq_open,
4511 .read = seq_read,
4512 .llseek = seq_lseek,
Pavel Emelyanov2feb27d2008-03-24 14:57:45 -07004513 .release = seq_release_net,
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004514};
4515
4516
Pavel Emelyanov46650792007-10-08 20:38:39 -07004517static int __net_init dev_proc_net_init(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004518{
4519 int rc = -ENOMEM;
4520
Gao fengd4beaa62013-02-18 01:34:54 +00004521 if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004522 goto out;
Gao fengd4beaa62013-02-18 01:34:54 +00004523 if (!proc_create("softnet_stat", S_IRUGO, net->proc_net,
4524 &softnet_seq_fops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004525 goto out_dev;
Gao fengd4beaa62013-02-18 01:34:54 +00004526 if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops))
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02004527 goto out_softnet;
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004528
Eric W. Biederman881d9662007-09-17 11:56:21 -07004529 if (wext_proc_init(net))
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02004530 goto out_ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004531 rc = 0;
4532out:
4533 return rc;
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02004534out_ptype:
Gao fengece31ff2013-02-18 01:34:56 +00004535 remove_proc_entry("ptype", net->proc_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004536out_softnet:
Gao fengece31ff2013-02-18 01:34:56 +00004537 remove_proc_entry("softnet_stat", net->proc_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004538out_dev:
Gao fengece31ff2013-02-18 01:34:56 +00004539 remove_proc_entry("dev", net->proc_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004540 goto out;
4541}
Eric W. Biederman881d9662007-09-17 11:56:21 -07004542
Pavel Emelyanov46650792007-10-08 20:38:39 -07004543static void __net_exit dev_proc_net_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07004544{
4545 wext_proc_exit(net);
4546
Gao fengece31ff2013-02-18 01:34:56 +00004547 remove_proc_entry("ptype", net->proc_net);
4548 remove_proc_entry("softnet_stat", net->proc_net);
4549 remove_proc_entry("dev", net->proc_net);
Eric W. Biederman881d9662007-09-17 11:56:21 -07004550}
4551
Denis V. Lunev022cbae2007-11-13 03:23:50 -08004552static struct pernet_operations __net_initdata dev_proc_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07004553 .init = dev_proc_net_init,
4554 .exit = dev_proc_net_exit,
4555};
4556
4557static int __init dev_proc_init(void)
4558{
4559 return register_pernet_subsys(&dev_proc_ops);
4560}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004561#else
4562#define dev_proc_init() 0
4563#endif /* CONFIG_PROC_FS */
4564
4565
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004566struct netdev_upper {
4567 struct net_device *dev;
4568 bool master;
4569 struct list_head list;
4570 struct rcu_head rcu;
4571 struct list_head search_list;
4572};
4573
4574static void __append_search_uppers(struct list_head *search_list,
4575 struct net_device *dev)
4576{
4577 struct netdev_upper *upper;
4578
4579 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4580 /* check if this upper is not already in search list */
4581 if (list_empty(&upper->search_list))
4582 list_add_tail(&upper->search_list, search_list);
4583 }
4584}
4585
4586static bool __netdev_search_upper_dev(struct net_device *dev,
4587 struct net_device *upper_dev)
4588{
4589 LIST_HEAD(search_list);
4590 struct netdev_upper *upper;
4591 struct netdev_upper *tmp;
4592 bool ret = false;
4593
4594 __append_search_uppers(&search_list, dev);
4595 list_for_each_entry(upper, &search_list, search_list) {
4596 if (upper->dev == upper_dev) {
4597 ret = true;
4598 break;
4599 }
4600 __append_search_uppers(&search_list, upper->dev);
4601 }
4602 list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4603 INIT_LIST_HEAD(&upper->search_list);
4604 return ret;
4605}
4606
4607static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4608 struct net_device *upper_dev)
4609{
4610 struct netdev_upper *upper;
4611
4612 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4613 if (upper->dev == upper_dev)
4614 return upper;
4615 }
4616 return NULL;
4617}
4618
4619/**
4620 * netdev_has_upper_dev - Check if device is linked to an upper device
4621 * @dev: device
4622 * @upper_dev: upper device to check
4623 *
4624 * Find out if a device is linked to specified upper device and return true
4625 * in case it is. Note that this checks only immediate upper device,
4626 * not through a complete stack of devices. The caller must hold the RTNL lock.
4627 */
4628bool netdev_has_upper_dev(struct net_device *dev,
4629 struct net_device *upper_dev)
4630{
4631 ASSERT_RTNL();
4632
4633 return __netdev_find_upper(dev, upper_dev);
4634}
4635EXPORT_SYMBOL(netdev_has_upper_dev);
4636
4637/**
4638 * netdev_has_any_upper_dev - Check if device is linked to some device
4639 * @dev: device
4640 *
4641 * Find out if a device is linked to an upper device and return true in case
4642 * it is. The caller must hold the RTNL lock.
4643 */
4644bool netdev_has_any_upper_dev(struct net_device *dev)
4645{
4646 ASSERT_RTNL();
4647
4648 return !list_empty(&dev->upper_dev_list);
4649}
4650EXPORT_SYMBOL(netdev_has_any_upper_dev);
4651
4652/**
4653 * netdev_master_upper_dev_get - Get master upper device
4654 * @dev: device
4655 *
4656 * Find a master upper device and return pointer to it or NULL in case
4657 * it's not there. The caller must hold the RTNL lock.
4658 */
4659struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4660{
4661 struct netdev_upper *upper;
4662
4663 ASSERT_RTNL();
4664
4665 if (list_empty(&dev->upper_dev_list))
4666 return NULL;
4667
4668 upper = list_first_entry(&dev->upper_dev_list,
4669 struct netdev_upper, list);
4670 if (likely(upper->master))
4671 return upper->dev;
4672 return NULL;
4673}
4674EXPORT_SYMBOL(netdev_master_upper_dev_get);
4675
4676/**
4677 * netdev_master_upper_dev_get_rcu - Get master upper device
4678 * @dev: device
4679 *
4680 * Find a master upper device and return pointer to it or NULL in case
4681 * it's not there. The caller must hold the RCU read lock.
4682 */
4683struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4684{
4685 struct netdev_upper *upper;
4686
4687 upper = list_first_or_null_rcu(&dev->upper_dev_list,
4688 struct netdev_upper, list);
4689 if (upper && likely(upper->master))
4690 return upper->dev;
4691 return NULL;
4692}
4693EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4694
4695static int __netdev_upper_dev_link(struct net_device *dev,
4696 struct net_device *upper_dev, bool master)
4697{
4698 struct netdev_upper *upper;
4699
4700 ASSERT_RTNL();
4701
4702 if (dev == upper_dev)
4703 return -EBUSY;
4704
4705 /* To prevent loops, check if dev is not upper device to upper_dev. */
4706 if (__netdev_search_upper_dev(upper_dev, dev))
4707 return -EBUSY;
4708
4709 if (__netdev_find_upper(dev, upper_dev))
4710 return -EEXIST;
4711
4712 if (master && netdev_master_upper_dev_get(dev))
4713 return -EBUSY;
4714
4715 upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4716 if (!upper)
4717 return -ENOMEM;
4718
4719 upper->dev = upper_dev;
4720 upper->master = master;
4721 INIT_LIST_HEAD(&upper->search_list);
4722
4723 /* Ensure that master upper link is always the first item in list. */
4724 if (master)
4725 list_add_rcu(&upper->list, &dev->upper_dev_list);
4726 else
4727 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4728 dev_hold(upper_dev);
4729
4730 return 0;
4731}
4732
4733/**
4734 * netdev_upper_dev_link - Add a link to the upper device
4735 * @dev: device
4736 * @upper_dev: new upper device
4737 *
4738 * Adds a link to device which is upper to this one. The caller must hold
4739 * the RTNL lock. On a failure a negative errno code is returned.
4740 * On success the reference counts are adjusted and the function
4741 * returns zero.
4742 */
4743int netdev_upper_dev_link(struct net_device *dev,
4744 struct net_device *upper_dev)
4745{
4746 return __netdev_upper_dev_link(dev, upper_dev, false);
4747}
4748EXPORT_SYMBOL(netdev_upper_dev_link);
4749
4750/**
4751 * netdev_master_upper_dev_link - Add a master link to the upper device
4752 * @dev: device
4753 * @upper_dev: new upper device
4754 *
4755 * Adds a link to device which is upper to this one. In this case, only
4756 * one master upper device can be linked, although other non-master devices
4757 * might be linked as well. The caller must hold the RTNL lock.
4758 * On a failure a negative errno code is returned. On success the reference
4759 * counts are adjusted and the function returns zero.
4760 */
4761int netdev_master_upper_dev_link(struct net_device *dev,
4762 struct net_device *upper_dev)
4763{
4764 return __netdev_upper_dev_link(dev, upper_dev, true);
4765}
4766EXPORT_SYMBOL(netdev_master_upper_dev_link);
4767
4768/**
4769 * netdev_upper_dev_unlink - Removes a link to upper device
4770 * @dev: device
4771 * @upper_dev: new upper device
4772 *
4773 * Removes a link to device which is upper to this one. The caller must hold
4774 * the RTNL lock.
4775 */
4776void netdev_upper_dev_unlink(struct net_device *dev,
4777 struct net_device *upper_dev)
4778{
4779 struct netdev_upper *upper;
4780
4781 ASSERT_RTNL();
4782
4783 upper = __netdev_find_upper(dev, upper_dev);
4784 if (!upper)
4785 return;
4786 list_del_rcu(&upper->list);
4787 dev_put(upper_dev);
4788 kfree_rcu(upper, rcu);
4789}
4790EXPORT_SYMBOL(netdev_upper_dev_unlink);
4791
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004792static void dev_change_rx_flags(struct net_device *dev, int flags)
4793{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004794 const struct net_device_ops *ops = dev->netdev_ops;
4795
4796 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4797 ops->ndo_change_rx_flags(dev, flags);
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004798}
4799
Wang Chendad9b332008-06-18 01:48:28 -07004800static int __dev_set_promiscuity(struct net_device *dev, int inc)
Patrick McHardy4417da62007-06-27 01:28:10 -07004801{
Eric Dumazetb536db92011-11-30 21:42:26 +00004802 unsigned int old_flags = dev->flags;
Eric W. Biedermand04a48b2012-05-23 17:01:57 -06004803 kuid_t uid;
4804 kgid_t gid;
Patrick McHardy4417da62007-06-27 01:28:10 -07004805
Patrick McHardy24023452007-07-14 18:51:31 -07004806 ASSERT_RTNL();
4807
Wang Chendad9b332008-06-18 01:48:28 -07004808 dev->flags |= IFF_PROMISC;
4809 dev->promiscuity += inc;
4810 if (dev->promiscuity == 0) {
4811 /*
4812 * Avoid overflow.
4813 * If inc causes overflow, untouch promisc and return error.
4814 */
4815 if (inc < 0)
4816 dev->flags &= ~IFF_PROMISC;
4817 else {
4818 dev->promiscuity -= inc;
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004819 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4820 dev->name);
Wang Chendad9b332008-06-18 01:48:28 -07004821 return -EOVERFLOW;
4822 }
4823 }
Patrick McHardy4417da62007-06-27 01:28:10 -07004824 if (dev->flags != old_flags) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004825 pr_info("device %s %s promiscuous mode\n",
4826 dev->name,
4827 dev->flags & IFF_PROMISC ? "entered" : "left");
David Howells8192b0c2008-11-14 10:39:10 +11004828 if (audit_enabled) {
4829 current_uid_gid(&uid, &gid);
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004830 audit_log(current->audit_context, GFP_ATOMIC,
4831 AUDIT_ANOM_PROMISCUOUS,
4832 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4833 dev->name, (dev->flags & IFF_PROMISC),
4834 (old_flags & IFF_PROMISC),
Eric W. Biedermane1760bd2012-09-10 22:39:43 -07004835 from_kuid(&init_user_ns, audit_get_loginuid(current)),
Eric W. Biedermand04a48b2012-05-23 17:01:57 -06004836 from_kuid(&init_user_ns, uid),
4837 from_kgid(&init_user_ns, gid),
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004838 audit_get_sessionid(current));
David Howells8192b0c2008-11-14 10:39:10 +11004839 }
Patrick McHardy24023452007-07-14 18:51:31 -07004840
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004841 dev_change_rx_flags(dev, IFF_PROMISC);
Patrick McHardy4417da62007-06-27 01:28:10 -07004842 }
Wang Chendad9b332008-06-18 01:48:28 -07004843 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004844}
4845
Linus Torvalds1da177e2005-04-16 15:20:36 -07004846/**
4847 * dev_set_promiscuity - update promiscuity count on a device
4848 * @dev: device
4849 * @inc: modifier
4850 *
Stephen Hemminger3041a062006-05-26 13:25:24 -07004851 * Add or remove promiscuity from a device. While the count in the device
Linus Torvalds1da177e2005-04-16 15:20:36 -07004852 * remains above zero the interface remains promiscuous. Once it hits zero
4853 * the device reverts back to normal filtering operation. A negative inc
4854 * value is used to drop promiscuity on the device.
Wang Chendad9b332008-06-18 01:48:28 -07004855 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004856 */
Wang Chendad9b332008-06-18 01:48:28 -07004857int dev_set_promiscuity(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004858{
Eric Dumazetb536db92011-11-30 21:42:26 +00004859 unsigned int old_flags = dev->flags;
Wang Chendad9b332008-06-18 01:48:28 -07004860 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004861
Wang Chendad9b332008-06-18 01:48:28 -07004862 err = __dev_set_promiscuity(dev, inc);
Patrick McHardy4b5a6982008-07-06 15:49:08 -07004863 if (err < 0)
Wang Chendad9b332008-06-18 01:48:28 -07004864 return err;
Patrick McHardy4417da62007-06-27 01:28:10 -07004865 if (dev->flags != old_flags)
4866 dev_set_rx_mode(dev);
Wang Chendad9b332008-06-18 01:48:28 -07004867 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004868}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004869EXPORT_SYMBOL(dev_set_promiscuity);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004870
4871/**
4872 * dev_set_allmulti - update allmulti count on a device
4873 * @dev: device
4874 * @inc: modifier
4875 *
4876 * Add or remove reception of all multicast frames to a device. While the
4877 * count in the device remains above zero the interface remains listening
4878 * to all interfaces. Once it hits zero the device reverts back to normal
4879 * filtering operation. A negative @inc value is used to drop the counter
4880 * when releasing a resource needing all multicasts.
Wang Chendad9b332008-06-18 01:48:28 -07004881 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004882 */
4883
Wang Chendad9b332008-06-18 01:48:28 -07004884int dev_set_allmulti(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004885{
Eric Dumazetb536db92011-11-30 21:42:26 +00004886 unsigned int old_flags = dev->flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004887
Patrick McHardy24023452007-07-14 18:51:31 -07004888 ASSERT_RTNL();
4889
Linus Torvalds1da177e2005-04-16 15:20:36 -07004890 dev->flags |= IFF_ALLMULTI;
Wang Chendad9b332008-06-18 01:48:28 -07004891 dev->allmulti += inc;
4892 if (dev->allmulti == 0) {
4893 /*
4894 * Avoid overflow.
4895 * If inc causes overflow, untouch allmulti and return error.
4896 */
4897 if (inc < 0)
4898 dev->flags &= ~IFF_ALLMULTI;
4899 else {
4900 dev->allmulti -= inc;
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004901 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4902 dev->name);
Wang Chendad9b332008-06-18 01:48:28 -07004903 return -EOVERFLOW;
4904 }
4905 }
Patrick McHardy24023452007-07-14 18:51:31 -07004906 if (dev->flags ^ old_flags) {
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004907 dev_change_rx_flags(dev, IFF_ALLMULTI);
Patrick McHardy4417da62007-06-27 01:28:10 -07004908 dev_set_rx_mode(dev);
Patrick McHardy24023452007-07-14 18:51:31 -07004909 }
Wang Chendad9b332008-06-18 01:48:28 -07004910 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004911}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004912EXPORT_SYMBOL(dev_set_allmulti);
Patrick McHardy4417da62007-06-27 01:28:10 -07004913
4914/*
4915 * Upload unicast and multicast address lists to device and
4916 * configure RX filtering. When the device doesn't support unicast
Joe Perches53ccaae2007-12-20 14:02:06 -08004917 * filtering it is put in promiscuous mode while unicast addresses
Patrick McHardy4417da62007-06-27 01:28:10 -07004918 * are present.
4919 */
4920void __dev_set_rx_mode(struct net_device *dev)
4921{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004922 const struct net_device_ops *ops = dev->netdev_ops;
4923
Patrick McHardy4417da62007-06-27 01:28:10 -07004924 /* dev_open will call this function so the list will stay sane. */
4925 if (!(dev->flags&IFF_UP))
4926 return;
4927
4928 if (!netif_device_present(dev))
YOSHIFUJI Hideaki40b77c92007-07-19 10:43:23 +09004929 return;
Patrick McHardy4417da62007-06-27 01:28:10 -07004930
Jiri Pirko01789342011-08-16 06:29:00 +00004931 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004932 /* Unicast addresses changes may only happen under the rtnl,
4933 * therefore calling __dev_set_promiscuity here is safe.
4934 */
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004935 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004936 __dev_set_promiscuity(dev, 1);
Joe Perches2d348d12011-07-25 16:17:35 -07004937 dev->uc_promisc = true;
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004938 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004939 __dev_set_promiscuity(dev, -1);
Joe Perches2d348d12011-07-25 16:17:35 -07004940 dev->uc_promisc = false;
Patrick McHardy4417da62007-06-27 01:28:10 -07004941 }
Patrick McHardy4417da62007-06-27 01:28:10 -07004942 }
Jiri Pirko01789342011-08-16 06:29:00 +00004943
4944 if (ops->ndo_set_rx_mode)
4945 ops->ndo_set_rx_mode(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004946}
4947
4948void dev_set_rx_mode(struct net_device *dev)
4949{
David S. Millerb9e40852008-07-15 00:15:08 -07004950 netif_addr_lock_bh(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004951 __dev_set_rx_mode(dev);
David S. Millerb9e40852008-07-15 00:15:08 -07004952 netif_addr_unlock_bh(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004953}
4954
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004955/**
4956 * dev_get_flags - get flags reported to userspace
4957 * @dev: device
4958 *
4959 * Get the combination of flag bits exported through APIs to userspace.
4960 */
Eric Dumazet95c96172012-04-15 05:58:06 +00004961unsigned int dev_get_flags(const struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004962{
Eric Dumazet95c96172012-04-15 05:58:06 +00004963 unsigned int flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004964
4965 flags = (dev->flags & ~(IFF_PROMISC |
4966 IFF_ALLMULTI |
Stefan Rompfb00055a2006-03-20 17:09:11 -08004967 IFF_RUNNING |
4968 IFF_LOWER_UP |
4969 IFF_DORMANT)) |
Linus Torvalds1da177e2005-04-16 15:20:36 -07004970 (dev->gflags & (IFF_PROMISC |
4971 IFF_ALLMULTI));
4972
Stefan Rompfb00055a2006-03-20 17:09:11 -08004973 if (netif_running(dev)) {
4974 if (netif_oper_up(dev))
4975 flags |= IFF_RUNNING;
4976 if (netif_carrier_ok(dev))
4977 flags |= IFF_LOWER_UP;
4978 if (netif_dormant(dev))
4979 flags |= IFF_DORMANT;
4980 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004981
4982 return flags;
4983}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004984EXPORT_SYMBOL(dev_get_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004985
Patrick McHardybd380812010-02-26 06:34:53 +00004986int __dev_change_flags(struct net_device *dev, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004987{
Eric Dumazetb536db92011-11-30 21:42:26 +00004988 unsigned int old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00004989 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004990
Patrick McHardy24023452007-07-14 18:51:31 -07004991 ASSERT_RTNL();
4992
Linus Torvalds1da177e2005-04-16 15:20:36 -07004993 /*
4994 * Set the flags on our device.
4995 */
4996
4997 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4998 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4999 IFF_AUTOMEDIA)) |
5000 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5001 IFF_ALLMULTI));
5002
5003 /*
5004 * Load in the correct multicast list now the flags have changed.
5005 */
5006
Patrick McHardyb6c40d62008-10-07 15:26:48 -07005007 if ((old_flags ^ flags) & IFF_MULTICAST)
5008 dev_change_rx_flags(dev, IFF_MULTICAST);
Patrick McHardy24023452007-07-14 18:51:31 -07005009
Patrick McHardy4417da62007-06-27 01:28:10 -07005010 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005011
5012 /*
5013 * Have we downed the interface. We handle IFF_UP ourselves
5014 * according to user attempts to set it, rather than blindly
5015 * setting it.
5016 */
5017
5018 ret = 0;
5019 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
Patrick McHardybd380812010-02-26 06:34:53 +00005020 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005021
5022 if (!ret)
Patrick McHardy4417da62007-06-27 01:28:10 -07005023 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005024 }
5025
Linus Torvalds1da177e2005-04-16 15:20:36 -07005026 if ((flags ^ dev->gflags) & IFF_PROMISC) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005027 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5028
Linus Torvalds1da177e2005-04-16 15:20:36 -07005029 dev->gflags ^= IFF_PROMISC;
5030 dev_set_promiscuity(dev, inc);
5031 }
5032
5033 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5034 is important. Some (broken) drivers set IFF_PROMISC, when
5035 IFF_ALLMULTI is requested not asking us and not reporting.
5036 */
5037 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005038 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5039
Linus Torvalds1da177e2005-04-16 15:20:36 -07005040 dev->gflags ^= IFF_ALLMULTI;
5041 dev_set_allmulti(dev, inc);
5042 }
5043
Patrick McHardybd380812010-02-26 06:34:53 +00005044 return ret;
5045}
5046
5047void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
5048{
5049 unsigned int changes = dev->flags ^ old_flags;
5050
5051 if (changes & IFF_UP) {
5052 if (dev->flags & IFF_UP)
5053 call_netdevice_notifiers(NETDEV_UP, dev);
5054 else
5055 call_netdevice_notifiers(NETDEV_DOWN, dev);
5056 }
5057
5058 if (dev->flags & IFF_UP &&
5059 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
5060 call_netdevice_notifiers(NETDEV_CHANGE, dev);
5061}
5062
5063/**
5064 * dev_change_flags - change device settings
5065 * @dev: device
5066 * @flags: device state flags
5067 *
5068 * Change settings on device based state flags. The flags are
5069 * in the userspace exported format.
5070 */
Eric Dumazetb536db92011-11-30 21:42:26 +00005071int dev_change_flags(struct net_device *dev, unsigned int flags)
Patrick McHardybd380812010-02-26 06:34:53 +00005072{
Eric Dumazetb536db92011-11-30 21:42:26 +00005073 int ret;
5074 unsigned int changes, old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00005075
5076 ret = __dev_change_flags(dev, flags);
5077 if (ret < 0)
5078 return ret;
5079
5080 changes = old_flags ^ dev->flags;
Thomas Graf7c355f52007-06-05 16:03:03 -07005081 if (changes)
5082 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005083
Patrick McHardybd380812010-02-26 06:34:53 +00005084 __dev_notify_flags(dev, old_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005085 return ret;
5086}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005087EXPORT_SYMBOL(dev_change_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005088
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005089/**
5090 * dev_set_mtu - Change maximum transfer unit
5091 * @dev: device
5092 * @new_mtu: new transfer unit
5093 *
5094 * Change the maximum transfer size of the network device.
5095 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005096int dev_set_mtu(struct net_device *dev, int new_mtu)
5097{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005098 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005099 int err;
5100
5101 if (new_mtu == dev->mtu)
5102 return 0;
5103
5104 /* MTU must be positive. */
5105 if (new_mtu < 0)
5106 return -EINVAL;
5107
5108 if (!netif_device_present(dev))
5109 return -ENODEV;
5110
5111 err = 0;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005112 if (ops->ndo_change_mtu)
5113 err = ops->ndo_change_mtu(dev, new_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005114 else
5115 dev->mtu = new_mtu;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005116
Jiri Pirkoe3d8fab2012-12-03 01:16:32 +00005117 if (!err)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005118 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005119 return err;
5120}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005121EXPORT_SYMBOL(dev_set_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005122
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005123/**
Vlad Dogarucbda10f2011-01-13 23:38:30 +00005124 * dev_set_group - Change group this device belongs to
5125 * @dev: device
5126 * @new_group: group this device should belong to
5127 */
5128void dev_set_group(struct net_device *dev, int new_group)
5129{
5130 dev->group = new_group;
5131}
5132EXPORT_SYMBOL(dev_set_group);
5133
5134/**
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005135 * dev_set_mac_address - Change Media Access Control Address
5136 * @dev: device
5137 * @sa: new address
5138 *
5139 * Change the hardware (MAC) address of the device
5140 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005141int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5142{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005143 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005144 int err;
5145
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005146 if (!ops->ndo_set_mac_address)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005147 return -EOPNOTSUPP;
5148 if (sa->sa_family != dev->type)
5149 return -EINVAL;
5150 if (!netif_device_present(dev))
5151 return -ENODEV;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005152 err = ops->ndo_set_mac_address(dev, sa);
Jiri Pirkof6521512013-01-01 03:30:14 +00005153 if (err)
5154 return err;
Jiri Pirkofbdeca22013-01-01 03:30:16 +00005155 dev->addr_assign_type = NET_ADDR_SET;
Jiri Pirkof6521512013-01-01 03:30:14 +00005156 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04005157 add_device_randomness(dev->dev_addr, dev->addr_len);
Jiri Pirkof6521512013-01-01 03:30:14 +00005158 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005159}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005160EXPORT_SYMBOL(dev_set_mac_address);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005161
Jiri Pirko4bf84c32012-12-27 23:49:37 +00005162/**
5163 * dev_change_carrier - Change device carrier
5164 * @dev: device
5165 * @new_carries: new value
5166 *
5167 * Change device carrier
5168 */
5169int dev_change_carrier(struct net_device *dev, bool new_carrier)
5170{
5171 const struct net_device_ops *ops = dev->netdev_ops;
5172
5173 if (!ops->ndo_change_carrier)
5174 return -EOPNOTSUPP;
5175 if (!netif_device_present(dev))
5176 return -ENODEV;
5177 return ops->ndo_change_carrier(dev, new_carrier);
5178}
5179EXPORT_SYMBOL(dev_change_carrier);
5180
Linus Torvalds1da177e2005-04-16 15:20:36 -07005181/**
5182 * dev_new_index - allocate an ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07005183 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07005184 *
5185 * Returns a suitable unique value for a new device interface
5186 * number. The caller must hold the rtnl semaphore or the
5187 * dev_base_lock to be sure it remains unique.
5188 */
Eric W. Biederman881d9662007-09-17 11:56:21 -07005189static int dev_new_index(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005190{
Pavel Emelyanovaa79e662012-08-08 21:53:19 +00005191 int ifindex = net->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005192 for (;;) {
5193 if (++ifindex <= 0)
5194 ifindex = 1;
Eric W. Biederman881d9662007-09-17 11:56:21 -07005195 if (!__dev_get_by_index(net, ifindex))
Pavel Emelyanovaa79e662012-08-08 21:53:19 +00005196 return net->ifindex = ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005197 }
5198}
5199
Linus Torvalds1da177e2005-04-16 15:20:36 -07005200/* Delayed registration/unregisteration */
Denis Cheng3b5b34f2007-12-07 00:49:17 -08005201static LIST_HEAD(net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005202
Stephen Hemminger6f05f622007-03-08 20:46:03 -08005203static void net_set_todo(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005204{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005205 list_add_tail(&dev->todo_list, &net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005206}
5207
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005208static void rollback_registered_many(struct list_head *head)
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005209{
Krishna Kumare93737b2009-12-08 22:26:02 +00005210 struct net_device *dev, *tmp;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005211
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005212 BUG_ON(dev_boot_phase);
5213 ASSERT_RTNL();
5214
Krishna Kumare93737b2009-12-08 22:26:02 +00005215 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005216 /* Some devices call without registering
Krishna Kumare93737b2009-12-08 22:26:02 +00005217 * for initialization unwind. Remove those
5218 * devices and proceed with the remaining.
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005219 */
5220 if (dev->reg_state == NETREG_UNINITIALIZED) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005221 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5222 dev->name, dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005223
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005224 WARN_ON(1);
Krishna Kumare93737b2009-12-08 22:26:02 +00005225 list_del(&dev->unreg_list);
5226 continue;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005227 }
Eric Dumazet449f4542011-05-19 12:24:16 +00005228 dev->dismantle = true;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005229 BUG_ON(dev->reg_state != NETREG_REGISTERED);
Octavian Purdila44345722010-12-13 12:44:07 +00005230 }
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005231
Octavian Purdila44345722010-12-13 12:44:07 +00005232 /* If device is running, close it first. */
5233 dev_close_many(head);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005234
Octavian Purdila44345722010-12-13 12:44:07 +00005235 list_for_each_entry(dev, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005236 /* And unlink it from device chain. */
5237 unlist_netdevice(dev);
5238
5239 dev->reg_state = NETREG_UNREGISTERING;
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005240 }
5241
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005242 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005243
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005244 list_for_each_entry(dev, head, unreg_list) {
5245 /* Shutdown queueing discipline. */
5246 dev_shutdown(dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005247
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005248
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005249 /* Notify protocols, that we are about to destroy
5250 this device. They should clean all the things.
5251 */
5252 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5253
Patrick McHardya2835762010-02-26 06:34:51 +00005254 if (!dev->rtnl_link_ops ||
5255 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5256 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5257
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005258 /*
5259 * Flush the unicast and multicast chains
5260 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005261 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00005262 dev_mc_flush(dev);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005263
5264 if (dev->netdev_ops->ndo_uninit)
5265 dev->netdev_ops->ndo_uninit(dev);
5266
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005267 /* Notifier chain MUST detach us all upper devices. */
5268 WARN_ON(netdev_has_any_upper_dev(dev));
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005269
5270 /* Remove entries from kobject tree */
5271 netdev_unregister_kobject(dev);
Alexander Duyck024e9672013-01-10 08:57:46 +00005272#ifdef CONFIG_XPS
5273 /* Remove XPS queueing entries */
5274 netif_reset_xps_queues_gt(dev, 0);
5275#endif
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005276 }
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005277
Eric W. Biederman850a5452011-10-13 22:25:23 +00005278 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005279
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005280 list_for_each_entry(dev, head, unreg_list)
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005281 dev_put(dev);
5282}
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005283
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005284static void rollback_registered(struct net_device *dev)
5285{
5286 LIST_HEAD(single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005287
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005288 list_add(&dev->unreg_list, &single);
5289 rollback_registered_many(&single);
Eric Dumazetceaaec92011-02-17 22:59:19 +00005290 list_del(&single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005291}
5292
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005293static netdev_features_t netdev_fix_features(struct net_device *dev,
5294 netdev_features_t features)
Herbert Xub63365a2008-10-23 01:11:29 -07005295{
Michał Mirosław57422dc2011-01-22 12:14:12 +00005296 /* Fix illegal checksum combinations */
5297 if ((features & NETIF_F_HW_CSUM) &&
5298 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005299 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
Michał Mirosław57422dc2011-01-22 12:14:12 +00005300 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5301 }
5302
Herbert Xub63365a2008-10-23 01:11:29 -07005303 /* Fix illegal SG+CSUM combinations. */
5304 if ((features & NETIF_F_SG) &&
5305 !(features & NETIF_F_ALL_CSUM)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005306 netdev_dbg(dev,
5307 "Dropping NETIF_F_SG since no checksum feature.\n");
Herbert Xub63365a2008-10-23 01:11:29 -07005308 features &= ~NETIF_F_SG;
5309 }
5310
5311 /* TSO requires that SG is present as well. */
Ben Hutchingsea2d3682011-04-12 14:38:37 +00005312 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005313 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
Ben Hutchingsea2d3682011-04-12 14:38:37 +00005314 features &= ~NETIF_F_ALL_TSO;
Herbert Xub63365a2008-10-23 01:11:29 -07005315 }
5316
Ben Hutchings31d8b9e2011-04-12 14:47:15 +00005317 /* TSO ECN requires that TSO is present as well. */
5318 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5319 features &= ~NETIF_F_TSO_ECN;
5320
Michał Mirosław212b5732011-02-15 16:59:16 +00005321 /* Software GSO depends on SG. */
5322 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005323 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
Michał Mirosław212b5732011-02-15 16:59:16 +00005324 features &= ~NETIF_F_GSO;
5325 }
5326
Michał Mirosławacd11302011-01-24 15:45:15 -08005327 /* UFO needs SG and checksumming */
Herbert Xub63365a2008-10-23 01:11:29 -07005328 if (features & NETIF_F_UFO) {
Michał Mirosław79032642010-11-30 06:38:00 +00005329 /* maybe split UFO into V4 and V6? */
5330 if (!((features & NETIF_F_GEN_CSUM) ||
5331 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5332 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005333 netdev_dbg(dev,
Michał Mirosławacd11302011-01-24 15:45:15 -08005334 "Dropping NETIF_F_UFO since no checksum offload features.\n");
Herbert Xub63365a2008-10-23 01:11:29 -07005335 features &= ~NETIF_F_UFO;
5336 }
5337
5338 if (!(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005339 netdev_dbg(dev,
Michał Mirosławacd11302011-01-24 15:45:15 -08005340 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
Herbert Xub63365a2008-10-23 01:11:29 -07005341 features &= ~NETIF_F_UFO;
5342 }
5343 }
5344
5345 return features;
5346}
Herbert Xub63365a2008-10-23 01:11:29 -07005347
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005348int __netdev_update_features(struct net_device *dev)
Michał Mirosław5455c692011-02-15 16:59:17 +00005349{
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005350 netdev_features_t features;
Michał Mirosław5455c692011-02-15 16:59:17 +00005351 int err = 0;
5352
Michał Mirosław87267482011-04-12 09:56:38 +00005353 ASSERT_RTNL();
5354
Michał Mirosław5455c692011-02-15 16:59:17 +00005355 features = netdev_get_wanted_features(dev);
5356
5357 if (dev->netdev_ops->ndo_fix_features)
5358 features = dev->netdev_ops->ndo_fix_features(dev, features);
5359
5360 /* driver might be less strict about feature dependencies */
5361 features = netdev_fix_features(dev, features);
5362
5363 if (dev->features == features)
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005364 return 0;
Michał Mirosław5455c692011-02-15 16:59:17 +00005365
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005366 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5367 &dev->features, &features);
Michał Mirosław5455c692011-02-15 16:59:17 +00005368
5369 if (dev->netdev_ops->ndo_set_features)
5370 err = dev->netdev_ops->ndo_set_features(dev, features);
5371
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005372 if (unlikely(err < 0)) {
Michał Mirosław5455c692011-02-15 16:59:17 +00005373 netdev_err(dev,
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005374 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5375 err, &features, &dev->features);
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005376 return -1;
5377 }
5378
5379 if (!err)
5380 dev->features = features;
5381
5382 return 1;
5383}
5384
Michał Mirosławafe12cc2011-05-07 03:22:17 +00005385/**
5386 * netdev_update_features - recalculate device features
5387 * @dev: the device to check
5388 *
5389 * Recalculate dev->features set and send notifications if it
5390 * has changed. Should be called after driver or hardware dependent
5391 * conditions might have changed that influence the features.
5392 */
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005393void netdev_update_features(struct net_device *dev)
5394{
5395 if (__netdev_update_features(dev))
5396 netdev_features_change(dev);
Michał Mirosław5455c692011-02-15 16:59:17 +00005397}
5398EXPORT_SYMBOL(netdev_update_features);
5399
Linus Torvalds1da177e2005-04-16 15:20:36 -07005400/**
Michał Mirosławafe12cc2011-05-07 03:22:17 +00005401 * netdev_change_features - recalculate device features
5402 * @dev: the device to check
5403 *
5404 * Recalculate dev->features set and send notifications even
5405 * if they have not changed. Should be called instead of
5406 * netdev_update_features() if also dev->vlan_features might
5407 * have changed to allow the changes to be propagated to stacked
5408 * VLAN devices.
5409 */
5410void netdev_change_features(struct net_device *dev)
5411{
5412 __netdev_update_features(dev);
5413 netdev_features_change(dev);
5414}
5415EXPORT_SYMBOL(netdev_change_features);
5416
5417/**
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08005418 * netif_stacked_transfer_operstate - transfer operstate
5419 * @rootdev: the root or lower level device to transfer state from
5420 * @dev: the device to transfer operstate to
5421 *
5422 * Transfer operational state from root to device. This is normally
5423 * called when a stacking relationship exists between the root
5424 * device and the device(a leaf device).
5425 */
5426void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5427 struct net_device *dev)
5428{
5429 if (rootdev->operstate == IF_OPER_DORMANT)
5430 netif_dormant_on(dev);
5431 else
5432 netif_dormant_off(dev);
5433
5434 if (netif_carrier_ok(rootdev)) {
5435 if (!netif_carrier_ok(dev))
5436 netif_carrier_on(dev);
5437 } else {
5438 if (netif_carrier_ok(dev))
5439 netif_carrier_off(dev);
5440 }
5441}
5442EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5443
Tom Herbertbf264142010-11-26 08:36:09 +00005444#ifdef CONFIG_RPS
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005445static int netif_alloc_rx_queues(struct net_device *dev)
5446{
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005447 unsigned int i, count = dev->num_rx_queues;
Tom Herbertbd25fa72010-10-18 18:00:16 +00005448 struct netdev_rx_queue *rx;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005449
Tom Herbertbd25fa72010-10-18 18:00:16 +00005450 BUG_ON(count < 1);
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005451
Tom Herbertbd25fa72010-10-18 18:00:16 +00005452 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
Joe Perches62b59422013-02-04 16:48:16 +00005453 if (!rx)
Tom Herbertbd25fa72010-10-18 18:00:16 +00005454 return -ENOMEM;
Joe Perches62b59422013-02-04 16:48:16 +00005455
Tom Herbertbd25fa72010-10-18 18:00:16 +00005456 dev->_rx = rx;
5457
Tom Herbertbd25fa72010-10-18 18:00:16 +00005458 for (i = 0; i < count; i++)
Tom Herbertfe822242010-11-09 10:47:38 +00005459 rx[i].dev = dev;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005460 return 0;
5461}
Tom Herbertbf264142010-11-26 08:36:09 +00005462#endif
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005463
Changli Gaoaa942102010-12-04 02:31:41 +00005464static void netdev_init_one_queue(struct net_device *dev,
5465 struct netdev_queue *queue, void *_unused)
5466{
5467 /* Initialize queue lock */
5468 spin_lock_init(&queue->_xmit_lock);
5469 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5470 queue->xmit_lock_owner = -1;
Changli Gaob236da62010-12-14 03:09:15 +00005471 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
Changli Gaoaa942102010-12-04 02:31:41 +00005472 queue->dev = dev;
Tom Herbert114cf582011-11-28 16:33:09 +00005473#ifdef CONFIG_BQL
5474 dql_init(&queue->dql, HZ);
5475#endif
Changli Gaoaa942102010-12-04 02:31:41 +00005476}
5477
Tom Herberte6484932010-10-18 18:04:39 +00005478static int netif_alloc_netdev_queues(struct net_device *dev)
5479{
5480 unsigned int count = dev->num_tx_queues;
5481 struct netdev_queue *tx;
5482
5483 BUG_ON(count < 1);
5484
5485 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
Joe Perches62b59422013-02-04 16:48:16 +00005486 if (!tx)
Tom Herberte6484932010-10-18 18:04:39 +00005487 return -ENOMEM;
Joe Perches62b59422013-02-04 16:48:16 +00005488
Tom Herberte6484932010-10-18 18:04:39 +00005489 dev->_tx = tx;
Tom Herbert1d24eb42010-11-21 13:17:27 +00005490
Tom Herberte6484932010-10-18 18:04:39 +00005491 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5492 spin_lock_init(&dev->tx_global_lock);
Changli Gaoaa942102010-12-04 02:31:41 +00005493
5494 return 0;
Tom Herberte6484932010-10-18 18:04:39 +00005495}
5496
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08005497/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005498 * register_netdevice - register a network device
5499 * @dev: device to register
5500 *
5501 * Take a completed network device structure and add it to the kernel
5502 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5503 * chain. 0 is returned on success. A negative errno code is returned
5504 * on a failure to set up the device, or if the name is a duplicate.
5505 *
5506 * Callers must hold the rtnl semaphore. You may want
5507 * register_netdev() instead of this.
5508 *
5509 * BUGS:
5510 * The locking appears insufficient to guarantee two parallel registers
5511 * will not get the same name.
5512 */
5513
5514int register_netdevice(struct net_device *dev)
5515{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005516 int ret;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005517 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005518
5519 BUG_ON(dev_boot_phase);
5520 ASSERT_RTNL();
5521
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005522 might_sleep();
5523
Linus Torvalds1da177e2005-04-16 15:20:36 -07005524 /* When net_device's are persistent, this will be fatal. */
5525 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005526 BUG_ON(!net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005527
David S. Millerf1f28aa2008-07-15 00:08:33 -07005528 spin_lock_init(&dev->addr_list_lock);
David S. Millercf508b12008-07-22 14:16:42 -07005529 netdev_set_addr_lockdep_class(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005530
Linus Torvalds1da177e2005-04-16 15:20:36 -07005531 dev->iflink = -1;
5532
Gao feng828de4f2012-09-13 20:58:27 +00005533 ret = dev_get_valid_name(net, dev, dev->name);
Peter Pan(潘卫平)0696c3a2011-05-12 15:46:56 +00005534 if (ret < 0)
5535 goto out;
5536
Linus Torvalds1da177e2005-04-16 15:20:36 -07005537 /* Init, if this function is available */
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005538 if (dev->netdev_ops->ndo_init) {
5539 ret = dev->netdev_ops->ndo_init(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005540 if (ret) {
5541 if (ret > 0)
5542 ret = -EIO;
Adrian Bunk90833aa2006-11-13 16:02:22 -08005543 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005544 }
5545 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005546
Michał Mirosławd2ed2732013-01-29 15:14:16 +00005547 if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_FILTER) &&
5548 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5549 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5550 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5551 ret = -EINVAL;
5552 goto err_uninit;
5553 }
5554
Pavel Emelyanov9c7dafb2012-08-08 21:52:46 +00005555 ret = -EBUSY;
5556 if (!dev->ifindex)
5557 dev->ifindex = dev_new_index(net);
5558 else if (__dev_get_by_index(net, dev->ifindex))
5559 goto err_uninit;
5560
Linus Torvalds1da177e2005-04-16 15:20:36 -07005561 if (dev->iflink == -1)
5562 dev->iflink = dev->ifindex;
5563
Michał Mirosław5455c692011-02-15 16:59:17 +00005564 /* Transfer changeable features to wanted_features and enable
5565 * software offloads (GSO and GRO).
5566 */
5567 dev->hw_features |= NETIF_F_SOFT_FEATURES;
Michał Mirosław14d12322011-02-22 16:52:28 +00005568 dev->features |= NETIF_F_SOFT_FEATURES;
5569 dev->wanted_features = dev->features & dev->hw_features;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005570
Tom Herbertc6e1a0d2011-04-04 22:30:30 -07005571 /* Turn on no cache copy if HW is doing checksum */
Michał Mirosław34324dc2011-11-15 15:29:55 +00005572 if (!(dev->flags & IFF_LOOPBACK)) {
5573 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5574 if (dev->features & NETIF_F_ALL_CSUM) {
5575 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5576 dev->features |= NETIF_F_NOCACHE_COPY;
5577 }
Tom Herbertc6e1a0d2011-04-04 22:30:30 -07005578 }
5579
Michał Mirosław1180e7d2011-07-14 14:41:11 -07005580 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
Brandon Philips16c3ea72010-09-15 09:24:24 +00005581 */
Michał Mirosław1180e7d2011-07-14 14:41:11 -07005582 dev->vlan_features |= NETIF_F_HIGHDMA;
Brandon Philips16c3ea72010-09-15 09:24:24 +00005583
Johannes Berg7ffbe3f2009-10-02 05:15:27 +00005584 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5585 ret = notifier_to_errno(ret);
5586 if (ret)
5587 goto err_uninit;
5588
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005589 ret = netdev_register_kobject(dev);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005590 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005591 goto err_uninit;
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005592 dev->reg_state = NETREG_REGISTERED;
5593
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005594 __netdev_update_features(dev);
Michał Mirosław8e9b59b2011-02-22 16:52:28 +00005595
Linus Torvalds1da177e2005-04-16 15:20:36 -07005596 /*
5597 * Default initial state at registry is that the
5598 * device is present.
5599 */
5600
5601 set_bit(__LINK_STATE_PRESENT, &dev->state);
5602
Ben Hutchings8f4cccb2012-08-20 22:16:51 +01005603 linkwatch_init_dev(dev);
5604
Linus Torvalds1da177e2005-04-16 15:20:36 -07005605 dev_init_scheduler(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005606 dev_hold(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005607 list_netdevice(dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04005608 add_device_randomness(dev->dev_addr, dev->addr_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005609
Jiri Pirko948b3372013-01-08 01:38:25 +00005610 /* If the device has permanent device address, driver should
5611 * set dev_addr and also addr_assign_type should be set to
5612 * NET_ADDR_PERM (default value).
5613 */
5614 if (dev->addr_assign_type == NET_ADDR_PERM)
5615 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5616
Linus Torvalds1da177e2005-04-16 15:20:36 -07005617 /* Notify protocols, that a new device appeared. */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005618 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07005619 ret = notifier_to_errno(ret);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005620 if (ret) {
5621 rollback_registered(dev);
5622 dev->reg_state = NETREG_UNREGISTERED;
5623 }
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005624 /*
5625 * Prevent userspace races by waiting until the network
5626 * device is fully setup before sending notifications.
5627 */
Patrick McHardya2835762010-02-26 06:34:51 +00005628 if (!dev->rtnl_link_ops ||
5629 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5630 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005631
5632out:
5633 return ret;
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005634
5635err_uninit:
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005636 if (dev->netdev_ops->ndo_uninit)
5637 dev->netdev_ops->ndo_uninit(dev);
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005638 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005639}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005640EXPORT_SYMBOL(register_netdevice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005641
5642/**
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005643 * init_dummy_netdev - init a dummy network device for NAPI
5644 * @dev: device to init
5645 *
5646 * This takes a network device structure and initialize the minimum
5647 * amount of fields so it can be used to schedule NAPI polls without
5648 * registering a full blown interface. This is to be used by drivers
5649 * that need to tie several hardware interfaces to a single NAPI
5650 * poll scheduler due to HW limitations.
5651 */
5652int init_dummy_netdev(struct net_device *dev)
5653{
5654 /* Clear everything. Note we don't initialize spinlocks
5655 * are they aren't supposed to be taken by any of the
5656 * NAPI code and this dummy netdev is supposed to be
5657 * only ever used for NAPI polls
5658 */
5659 memset(dev, 0, sizeof(struct net_device));
5660
5661 /* make sure we BUG if trying to hit standard
5662 * register/unregister code path
5663 */
5664 dev->reg_state = NETREG_DUMMY;
5665
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005666 /* NAPI wants this */
5667 INIT_LIST_HEAD(&dev->napi_list);
5668
5669 /* a dummy interface is started by default */
5670 set_bit(__LINK_STATE_PRESENT, &dev->state);
5671 set_bit(__LINK_STATE_START, &dev->state);
5672
Eric Dumazet29b44332010-10-11 10:22:12 +00005673 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5674 * because users of this 'device' dont need to change
5675 * its refcount.
5676 */
5677
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005678 return 0;
5679}
5680EXPORT_SYMBOL_GPL(init_dummy_netdev);
5681
5682
5683/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005684 * register_netdev - register a network device
5685 * @dev: device to register
5686 *
5687 * Take a completed network device structure and add it to the kernel
5688 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5689 * chain. 0 is returned on success. A negative errno code is returned
5690 * on a failure to set up the device, or if the name is a duplicate.
5691 *
Borislav Petkov38b4da32007-04-20 22:14:10 -07005692 * This is a wrapper around register_netdevice that takes the rtnl semaphore
Linus Torvalds1da177e2005-04-16 15:20:36 -07005693 * and expands the device name if you passed a format string to
5694 * alloc_netdev.
5695 */
5696int register_netdev(struct net_device *dev)
5697{
5698 int err;
5699
5700 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005701 err = register_netdevice(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005702 rtnl_unlock();
5703 return err;
5704}
5705EXPORT_SYMBOL(register_netdev);
5706
Eric Dumazet29b44332010-10-11 10:22:12 +00005707int netdev_refcnt_read(const struct net_device *dev)
5708{
5709 int i, refcnt = 0;
5710
5711 for_each_possible_cpu(i)
5712 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5713 return refcnt;
5714}
5715EXPORT_SYMBOL(netdev_refcnt_read);
5716
Ben Hutchings2c530402012-07-10 10:55:09 +00005717/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005718 * netdev_wait_allrefs - wait until all references are gone.
Randy Dunlap3de7a372012-08-18 14:36:44 +00005719 * @dev: target net_device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005720 *
5721 * This is called when unregistering network devices.
5722 *
5723 * Any protocol or device that holds a reference should register
5724 * for netdevice notification, and cleanup and put back the
5725 * reference if they receive an UNREGISTER event.
5726 * We can get stuck here if buggy protocols don't correctly
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005727 * call dev_put.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005728 */
5729static void netdev_wait_allrefs(struct net_device *dev)
5730{
5731 unsigned long rebroadcast_time, warning_time;
Eric Dumazet29b44332010-10-11 10:22:12 +00005732 int refcnt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005733
Eric Dumazete014deb2009-11-17 05:59:21 +00005734 linkwatch_forget_dev(dev);
5735
Linus Torvalds1da177e2005-04-16 15:20:36 -07005736 rebroadcast_time = warning_time = jiffies;
Eric Dumazet29b44332010-10-11 10:22:12 +00005737 refcnt = netdev_refcnt_read(dev);
5738
5739 while (refcnt != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005740 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005741 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005742
5743 /* Rebroadcast unregister notification */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005744 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005745
Eric Dumazet748e2d92012-08-22 21:50:59 +00005746 __rtnl_unlock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005747 rcu_barrier();
Eric Dumazet748e2d92012-08-22 21:50:59 +00005748 rtnl_lock();
5749
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005750 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005751 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5752 &dev->state)) {
5753 /* We must not have linkwatch events
5754 * pending on unregister. If this
5755 * happens, we simply run the queue
5756 * unscheduled, resulting in a noop
5757 * for this device.
5758 */
5759 linkwatch_run_queue();
5760 }
5761
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005762 __rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005763
5764 rebroadcast_time = jiffies;
5765 }
5766
5767 msleep(250);
5768
Eric Dumazet29b44332010-10-11 10:22:12 +00005769 refcnt = netdev_refcnt_read(dev);
5770
Linus Torvalds1da177e2005-04-16 15:20:36 -07005771 if (time_after(jiffies, warning_time + 10 * HZ)) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005772 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5773 dev->name, refcnt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005774 warning_time = jiffies;
5775 }
5776 }
5777}
5778
5779/* The sequence is:
5780 *
5781 * rtnl_lock();
5782 * ...
5783 * register_netdevice(x1);
5784 * register_netdevice(x2);
5785 * ...
5786 * unregister_netdevice(y1);
5787 * unregister_netdevice(y2);
5788 * ...
5789 * rtnl_unlock();
5790 * free_netdev(y1);
5791 * free_netdev(y2);
5792 *
Herbert Xu58ec3b42008-10-07 15:50:03 -07005793 * We are invoked by rtnl_unlock().
Linus Torvalds1da177e2005-04-16 15:20:36 -07005794 * This allows us to deal with problems:
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005795 * 1) We can delete sysfs objects which invoke hotplug
Linus Torvalds1da177e2005-04-16 15:20:36 -07005796 * without deadlocking with linkwatch via keventd.
5797 * 2) Since we run with the RTNL semaphore not held, we can sleep
5798 * safely in order to wait for the netdev refcnt to drop to zero.
Herbert Xu58ec3b42008-10-07 15:50:03 -07005799 *
5800 * We must not return until all unregister events added during
5801 * the interval the lock was held have been completed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005802 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005803void netdev_run_todo(void)
5804{
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005805 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005806
Linus Torvalds1da177e2005-04-16 15:20:36 -07005807 /* Snapshot list, allow later requests */
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005808 list_replace_init(&net_todo_list, &list);
Herbert Xu58ec3b42008-10-07 15:50:03 -07005809
5810 __rtnl_unlock();
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005811
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005812
5813 /* Wait for rcu callbacks to finish before next phase */
Eric W. Biederman850a5452011-10-13 22:25:23 +00005814 if (!list_empty(&list))
5815 rcu_barrier();
5816
Linus Torvalds1da177e2005-04-16 15:20:36 -07005817 while (!list_empty(&list)) {
5818 struct net_device *dev
stephen hemmingere5e26d72010-02-24 14:01:38 +00005819 = list_first_entry(&list, struct net_device, todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005820 list_del(&dev->todo_list);
5821
Eric Dumazet748e2d92012-08-22 21:50:59 +00005822 rtnl_lock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005823 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Eric Dumazet748e2d92012-08-22 21:50:59 +00005824 __rtnl_unlock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005825
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005826 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005827 pr_err("network todo '%s' but state %d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07005828 dev->name, dev->reg_state);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005829 dump_stack();
5830 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005831 }
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005832
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005833 dev->reg_state = NETREG_UNREGISTERED;
5834
Changli Gao152102c2010-03-30 20:16:22 +00005835 on_each_cpu(flush_backlog, dev, 1);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07005836
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005837 netdev_wait_allrefs(dev);
5838
5839 /* paranoia */
Eric Dumazet29b44332010-10-11 10:22:12 +00005840 BUG_ON(netdev_refcnt_read(dev));
Eric Dumazet33d480c2011-08-11 19:30:52 +00005841 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5842 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
Ilpo Järvinen547b7922008-07-25 21:43:18 -07005843 WARN_ON(dev->dn_ptr);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005844
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005845 if (dev->destructor)
5846 dev->destructor(dev);
Stephen Hemminger9093bbb2007-05-19 15:39:25 -07005847
5848 /* Free network device */
5849 kobject_put(&dev->dev.kobj);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005850 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005851}
5852
Ben Hutchings3cfde792010-07-09 09:11:52 +00005853/* Convert net_device_stats to rtnl_link_stats64. They have the same
5854 * fields in the same order, with only the type differing.
5855 */
Eric Dumazet77a1abf2012-03-05 04:50:09 +00005856void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5857 const struct net_device_stats *netdev_stats)
Ben Hutchings3cfde792010-07-09 09:11:52 +00005858{
5859#if BITS_PER_LONG == 64
Eric Dumazet77a1abf2012-03-05 04:50:09 +00005860 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5861 memcpy(stats64, netdev_stats, sizeof(*stats64));
Ben Hutchings3cfde792010-07-09 09:11:52 +00005862#else
5863 size_t i, n = sizeof(*stats64) / sizeof(u64);
5864 const unsigned long *src = (const unsigned long *)netdev_stats;
5865 u64 *dst = (u64 *)stats64;
5866
5867 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5868 sizeof(*stats64) / sizeof(u64));
5869 for (i = 0; i < n; i++)
5870 dst[i] = src[i];
5871#endif
5872}
Eric Dumazet77a1abf2012-03-05 04:50:09 +00005873EXPORT_SYMBOL(netdev_stats_to_stats64);
Ben Hutchings3cfde792010-07-09 09:11:52 +00005874
Eric Dumazetd83345a2009-11-16 03:36:51 +00005875/**
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005876 * dev_get_stats - get network device statistics
5877 * @dev: device to get statistics from
Eric Dumazet28172732010-07-07 14:58:56 -07005878 * @storage: place to store stats
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005879 *
Ben Hutchingsd7753512010-07-09 09:12:41 +00005880 * Get network statistics from device. Return @storage.
5881 * The device driver may provide its own method by setting
5882 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5883 * otherwise the internal statistics structure is used.
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005884 */
Ben Hutchingsd7753512010-07-09 09:12:41 +00005885struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5886 struct rtnl_link_stats64 *storage)
Eric Dumazet7004bf22009-05-18 00:34:33 +00005887{
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005888 const struct net_device_ops *ops = dev->netdev_ops;
5889
Eric Dumazet28172732010-07-07 14:58:56 -07005890 if (ops->ndo_get_stats64) {
5891 memset(storage, 0, sizeof(*storage));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005892 ops->ndo_get_stats64(dev, storage);
5893 } else if (ops->ndo_get_stats) {
Ben Hutchings3cfde792010-07-09 09:11:52 +00005894 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005895 } else {
5896 netdev_stats_to_stats64(storage, &dev->stats);
Eric Dumazet28172732010-07-07 14:58:56 -07005897 }
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005898 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
Eric Dumazet28172732010-07-07 14:58:56 -07005899 return storage;
Rusty Russellc45d2862007-03-28 14:29:08 -07005900}
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005901EXPORT_SYMBOL(dev_get_stats);
Rusty Russellc45d2862007-03-28 14:29:08 -07005902
Eric Dumazet24824a02010-10-02 06:11:55 +00005903struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
David S. Millerdc2b4842008-07-08 17:18:23 -07005904{
Eric Dumazet24824a02010-10-02 06:11:55 +00005905 struct netdev_queue *queue = dev_ingress_queue(dev);
David S. Millerdc2b4842008-07-08 17:18:23 -07005906
Eric Dumazet24824a02010-10-02 06:11:55 +00005907#ifdef CONFIG_NET_CLS_ACT
5908 if (queue)
5909 return queue;
5910 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5911 if (!queue)
5912 return NULL;
5913 netdev_init_one_queue(dev, queue, NULL);
Eric Dumazet24824a02010-10-02 06:11:55 +00005914 queue->qdisc = &noop_qdisc;
5915 queue->qdisc_sleeping = &noop_qdisc;
5916 rcu_assign_pointer(dev->ingress_queue, queue);
5917#endif
5918 return queue;
David S. Millerbb949fb2008-07-08 16:55:56 -07005919}
5920
Eric Dumazet2c60db02012-09-16 09:17:26 +00005921static const struct ethtool_ops default_ethtool_ops;
5922
Stanislaw Gruszkad07d7502013-01-10 23:19:10 +00005923void netdev_set_default_ethtool_ops(struct net_device *dev,
5924 const struct ethtool_ops *ops)
5925{
5926 if (dev->ethtool_ops == &default_ethtool_ops)
5927 dev->ethtool_ops = ops;
5928}
5929EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5930
Linus Torvalds1da177e2005-04-16 15:20:36 -07005931/**
Tom Herbert36909ea2011-01-09 19:36:31 +00005932 * alloc_netdev_mqs - allocate network device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005933 * @sizeof_priv: size of private data to allocate space for
5934 * @name: device name format string
5935 * @setup: callback to initialize device
Tom Herbert36909ea2011-01-09 19:36:31 +00005936 * @txqs: the number of TX subqueues to allocate
5937 * @rxqs: the number of RX subqueues to allocate
Linus Torvalds1da177e2005-04-16 15:20:36 -07005938 *
5939 * Allocates a struct net_device with private data area for driver use
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005940 * and performs basic initialization. Also allocates subquue structs
Tom Herbert36909ea2011-01-09 19:36:31 +00005941 * for each queue on the device.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005942 */
Tom Herbert36909ea2011-01-09 19:36:31 +00005943struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5944 void (*setup)(struct net_device *),
5945 unsigned int txqs, unsigned int rxqs)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005946{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005947 struct net_device *dev;
Stephen Hemminger79439862008-07-21 13:28:44 -07005948 size_t alloc_size;
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005949 struct net_device *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005950
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07005951 BUG_ON(strlen(name) >= sizeof(dev->name));
5952
Tom Herbert36909ea2011-01-09 19:36:31 +00005953 if (txqs < 1) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005954 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
Tom Herbert55513fb2010-10-18 17:55:58 +00005955 return NULL;
5956 }
5957
Tom Herbert36909ea2011-01-09 19:36:31 +00005958#ifdef CONFIG_RPS
5959 if (rxqs < 1) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005960 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
Tom Herbert36909ea2011-01-09 19:36:31 +00005961 return NULL;
5962 }
5963#endif
5964
David S. Millerfd2ea0a2008-07-17 01:56:23 -07005965 alloc_size = sizeof(struct net_device);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005966 if (sizeof_priv) {
5967 /* ensure 32-byte alignment of private area */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005968 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005969 alloc_size += sizeof_priv;
5970 }
5971 /* ensure 32-byte alignment of whole construct */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005972 alloc_size += NETDEV_ALIGN - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005973
Paolo 'Blaisorblade' Giarrusso31380de2006-04-06 22:38:28 -07005974 p = kzalloc(alloc_size, GFP_KERNEL);
Joe Perches62b59422013-02-04 16:48:16 +00005975 if (!p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005976 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005977
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005978 dev = PTR_ALIGN(p, NETDEV_ALIGN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005979 dev->padded = (char *)dev - (char *)p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005980
Eric Dumazet29b44332010-10-11 10:22:12 +00005981 dev->pcpu_refcnt = alloc_percpu(int);
5982 if (!dev->pcpu_refcnt)
Tom Herberte6484932010-10-18 18:04:39 +00005983 goto free_p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005984
Linus Torvalds1da177e2005-04-16 15:20:36 -07005985 if (dev_addr_init(dev))
Eric Dumazet29b44332010-10-11 10:22:12 +00005986 goto free_pcpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005987
Jiri Pirko22bedad32010-04-01 21:22:57 +00005988 dev_mc_init(dev);
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005989 dev_uc_init(dev);
Jiri Pirkoccffad252009-05-22 23:22:17 +00005990
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005991 dev_net_set(dev, &init_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005992
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07005993 dev->gso_max_size = GSO_MAX_SIZE;
Ben Hutchings30b678d2012-07-30 15:57:00 +00005994 dev->gso_max_segs = GSO_MAX_SEGS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005995
Herbert Xud565b0a2008-12-15 23:38:52 -08005996 INIT_LIST_HEAD(&dev->napi_list);
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005997 INIT_LIST_HEAD(&dev->unreg_list);
Eric Dumazete014deb2009-11-17 05:59:21 +00005998 INIT_LIST_HEAD(&dev->link_watch_list);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005999 INIT_LIST_HEAD(&dev->upper_dev_list);
Eric Dumazet93f154b2009-05-18 22:19:19 -07006000 dev->priv_flags = IFF_XMIT_DST_RELEASE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006001 setup(dev);
David S. Miller8d3bdbd2011-02-08 15:02:50 -08006002
6003 dev->num_tx_queues = txqs;
6004 dev->real_num_tx_queues = txqs;
6005 if (netif_alloc_netdev_queues(dev))
6006 goto free_all;
6007
6008#ifdef CONFIG_RPS
6009 dev->num_rx_queues = rxqs;
6010 dev->real_num_rx_queues = rxqs;
6011 if (netif_alloc_rx_queues(dev))
6012 goto free_all;
6013#endif
6014
Linus Torvalds1da177e2005-04-16 15:20:36 -07006015 strcpy(dev->name, name);
Vlad Dogarucbda10f2011-01-13 23:38:30 +00006016 dev->group = INIT_NETDEV_GROUP;
Eric Dumazet2c60db02012-09-16 09:17:26 +00006017 if (!dev->ethtool_ops)
6018 dev->ethtool_ops = &default_ethtool_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006019 return dev;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00006020
David S. Miller8d3bdbd2011-02-08 15:02:50 -08006021free_all:
6022 free_netdev(dev);
6023 return NULL;
6024
Eric Dumazet29b44332010-10-11 10:22:12 +00006025free_pcpu:
6026 free_percpu(dev->pcpu_refcnt);
Tom Herberted9af2e2010-11-09 10:47:30 +00006027 kfree(dev->_tx);
Tom Herbertfe822242010-11-09 10:47:38 +00006028#ifdef CONFIG_RPS
6029 kfree(dev->_rx);
6030#endif
6031
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00006032free_p:
6033 kfree(p);
6034 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006035}
Tom Herbert36909ea2011-01-09 19:36:31 +00006036EXPORT_SYMBOL(alloc_netdev_mqs);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006037
6038/**
6039 * free_netdev - free network device
6040 * @dev: device
6041 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09006042 * This function does the last stage of destroying an allocated device
6043 * interface. The reference to the device object is released.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006044 * If this is the last reference then it will be freed.
6045 */
6046void free_netdev(struct net_device *dev)
6047{
Herbert Xud565b0a2008-12-15 23:38:52 -08006048 struct napi_struct *p, *n;
6049
Denis V. Lunevf3005d72008-04-16 02:02:18 -07006050 release_net(dev_net(dev));
6051
David S. Millere8a04642008-07-17 00:34:19 -07006052 kfree(dev->_tx);
Tom Herbertfe822242010-11-09 10:47:38 +00006053#ifdef CONFIG_RPS
6054 kfree(dev->_rx);
6055#endif
David S. Millere8a04642008-07-17 00:34:19 -07006056
Eric Dumazet33d480c2011-08-11 19:30:52 +00006057 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
Eric Dumazet24824a02010-10-02 06:11:55 +00006058
Jiri Pirkof001fde2009-05-05 02:48:28 +00006059 /* Flush device addresses */
6060 dev_addr_flush(dev);
6061
Herbert Xud565b0a2008-12-15 23:38:52 -08006062 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6063 netif_napi_del(p);
6064
Eric Dumazet29b44332010-10-11 10:22:12 +00006065 free_percpu(dev->pcpu_refcnt);
6066 dev->pcpu_refcnt = NULL;
6067
Stephen Hemminger3041a062006-05-26 13:25:24 -07006068 /* Compatibility with error handling in drivers */
Linus Torvalds1da177e2005-04-16 15:20:36 -07006069 if (dev->reg_state == NETREG_UNINITIALIZED) {
6070 kfree((char *)dev - dev->padded);
6071 return;
6072 }
6073
6074 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6075 dev->reg_state = NETREG_RELEASED;
6076
Greg Kroah-Hartman43cb76d2002-04-09 12:14:34 -07006077 /* will free via device release */
6078 put_device(&dev->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006079}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07006080EXPORT_SYMBOL(free_netdev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09006081
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006082/**
6083 * synchronize_net - Synchronize with packet receive processing
6084 *
6085 * Wait for packets currently being received to be done.
6086 * Does not block later packets from starting.
6087 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09006088void synchronize_net(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006089{
6090 might_sleep();
Eric Dumazetbe3fc412011-05-23 23:07:32 +00006091 if (rtnl_is_locked())
6092 synchronize_rcu_expedited();
6093 else
6094 synchronize_rcu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006095}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07006096EXPORT_SYMBOL(synchronize_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006097
6098/**
Eric Dumazet44a08732009-10-27 07:03:04 +00006099 * unregister_netdevice_queue - remove device from the kernel
Linus Torvalds1da177e2005-04-16 15:20:36 -07006100 * @dev: device
Eric Dumazet44a08732009-10-27 07:03:04 +00006101 * @head: list
Jaswinder Singh Rajput6ebfbc02009-11-22 20:43:13 -08006102 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07006103 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08006104 * from the kernel tables.
Eric Dumazet44a08732009-10-27 07:03:04 +00006105 * If head not NULL, device is queued to be unregistered later.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006106 *
6107 * Callers must hold the rtnl semaphore. You may want
6108 * unregister_netdev() instead of this.
6109 */
6110
Eric Dumazet44a08732009-10-27 07:03:04 +00006111void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006112{
Herbert Xua6620712007-12-12 19:21:56 -08006113 ASSERT_RTNL();
6114
Eric Dumazet44a08732009-10-27 07:03:04 +00006115 if (head) {
Eric W. Biederman9fdce092009-10-30 14:51:13 +00006116 list_move_tail(&dev->unreg_list, head);
Eric Dumazet44a08732009-10-27 07:03:04 +00006117 } else {
6118 rollback_registered(dev);
6119 /* Finish processing unregister after unlock */
6120 net_set_todo(dev);
6121 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006122}
Eric Dumazet44a08732009-10-27 07:03:04 +00006123EXPORT_SYMBOL(unregister_netdevice_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006124
6125/**
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006126 * unregister_netdevice_many - unregister many devices
6127 * @head: list of devices
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006128 */
6129void unregister_netdevice_many(struct list_head *head)
6130{
6131 struct net_device *dev;
6132
6133 if (!list_empty(head)) {
6134 rollback_registered_many(head);
6135 list_for_each_entry(dev, head, unreg_list)
6136 net_set_todo(dev);
6137 }
6138}
Eric Dumazet63c80992009-10-27 07:06:49 +00006139EXPORT_SYMBOL(unregister_netdevice_many);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006140
6141/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07006142 * unregister_netdev - remove device from the kernel
6143 * @dev: device
6144 *
6145 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08006146 * from the kernel tables.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006147 *
6148 * This is just a wrapper for unregister_netdevice that takes
6149 * the rtnl semaphore. In general you want to use this and not
6150 * unregister_netdevice.
6151 */
6152void unregister_netdev(struct net_device *dev)
6153{
6154 rtnl_lock();
6155 unregister_netdevice(dev);
6156 rtnl_unlock();
6157}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006158EXPORT_SYMBOL(unregister_netdev);
6159
Eric W. Biedermance286d32007-09-12 13:53:49 +02006160/**
6161 * dev_change_net_namespace - move device to different nethost namespace
6162 * @dev: device
6163 * @net: network namespace
6164 * @pat: If not NULL name pattern to try if the current device name
6165 * is already taken in the destination network namespace.
6166 *
6167 * This function shuts down a device interface and moves it
6168 * to a new network namespace. On success 0 is returned, on
6169 * a failure a netagive errno code is returned.
6170 *
6171 * Callers must hold the rtnl semaphore.
6172 */
6173
6174int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6175{
Eric W. Biedermance286d32007-09-12 13:53:49 +02006176 int err;
6177
6178 ASSERT_RTNL();
6179
6180 /* Don't allow namespace local devices to be moved. */
6181 err = -EINVAL;
6182 if (dev->features & NETIF_F_NETNS_LOCAL)
6183 goto out;
6184
6185 /* Ensure the device has been registrered */
Eric W. Biedermance286d32007-09-12 13:53:49 +02006186 if (dev->reg_state != NETREG_REGISTERED)
6187 goto out;
6188
6189 /* Get out if there is nothing todo */
6190 err = 0;
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09006191 if (net_eq(dev_net(dev), net))
Eric W. Biedermance286d32007-09-12 13:53:49 +02006192 goto out;
6193
6194 /* Pick the destination device name, and ensure
6195 * we can use it in the destination network namespace.
6196 */
6197 err = -EEXIST;
Octavian Purdilad9031022009-11-18 02:36:59 +00006198 if (__dev_get_by_name(net, dev->name)) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006199 /* We get here if we can't use the current device name */
6200 if (!pat)
6201 goto out;
Gao feng828de4f2012-09-13 20:58:27 +00006202 if (dev_get_valid_name(net, dev, pat) < 0)
Eric W. Biedermance286d32007-09-12 13:53:49 +02006203 goto out;
6204 }
6205
6206 /*
6207 * And now a mini version of register_netdevice unregister_netdevice.
6208 */
6209
6210 /* If device is running close it first. */
Pavel Emelyanov9b772652007-10-10 02:49:09 -07006211 dev_close(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006212
6213 /* And unlink it from device chain */
6214 err = -ENODEV;
6215 unlist_netdevice(dev);
6216
6217 synchronize_net();
6218
6219 /* Shutdown queueing discipline. */
6220 dev_shutdown(dev);
6221
6222 /* Notify protocols, that we are about to destroy
6223 this device. They should clean all the things.
David Lamparter3b27e102010-09-17 03:22:19 +00006224
6225 Note that dev->reg_state stays at NETREG_REGISTERED.
6226 This is wanted because this way 8021q and macvlan know
6227 the device is just moving and can keep their slaves up.
Eric W. Biedermance286d32007-09-12 13:53:49 +02006228 */
6229 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Gao feng6549dd42012-08-23 15:36:55 +00006230 rcu_barrier();
6231 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Eric W. Biedermand2237d32011-10-21 06:24:20 +00006232 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006233
6234 /*
6235 * Flush the unicast and multicast chains
6236 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00006237 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00006238 dev_mc_flush(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006239
Serge Hallyn4e66ae22012-12-03 16:17:12 +00006240 /* Send a netdev-removed uevent to the old namespace */
6241 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6242
Eric W. Biedermance286d32007-09-12 13:53:49 +02006243 /* Actually switch the network namespace */
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09006244 dev_net_set(dev, net);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006245
Eric W. Biedermance286d32007-09-12 13:53:49 +02006246 /* If there is an ifindex conflict assign a new one */
6247 if (__dev_get_by_index(net, dev->ifindex)) {
6248 int iflink = (dev->iflink == dev->ifindex);
6249 dev->ifindex = dev_new_index(net);
6250 if (iflink)
6251 dev->iflink = dev->ifindex;
6252 }
6253
Serge Hallyn4e66ae22012-12-03 16:17:12 +00006254 /* Send a netdev-add uevent to the new namespace */
6255 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6256
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006257 /* Fixup kobjects */
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07006258 err = device_rename(&dev->dev, dev->name);
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006259 WARN_ON(err);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006260
6261 /* Add the device back in the hashes */
6262 list_netdevice(dev);
6263
6264 /* Notify protocols, that a new device appeared. */
6265 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6266
Eric W. Biedermand90a9092009-12-12 22:11:15 +00006267 /*
6268 * Prevent userspace races by waiting until the network
6269 * device is fully setup before sending notifications.
6270 */
6271 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6272
Eric W. Biedermance286d32007-09-12 13:53:49 +02006273 synchronize_net();
6274 err = 0;
6275out:
6276 return err;
6277}
Johannes Berg463d0182009-07-14 00:33:35 +02006278EXPORT_SYMBOL_GPL(dev_change_net_namespace);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006279
Linus Torvalds1da177e2005-04-16 15:20:36 -07006280static int dev_cpu_callback(struct notifier_block *nfb,
6281 unsigned long action,
6282 void *ocpu)
6283{
6284 struct sk_buff **list_skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006285 struct sk_buff *skb;
6286 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6287 struct softnet_data *sd, *oldsd;
6288
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006289 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006290 return NOTIFY_OK;
6291
6292 local_irq_disable();
6293 cpu = smp_processor_id();
6294 sd = &per_cpu(softnet_data, cpu);
6295 oldsd = &per_cpu(softnet_data, oldcpu);
6296
6297 /* Find end of our completion_queue. */
6298 list_skb = &sd->completion_queue;
6299 while (*list_skb)
6300 list_skb = &(*list_skb)->next;
6301 /* Append completion queue from offline CPU. */
6302 *list_skb = oldsd->completion_queue;
6303 oldsd->completion_queue = NULL;
6304
Linus Torvalds1da177e2005-04-16 15:20:36 -07006305 /* Append output queue from offline CPU. */
Changli Gaoa9cbd582010-04-26 23:06:24 +00006306 if (oldsd->output_queue) {
6307 *sd->output_queue_tailp = oldsd->output_queue;
6308 sd->output_queue_tailp = oldsd->output_queue_tailp;
6309 oldsd->output_queue = NULL;
6310 oldsd->output_queue_tailp = &oldsd->output_queue;
6311 }
Heiko Carstens264524d2011-06-06 20:50:03 +00006312 /* Append NAPI poll list from offline CPU. */
6313 if (!list_empty(&oldsd->poll_list)) {
6314 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6315 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6316 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006317
6318 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6319 local_irq_enable();
6320
6321 /* Process offline CPU's input_pkt_queue */
Tom Herbert76cc8b12010-05-20 18:37:59 +00006322 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6323 netif_rx(skb);
6324 input_queue_head_incr(oldsd);
6325 }
Tom Herbertfec5e652010-04-16 16:01:27 -07006326 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006327 netif_rx(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00006328 input_queue_head_incr(oldsd);
Tom Herbertfec5e652010-04-16 16:01:27 -07006329 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006330
6331 return NOTIFY_OK;
6332}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006333
6334
Herbert Xu7f353bf2007-08-10 15:47:58 -07006335/**
Herbert Xub63365a2008-10-23 01:11:29 -07006336 * netdev_increment_features - increment feature set by one
6337 * @all: current feature set
6338 * @one: new feature set
6339 * @mask: mask feature set
Herbert Xu7f353bf2007-08-10 15:47:58 -07006340 *
6341 * Computes a new feature set after adding a device with feature set
Herbert Xub63365a2008-10-23 01:11:29 -07006342 * @one to the master device with current feature set @all. Will not
6343 * enable anything that is off in @mask. Returns the new feature set.
Herbert Xu7f353bf2007-08-10 15:47:58 -07006344 */
Michał Mirosławc8f44af2011-11-15 15:29:55 +00006345netdev_features_t netdev_increment_features(netdev_features_t all,
6346 netdev_features_t one, netdev_features_t mask)
Herbert Xu7f353bf2007-08-10 15:47:58 -07006347{
Michał Mirosław1742f182011-04-22 06:31:16 +00006348 if (mask & NETIF_F_GEN_CSUM)
6349 mask |= NETIF_F_ALL_CSUM;
6350 mask |= NETIF_F_VLAN_CHALLENGED;
6351
6352 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6353 all &= one | ~NETIF_F_ALL_FOR_ALL;
6354
Michał Mirosław1742f182011-04-22 06:31:16 +00006355 /* If one device supports hw checksumming, set for all. */
6356 if (all & NETIF_F_GEN_CSUM)
6357 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
Herbert Xu7f353bf2007-08-10 15:47:58 -07006358
6359 return all;
6360}
Herbert Xub63365a2008-10-23 01:11:29 -07006361EXPORT_SYMBOL(netdev_increment_features);
Herbert Xu7f353bf2007-08-10 15:47:58 -07006362
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006363static struct hlist_head *netdev_create_hash(void)
6364{
6365 int i;
6366 struct hlist_head *hash;
6367
6368 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6369 if (hash != NULL)
6370 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6371 INIT_HLIST_HEAD(&hash[i]);
6372
6373 return hash;
6374}
6375
Eric W. Biederman881d9662007-09-17 11:56:21 -07006376/* Initialize per network namespace state */
Pavel Emelyanov46650792007-10-08 20:38:39 -07006377static int __net_init netdev_init(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07006378{
Rustad, Mark D734b6542012-07-18 09:06:07 +00006379 if (net != &init_net)
6380 INIT_LIST_HEAD(&net->dev_base_head);
Eric W. Biederman881d9662007-09-17 11:56:21 -07006381
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006382 net->dev_name_head = netdev_create_hash();
6383 if (net->dev_name_head == NULL)
6384 goto err_name;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006385
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006386 net->dev_index_head = netdev_create_hash();
6387 if (net->dev_index_head == NULL)
6388 goto err_idx;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006389
6390 return 0;
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006391
6392err_idx:
6393 kfree(net->dev_name_head);
6394err_name:
6395 return -ENOMEM;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006396}
6397
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006398/**
6399 * netdev_drivername - network driver for the device
6400 * @dev: network device
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006401 *
6402 * Determine network driver for device.
6403 */
David S. Miller3019de12011-06-06 16:41:33 -07006404const char *netdev_drivername(const struct net_device *dev)
Arjan van de Ven6579e572008-07-21 13:31:48 -07006405{
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07006406 const struct device_driver *driver;
6407 const struct device *parent;
David S. Miller3019de12011-06-06 16:41:33 -07006408 const char *empty = "";
Arjan van de Ven6579e572008-07-21 13:31:48 -07006409
6410 parent = dev->dev.parent;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006411 if (!parent)
David S. Miller3019de12011-06-06 16:41:33 -07006412 return empty;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006413
6414 driver = parent->driver;
6415 if (driver && driver->name)
David S. Miller3019de12011-06-06 16:41:33 -07006416 return driver->name;
6417 return empty;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006418}
6419
Joe Perchesb004ff42012-09-12 20:12:19 -07006420static int __netdev_printk(const char *level, const struct net_device *dev,
Joe Perches256df2f2010-06-27 01:02:35 +00006421 struct va_format *vaf)
6422{
6423 int r;
6424
Joe Perchesb004ff42012-09-12 20:12:19 -07006425 if (dev && dev->dev.parent) {
Joe Perches666f3552012-09-12 20:14:11 -07006426 r = dev_printk_emit(level[1] - '0',
6427 dev->dev.parent,
6428 "%s %s %s: %pV",
6429 dev_driver_string(dev->dev.parent),
6430 dev_name(dev->dev.parent),
6431 netdev_name(dev), vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006432 } else if (dev) {
Joe Perches256df2f2010-06-27 01:02:35 +00006433 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006434 } else {
Joe Perches256df2f2010-06-27 01:02:35 +00006435 r = printk("%s(NULL net_device): %pV", level, vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006436 }
Joe Perches256df2f2010-06-27 01:02:35 +00006437
6438 return r;
6439}
6440
6441int netdev_printk(const char *level, const struct net_device *dev,
6442 const char *format, ...)
6443{
6444 struct va_format vaf;
6445 va_list args;
6446 int r;
6447
6448 va_start(args, format);
6449
6450 vaf.fmt = format;
6451 vaf.va = &args;
6452
6453 r = __netdev_printk(level, dev, &vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006454
Joe Perches256df2f2010-06-27 01:02:35 +00006455 va_end(args);
6456
6457 return r;
6458}
6459EXPORT_SYMBOL(netdev_printk);
6460
6461#define define_netdev_printk_level(func, level) \
6462int func(const struct net_device *dev, const char *fmt, ...) \
6463{ \
6464 int r; \
6465 struct va_format vaf; \
6466 va_list args; \
6467 \
6468 va_start(args, fmt); \
6469 \
6470 vaf.fmt = fmt; \
6471 vaf.va = &args; \
6472 \
6473 r = __netdev_printk(level, dev, &vaf); \
Joe Perchesb004ff42012-09-12 20:12:19 -07006474 \
Joe Perches256df2f2010-06-27 01:02:35 +00006475 va_end(args); \
6476 \
6477 return r; \
6478} \
6479EXPORT_SYMBOL(func);
6480
6481define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6482define_netdev_printk_level(netdev_alert, KERN_ALERT);
6483define_netdev_printk_level(netdev_crit, KERN_CRIT);
6484define_netdev_printk_level(netdev_err, KERN_ERR);
6485define_netdev_printk_level(netdev_warn, KERN_WARNING);
6486define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6487define_netdev_printk_level(netdev_info, KERN_INFO);
6488
Pavel Emelyanov46650792007-10-08 20:38:39 -07006489static void __net_exit netdev_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07006490{
6491 kfree(net->dev_name_head);
6492 kfree(net->dev_index_head);
6493}
6494
Denis V. Lunev022cbae2007-11-13 03:23:50 -08006495static struct pernet_operations __net_initdata netdev_net_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07006496 .init = netdev_init,
6497 .exit = netdev_exit,
6498};
6499
Pavel Emelyanov46650792007-10-08 20:38:39 -07006500static void __net_exit default_device_exit(struct net *net)
Eric W. Biedermance286d32007-09-12 13:53:49 +02006501{
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006502 struct net_device *dev, *aux;
Eric W. Biedermance286d32007-09-12 13:53:49 +02006503 /*
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006504 * Push all migratable network devices back to the
Eric W. Biedermance286d32007-09-12 13:53:49 +02006505 * initial network namespace
6506 */
6507 rtnl_lock();
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006508 for_each_netdev_safe(net, dev, aux) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006509 int err;
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006510 char fb_name[IFNAMSIZ];
Eric W. Biedermance286d32007-09-12 13:53:49 +02006511
6512 /* Ignore unmoveable devices (i.e. loopback) */
6513 if (dev->features & NETIF_F_NETNS_LOCAL)
6514 continue;
6515
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006516 /* Leave virtual devices for the generic cleanup */
6517 if (dev->rtnl_link_ops)
6518 continue;
Eric W. Biedermand0c082c2008-11-05 15:59:38 -08006519
Lucas De Marchi25985ed2011-03-30 22:57:33 -03006520 /* Push remaining network devices to init_net */
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006521 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6522 err = dev_change_net_namespace(dev, &init_net, fb_name);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006523 if (err) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00006524 pr_emerg("%s: failed to move %s to init_net: %d\n",
6525 __func__, dev->name, err);
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006526 BUG();
Eric W. Biedermance286d32007-09-12 13:53:49 +02006527 }
6528 }
6529 rtnl_unlock();
6530}
6531
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006532static void __net_exit default_device_exit_batch(struct list_head *net_list)
6533{
6534 /* At exit all network devices most be removed from a network
Uwe Kleine-Königb5950762010-11-01 15:38:34 -04006535 * namespace. Do this in the reverse order of registration.
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006536 * Do this across as many network namespaces as possible to
6537 * improve batching efficiency.
6538 */
6539 struct net_device *dev;
6540 struct net *net;
6541 LIST_HEAD(dev_kill_list);
6542
6543 rtnl_lock();
6544 list_for_each_entry(net, net_list, exit_list) {
6545 for_each_netdev_reverse(net, dev) {
6546 if (dev->rtnl_link_ops)
6547 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6548 else
6549 unregister_netdevice_queue(dev, &dev_kill_list);
6550 }
6551 }
6552 unregister_netdevice_many(&dev_kill_list);
Eric Dumazetceaaec92011-02-17 22:59:19 +00006553 list_del(&dev_kill_list);
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006554 rtnl_unlock();
6555}
6556
Denis V. Lunev022cbae2007-11-13 03:23:50 -08006557static struct pernet_operations __net_initdata default_device_ops = {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006558 .exit = default_device_exit,
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006559 .exit_batch = default_device_exit_batch,
Eric W. Biedermance286d32007-09-12 13:53:49 +02006560};
6561
Linus Torvalds1da177e2005-04-16 15:20:36 -07006562/*
6563 * Initialize the DEV module. At boot time this walks the device list and
6564 * unhooks any devices that fail to initialise (normally hardware not
6565 * present) and leaves us with a valid list of present and active devices.
6566 *
6567 */
6568
6569/*
6570 * This is called single threaded during boot, so no need
6571 * to take the rtnl semaphore.
6572 */
6573static int __init net_dev_init(void)
6574{
6575 int i, rc = -ENOMEM;
6576
6577 BUG_ON(!dev_boot_phase);
6578
Linus Torvalds1da177e2005-04-16 15:20:36 -07006579 if (dev_proc_init())
6580 goto out;
6581
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006582 if (netdev_kobject_init())
Linus Torvalds1da177e2005-04-16 15:20:36 -07006583 goto out;
6584
6585 INIT_LIST_HEAD(&ptype_all);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08006586 for (i = 0; i < PTYPE_HASH_SIZE; i++)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006587 INIT_LIST_HEAD(&ptype_base[i]);
6588
Vlad Yasevich62532da2012-11-15 08:49:10 +00006589 INIT_LIST_HEAD(&offload_base);
6590
Eric W. Biederman881d9662007-09-17 11:56:21 -07006591 if (register_pernet_subsys(&netdev_net_ops))
6592 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006593
6594 /*
6595 * Initialise the packet receive queues.
6596 */
6597
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07006598 for_each_possible_cpu(i) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006599 struct softnet_data *sd = &per_cpu(softnet_data, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006600
Changli Gaodee42872010-05-02 05:42:16 +00006601 memset(sd, 0, sizeof(*sd));
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006602 skb_queue_head_init(&sd->input_pkt_queue);
Changli Gao6e7676c2010-04-27 15:07:33 -07006603 skb_queue_head_init(&sd->process_queue);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006604 sd->completion_queue = NULL;
6605 INIT_LIST_HEAD(&sd->poll_list);
Changli Gaoa9cbd582010-04-26 23:06:24 +00006606 sd->output_queue = NULL;
6607 sd->output_queue_tailp = &sd->output_queue;
Eric Dumazetdf334542010-03-24 19:13:54 +00006608#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006609 sd->csd.func = rps_trigger_softirq;
6610 sd->csd.info = sd;
6611 sd->csd.flags = 0;
6612 sd->cpu = i;
Tom Herbert1e94d722010-03-18 17:45:44 -07006613#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00006614
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006615 sd->backlog.poll = process_backlog;
6616 sd->backlog.weight = weight_p;
6617 sd->backlog.gro_list = NULL;
6618 sd->backlog.gro_count = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006619 }
6620
Linus Torvalds1da177e2005-04-16 15:20:36 -07006621 dev_boot_phase = 0;
6622
Eric W. Biederman505d4f72008-11-07 22:54:20 -08006623 /* The loopback device is special if any other network devices
6624 * is present in a network namespace the loopback device must
6625 * be present. Since we now dynamically allocate and free the
6626 * loopback device ensure this invariant is maintained by
6627 * keeping the loopback device as the first device on the
6628 * list of network devices. Ensuring the loopback devices
6629 * is the first device that appears and the last network device
6630 * that disappears.
6631 */
6632 if (register_pernet_device(&loopback_net_ops))
6633 goto out;
6634
6635 if (register_pernet_device(&default_device_ops))
6636 goto out;
6637
Carlos R. Mafra962cf362008-05-15 11:15:37 -03006638 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6639 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006640
6641 hotcpu_notifier(dev_cpu_callback, 0);
6642 dst_init();
6643 dev_mcast_init();
6644 rc = 0;
6645out:
6646 return rc;
6647}
6648
6649subsys_initcall(net_dev_init);