blob: 722f633926e062884f44c51c1dbe54de7b8b67f3 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070076#include <linux/bitops.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080077#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070078#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
stephen hemminger08e98972009-11-10 07:20:34 +000081#include <linux/hash.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090082#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/sched.h>
Arjan van de Ven4a3e2f72006-03-20 22:33:17 -080084#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
Ben Hutchings0187bdf2008-06-19 16:15:47 -070094#include <linux/ethtool.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <linux/notifier.h>
96#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020097#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <net/sock.h>
99#include <linux/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100#include <linux/stat.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101#include <net/dst.h>
102#include <net/pkt_sched.h>
103#include <net/checksum.h>
Arnd Bergmann44540962009-11-26 06:07:08 +0000104#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105#include <linux/highmem.h>
106#include <linux/init.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107#include <linux/module.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#include <linux/netpoll.h>
109#include <linux/rcupdate.h>
110#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700111#include <net/iw_handler.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112#include <asm/current.h>
Steve Grubb5bdb9882005-12-03 08:39:35 -0500113#include <linux/audit.h>
Chris Leechdb217332006-06-17 21:24:58 -0700114#include <linux/dmaengine.h>
Herbert Xuf6a78bf2006-06-22 02:57:17 -0700115#include <linux/err.h>
David S. Millerc7fa9d12006-08-15 16:34:13 -0700116#include <linux/ctype.h>
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700117#include <linux/if_arp.h>
Ben Hutchings6de329e2008-06-16 17:02:28 -0700118#include <linux/if_vlan.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700119#include <linux/ip.h>
Alexander Duyckad55dca2008-09-20 22:05:50 -0700120#include <net/ip.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700121#include <linux/ipv6.h>
122#include <linux/in.h>
David S. Millerb6b2fed2008-07-21 09:48:06 -0700123#include <linux/jhash.h>
124#include <linux/random.h>
David S. Miller9cbc1cb2009-06-15 03:02:23 -0700125#include <trace/events/napi.h>
Koki Sanagicf66ba52010-08-23 18:45:02 +0900126#include <trace/events/net.h>
Koki Sanagi07dc22e2010-08-23 18:46:12 +0900127#include <trace/events/skb.h>
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +0000128#include <linux/pci.h>
Stephen Rothwellcaeda9b2010-09-16 21:39:16 -0700129#include <linux/inetdevice.h>
Ben Hutchingsc4454772011-01-19 11:03:53 +0000130#include <linux/cpu_rmap.h>
Ingo Molnarc5905af2012-02-24 08:31:31 +0100131#include <linux/static_key.h>
Eliezer Tamiraf12fa62013-06-10 11:39:41 +0300132#include <linux/hashtable.h>
Eric Dumazet60877a32013-06-20 01:15:51 -0700133#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134
Pavel Emelyanov342709e2007-10-23 21:14:45 -0700135#include "net-sysfs.h"
136
Herbert Xud565b0a2008-12-15 23:38:52 -0800137/* Instead of increasing this, you should create a hash table. */
138#define MAX_GRO_SKBS 8
139
Herbert Xu5d38a072009-01-04 16:13:40 -0800140/* This should be increased if a protocol with a bigger head is added. */
141#define GRO_MAX_HEAD (MAX_HEADER + 128)
142
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143static DEFINE_SPINLOCK(ptype_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000144static DEFINE_SPINLOCK(offload_lock);
Cong Wang900ff8c2013-02-18 19:20:33 +0000145struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
146struct list_head ptype_all __read_mostly; /* Taps */
Vlad Yasevich62532da2012-11-15 08:49:10 +0000147static struct list_head offload_base __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149/*
Pavel Emelianov7562f872007-05-03 15:13:45 -0700150 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151 * semaphore.
152 *
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800153 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154 *
155 * Writers must hold the rtnl semaphore while they loop through the
Pavel Emelianov7562f872007-05-03 15:13:45 -0700156 * dev_base_head list, and hold dev_base_lock for writing when they do the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157 * actual updates. This allows pure readers to access the list even
158 * while a writer is preparing to update it.
159 *
160 * To put it another way, dev_base_lock is held for writing only to
161 * protect against pure readers; the rtnl semaphore provides the
162 * protection against other writers.
163 *
164 * See, for example usages, register_netdevice() and
165 * unregister_netdevice(), which must be called with the rtnl
166 * semaphore held.
167 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700168DEFINE_RWLOCK(dev_base_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169EXPORT_SYMBOL(dev_base_lock);
170
Eliezer Tamiraf12fa62013-06-10 11:39:41 +0300171/* protects napi_hash addition/deletion and napi_gen_id */
172static DEFINE_SPINLOCK(napi_hash_lock);
173
174static unsigned int napi_gen_id;
175static DEFINE_HASHTABLE(napi_hash, 8);
176
Eric Dumazet30e6c9f2012-12-20 17:25:08 +0000177seqcount_t devnet_rename_seq;
Brian Haleyc91f6df2012-11-26 05:21:08 +0000178
Thomas Graf4e985ad2011-06-21 03:11:20 +0000179static inline void dev_base_seq_inc(struct net *net)
180{
181 while (++net->dev_base_seq == 0);
182}
183
Eric W. Biederman881d9662007-09-17 11:56:21 -0700184static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185{
Eric Dumazet95c96172012-04-15 05:58:06 +0000186 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
187
stephen hemminger08e98972009-11-10 07:20:34 +0000188 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189}
190
Eric W. Biederman881d9662007-09-17 11:56:21 -0700191static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700192{
Eric Dumazet7c28bd02009-10-24 06:13:17 -0700193 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194}
195
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000196static inline void rps_lock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000197{
198#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000199 spin_lock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000200#endif
201}
202
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000203static inline void rps_unlock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000204{
205#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000206 spin_unlock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000207#endif
208}
209
Eric W. Biedermance286d32007-09-12 13:53:49 +0200210/* Device list insertion */
dingtianhong53759be2013-04-17 22:17:50 +0000211static void list_netdevice(struct net_device *dev)
Eric W. Biedermance286d32007-09-12 13:53:49 +0200212{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900213 struct net *net = dev_net(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200214
215 ASSERT_RTNL();
216
217 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800218 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
Eric Dumazet72c95282009-10-30 07:11:27 +0000219 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000220 hlist_add_head_rcu(&dev->index_hlist,
221 dev_index_hash(net, dev->ifindex));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200222 write_unlock_bh(&dev_base_lock);
Thomas Graf4e985ad2011-06-21 03:11:20 +0000223
224 dev_base_seq_inc(net);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200225}
226
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000227/* Device list removal
228 * caller must respect a RCU grace period before freeing/reusing dev
229 */
Eric W. Biedermance286d32007-09-12 13:53:49 +0200230static void unlist_netdevice(struct net_device *dev)
231{
232 ASSERT_RTNL();
233
234 /* Unlink dev from the device chain */
235 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800236 list_del_rcu(&dev->dev_list);
Eric Dumazet72c95282009-10-30 07:11:27 +0000237 hlist_del_rcu(&dev->name_hlist);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000238 hlist_del_rcu(&dev->index_hlist);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200239 write_unlock_bh(&dev_base_lock);
Thomas Graf4e985ad2011-06-21 03:11:20 +0000240
241 dev_base_seq_inc(dev_net(dev));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200242}
243
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244/*
245 * Our notifier list
246 */
247
Alan Sternf07d5b92006-05-09 15:23:03 -0700248static RAW_NOTIFIER_HEAD(netdev_chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700249
250/*
251 * Device drivers call our routines to queue packets here. We empty the
252 * queue in the local softnet handler.
253 */
Stephen Hemmingerbea33482007-10-03 16:41:36 -0700254
Eric Dumazet9958da02010-04-17 04:17:02 +0000255DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700256EXPORT_PER_CPU_SYMBOL(softnet_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257
David S. Millercf508b12008-07-22 14:16:42 -0700258#ifdef CONFIG_LOCKDEP
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700259/*
David S. Millerc773e842008-07-08 23:13:53 -0700260 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700261 * according to dev->type
262 */
263static const unsigned short netdev_lock_type[] =
264 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
265 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
266 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
267 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
268 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
269 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
270 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
271 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
272 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
273 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
274 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
275 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
Paul Gortmaker211ed862012-05-10 17:14:35 -0400276 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
277 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
278 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700279
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700280static const char *const netdev_lock_name[] =
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700281 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
282 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
283 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
284 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
285 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
286 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
287 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
288 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
289 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
290 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
291 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
292 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
Paul Gortmaker211ed862012-05-10 17:14:35 -0400293 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
294 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
295 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700296
297static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
David S. Millercf508b12008-07-22 14:16:42 -0700298static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700299
300static inline unsigned short netdev_lock_pos(unsigned short dev_type)
301{
302 int i;
303
304 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
305 if (netdev_lock_type[i] == dev_type)
306 return i;
307 /* the last key is used by default */
308 return ARRAY_SIZE(netdev_lock_type) - 1;
309}
310
David S. Millercf508b12008-07-22 14:16:42 -0700311static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
312 unsigned short dev_type)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700313{
314 int i;
315
316 i = netdev_lock_pos(dev_type);
317 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
318 netdev_lock_name[i]);
319}
David S. Millercf508b12008-07-22 14:16:42 -0700320
321static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
322{
323 int i;
324
325 i = netdev_lock_pos(dev->type);
326 lockdep_set_class_and_name(&dev->addr_list_lock,
327 &netdev_addr_lock_key[i],
328 netdev_lock_name[i]);
329}
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700330#else
David S. Millercf508b12008-07-22 14:16:42 -0700331static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
332 unsigned short dev_type)
333{
334}
335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700336{
337}
338#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339
340/*******************************************************************************
341
342 Protocol management and registration routines
343
344*******************************************************************************/
345
346/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347 * Add a protocol ID to the list. Now that the input handler is
348 * smarter we can dispense with all the messy stuff that used to be
349 * here.
350 *
351 * BEWARE!!! Protocol handlers, mangling input packets,
352 * MUST BE last in hash buckets and checking protocol handlers
353 * MUST start from promiscuous ptype_all chain in net_bh.
354 * It is true now, do not change it.
355 * Explanation follows: if protocol handler, mangling packet, will
356 * be the first on list, it is not able to sense, that packet
357 * is cloned and should be copied-on-write, so that it will
358 * change it and subsequent readers will get broken packet.
359 * --ANK (980803)
360 */
361
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000362static inline struct list_head *ptype_head(const struct packet_type *pt)
363{
364 if (pt->type == htons(ETH_P_ALL))
365 return &ptype_all;
366 else
367 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
368}
369
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370/**
371 * dev_add_pack - add packet handler
372 * @pt: packet type declaration
373 *
374 * Add a protocol handler to the networking stack. The passed &packet_type
375 * is linked into kernel lists and may not be freed until it has been
376 * removed from the kernel lists.
377 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900378 * This call does not sleep therefore it can not
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 * guarantee all CPU's that are in middle of receiving packets
380 * will see the new packet type (until the next received packet).
381 */
382
383void dev_add_pack(struct packet_type *pt)
384{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000385 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000387 spin_lock(&ptype_lock);
388 list_add_rcu(&pt->list, head);
389 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700391EXPORT_SYMBOL(dev_add_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700392
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393/**
394 * __dev_remove_pack - remove packet handler
395 * @pt: packet type declaration
396 *
397 * Remove a protocol handler that was previously added to the kernel
398 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
399 * from the kernel lists and can be freed or reused once this function
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900400 * returns.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401 *
402 * The packet type might still be in use by receivers
403 * and must not be freed until after all the CPU's have gone
404 * through a quiescent state.
405 */
406void __dev_remove_pack(struct packet_type *pt)
407{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000408 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700409 struct packet_type *pt1;
410
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000411 spin_lock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700412
413 list_for_each_entry(pt1, head, list) {
414 if (pt == pt1) {
415 list_del_rcu(&pt->list);
416 goto out;
417 }
418 }
419
Joe Perches7b6cd1c2012-02-01 10:54:43 +0000420 pr_warn("dev_remove_pack: %p not found\n", pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700421out:
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000422 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700423}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700424EXPORT_SYMBOL(__dev_remove_pack);
425
Linus Torvalds1da177e2005-04-16 15:20:36 -0700426/**
427 * dev_remove_pack - remove packet handler
428 * @pt: packet type declaration
429 *
430 * Remove a protocol handler that was previously added to the kernel
431 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
432 * from the kernel lists and can be freed or reused once this function
433 * returns.
434 *
435 * This call sleeps to guarantee that no CPU is looking at the packet
436 * type after return.
437 */
438void dev_remove_pack(struct packet_type *pt)
439{
440 __dev_remove_pack(pt);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900441
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442 synchronize_net();
443}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700444EXPORT_SYMBOL(dev_remove_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445
Vlad Yasevich62532da2012-11-15 08:49:10 +0000446
447/**
448 * dev_add_offload - register offload handlers
449 * @po: protocol offload declaration
450 *
451 * Add protocol offload handlers to the networking stack. The passed
452 * &proto_offload is linked into kernel lists and may not be freed until
453 * it has been removed from the kernel lists.
454 *
455 * This call does not sleep therefore it can not
456 * guarantee all CPU's that are in middle of receiving packets
457 * will see the new offload handlers (until the next received packet).
458 */
459void dev_add_offload(struct packet_offload *po)
460{
461 struct list_head *head = &offload_base;
462
463 spin_lock(&offload_lock);
464 list_add_rcu(&po->list, head);
465 spin_unlock(&offload_lock);
466}
467EXPORT_SYMBOL(dev_add_offload);
468
469/**
470 * __dev_remove_offload - remove offload handler
471 * @po: packet offload declaration
472 *
473 * Remove a protocol offload handler that was previously added to the
474 * kernel offload handlers by dev_add_offload(). The passed &offload_type
475 * is removed from the kernel lists and can be freed or reused once this
476 * function returns.
477 *
478 * The packet type might still be in use by receivers
479 * and must not be freed until after all the CPU's have gone
480 * through a quiescent state.
481 */
482void __dev_remove_offload(struct packet_offload *po)
483{
484 struct list_head *head = &offload_base;
485 struct packet_offload *po1;
486
Eric Dumazetc53aa502012-11-16 08:08:23 +0000487 spin_lock(&offload_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000488
489 list_for_each_entry(po1, head, list) {
490 if (po == po1) {
491 list_del_rcu(&po->list);
492 goto out;
493 }
494 }
495
496 pr_warn("dev_remove_offload: %p not found\n", po);
497out:
Eric Dumazetc53aa502012-11-16 08:08:23 +0000498 spin_unlock(&offload_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000499}
500EXPORT_SYMBOL(__dev_remove_offload);
501
502/**
503 * dev_remove_offload - remove packet offload handler
504 * @po: packet offload declaration
505 *
506 * Remove a packet offload handler that was previously added to the kernel
507 * offload handlers by dev_add_offload(). The passed &offload_type is
508 * removed from the kernel lists and can be freed or reused once this
509 * function returns.
510 *
511 * This call sleeps to guarantee that no CPU is looking at the packet
512 * type after return.
513 */
514void dev_remove_offload(struct packet_offload *po)
515{
516 __dev_remove_offload(po);
517
518 synchronize_net();
519}
520EXPORT_SYMBOL(dev_remove_offload);
521
Linus Torvalds1da177e2005-04-16 15:20:36 -0700522/******************************************************************************
523
524 Device Boot-time Settings Routines
525
526*******************************************************************************/
527
528/* Boot time configuration table */
529static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
530
531/**
532 * netdev_boot_setup_add - add new setup entry
533 * @name: name of the device
534 * @map: configured settings for the device
535 *
536 * Adds new setup entry to the dev_boot_setup list. The function
537 * returns 0 on error and 1 on success. This is a generic routine to
538 * all netdevices.
539 */
540static int netdev_boot_setup_add(char *name, struct ifmap *map)
541{
542 struct netdev_boot_setup *s;
543 int i;
544
545 s = dev_boot_setup;
546 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
547 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
548 memset(s[i].name, 0, sizeof(s[i].name));
Wang Chen93b3cff2008-07-01 19:57:19 -0700549 strlcpy(s[i].name, name, IFNAMSIZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700550 memcpy(&s[i].map, map, sizeof(s[i].map));
551 break;
552 }
553 }
554
555 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
556}
557
558/**
559 * netdev_boot_setup_check - check boot time settings
560 * @dev: the netdevice
561 *
562 * Check boot time settings for the device.
563 * The found settings are set for the device to be used
564 * later in the device probing.
565 * Returns 0 if no settings found, 1 if they are.
566 */
567int netdev_boot_setup_check(struct net_device *dev)
568{
569 struct netdev_boot_setup *s = dev_boot_setup;
570 int i;
571
572 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
573 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
Wang Chen93b3cff2008-07-01 19:57:19 -0700574 !strcmp(dev->name, s[i].name)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700575 dev->irq = s[i].map.irq;
576 dev->base_addr = s[i].map.base_addr;
577 dev->mem_start = s[i].map.mem_start;
578 dev->mem_end = s[i].map.mem_end;
579 return 1;
580 }
581 }
582 return 0;
583}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700584EXPORT_SYMBOL(netdev_boot_setup_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700585
586
587/**
588 * netdev_boot_base - get address from boot time settings
589 * @prefix: prefix for network device
590 * @unit: id for network device
591 *
592 * Check boot time settings for the base address of device.
593 * The found settings are set for the device to be used
594 * later in the device probing.
595 * Returns 0 if no settings found.
596 */
597unsigned long netdev_boot_base(const char *prefix, int unit)
598{
599 const struct netdev_boot_setup *s = dev_boot_setup;
600 char name[IFNAMSIZ];
601 int i;
602
603 sprintf(name, "%s%d", prefix, unit);
604
605 /*
606 * If device already registered then return base of 1
607 * to indicate not to probe for this interface
608 */
Eric W. Biederman881d9662007-09-17 11:56:21 -0700609 if (__dev_get_by_name(&init_net, name))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700610 return 1;
611
612 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
613 if (!strcmp(name, s[i].name))
614 return s[i].map.base_addr;
615 return 0;
616}
617
618/*
619 * Saves at boot time configured settings for any netdevice.
620 */
621int __init netdev_boot_setup(char *str)
622{
623 int ints[5];
624 struct ifmap map;
625
626 str = get_options(str, ARRAY_SIZE(ints), ints);
627 if (!str || !*str)
628 return 0;
629
630 /* Save settings */
631 memset(&map, 0, sizeof(map));
632 if (ints[0] > 0)
633 map.irq = ints[1];
634 if (ints[0] > 1)
635 map.base_addr = ints[2];
636 if (ints[0] > 2)
637 map.mem_start = ints[3];
638 if (ints[0] > 3)
639 map.mem_end = ints[4];
640
641 /* Add new entry to the list */
642 return netdev_boot_setup_add(str, &map);
643}
644
645__setup("netdev=", netdev_boot_setup);
646
647/*******************************************************************************
648
649 Device Interface Subroutines
650
651*******************************************************************************/
652
653/**
654 * __dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700655 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700656 * @name: name to find
657 *
658 * Find an interface by name. Must be called under RTNL semaphore
659 * or @dev_base_lock. If the name is found a pointer to the device
660 * is returned. If the name is not found then %NULL is returned. The
661 * reference counters are not incremented so the caller must be
662 * careful with locks.
663 */
664
Eric W. Biederman881d9662007-09-17 11:56:21 -0700665struct net_device *__dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666{
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700667 struct net_device *dev;
668 struct hlist_head *head = dev_name_hash(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669
Sasha Levinb67bfe02013-02-27 17:06:00 -0800670 hlist_for_each_entry(dev, head, name_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700671 if (!strncmp(dev->name, name, IFNAMSIZ))
672 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700673
Linus Torvalds1da177e2005-04-16 15:20:36 -0700674 return NULL;
675}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700676EXPORT_SYMBOL(__dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700677
678/**
Eric Dumazet72c95282009-10-30 07:11:27 +0000679 * dev_get_by_name_rcu - find a device by its name
680 * @net: the applicable net namespace
681 * @name: name to find
682 *
683 * Find an interface by name.
684 * If the name is found a pointer to the device is returned.
685 * If the name is not found then %NULL is returned.
686 * The reference counters are not incremented so the caller must be
687 * careful with locks. The caller must hold RCU lock.
688 */
689
690struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
691{
Eric Dumazet72c95282009-10-30 07:11:27 +0000692 struct net_device *dev;
693 struct hlist_head *head = dev_name_hash(net, name);
694
Sasha Levinb67bfe02013-02-27 17:06:00 -0800695 hlist_for_each_entry_rcu(dev, head, name_hlist)
Eric Dumazet72c95282009-10-30 07:11:27 +0000696 if (!strncmp(dev->name, name, IFNAMSIZ))
697 return dev;
698
699 return NULL;
700}
701EXPORT_SYMBOL(dev_get_by_name_rcu);
702
703/**
Linus Torvalds1da177e2005-04-16 15:20:36 -0700704 * dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700705 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700706 * @name: name to find
707 *
708 * Find an interface by name. This can be called from any
709 * context and does its own locking. The returned handle has
710 * the usage count incremented and the caller must use dev_put() to
711 * release it when it is no longer needed. %NULL is returned if no
712 * matching device is found.
713 */
714
Eric W. Biederman881d9662007-09-17 11:56:21 -0700715struct net_device *dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716{
717 struct net_device *dev;
718
Eric Dumazet72c95282009-10-30 07:11:27 +0000719 rcu_read_lock();
720 dev = dev_get_by_name_rcu(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700721 if (dev)
722 dev_hold(dev);
Eric Dumazet72c95282009-10-30 07:11:27 +0000723 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724 return dev;
725}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700726EXPORT_SYMBOL(dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700727
728/**
729 * __dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700730 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731 * @ifindex: index of device
732 *
733 * Search for an interface by index. Returns %NULL if the device
734 * is not found or a pointer to the device. The device has not
735 * had its reference counter increased so the caller must be careful
736 * about locking. The caller must hold either the RTNL semaphore
737 * or @dev_base_lock.
738 */
739
Eric W. Biederman881d9662007-09-17 11:56:21 -0700740struct net_device *__dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700741{
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700742 struct net_device *dev;
743 struct hlist_head *head = dev_index_hash(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700744
Sasha Levinb67bfe02013-02-27 17:06:00 -0800745 hlist_for_each_entry(dev, head, index_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700746 if (dev->ifindex == ifindex)
747 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700748
Linus Torvalds1da177e2005-04-16 15:20:36 -0700749 return NULL;
750}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700751EXPORT_SYMBOL(__dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700752
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000753/**
754 * dev_get_by_index_rcu - find a device by its ifindex
755 * @net: the applicable net namespace
756 * @ifindex: index of device
757 *
758 * Search for an interface by index. Returns %NULL if the device
759 * is not found or a pointer to the device. The device has not
760 * had its reference counter increased so the caller must be careful
761 * about locking. The caller must hold RCU lock.
762 */
763
764struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
765{
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000766 struct net_device *dev;
767 struct hlist_head *head = dev_index_hash(net, ifindex);
768
Sasha Levinb67bfe02013-02-27 17:06:00 -0800769 hlist_for_each_entry_rcu(dev, head, index_hlist)
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000770 if (dev->ifindex == ifindex)
771 return dev;
772
773 return NULL;
774}
775EXPORT_SYMBOL(dev_get_by_index_rcu);
776
Linus Torvalds1da177e2005-04-16 15:20:36 -0700777
778/**
779 * dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700780 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700781 * @ifindex: index of device
782 *
783 * Search for an interface by index. Returns NULL if the device
784 * is not found or a pointer to the device. The device returned has
785 * had a reference added and the pointer is safe until the user calls
786 * dev_put to indicate they have finished with it.
787 */
788
Eric W. Biederman881d9662007-09-17 11:56:21 -0700789struct net_device *dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790{
791 struct net_device *dev;
792
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000793 rcu_read_lock();
794 dev = dev_get_by_index_rcu(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700795 if (dev)
796 dev_hold(dev);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000797 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700798 return dev;
799}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700800EXPORT_SYMBOL(dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700801
802/**
Eric Dumazet941666c2010-12-05 01:23:53 +0000803 * dev_getbyhwaddr_rcu - find a device by its hardware address
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700804 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700805 * @type: media type of device
806 * @ha: hardware address
807 *
808 * Search for an interface by MAC address. Returns NULL if the device
Eric Dumazetc5066532011-01-24 13:16:16 -0800809 * is not found or a pointer to the device.
810 * The caller must hold RCU or RTNL.
Eric Dumazet941666c2010-12-05 01:23:53 +0000811 * The returned device has not had its ref count increased
Linus Torvalds1da177e2005-04-16 15:20:36 -0700812 * and the caller must therefore be careful about locking
813 *
Linus Torvalds1da177e2005-04-16 15:20:36 -0700814 */
815
Eric Dumazet941666c2010-12-05 01:23:53 +0000816struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
817 const char *ha)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700818{
819 struct net_device *dev;
820
Eric Dumazet941666c2010-12-05 01:23:53 +0000821 for_each_netdev_rcu(net, dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700822 if (dev->type == type &&
823 !memcmp(dev->dev_addr, ha, dev->addr_len))
Pavel Emelianov7562f872007-05-03 15:13:45 -0700824 return dev;
825
826 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700827}
Eric Dumazet941666c2010-12-05 01:23:53 +0000828EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
Jochen Friedrichcf309e32005-09-22 04:44:55 -0300829
Eric W. Biederman881d9662007-09-17 11:56:21 -0700830struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700831{
832 struct net_device *dev;
833
834 ASSERT_RTNL();
Eric W. Biederman881d9662007-09-17 11:56:21 -0700835 for_each_netdev(net, dev)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700836 if (dev->type == type)
Pavel Emelianov7562f872007-05-03 15:13:45 -0700837 return dev;
838
839 return NULL;
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700840}
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700841EXPORT_SYMBOL(__dev_getfirstbyhwtype);
842
Eric W. Biederman881d9662007-09-17 11:56:21 -0700843struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700844{
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000845 struct net_device *dev, *ret = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700846
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000847 rcu_read_lock();
848 for_each_netdev_rcu(net, dev)
849 if (dev->type == type) {
850 dev_hold(dev);
851 ret = dev;
852 break;
853 }
854 rcu_read_unlock();
855 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700856}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700857EXPORT_SYMBOL(dev_getfirstbyhwtype);
858
859/**
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000860 * dev_get_by_flags_rcu - find any device with given flags
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700861 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700862 * @if_flags: IFF_* values
863 * @mask: bitmask of bits in if_flags to check
864 *
865 * Search for any interface with the given flags. Returns NULL if a device
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000866 * is not found or a pointer to the device. Must be called inside
867 * rcu_read_lock(), and result refcount is unchanged.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700868 */
869
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000870struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700871 unsigned short mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700872{
Pavel Emelianov7562f872007-05-03 15:13:45 -0700873 struct net_device *dev, *ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700874
Pavel Emelianov7562f872007-05-03 15:13:45 -0700875 ret = NULL;
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800876 for_each_netdev_rcu(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700877 if (((dev->flags ^ if_flags) & mask) == 0) {
Pavel Emelianov7562f872007-05-03 15:13:45 -0700878 ret = dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700879 break;
880 }
881 }
Pavel Emelianov7562f872007-05-03 15:13:45 -0700882 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700883}
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000884EXPORT_SYMBOL(dev_get_by_flags_rcu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700885
886/**
887 * dev_valid_name - check if name is okay for network device
888 * @name: name string
889 *
890 * Network device names need to be valid file names to
David S. Millerc7fa9d12006-08-15 16:34:13 -0700891 * to allow sysfs to work. We also disallow any kind of
892 * whitespace.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700893 */
David S. Miller95f050b2012-03-06 16:12:15 -0500894bool dev_valid_name(const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700895{
David S. Millerc7fa9d12006-08-15 16:34:13 -0700896 if (*name == '\0')
David S. Miller95f050b2012-03-06 16:12:15 -0500897 return false;
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -0700898 if (strlen(name) >= IFNAMSIZ)
David S. Miller95f050b2012-03-06 16:12:15 -0500899 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700900 if (!strcmp(name, ".") || !strcmp(name, ".."))
David S. Miller95f050b2012-03-06 16:12:15 -0500901 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700902
903 while (*name) {
904 if (*name == '/' || isspace(*name))
David S. Miller95f050b2012-03-06 16:12:15 -0500905 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700906 name++;
907 }
David S. Miller95f050b2012-03-06 16:12:15 -0500908 return true;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700909}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700910EXPORT_SYMBOL(dev_valid_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700911
912/**
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200913 * __dev_alloc_name - allocate a name for a device
914 * @net: network namespace to allocate the device name in
Linus Torvalds1da177e2005-04-16 15:20:36 -0700915 * @name: name format string
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200916 * @buf: scratch buffer and result name string
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917 *
918 * Passed a format string - eg "lt%d" it will try and find a suitable
Stephen Hemminger3041a062006-05-26 13:25:24 -0700919 * id. It scans list of devices to build up a free map, then chooses
920 * the first empty slot. The caller must hold the dev_base or rtnl lock
921 * while allocating the name and adding the device in order to avoid
922 * duplicates.
923 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
924 * Returns the number of the unit assigned or a negative errno code.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700925 */
926
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200927static int __dev_alloc_name(struct net *net, const char *name, char *buf)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700928{
929 int i = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700930 const char *p;
931 const int max_netdevices = 8*PAGE_SIZE;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700932 unsigned long *inuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700933 struct net_device *d;
934
935 p = strnchr(name, IFNAMSIZ-1, '%');
936 if (p) {
937 /*
938 * Verify the string as this thing may have come from
939 * the user. There must be either one "%d" and no other "%"
940 * characters.
941 */
942 if (p[1] != 'd' || strchr(p + 2, '%'))
943 return -EINVAL;
944
945 /* Use one page as a bit array of possible slots */
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700946 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700947 if (!inuse)
948 return -ENOMEM;
949
Eric W. Biederman881d9662007-09-17 11:56:21 -0700950 for_each_netdev(net, d) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700951 if (!sscanf(d->name, name, &i))
952 continue;
953 if (i < 0 || i >= max_netdevices)
954 continue;
955
956 /* avoid cases where sscanf is not exact inverse of printf */
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200957 snprintf(buf, IFNAMSIZ, name, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700958 if (!strncmp(buf, d->name, IFNAMSIZ))
959 set_bit(i, inuse);
960 }
961
962 i = find_first_zero_bit(inuse, max_netdevices);
963 free_page((unsigned long) inuse);
964 }
965
Octavian Purdilad9031022009-11-18 02:36:59 +0000966 if (buf != name)
967 snprintf(buf, IFNAMSIZ, name, i);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200968 if (!__dev_get_by_name(net, buf))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700969 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700970
971 /* It is possible to run out of possible slots
972 * when the name is long and there isn't enough space left
973 * for the digits, or if all bits are used.
974 */
975 return -ENFILE;
976}
977
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200978/**
979 * dev_alloc_name - allocate a name for a device
980 * @dev: device
981 * @name: name format string
982 *
983 * Passed a format string - eg "lt%d" it will try and find a suitable
984 * id. It scans list of devices to build up a free map, then chooses
985 * the first empty slot. The caller must hold the dev_base or rtnl lock
986 * while allocating the name and adding the device in order to avoid
987 * duplicates.
988 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
989 * Returns the number of the unit assigned or a negative errno code.
990 */
991
992int dev_alloc_name(struct net_device *dev, const char *name)
993{
994 char buf[IFNAMSIZ];
995 struct net *net;
996 int ret;
997
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900998 BUG_ON(!dev_net(dev));
999 net = dev_net(dev);
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001000 ret = __dev_alloc_name(net, name, buf);
1001 if (ret >= 0)
1002 strlcpy(dev->name, buf, IFNAMSIZ);
1003 return ret;
1004}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001005EXPORT_SYMBOL(dev_alloc_name);
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001006
Gao feng828de4f2012-09-13 20:58:27 +00001007static int dev_alloc_name_ns(struct net *net,
1008 struct net_device *dev,
1009 const char *name)
Octavian Purdilad9031022009-11-18 02:36:59 +00001010{
Gao feng828de4f2012-09-13 20:58:27 +00001011 char buf[IFNAMSIZ];
1012 int ret;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001013
Gao feng828de4f2012-09-13 20:58:27 +00001014 ret = __dev_alloc_name(net, name, buf);
1015 if (ret >= 0)
1016 strlcpy(dev->name, buf, IFNAMSIZ);
1017 return ret;
1018}
1019
1020static int dev_get_valid_name(struct net *net,
1021 struct net_device *dev,
1022 const char *name)
1023{
1024 BUG_ON(!net);
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001025
Octavian Purdilad9031022009-11-18 02:36:59 +00001026 if (!dev_valid_name(name))
1027 return -EINVAL;
1028
Jiri Pirko1c5cae82011-04-30 01:21:32 +00001029 if (strchr(name, '%'))
Gao feng828de4f2012-09-13 20:58:27 +00001030 return dev_alloc_name_ns(net, dev, name);
Octavian Purdilad9031022009-11-18 02:36:59 +00001031 else if (__dev_get_by_name(net, name))
1032 return -EEXIST;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001033 else if (dev->name != name)
1034 strlcpy(dev->name, name, IFNAMSIZ);
Octavian Purdilad9031022009-11-18 02:36:59 +00001035
1036 return 0;
1037}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001038
1039/**
1040 * dev_change_name - change name of a device
1041 * @dev: device
1042 * @newname: name (or format string) must be at least IFNAMSIZ
1043 *
1044 * Change name of a device, can pass format strings "eth%d".
1045 * for wildcarding.
1046 */
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07001047int dev_change_name(struct net_device *dev, const char *newname)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001048{
Herbert Xufcc5a032007-07-30 17:03:38 -07001049 char oldname[IFNAMSIZ];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050 int err = 0;
Herbert Xufcc5a032007-07-30 17:03:38 -07001051 int ret;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001052 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053
1054 ASSERT_RTNL();
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001055 BUG_ON(!dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001056
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001057 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001058 if (dev->flags & IFF_UP)
1059 return -EBUSY;
1060
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001061 write_seqcount_begin(&devnet_rename_seq);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001062
1063 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001064 write_seqcount_end(&devnet_rename_seq);
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -07001065 return 0;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001066 }
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -07001067
Herbert Xufcc5a032007-07-30 17:03:38 -07001068 memcpy(oldname, dev->name, IFNAMSIZ);
1069
Gao feng828de4f2012-09-13 20:58:27 +00001070 err = dev_get_valid_name(net, dev, newname);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001071 if (err < 0) {
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001072 write_seqcount_end(&devnet_rename_seq);
Octavian Purdilad9031022009-11-18 02:36:59 +00001073 return err;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001074 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001075
Herbert Xufcc5a032007-07-30 17:03:38 -07001076rollback:
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001077 ret = device_rename(&dev->dev, dev->name);
1078 if (ret) {
1079 memcpy(dev->name, oldname, IFNAMSIZ);
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001080 write_seqcount_end(&devnet_rename_seq);
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001081 return ret;
Stephen Hemmingerdcc99772008-05-14 22:33:38 -07001082 }
Herbert Xu7f988ea2007-07-30 16:35:46 -07001083
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001084 write_seqcount_end(&devnet_rename_seq);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001085
Herbert Xu7f988ea2007-07-30 16:35:46 -07001086 write_lock_bh(&dev_base_lock);
Eric Dumazet372b2312011-05-17 13:56:59 -04001087 hlist_del_rcu(&dev->name_hlist);
Eric Dumazet72c95282009-10-30 07:11:27 +00001088 write_unlock_bh(&dev_base_lock);
1089
1090 synchronize_rcu();
1091
1092 write_lock_bh(&dev_base_lock);
1093 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Herbert Xu7f988ea2007-07-30 16:35:46 -07001094 write_unlock_bh(&dev_base_lock);
1095
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001096 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001097 ret = notifier_to_errno(ret);
1098
1099 if (ret) {
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001100 /* err >= 0 after dev_alloc_name() or stores the first errno */
1101 if (err >= 0) {
Herbert Xufcc5a032007-07-30 17:03:38 -07001102 err = ret;
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001103 write_seqcount_begin(&devnet_rename_seq);
Herbert Xufcc5a032007-07-30 17:03:38 -07001104 memcpy(dev->name, oldname, IFNAMSIZ);
1105 goto rollback;
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001106 } else {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001107 pr_err("%s: name change rollback failed: %d\n",
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001108 dev->name, ret);
Herbert Xufcc5a032007-07-30 17:03:38 -07001109 }
1110 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001111
1112 return err;
1113}
1114
1115/**
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001116 * dev_set_alias - change ifalias of a device
1117 * @dev: device
1118 * @alias: name up to IFALIASZ
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07001119 * @len: limit of bytes to copy from info
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001120 *
1121 * Set ifalias for a device,
1122 */
1123int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1124{
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001125 char *new_ifalias;
1126
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001127 ASSERT_RTNL();
1128
1129 if (len >= IFALIASZ)
1130 return -EINVAL;
1131
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001132 if (!len) {
Sachin Kamat388dfc22012-11-20 00:57:04 +00001133 kfree(dev->ifalias);
1134 dev->ifalias = NULL;
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001135 return 0;
1136 }
1137
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001138 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1139 if (!new_ifalias)
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001140 return -ENOMEM;
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001141 dev->ifalias = new_ifalias;
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001142
1143 strlcpy(dev->ifalias, alias, len+1);
1144 return len;
1145}
1146
1147
1148/**
Stephen Hemminger3041a062006-05-26 13:25:24 -07001149 * netdev_features_change - device changes features
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001150 * @dev: device to cause notification
1151 *
1152 * Called to indicate a device has changed features.
1153 */
1154void netdev_features_change(struct net_device *dev)
1155{
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001156 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001157}
1158EXPORT_SYMBOL(netdev_features_change);
1159
1160/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001161 * netdev_state_change - device changes state
1162 * @dev: device to cause notification
1163 *
1164 * Called to indicate a device has changed state. This function calls
1165 * the notifier chains for netdev_chain and sends a NEWLINK message
1166 * to the routing socket.
1167 */
1168void netdev_state_change(struct net_device *dev)
1169{
1170 if (dev->flags & IFF_UP) {
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001171 call_netdevice_notifiers(NETDEV_CHANGE, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001172 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1173 }
1174}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001175EXPORT_SYMBOL(netdev_state_change);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001176
Amerigo Wangee89bab2012-08-09 22:14:56 +00001177/**
1178 * netdev_notify_peers - notify network peers about existence of @dev
1179 * @dev: network device
1180 *
1181 * Generate traffic such that interested network peers are aware of
1182 * @dev, such as by generating a gratuitous ARP. This may be used when
1183 * a device wants to inform the rest of the network about some sort of
1184 * reconfiguration such as a failover event or virtual machine
1185 * migration.
1186 */
1187void netdev_notify_peers(struct net_device *dev)
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001188{
Amerigo Wangee89bab2012-08-09 22:14:56 +00001189 rtnl_lock();
1190 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1191 rtnl_unlock();
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001192}
Amerigo Wangee89bab2012-08-09 22:14:56 +00001193EXPORT_SYMBOL(netdev_notify_peers);
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001194
Patrick McHardybd380812010-02-26 06:34:53 +00001195static int __dev_open(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001196{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001197 const struct net_device_ops *ops = dev->netdev_ops;
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001198 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001199
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001200 ASSERT_RTNL();
1201
Linus Torvalds1da177e2005-04-16 15:20:36 -07001202 if (!netif_device_present(dev))
1203 return -ENODEV;
1204
Neil Hormanca99ca12013-02-05 08:05:43 +00001205 /* Block netpoll from trying to do any rx path servicing.
1206 * If we don't do this there is a chance ndo_poll_controller
1207 * or ndo_poll may be running while we open the device
1208 */
dingtianhongda6e3782013-05-27 19:53:31 +00001209 netpoll_rx_disable(dev);
Neil Hormanca99ca12013-02-05 08:05:43 +00001210
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001211 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1212 ret = notifier_to_errno(ret);
1213 if (ret)
1214 return ret;
1215
Linus Torvalds1da177e2005-04-16 15:20:36 -07001216 set_bit(__LINK_STATE_START, &dev->state);
Jeff Garzikbada3392007-10-23 20:19:37 -07001217
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001218 if (ops->ndo_validate_addr)
1219 ret = ops->ndo_validate_addr(dev);
Jeff Garzikbada3392007-10-23 20:19:37 -07001220
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001221 if (!ret && ops->ndo_open)
1222 ret = ops->ndo_open(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001223
Neil Hormanca99ca12013-02-05 08:05:43 +00001224 netpoll_rx_enable(dev);
1225
Jeff Garzikbada3392007-10-23 20:19:37 -07001226 if (ret)
1227 clear_bit(__LINK_STATE_START, &dev->state);
1228 else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001229 dev->flags |= IFF_UP;
David S. Millerb4bd07c2009-02-06 22:06:43 -08001230 net_dmaengine_get();
Patrick McHardy4417da62007-06-27 01:28:10 -07001231 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001232 dev_activate(dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04001233 add_device_randomness(dev->dev_addr, dev->addr_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001234 }
Jeff Garzikbada3392007-10-23 20:19:37 -07001235
Linus Torvalds1da177e2005-04-16 15:20:36 -07001236 return ret;
1237}
Patrick McHardybd380812010-02-26 06:34:53 +00001238
1239/**
1240 * dev_open - prepare an interface for use.
1241 * @dev: device to open
1242 *
1243 * Takes a device from down to up state. The device's private open
1244 * function is invoked and then the multicast lists are loaded. Finally
1245 * the device is moved into the up state and a %NETDEV_UP message is
1246 * sent to the netdev notifier chain.
1247 *
1248 * Calling this function on an active interface is a nop. On a failure
1249 * a negative errno code is returned.
1250 */
1251int dev_open(struct net_device *dev)
1252{
1253 int ret;
1254
Patrick McHardybd380812010-02-26 06:34:53 +00001255 if (dev->flags & IFF_UP)
1256 return 0;
1257
Patrick McHardybd380812010-02-26 06:34:53 +00001258 ret = __dev_open(dev);
1259 if (ret < 0)
1260 return ret;
1261
Patrick McHardybd380812010-02-26 06:34:53 +00001262 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1263 call_netdevice_notifiers(NETDEV_UP, dev);
1264
1265 return ret;
1266}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001267EXPORT_SYMBOL(dev_open);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001268
Octavian Purdila44345722010-12-13 12:44:07 +00001269static int __dev_close_many(struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001270{
Octavian Purdila44345722010-12-13 12:44:07 +00001271 struct net_device *dev;
Patrick McHardybd380812010-02-26 06:34:53 +00001272
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001273 ASSERT_RTNL();
David S. Miller9d5010d2007-09-12 14:33:25 +02001274 might_sleep();
1275
Octavian Purdila44345722010-12-13 12:44:07 +00001276 list_for_each_entry(dev, head, unreg_list) {
Octavian Purdila44345722010-12-13 12:44:07 +00001277 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001278
Octavian Purdila44345722010-12-13 12:44:07 +00001279 clear_bit(__LINK_STATE_START, &dev->state);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001280
Octavian Purdila44345722010-12-13 12:44:07 +00001281 /* Synchronize to scheduled poll. We cannot touch poll list, it
1282 * can be even on different cpu. So just clear netif_running().
1283 *
1284 * dev->stop() will invoke napi_disable() on all of it's
1285 * napi_struct instances on this device.
1286 */
1287 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1288 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001289
Octavian Purdila44345722010-12-13 12:44:07 +00001290 dev_deactivate_many(head);
1291
1292 list_for_each_entry(dev, head, unreg_list) {
1293 const struct net_device_ops *ops = dev->netdev_ops;
1294
1295 /*
1296 * Call the device specific close. This cannot fail.
1297 * Only if device is UP
1298 *
1299 * We allow it to be called even after a DETACH hot-plug
1300 * event.
1301 */
1302 if (ops->ndo_stop)
1303 ops->ndo_stop(dev);
1304
Octavian Purdila44345722010-12-13 12:44:07 +00001305 dev->flags &= ~IFF_UP;
Octavian Purdila44345722010-12-13 12:44:07 +00001306 net_dmaengine_put();
1307 }
1308
1309 return 0;
1310}
1311
1312static int __dev_close(struct net_device *dev)
1313{
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001314 int retval;
Octavian Purdila44345722010-12-13 12:44:07 +00001315 LIST_HEAD(single);
1316
Neil Hormanca99ca12013-02-05 08:05:43 +00001317 /* Temporarily disable netpoll until the interface is down */
dingtianhongda6e3782013-05-27 19:53:31 +00001318 netpoll_rx_disable(dev);
Neil Hormanca99ca12013-02-05 08:05:43 +00001319
Octavian Purdila44345722010-12-13 12:44:07 +00001320 list_add(&dev->unreg_list, &single);
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001321 retval = __dev_close_many(&single);
1322 list_del(&single);
Neil Hormanca99ca12013-02-05 08:05:43 +00001323
1324 netpoll_rx_enable(dev);
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001325 return retval;
Octavian Purdila44345722010-12-13 12:44:07 +00001326}
1327
Eric Dumazet3fbd8752011-01-19 21:23:22 +00001328static int dev_close_many(struct list_head *head)
Octavian Purdila44345722010-12-13 12:44:07 +00001329{
1330 struct net_device *dev, *tmp;
1331 LIST_HEAD(tmp_list);
1332
1333 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1334 if (!(dev->flags & IFF_UP))
1335 list_move(&dev->unreg_list, &tmp_list);
1336
1337 __dev_close_many(head);
Matti Linnanvuorid8b2a4d2008-02-12 23:10:11 -08001338
Octavian Purdila44345722010-12-13 12:44:07 +00001339 list_for_each_entry(dev, head, unreg_list) {
1340 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1341 call_netdevice_notifiers(NETDEV_DOWN, dev);
1342 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001343
Octavian Purdila44345722010-12-13 12:44:07 +00001344 /* rollback_registered_many needs the complete original list */
1345 list_splice(&tmp_list, head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001346 return 0;
1347}
Patrick McHardybd380812010-02-26 06:34:53 +00001348
1349/**
1350 * dev_close - shutdown an interface.
1351 * @dev: device to shutdown
1352 *
1353 * This function moves an active device into down state. A
1354 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1355 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1356 * chain.
1357 */
1358int dev_close(struct net_device *dev)
1359{
Eric Dumazete14a5992011-05-10 12:26:06 -07001360 if (dev->flags & IFF_UP) {
1361 LIST_HEAD(single);
Patrick McHardybd380812010-02-26 06:34:53 +00001362
Neil Hormanca99ca12013-02-05 08:05:43 +00001363 /* Block netpoll rx while the interface is going down */
dingtianhongda6e3782013-05-27 19:53:31 +00001364 netpoll_rx_disable(dev);
Neil Hormanca99ca12013-02-05 08:05:43 +00001365
Eric Dumazete14a5992011-05-10 12:26:06 -07001366 list_add(&dev->unreg_list, &single);
1367 dev_close_many(&single);
1368 list_del(&single);
Neil Hormanca99ca12013-02-05 08:05:43 +00001369
1370 netpoll_rx_enable(dev);
Eric Dumazete14a5992011-05-10 12:26:06 -07001371 }
dingtianhongda6e3782013-05-27 19:53:31 +00001372 return 0;
Patrick McHardybd380812010-02-26 06:34:53 +00001373}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001374EXPORT_SYMBOL(dev_close);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001375
1376
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001377/**
1378 * dev_disable_lro - disable Large Receive Offload on a device
1379 * @dev: device
1380 *
1381 * Disable Large Receive Offload (LRO) on a net device. Must be
1382 * called under RTNL. This is needed if received packets may be
1383 * forwarded to another interface.
1384 */
1385void dev_disable_lro(struct net_device *dev)
1386{
Neil Hormanf11970e2011-05-24 08:31:09 +00001387 /*
1388 * If we're trying to disable lro on a vlan device
1389 * use the underlying physical device instead
1390 */
1391 if (is_vlan_dev(dev))
1392 dev = vlan_dev_real_dev(dev);
1393
Michał Mirosławbc5787c62011-11-15 15:29:55 +00001394 dev->wanted_features &= ~NETIF_F_LRO;
1395 netdev_update_features(dev);
Michał Mirosław27660512011-03-18 16:56:34 +00001396
Michał Mirosław22d59692011-04-21 12:42:15 +00001397 if (unlikely(dev->features & NETIF_F_LRO))
1398 netdev_WARN(dev, "failed to disable LRO!\n");
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001399}
1400EXPORT_SYMBOL(dev_disable_lro);
1401
Jiri Pirko351638e2013-05-28 01:30:21 +00001402static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1403 struct net_device *dev)
1404{
1405 struct netdev_notifier_info info;
1406
1407 netdev_notifier_info_init(&info, dev);
1408 return nb->notifier_call(nb, val, &info);
1409}
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001410
Eric W. Biederman881d9662007-09-17 11:56:21 -07001411static int dev_boot_phase = 1;
1412
Linus Torvalds1da177e2005-04-16 15:20:36 -07001413/**
1414 * register_netdevice_notifier - register a network notifier block
1415 * @nb: notifier
1416 *
1417 * Register a notifier to be called when network device events occur.
1418 * The notifier passed is linked into the kernel structures and must
1419 * not be reused until it has been unregistered. A negative errno code
1420 * is returned on a failure.
1421 *
1422 * When registered all registration and up events are replayed
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001423 * to the new notifier to allow device to have a race free
Linus Torvalds1da177e2005-04-16 15:20:36 -07001424 * view of the network device list.
1425 */
1426
1427int register_netdevice_notifier(struct notifier_block *nb)
1428{
1429 struct net_device *dev;
Herbert Xufcc5a032007-07-30 17:03:38 -07001430 struct net_device *last;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001431 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001432 int err;
1433
1434 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001435 err = raw_notifier_chain_register(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001436 if (err)
1437 goto unlock;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001438 if (dev_boot_phase)
1439 goto unlock;
1440 for_each_net(net) {
1441 for_each_netdev(net, dev) {
Jiri Pirko351638e2013-05-28 01:30:21 +00001442 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
Eric W. Biederman881d9662007-09-17 11:56:21 -07001443 err = notifier_to_errno(err);
1444 if (err)
1445 goto rollback;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001446
Eric W. Biederman881d9662007-09-17 11:56:21 -07001447 if (!(dev->flags & IFF_UP))
1448 continue;
Herbert Xufcc5a032007-07-30 17:03:38 -07001449
Jiri Pirko351638e2013-05-28 01:30:21 +00001450 call_netdevice_notifier(nb, NETDEV_UP, dev);
Eric W. Biederman881d9662007-09-17 11:56:21 -07001451 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001452 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001453
1454unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001455 rtnl_unlock();
1456 return err;
Herbert Xufcc5a032007-07-30 17:03:38 -07001457
1458rollback:
1459 last = dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001460 for_each_net(net) {
1461 for_each_netdev(net, dev) {
1462 if (dev == last)
RongQing.Li8f891482011-11-30 23:43:07 -05001463 goto outroll;
Herbert Xufcc5a032007-07-30 17:03:38 -07001464
Eric W. Biederman881d9662007-09-17 11:56:21 -07001465 if (dev->flags & IFF_UP) {
Jiri Pirko351638e2013-05-28 01:30:21 +00001466 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1467 dev);
1468 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
Eric W. Biederman881d9662007-09-17 11:56:21 -07001469 }
Jiri Pirko351638e2013-05-28 01:30:21 +00001470 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001471 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001472 }
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001473
RongQing.Li8f891482011-11-30 23:43:07 -05001474outroll:
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001475 raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001476 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001477}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001478EXPORT_SYMBOL(register_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001479
1480/**
1481 * unregister_netdevice_notifier - unregister a network notifier block
1482 * @nb: notifier
1483 *
1484 * Unregister a notifier previously registered by
1485 * register_netdevice_notifier(). The notifier is unlinked into the
1486 * kernel structures and may then be reused. A negative errno code
1487 * is returned on a failure.
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001488 *
1489 * After unregistering unregister and down device events are synthesized
1490 * for all devices on the device list to the removed notifier to remove
1491 * the need for special case cleanup code.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001492 */
1493
1494int unregister_netdevice_notifier(struct notifier_block *nb)
1495{
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001496 struct net_device *dev;
1497 struct net *net;
Herbert Xu9f514952006-03-25 01:24:25 -08001498 int err;
1499
1500 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001501 err = raw_notifier_chain_unregister(&netdev_chain, nb);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001502 if (err)
1503 goto unlock;
1504
1505 for_each_net(net) {
1506 for_each_netdev(net, dev) {
1507 if (dev->flags & IFF_UP) {
Jiri Pirko351638e2013-05-28 01:30:21 +00001508 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1509 dev);
1510 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001511 }
Jiri Pirko351638e2013-05-28 01:30:21 +00001512 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001513 }
1514 }
1515unlock:
Herbert Xu9f514952006-03-25 01:24:25 -08001516 rtnl_unlock();
1517 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001518}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001519EXPORT_SYMBOL(unregister_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001520
1521/**
Jiri Pirko351638e2013-05-28 01:30:21 +00001522 * call_netdevice_notifiers_info - call all network notifier blocks
1523 * @val: value passed unmodified to notifier function
1524 * @dev: net_device pointer passed unmodified to notifier function
1525 * @info: notifier information data
1526 *
1527 * Call all network notifier blocks. Parameters and return value
1528 * are as for raw_notifier_call_chain().
1529 */
1530
1531int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
1532 struct netdev_notifier_info *info)
1533{
1534 ASSERT_RTNL();
1535 netdev_notifier_info_init(info, dev);
1536 return raw_notifier_call_chain(&netdev_chain, val, info);
1537}
1538EXPORT_SYMBOL(call_netdevice_notifiers_info);
1539
1540/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001541 * call_netdevice_notifiers - call all network notifier blocks
1542 * @val: value passed unmodified to notifier function
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001543 * @dev: net_device pointer passed unmodified to notifier function
Linus Torvalds1da177e2005-04-16 15:20:36 -07001544 *
1545 * Call all network notifier blocks. Parameters and return value
Alan Sternf07d5b92006-05-09 15:23:03 -07001546 * are as for raw_notifier_call_chain().
Linus Torvalds1da177e2005-04-16 15:20:36 -07001547 */
1548
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001549int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001550{
Jiri Pirko351638e2013-05-28 01:30:21 +00001551 struct netdev_notifier_info info;
1552
1553 return call_netdevice_notifiers_info(val, dev, &info);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001554}
stephen hemmingeredf947f2011-03-24 13:24:01 +00001555EXPORT_SYMBOL(call_netdevice_notifiers);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001556
Ingo Molnarc5905af2012-02-24 08:31:31 +01001557static struct static_key netstamp_needed __read_mostly;
Eric Dumazetb90e5792011-11-28 11:16:50 +00001558#ifdef HAVE_JUMP_LABEL
Ingo Molnarc5905af2012-02-24 08:31:31 +01001559/* We are not allowed to call static_key_slow_dec() from irq context
Eric Dumazetb90e5792011-11-28 11:16:50 +00001560 * If net_disable_timestamp() is called from irq context, defer the
Ingo Molnarc5905af2012-02-24 08:31:31 +01001561 * static_key_slow_dec() calls.
Eric Dumazetb90e5792011-11-28 11:16:50 +00001562 */
1563static atomic_t netstamp_needed_deferred;
1564#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001565
1566void net_enable_timestamp(void)
1567{
Eric Dumazetb90e5792011-11-28 11:16:50 +00001568#ifdef HAVE_JUMP_LABEL
1569 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1570
1571 if (deferred) {
1572 while (--deferred)
Ingo Molnarc5905af2012-02-24 08:31:31 +01001573 static_key_slow_dec(&netstamp_needed);
Eric Dumazetb90e5792011-11-28 11:16:50 +00001574 return;
1575 }
1576#endif
Ingo Molnarc5905af2012-02-24 08:31:31 +01001577 static_key_slow_inc(&netstamp_needed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001578}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001579EXPORT_SYMBOL(net_enable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001580
1581void net_disable_timestamp(void)
1582{
Eric Dumazetb90e5792011-11-28 11:16:50 +00001583#ifdef HAVE_JUMP_LABEL
1584 if (in_interrupt()) {
1585 atomic_inc(&netstamp_needed_deferred);
1586 return;
1587 }
1588#endif
Ingo Molnarc5905af2012-02-24 08:31:31 +01001589 static_key_slow_dec(&netstamp_needed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001590}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001591EXPORT_SYMBOL(net_disable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001592
Eric Dumazet3b098e22010-05-15 23:57:10 -07001593static inline void net_timestamp_set(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001594{
Eric Dumazet588f0332011-11-15 04:12:55 +00001595 skb->tstamp.tv64 = 0;
Ingo Molnarc5905af2012-02-24 08:31:31 +01001596 if (static_key_false(&netstamp_needed))
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001597 __net_timestamp(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001598}
1599
Eric Dumazet588f0332011-11-15 04:12:55 +00001600#define net_timestamp_check(COND, SKB) \
Ingo Molnarc5905af2012-02-24 08:31:31 +01001601 if (static_key_false(&netstamp_needed)) { \
Eric Dumazet588f0332011-11-15 04:12:55 +00001602 if ((COND) && !(SKB)->tstamp.tv64) \
1603 __net_timestamp(SKB); \
1604 } \
Eric Dumazet3b098e22010-05-15 23:57:10 -07001605
Daniel Lezcano79b569f2011-03-30 02:42:17 -07001606static inline bool is_skb_forwardable(struct net_device *dev,
1607 struct sk_buff *skb)
1608{
1609 unsigned int len;
1610
1611 if (!(dev->flags & IFF_UP))
1612 return false;
1613
1614 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1615 if (skb->len <= len)
1616 return true;
1617
1618 /* if TSO is enabled, we don't care about the length as the packet
1619 * could be forwarded without being segmented before
1620 */
1621 if (skb_is_gso(skb))
1622 return true;
1623
1624 return false;
1625}
1626
Arnd Bergmann44540962009-11-26 06:07:08 +00001627/**
1628 * dev_forward_skb - loopback an skb to another netif
1629 *
1630 * @dev: destination network device
1631 * @skb: buffer to forward
1632 *
1633 * return values:
1634 * NET_RX_SUCCESS (no congestion)
Eric Dumazet6ec82562010-05-06 00:53:53 -07001635 * NET_RX_DROP (packet was dropped, but freed)
Arnd Bergmann44540962009-11-26 06:07:08 +00001636 *
1637 * dev_forward_skb can be used for injecting an skb from the
1638 * start_xmit function of one device into the receive queue
1639 * of another device.
1640 *
1641 * The receiving device may be in another namespace, so
1642 * we have to clear all information in the skb that could
1643 * impact namespace isolation.
1644 */
1645int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1646{
Michael S. Tsirkin48c83012011-08-31 08:03:29 +00001647 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1648 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1649 atomic_long_inc(&dev->rx_dropped);
1650 kfree_skb(skb);
1651 return NET_RX_DROP;
1652 }
1653 }
1654
Arnd Bergmann44540962009-11-26 06:07:08 +00001655 skb_orphan(skb);
1656
Daniel Lezcano79b569f2011-03-30 02:42:17 -07001657 if (unlikely(!is_skb_forwardable(dev, skb))) {
Eric Dumazetcaf586e2010-09-30 21:06:55 +00001658 atomic_long_inc(&dev->rx_dropped);
Eric Dumazet6ec82562010-05-06 00:53:53 -07001659 kfree_skb(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001660 return NET_RX_DROP;
Eric Dumazet6ec82562010-05-06 00:53:53 -07001661 }
Benjamin LaHaise3b9785c2012-03-27 15:55:44 +00001662 skb->skb_iif = 0;
David S. Miller59b99972012-05-10 23:03:34 -04001663 skb_dst_drop(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001664 skb->tstamp.tv64 = 0;
1665 skb->pkt_type = PACKET_HOST;
1666 skb->protocol = eth_type_trans(skb, dev);
David S. Miller59b99972012-05-10 23:03:34 -04001667 skb->mark = 0;
1668 secpath_reset(skb);
1669 nf_reset(skb);
Patrick McHardy124dff02013-04-05 20:42:05 +02001670 nf_reset_trace(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001671 return netif_rx(skb);
1672}
1673EXPORT_SYMBOL_GPL(dev_forward_skb);
1674
Changli Gao71d9dec2010-12-15 19:57:25 +00001675static inline int deliver_skb(struct sk_buff *skb,
1676 struct packet_type *pt_prev,
1677 struct net_device *orig_dev)
1678{
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00001679 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1680 return -ENOMEM;
Changli Gao71d9dec2010-12-15 19:57:25 +00001681 atomic_inc(&skb->users);
1682 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1683}
1684
Eric Leblondc0de08d2012-08-16 22:02:58 +00001685static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1686{
Eric Leblonda3d744e2012-11-06 02:10:10 +00001687 if (!ptype->af_packet_priv || !skb->sk)
Eric Leblondc0de08d2012-08-16 22:02:58 +00001688 return false;
1689
1690 if (ptype->id_match)
1691 return ptype->id_match(ptype, skb->sk);
1692 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1693 return true;
1694
1695 return false;
1696}
1697
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698/*
1699 * Support routine. Sends outgoing frames to any network
1700 * taps currently in use.
1701 */
1702
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001703static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001704{
1705 struct packet_type *ptype;
Changli Gao71d9dec2010-12-15 19:57:25 +00001706 struct sk_buff *skb2 = NULL;
1707 struct packet_type *pt_prev = NULL;
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001708
Linus Torvalds1da177e2005-04-16 15:20:36 -07001709 rcu_read_lock();
1710 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1711 /* Never send packets back to the socket
1712 * they originated from - MvS (miquels@drinkel.ow.org)
1713 */
1714 if ((ptype->dev == dev || !ptype->dev) &&
Eric Leblondc0de08d2012-08-16 22:02:58 +00001715 (!skb_loop_sk(ptype, skb))) {
Changli Gao71d9dec2010-12-15 19:57:25 +00001716 if (pt_prev) {
1717 deliver_skb(skb2, pt_prev, skb->dev);
1718 pt_prev = ptype;
1719 continue;
1720 }
1721
1722 skb2 = skb_clone(skb, GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001723 if (!skb2)
1724 break;
1725
Eric Dumazet70978182010-12-20 21:22:51 +00001726 net_timestamp_set(skb2);
1727
Linus Torvalds1da177e2005-04-16 15:20:36 -07001728 /* skb->nh should be correctly
1729 set by sender, so that the second statement is
1730 just protection against buggy protocols.
1731 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001732 skb_reset_mac_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001733
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001734 if (skb_network_header(skb2) < skb2->data ||
Simon Hormanced14f62013-05-28 20:34:25 +00001735 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
Joe Perchese87cc472012-05-13 21:56:26 +00001736 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1737 ntohs(skb2->protocol),
1738 dev->name);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07001739 skb_reset_network_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001740 }
1741
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001742 skb2->transport_header = skb2->network_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001743 skb2->pkt_type = PACKET_OUTGOING;
Changli Gao71d9dec2010-12-15 19:57:25 +00001744 pt_prev = ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001745 }
1746 }
Changli Gao71d9dec2010-12-15 19:57:25 +00001747 if (pt_prev)
1748 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001749 rcu_read_unlock();
1750}
1751
Ben Hutchings2c530402012-07-10 10:55:09 +00001752/**
1753 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
John Fastabend4f57c082011-01-17 08:06:04 +00001754 * @dev: Network device
1755 * @txq: number of queues available
1756 *
1757 * If real_num_tx_queues is changed the tc mappings may no longer be
1758 * valid. To resolve this verify the tc mapping remains valid and if
1759 * not NULL the mapping. With no priorities mapping to this
1760 * offset/count pair it will no longer be used. In the worst case TC0
1761 * is invalid nothing can be done so disable priority mappings. If is
1762 * expected that drivers will fix this mapping if they can before
1763 * calling netif_set_real_num_tx_queues.
1764 */
Eric Dumazetbb134d22011-01-20 19:18:08 +00001765static void netif_setup_tc(struct net_device *dev, unsigned int txq)
John Fastabend4f57c082011-01-17 08:06:04 +00001766{
1767 int i;
1768 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1769
1770 /* If TC0 is invalidated disable TC mapping */
1771 if (tc->offset + tc->count > txq) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001772 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
John Fastabend4f57c082011-01-17 08:06:04 +00001773 dev->num_tc = 0;
1774 return;
1775 }
1776
1777 /* Invalidated prio to tc mappings set to TC0 */
1778 for (i = 1; i < TC_BITMASK + 1; i++) {
1779 int q = netdev_get_prio_tc_map(dev, i);
1780
1781 tc = &dev->tc_to_txq[q];
1782 if (tc->offset + tc->count > txq) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001783 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1784 i, q);
John Fastabend4f57c082011-01-17 08:06:04 +00001785 netdev_set_prio_tc_map(dev, i, 0);
1786 }
1787 }
1788}
1789
Alexander Duyck537c00d2013-01-10 08:57:02 +00001790#ifdef CONFIG_XPS
1791static DEFINE_MUTEX(xps_map_mutex);
1792#define xmap_dereference(P) \
1793 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1794
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001795static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1796 int cpu, u16 index)
1797{
1798 struct xps_map *map = NULL;
1799 int pos;
1800
1801 if (dev_maps)
1802 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1803
1804 for (pos = 0; map && pos < map->len; pos++) {
1805 if (map->queues[pos] == index) {
1806 if (map->len > 1) {
1807 map->queues[pos] = map->queues[--map->len];
1808 } else {
1809 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1810 kfree_rcu(map, rcu);
1811 map = NULL;
1812 }
1813 break;
1814 }
1815 }
1816
1817 return map;
1818}
1819
Alexander Duyck024e9672013-01-10 08:57:46 +00001820static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
Alexander Duyck537c00d2013-01-10 08:57:02 +00001821{
1822 struct xps_dev_maps *dev_maps;
Alexander Duyck024e9672013-01-10 08:57:46 +00001823 int cpu, i;
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001824 bool active = false;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001825
1826 mutex_lock(&xps_map_mutex);
1827 dev_maps = xmap_dereference(dev->xps_maps);
1828
1829 if (!dev_maps)
1830 goto out_no_maps;
1831
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001832 for_each_possible_cpu(cpu) {
Alexander Duyck024e9672013-01-10 08:57:46 +00001833 for (i = index; i < dev->num_tx_queues; i++) {
1834 if (!remove_xps_queue(dev_maps, cpu, i))
1835 break;
1836 }
1837 if (i == dev->num_tx_queues)
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001838 active = true;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001839 }
1840
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001841 if (!active) {
Alexander Duyck537c00d2013-01-10 08:57:02 +00001842 RCU_INIT_POINTER(dev->xps_maps, NULL);
1843 kfree_rcu(dev_maps, rcu);
1844 }
1845
Alexander Duyck024e9672013-01-10 08:57:46 +00001846 for (i = index; i < dev->num_tx_queues; i++)
1847 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1848 NUMA_NO_NODE);
1849
Alexander Duyck537c00d2013-01-10 08:57:02 +00001850out_no_maps:
1851 mutex_unlock(&xps_map_mutex);
1852}
1853
Alexander Duyck01c5f862013-01-10 08:57:35 +00001854static struct xps_map *expand_xps_map(struct xps_map *map,
1855 int cpu, u16 index)
1856{
1857 struct xps_map *new_map;
1858 int alloc_len = XPS_MIN_MAP_ALLOC;
1859 int i, pos;
1860
1861 for (pos = 0; map && pos < map->len; pos++) {
1862 if (map->queues[pos] != index)
1863 continue;
1864 return map;
1865 }
1866
1867 /* Need to add queue to this CPU's existing map */
1868 if (map) {
1869 if (pos < map->alloc_len)
1870 return map;
1871
1872 alloc_len = map->alloc_len * 2;
1873 }
1874
1875 /* Need to allocate new map to store queue on this CPU's map */
1876 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1877 cpu_to_node(cpu));
1878 if (!new_map)
1879 return NULL;
1880
1881 for (i = 0; i < pos; i++)
1882 new_map->queues[i] = map->queues[i];
1883 new_map->alloc_len = alloc_len;
1884 new_map->len = pos;
1885
1886 return new_map;
1887}
1888
Alexander Duyck537c00d2013-01-10 08:57:02 +00001889int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1890{
Alexander Duyck01c5f862013-01-10 08:57:35 +00001891 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001892 struct xps_map *map, *new_map;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001893 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
Alexander Duyck01c5f862013-01-10 08:57:35 +00001894 int cpu, numa_node_id = -2;
1895 bool active = false;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001896
1897 mutex_lock(&xps_map_mutex);
1898
1899 dev_maps = xmap_dereference(dev->xps_maps);
1900
Alexander Duyck01c5f862013-01-10 08:57:35 +00001901 /* allocate memory for queue storage */
1902 for_each_online_cpu(cpu) {
1903 if (!cpumask_test_cpu(cpu, mask))
1904 continue;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001905
Alexander Duyck01c5f862013-01-10 08:57:35 +00001906 if (!new_dev_maps)
1907 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
Alexander Duyck2bb60cb2013-02-22 06:38:44 +00001908 if (!new_dev_maps) {
1909 mutex_unlock(&xps_map_mutex);
Alexander Duyck01c5f862013-01-10 08:57:35 +00001910 return -ENOMEM;
Alexander Duyck2bb60cb2013-02-22 06:38:44 +00001911 }
Alexander Duyck01c5f862013-01-10 08:57:35 +00001912
1913 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1914 NULL;
1915
1916 map = expand_xps_map(map, cpu, index);
1917 if (!map)
1918 goto error;
1919
1920 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1921 }
1922
1923 if (!new_dev_maps)
1924 goto out_no_new_maps;
1925
1926 for_each_possible_cpu(cpu) {
1927 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1928 /* add queue to CPU maps */
1929 int pos = 0;
1930
1931 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1932 while ((pos < map->len) && (map->queues[pos] != index))
1933 pos++;
1934
1935 if (pos == map->len)
1936 map->queues[map->len++] = index;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001937#ifdef CONFIG_NUMA
Alexander Duyck537c00d2013-01-10 08:57:02 +00001938 if (numa_node_id == -2)
1939 numa_node_id = cpu_to_node(cpu);
1940 else if (numa_node_id != cpu_to_node(cpu))
1941 numa_node_id = -1;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001942#endif
Alexander Duyck01c5f862013-01-10 08:57:35 +00001943 } else if (dev_maps) {
1944 /* fill in the new device map from the old device map */
1945 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1946 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
Alexander Duyck537c00d2013-01-10 08:57:02 +00001947 }
Alexander Duyck01c5f862013-01-10 08:57:35 +00001948
Alexander Duyck537c00d2013-01-10 08:57:02 +00001949 }
1950
Alexander Duyck01c5f862013-01-10 08:57:35 +00001951 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1952
Alexander Duyck537c00d2013-01-10 08:57:02 +00001953 /* Cleanup old maps */
Alexander Duyck01c5f862013-01-10 08:57:35 +00001954 if (dev_maps) {
1955 for_each_possible_cpu(cpu) {
1956 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1957 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1958 if (map && map != new_map)
1959 kfree_rcu(map, rcu);
1960 }
Alexander Duyck537c00d2013-01-10 08:57:02 +00001961
Alexander Duyck537c00d2013-01-10 08:57:02 +00001962 kfree_rcu(dev_maps, rcu);
Alexander Duyck01c5f862013-01-10 08:57:35 +00001963 }
Alexander Duyck537c00d2013-01-10 08:57:02 +00001964
Alexander Duyck01c5f862013-01-10 08:57:35 +00001965 dev_maps = new_dev_maps;
1966 active = true;
1967
1968out_no_new_maps:
1969 /* update Tx queue numa node */
Alexander Duyck537c00d2013-01-10 08:57:02 +00001970 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1971 (numa_node_id >= 0) ? numa_node_id :
1972 NUMA_NO_NODE);
1973
Alexander Duyck01c5f862013-01-10 08:57:35 +00001974 if (!dev_maps)
1975 goto out_no_maps;
1976
1977 /* removes queue from unused CPUs */
1978 for_each_possible_cpu(cpu) {
1979 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1980 continue;
1981
1982 if (remove_xps_queue(dev_maps, cpu, index))
1983 active = true;
1984 }
1985
1986 /* free map if not active */
1987 if (!active) {
1988 RCU_INIT_POINTER(dev->xps_maps, NULL);
1989 kfree_rcu(dev_maps, rcu);
1990 }
1991
1992out_no_maps:
Alexander Duyck537c00d2013-01-10 08:57:02 +00001993 mutex_unlock(&xps_map_mutex);
1994
1995 return 0;
1996error:
Alexander Duyck01c5f862013-01-10 08:57:35 +00001997 /* remove any maps that we added */
1998 for_each_possible_cpu(cpu) {
1999 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2000 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2001 NULL;
2002 if (new_map && new_map != map)
2003 kfree(new_map);
2004 }
2005
Alexander Duyck537c00d2013-01-10 08:57:02 +00002006 mutex_unlock(&xps_map_mutex);
2007
Alexander Duyck537c00d2013-01-10 08:57:02 +00002008 kfree(new_dev_maps);
2009 return -ENOMEM;
2010}
2011EXPORT_SYMBOL(netif_set_xps_queue);
2012
2013#endif
John Fastabendf0796d52010-07-01 13:21:57 +00002014/*
2015 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2016 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2017 */
Tom Herberte6484932010-10-18 18:04:39 +00002018int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
John Fastabendf0796d52010-07-01 13:21:57 +00002019{
Tom Herbert1d24eb42010-11-21 13:17:27 +00002020 int rc;
2021
Tom Herberte6484932010-10-18 18:04:39 +00002022 if (txq < 1 || txq > dev->num_tx_queues)
2023 return -EINVAL;
John Fastabendf0796d52010-07-01 13:21:57 +00002024
Ben Hutchings5c565802011-02-15 19:39:21 +00002025 if (dev->reg_state == NETREG_REGISTERED ||
2026 dev->reg_state == NETREG_UNREGISTERING) {
Tom Herberte6484932010-10-18 18:04:39 +00002027 ASSERT_RTNL();
2028
Tom Herbert1d24eb42010-11-21 13:17:27 +00002029 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2030 txq);
Tom Herbertbf264142010-11-26 08:36:09 +00002031 if (rc)
2032 return rc;
2033
John Fastabend4f57c082011-01-17 08:06:04 +00002034 if (dev->num_tc)
2035 netif_setup_tc(dev, txq);
2036
Alexander Duyck024e9672013-01-10 08:57:46 +00002037 if (txq < dev->real_num_tx_queues) {
Tom Herberte6484932010-10-18 18:04:39 +00002038 qdisc_reset_all_tx_gt(dev, txq);
Alexander Duyck024e9672013-01-10 08:57:46 +00002039#ifdef CONFIG_XPS
2040 netif_reset_xps_queues_gt(dev, txq);
2041#endif
2042 }
John Fastabendf0796d52010-07-01 13:21:57 +00002043 }
Tom Herberte6484932010-10-18 18:04:39 +00002044
2045 dev->real_num_tx_queues = txq;
2046 return 0;
John Fastabendf0796d52010-07-01 13:21:57 +00002047}
2048EXPORT_SYMBOL(netif_set_real_num_tx_queues);
Denis Vlasenko56079432006-03-29 15:57:29 -08002049
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002050#ifdef CONFIG_RPS
2051/**
2052 * netif_set_real_num_rx_queues - set actual number of RX queues used
2053 * @dev: Network device
2054 * @rxq: Actual number of RX queues
2055 *
2056 * This must be called either with the rtnl_lock held or before
2057 * registration of the net device. Returns 0 on success, or a
Ben Hutchings4e7f7952010-10-08 10:33:39 -07002058 * negative error code. If called before registration, it always
2059 * succeeds.
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002060 */
2061int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2062{
2063 int rc;
2064
Tom Herbertbd25fa72010-10-18 18:00:16 +00002065 if (rxq < 1 || rxq > dev->num_rx_queues)
2066 return -EINVAL;
2067
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002068 if (dev->reg_state == NETREG_REGISTERED) {
2069 ASSERT_RTNL();
2070
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002071 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2072 rxq);
2073 if (rc)
2074 return rc;
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002075 }
2076
2077 dev->real_num_rx_queues = rxq;
2078 return 0;
2079}
2080EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2081#endif
2082
Ben Hutchings2c530402012-07-10 10:55:09 +00002083/**
2084 * netif_get_num_default_rss_queues - default number of RSS queues
Yuval Mintz16917b82012-07-01 03:18:50 +00002085 *
2086 * This routine should set an upper limit on the number of RSS queues
2087 * used by default by multiqueue devices.
2088 */
Ben Hutchingsa55b1382012-07-10 10:54:38 +00002089int netif_get_num_default_rss_queues(void)
Yuval Mintz16917b82012-07-01 03:18:50 +00002090{
2091 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2092}
2093EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2094
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002095static inline void __netif_reschedule(struct Qdisc *q)
2096{
2097 struct softnet_data *sd;
2098 unsigned long flags;
2099
2100 local_irq_save(flags);
2101 sd = &__get_cpu_var(softnet_data);
Changli Gaoa9cbd582010-04-26 23:06:24 +00002102 q->next_sched = NULL;
2103 *sd->output_queue_tailp = q;
2104 sd->output_queue_tailp = &q->next_sched;
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002105 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2106 local_irq_restore(flags);
2107}
2108
David S. Miller37437bb2008-07-16 02:15:04 -07002109void __netif_schedule(struct Qdisc *q)
Denis Vlasenko56079432006-03-29 15:57:29 -08002110{
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002111 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2112 __netif_reschedule(q);
Denis Vlasenko56079432006-03-29 15:57:29 -08002113}
2114EXPORT_SYMBOL(__netif_schedule);
2115
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002116void dev_kfree_skb_irq(struct sk_buff *skb)
Denis Vlasenko56079432006-03-29 15:57:29 -08002117{
David S. Miller3578b0c2010-08-03 00:24:04 -07002118 if (atomic_dec_and_test(&skb->users)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002119 struct softnet_data *sd;
2120 unsigned long flags;
Denis Vlasenko56079432006-03-29 15:57:29 -08002121
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002122 local_irq_save(flags);
2123 sd = &__get_cpu_var(softnet_data);
2124 skb->next = sd->completion_queue;
2125 sd->completion_queue = skb;
2126 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2127 local_irq_restore(flags);
2128 }
Denis Vlasenko56079432006-03-29 15:57:29 -08002129}
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002130EXPORT_SYMBOL(dev_kfree_skb_irq);
Denis Vlasenko56079432006-03-29 15:57:29 -08002131
2132void dev_kfree_skb_any(struct sk_buff *skb)
2133{
2134 if (in_irq() || irqs_disabled())
2135 dev_kfree_skb_irq(skb);
2136 else
2137 dev_kfree_skb(skb);
2138}
2139EXPORT_SYMBOL(dev_kfree_skb_any);
2140
2141
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002142/**
2143 * netif_device_detach - mark device as removed
2144 * @dev: network device
2145 *
2146 * Mark device as removed from system and therefore no longer available.
2147 */
Denis Vlasenko56079432006-03-29 15:57:29 -08002148void netif_device_detach(struct net_device *dev)
2149{
2150 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2151 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00002152 netif_tx_stop_all_queues(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08002153 }
2154}
2155EXPORT_SYMBOL(netif_device_detach);
2156
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002157/**
2158 * netif_device_attach - mark device as attached
2159 * @dev: network device
2160 *
2161 * Mark device as attached from system and restart if needed.
2162 */
Denis Vlasenko56079432006-03-29 15:57:29 -08002163void netif_device_attach(struct net_device *dev)
2164{
2165 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2166 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00002167 netif_tx_wake_all_queues(dev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002168 __netdev_watchdog_up(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08002169 }
2170}
2171EXPORT_SYMBOL(netif_device_attach);
2172
Ben Hutchings36c92472012-01-17 07:57:56 +00002173static void skb_warn_bad_offload(const struct sk_buff *skb)
2174{
Michał Mirosław65e9d2f2012-01-17 10:00:40 +00002175 static const netdev_features_t null_features = 0;
Ben Hutchings36c92472012-01-17 07:57:56 +00002176 struct net_device *dev = skb->dev;
2177 const char *driver = "";
2178
Ben Greearc846ad92013-04-19 10:45:52 +00002179 if (!net_ratelimit())
2180 return;
2181
Ben Hutchings36c92472012-01-17 07:57:56 +00002182 if (dev && dev->dev.parent)
2183 driver = dev_driver_string(dev->dev.parent);
2184
2185 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2186 "gso_type=%d ip_summed=%d\n",
Michał Mirosław65e9d2f2012-01-17 10:00:40 +00002187 driver, dev ? &dev->features : &null_features,
2188 skb->sk ? &skb->sk->sk_route_caps : &null_features,
Ben Hutchings36c92472012-01-17 07:57:56 +00002189 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2190 skb_shinfo(skb)->gso_type, skb->ip_summed);
2191}
2192
Linus Torvalds1da177e2005-04-16 15:20:36 -07002193/*
2194 * Invalidate hardware checksum when packet is to be mangled, and
2195 * complete checksum manually on outgoing path.
2196 */
Patrick McHardy84fa7932006-08-29 16:44:56 -07002197int skb_checksum_help(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002198{
Al Virod3bc23e2006-11-14 21:24:49 -08002199 __wsum csum;
Herbert Xu663ead32007-04-09 11:59:07 -07002200 int ret = 0, offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201
Patrick McHardy84fa7932006-08-29 16:44:56 -07002202 if (skb->ip_summed == CHECKSUM_COMPLETE)
Herbert Xua430a432006-07-08 13:34:56 -07002203 goto out_set_summed;
2204
2205 if (unlikely(skb_shinfo(skb)->gso_size)) {
Ben Hutchings36c92472012-01-17 07:57:56 +00002206 skb_warn_bad_offload(skb);
2207 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002208 }
2209
Eric Dumazetcef401d2013-01-25 20:34:37 +00002210 /* Before computing a checksum, we should make sure no frag could
2211 * be modified by an external entity : checksum could be wrong.
2212 */
2213 if (skb_has_shared_frag(skb)) {
2214 ret = __skb_linearize(skb);
2215 if (ret)
2216 goto out;
2217 }
2218
Michał Mirosław55508d62010-12-14 15:24:08 +00002219 offset = skb_checksum_start_offset(skb);
Herbert Xua0308472007-10-15 01:47:15 -07002220 BUG_ON(offset >= skb_headlen(skb));
2221 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2222
2223 offset += skb->csum_offset;
2224 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2225
2226 if (skb_cloned(skb) &&
2227 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002228 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2229 if (ret)
2230 goto out;
2231 }
2232
Herbert Xua0308472007-10-15 01:47:15 -07002233 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
Herbert Xua430a432006-07-08 13:34:56 -07002234out_set_summed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002235 skb->ip_summed = CHECKSUM_NONE;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002236out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002237 return ret;
2238}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002239EXPORT_SYMBOL(skb_checksum_help);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002240
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002241__be16 skb_network_protocol(struct sk_buff *skb)
2242{
2243 __be16 type = skb->protocol;
David S. Miller61816592013-03-20 12:46:26 -04002244 int vlan_depth = ETH_HLEN;
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002245
Pravin B Shelar19acc322013-05-07 20:41:07 +00002246 /* Tunnel gso handlers can set protocol to ethernet. */
2247 if (type == htons(ETH_P_TEB)) {
2248 struct ethhdr *eth;
2249
2250 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2251 return 0;
2252
2253 eth = (struct ethhdr *)skb_mac_header(skb);
2254 type = eth->h_proto;
2255 }
2256
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002257 while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002258 struct vlan_hdr *vh;
2259
2260 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2261 return 0;
2262
2263 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2264 type = vh->h_vlan_encapsulated_proto;
2265 vlan_depth += VLAN_HLEN;
2266 }
2267
2268 return type;
2269}
2270
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002271/**
2272 * skb_mac_gso_segment - mac layer segmentation handler.
2273 * @skb: buffer to segment
2274 * @features: features for the output path (see dev->features)
2275 */
2276struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2277 netdev_features_t features)
2278{
2279 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2280 struct packet_offload *ptype;
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002281 __be16 type = skb_network_protocol(skb);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002282
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002283 if (unlikely(!type))
2284 return ERR_PTR(-EINVAL);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002285
2286 __skb_pull(skb, skb->mac_len);
2287
2288 rcu_read_lock();
2289 list_for_each_entry_rcu(ptype, &offload_base, list) {
2290 if (ptype->type == type && ptype->callbacks.gso_segment) {
2291 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2292 int err;
2293
2294 err = ptype->callbacks.gso_send_check(skb);
2295 segs = ERR_PTR(err);
2296 if (err || skb_gso_ok(skb, features))
2297 break;
2298 __skb_push(skb, (skb->data -
2299 skb_network_header(skb)));
2300 }
2301 segs = ptype->callbacks.gso_segment(skb, features);
2302 break;
2303 }
2304 }
2305 rcu_read_unlock();
2306
2307 __skb_push(skb, skb->data - skb_mac_header(skb));
2308
2309 return segs;
2310}
2311EXPORT_SYMBOL(skb_mac_gso_segment);
2312
2313
Cong Wang12b00042013-02-05 16:36:38 +00002314/* openvswitch calls this on rx path, so we need a different check.
2315 */
2316static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2317{
2318 if (tx_path)
2319 return skb->ip_summed != CHECKSUM_PARTIAL;
2320 else
2321 return skb->ip_summed == CHECKSUM_NONE;
2322}
2323
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002324/**
Cong Wang12b00042013-02-05 16:36:38 +00002325 * __skb_gso_segment - Perform segmentation on skb.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002326 * @skb: buffer to segment
Herbert Xu576a30e2006-06-27 13:22:38 -07002327 * @features: features for the output path (see dev->features)
Cong Wang12b00042013-02-05 16:36:38 +00002328 * @tx_path: whether it is called in TX path
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002329 *
2330 * This function segments the given skb and returns a list of segments.
Herbert Xu576a30e2006-06-27 13:22:38 -07002331 *
2332 * It may return NULL if the skb requires no segmentation. This is
2333 * only possible when GSO is used for verifying header integrity.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002334 */
Cong Wang12b00042013-02-05 16:36:38 +00002335struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2336 netdev_features_t features, bool tx_path)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002337{
Cong Wang12b00042013-02-05 16:36:38 +00002338 if (unlikely(skb_needs_check(skb, tx_path))) {
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002339 int err;
2340
Ben Hutchings36c92472012-01-17 07:57:56 +00002341 skb_warn_bad_offload(skb);
Herbert Xu67fd1a72009-01-19 16:26:44 -08002342
Herbert Xua430a432006-07-08 13:34:56 -07002343 if (skb_header_cloned(skb) &&
2344 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2345 return ERR_PTR(err);
2346 }
2347
Pravin B Shelar68c33162013-02-14 14:02:41 +00002348 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002349 skb_reset_mac_header(skb);
2350 skb_reset_mac_len(skb);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002351
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002352 return skb_mac_gso_segment(skb, features);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002353}
Cong Wang12b00042013-02-05 16:36:38 +00002354EXPORT_SYMBOL(__skb_gso_segment);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002355
Herbert Xufb286bb2005-11-10 13:01:24 -08002356/* Take action when hardware reception checksum errors are detected. */
2357#ifdef CONFIG_BUG
2358void netdev_rx_csum_fault(struct net_device *dev)
2359{
2360 if (net_ratelimit()) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00002361 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
Herbert Xufb286bb2005-11-10 13:01:24 -08002362 dump_stack();
2363 }
2364}
2365EXPORT_SYMBOL(netdev_rx_csum_fault);
2366#endif
2367
Linus Torvalds1da177e2005-04-16 15:20:36 -07002368/* Actually, we should eliminate this check as soon as we know, that:
2369 * 1. IOMMU is present and allows to map all the memory.
2370 * 2. No high memory really exists on this machine.
2371 */
2372
Eric Dumazet9092c652010-04-02 13:34:49 -07002373static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002374{
Herbert Xu3d3a8532006-06-27 13:33:10 -07002375#ifdef CONFIG_HIGHMEM
Linus Torvalds1da177e2005-04-16 15:20:36 -07002376 int i;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002377 if (!(dev->features & NETIF_F_HIGHDMA)) {
Ian Campbellea2ab692011-08-22 23:44:58 +00002378 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2379 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2380 if (PageHighMem(skb_frag_page(frag)))
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002381 return 1;
Ian Campbellea2ab692011-08-22 23:44:58 +00002382 }
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002383 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002384
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002385 if (PCI_DMA_BUS_IS_PHYS) {
2386 struct device *pdev = dev->dev.parent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002387
Eric Dumazet9092c652010-04-02 13:34:49 -07002388 if (!pdev)
2389 return 0;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002390 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
Ian Campbellea2ab692011-08-22 23:44:58 +00002391 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2392 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002393 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2394 return 1;
2395 }
2396 }
Herbert Xu3d3a8532006-06-27 13:33:10 -07002397#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002398 return 0;
2399}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002400
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002401struct dev_gso_cb {
2402 void (*destructor)(struct sk_buff *skb);
2403};
2404
2405#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2406
2407static void dev_gso_skb_destructor(struct sk_buff *skb)
2408{
2409 struct dev_gso_cb *cb;
2410
2411 do {
2412 struct sk_buff *nskb = skb->next;
2413
2414 skb->next = nskb->next;
2415 nskb->next = NULL;
2416 kfree_skb(nskb);
2417 } while (skb->next);
2418
2419 cb = DEV_GSO_CB(skb);
2420 if (cb->destructor)
2421 cb->destructor(skb);
2422}
2423
2424/**
2425 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2426 * @skb: buffer to segment
Jesse Gross91ecb632011-01-09 06:23:33 +00002427 * @features: device features as applicable to this skb
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002428 *
2429 * This function segments the given skb and stores the list of segments
2430 * in skb->next.
2431 */
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002432static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002433{
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002434 struct sk_buff *segs;
2435
Herbert Xu576a30e2006-06-27 13:22:38 -07002436 segs = skb_gso_segment(skb, features);
2437
2438 /* Verifying header integrity only. */
2439 if (!segs)
2440 return 0;
2441
Hirofumi Nakagawa801678c2008-04-29 01:03:09 -07002442 if (IS_ERR(segs))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002443 return PTR_ERR(segs);
2444
2445 skb->next = segs;
2446 DEV_GSO_CB(skb)->destructor = skb->destructor;
2447 skb->destructor = dev_gso_skb_destructor;
2448
2449 return 0;
2450}
2451
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002452static netdev_features_t harmonize_features(struct sk_buff *skb,
2453 __be16 protocol, netdev_features_t features)
Jesse Grossf01a5232011-01-09 06:23:31 +00002454{
Ed Cashinc0d680e2012-09-19 15:49:00 +00002455 if (skb->ip_summed != CHECKSUM_NONE &&
2456 !can_checksum_protocol(features, protocol)) {
Jesse Grossf01a5232011-01-09 06:23:31 +00002457 features &= ~NETIF_F_ALL_CSUM;
Jesse Grossf01a5232011-01-09 06:23:31 +00002458 } else if (illegal_highdma(skb->dev, skb)) {
2459 features &= ~NETIF_F_SG;
2460 }
2461
2462 return features;
2463}
2464
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002465netdev_features_t netif_skb_features(struct sk_buff *skb)
Jesse Gross58e998c2010-10-29 12:14:55 +00002466{
2467 __be16 protocol = skb->protocol;
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002468 netdev_features_t features = skb->dev->features;
Jesse Gross58e998c2010-10-29 12:14:55 +00002469
Ben Hutchings30b678d2012-07-30 15:57:00 +00002470 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2471 features &= ~NETIF_F_GSO_MASK;
2472
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002473 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
Jesse Gross58e998c2010-10-29 12:14:55 +00002474 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2475 protocol = veh->h_vlan_encapsulated_proto;
Jesse Grossf01a5232011-01-09 06:23:31 +00002476 } else if (!vlan_tx_tag_present(skb)) {
2477 return harmonize_features(skb, protocol, features);
2478 }
Jesse Gross58e998c2010-10-29 12:14:55 +00002479
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002480 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2481 NETIF_F_HW_VLAN_STAG_TX);
Jesse Grossf01a5232011-01-09 06:23:31 +00002482
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002483 if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
Jesse Grossf01a5232011-01-09 06:23:31 +00002484 return harmonize_features(skb, protocol, features);
2485 } else {
2486 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002487 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2488 NETIF_F_HW_VLAN_STAG_TX;
Jesse Grossf01a5232011-01-09 06:23:31 +00002489 return harmonize_features(skb, protocol, features);
2490 }
Jesse Gross58e998c2010-10-29 12:14:55 +00002491}
Jesse Grossf01a5232011-01-09 06:23:31 +00002492EXPORT_SYMBOL(netif_skb_features);
Jesse Gross58e998c2010-10-29 12:14:55 +00002493
John Fastabend6afff0c2010-06-16 14:18:12 +00002494/*
2495 * Returns true if either:
2496 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
Rami Rosend1a53df2012-08-27 23:39:24 +00002497 * 2. skb is fragmented and the device does not support SG.
John Fastabend6afff0c2010-06-16 14:18:12 +00002498 */
2499static inline int skb_needs_linearize(struct sk_buff *skb,
Patrick McHardy6708c9e2013-05-01 22:36:49 +00002500 netdev_features_t features)
John Fastabend6afff0c2010-06-16 14:18:12 +00002501{
Jesse Gross02932ce2011-01-09 06:23:34 +00002502 return skb_is_nonlinear(skb) &&
2503 ((skb_has_frag_list(skb) &&
2504 !(features & NETIF_F_FRAGLIST)) ||
Jesse Grosse1e78db2010-10-29 12:14:53 +00002505 (skb_shinfo(skb)->nr_frags &&
Jesse Gross02932ce2011-01-09 06:23:34 +00002506 !(features & NETIF_F_SG)));
John Fastabend6afff0c2010-06-16 14:18:12 +00002507}
2508
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002509int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2510 struct netdev_queue *txq)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002511{
Stephen Hemminger00829822008-11-20 20:14:53 -08002512 const struct net_device_ops *ops = dev->netdev_ops;
Patrick McHardy572a9d72009-11-10 06:14:14 +00002513 int rc = NETDEV_TX_OK;
Koki Sanagiec764bf2011-05-30 21:48:34 +00002514 unsigned int skb_len;
Stephen Hemminger00829822008-11-20 20:14:53 -08002515
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002516 if (likely(!skb->next)) {
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002517 netdev_features_t features;
Jesse Grossfc741212011-01-09 06:23:32 +00002518
Eric Dumazet93f154b2009-05-18 22:19:19 -07002519 /*
Lucas De Marchi25985ed2011-03-30 22:57:33 -03002520 * If device doesn't need skb->dst, release it right now while
Eric Dumazet93f154b2009-05-18 22:19:19 -07002521 * its hot in this cpu cache
2522 */
Eric Dumazetadf30902009-06-02 05:19:30 +00002523 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2524 skb_dst_drop(skb);
2525
Jesse Grossfc741212011-01-09 06:23:32 +00002526 features = netif_skb_features(skb);
2527
Jesse Gross7b9c6092010-10-20 13:56:04 +00002528 if (vlan_tx_tag_present(skb) &&
Patrick McHardy86a9bad2013-04-19 02:04:30 +00002529 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2530 skb = __vlan_put_tag(skb, skb->vlan_proto,
2531 vlan_tx_tag_get(skb));
Jesse Gross7b9c6092010-10-20 13:56:04 +00002532 if (unlikely(!skb))
2533 goto out;
2534
2535 skb->vlan_tci = 0;
2536 }
2537
Alexander Duyckfc70fb62012-12-07 14:14:15 +00002538 /* If encapsulation offload request, verify we are testing
2539 * hardware encapsulation features instead of standard
2540 * features for the netdev
2541 */
2542 if (skb->encapsulation)
2543 features &= dev->hw_enc_features;
2544
Jesse Grossfc741212011-01-09 06:23:32 +00002545 if (netif_needs_gso(skb, features)) {
Jesse Gross91ecb632011-01-09 06:23:33 +00002546 if (unlikely(dev_gso_segment(skb, features)))
David S. Miller9ccb8972010-04-22 01:02:07 -07002547 goto out_kfree_skb;
2548 if (skb->next)
2549 goto gso;
John Fastabend6afff0c2010-06-16 14:18:12 +00002550 } else {
Jesse Gross02932ce2011-01-09 06:23:34 +00002551 if (skb_needs_linearize(skb, features) &&
John Fastabend6afff0c2010-06-16 14:18:12 +00002552 __skb_linearize(skb))
2553 goto out_kfree_skb;
2554
2555 /* If packet is not checksummed and device does not
2556 * support checksumming for this protocol, complete
2557 * checksumming here.
2558 */
2559 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Alexander Duyckfc70fb62012-12-07 14:14:15 +00002560 if (skb->encapsulation)
2561 skb_set_inner_transport_header(skb,
2562 skb_checksum_start_offset(skb));
2563 else
2564 skb_set_transport_header(skb,
2565 skb_checksum_start_offset(skb));
Jesse Gross03634662011-01-09 06:23:35 +00002566 if (!(features & NETIF_F_ALL_CSUM) &&
John Fastabend6afff0c2010-06-16 14:18:12 +00002567 skb_checksum_help(skb))
2568 goto out_kfree_skb;
2569 }
David S. Miller9ccb8972010-04-22 01:02:07 -07002570 }
2571
Eric Dumazetb40863c2012-09-18 20:44:49 +00002572 if (!list_empty(&ptype_all))
2573 dev_queue_xmit_nit(skb, dev);
2574
Koki Sanagiec764bf2011-05-30 21:48:34 +00002575 skb_len = skb->len;
Patrick Ohlyac45f602009-02-12 05:03:37 +00002576 rc = ops->ndo_start_xmit(skb, dev);
Koki Sanagiec764bf2011-05-30 21:48:34 +00002577 trace_net_dev_xmit(skb, rc, dev, skb_len);
Patrick McHardyec634fe2009-07-05 19:23:38 -07002578 if (rc == NETDEV_TX_OK)
Eric Dumazet08baf562009-05-25 22:58:01 -07002579 txq_trans_update(txq);
Patrick Ohlyac45f602009-02-12 05:03:37 +00002580 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002581 }
2582
Herbert Xu576a30e2006-06-27 13:22:38 -07002583gso:
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002584 do {
2585 struct sk_buff *nskb = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002586
2587 skb->next = nskb->next;
2588 nskb->next = NULL;
Krishna Kumar068a2de2009-12-09 20:59:58 +00002589
Eric Dumazetb40863c2012-09-18 20:44:49 +00002590 if (!list_empty(&ptype_all))
2591 dev_queue_xmit_nit(nskb, dev);
2592
Koki Sanagiec764bf2011-05-30 21:48:34 +00002593 skb_len = nskb->len;
Stephen Hemminger00829822008-11-20 20:14:53 -08002594 rc = ops->ndo_start_xmit(nskb, dev);
Koki Sanagiec764bf2011-05-30 21:48:34 +00002595 trace_net_dev_xmit(nskb, rc, dev, skb_len);
Patrick McHardyec634fe2009-07-05 19:23:38 -07002596 if (unlikely(rc != NETDEV_TX_OK)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002597 if (rc & ~NETDEV_TX_MASK)
2598 goto out_kfree_gso_skb;
Michael Chanf54d9e82006-06-25 23:57:04 -07002599 nskb->next = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002600 skb->next = nskb;
2601 return rc;
2602 }
Eric Dumazet08baf562009-05-25 22:58:01 -07002603 txq_trans_update(txq);
Tom Herbert734664982011-11-28 16:32:44 +00002604 if (unlikely(netif_xmit_stopped(txq) && skb->next))
Michael Chanf54d9e82006-06-25 23:57:04 -07002605 return NETDEV_TX_BUSY;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002606 } while (skb->next);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002607
Patrick McHardy572a9d72009-11-10 06:14:14 +00002608out_kfree_gso_skb:
Sridhar Samudrala0c772152013-04-29 13:02:42 +00002609 if (likely(skb->next == NULL)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002610 skb->destructor = DEV_GSO_CB(skb)->destructor;
Sridhar Samudrala0c772152013-04-29 13:02:42 +00002611 consume_skb(skb);
2612 return rc;
2613 }
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002614out_kfree_skb:
2615 kfree_skb(skb);
Jesse Gross7b9c6092010-10-20 13:56:04 +00002616out:
Patrick McHardy572a9d72009-11-10 06:14:14 +00002617 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002618}
2619
Eric Dumazet1def9232013-01-10 12:36:42 +00002620static void qdisc_pkt_len_init(struct sk_buff *skb)
2621{
2622 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2623
2624 qdisc_skb_cb(skb)->pkt_len = skb->len;
2625
2626 /* To get more precise estimation of bytes sent on wire,
2627 * we add to pkt_len the headers size of all segments
2628 */
2629 if (shinfo->gso_size) {
Eric Dumazet757b8b12013-01-15 21:14:21 -08002630 unsigned int hdr_len;
Jason Wang15e5a032013-03-25 20:19:59 +00002631 u16 gso_segs = shinfo->gso_segs;
Eric Dumazet1def9232013-01-10 12:36:42 +00002632
Eric Dumazet757b8b12013-01-15 21:14:21 -08002633 /* mac layer + network layer */
2634 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2635
2636 /* + transport layer */
Eric Dumazet1def9232013-01-10 12:36:42 +00002637 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2638 hdr_len += tcp_hdrlen(skb);
2639 else
2640 hdr_len += sizeof(struct udphdr);
Jason Wang15e5a032013-03-25 20:19:59 +00002641
2642 if (shinfo->gso_type & SKB_GSO_DODGY)
2643 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2644 shinfo->gso_size);
2645
2646 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
Eric Dumazet1def9232013-01-10 12:36:42 +00002647 }
2648}
2649
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002650static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2651 struct net_device *dev,
2652 struct netdev_queue *txq)
2653{
2654 spinlock_t *root_lock = qdisc_lock(q);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002655 bool contended;
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002656 int rc;
2657
Eric Dumazet1def9232013-01-10 12:36:42 +00002658 qdisc_pkt_len_init(skb);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002659 qdisc_calculate_pkt_len(skb, q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002660 /*
2661 * Heuristic to force contended enqueues to serialize on a
2662 * separate lock before trying to get qdisc main lock.
2663 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2664 * and dequeue packets faster.
2665 */
Eric Dumazeta2da5702011-01-20 03:48:19 +00002666 contended = qdisc_is_running(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002667 if (unlikely(contended))
2668 spin_lock(&q->busylock);
2669
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002670 spin_lock(root_lock);
2671 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2672 kfree_skb(skb);
2673 rc = NET_XMIT_DROP;
2674 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
Eric Dumazetbc135b22010-06-02 03:23:51 -07002675 qdisc_run_begin(q)) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002676 /*
2677 * This is a work-conserving queue; there are no old skbs
2678 * waiting to be sent out; and the qdisc is not running -
2679 * xmit the skb directly.
2680 */
Eric Dumazet7fee2262010-05-11 23:19:48 +00002681 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2682 skb_dst_force(skb);
Eric Dumazetbfe0d022011-01-09 08:30:54 +00002683
Eric Dumazetbfe0d022011-01-09 08:30:54 +00002684 qdisc_bstats_update(q, skb);
2685
Eric Dumazet79640a42010-06-02 05:09:29 -07002686 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2687 if (unlikely(contended)) {
2688 spin_unlock(&q->busylock);
2689 contended = false;
2690 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002691 __qdisc_run(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002692 } else
Eric Dumazetbc135b22010-06-02 03:23:51 -07002693 qdisc_run_end(q);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002694
2695 rc = NET_XMIT_SUCCESS;
2696 } else {
Eric Dumazet7fee2262010-05-11 23:19:48 +00002697 skb_dst_force(skb);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002698 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
Eric Dumazet79640a42010-06-02 05:09:29 -07002699 if (qdisc_run_begin(q)) {
2700 if (unlikely(contended)) {
2701 spin_unlock(&q->busylock);
2702 contended = false;
2703 }
2704 __qdisc_run(q);
2705 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002706 }
2707 spin_unlock(root_lock);
Eric Dumazet79640a42010-06-02 05:09:29 -07002708 if (unlikely(contended))
2709 spin_unlock(&q->busylock);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002710 return rc;
2711}
2712
Neil Horman5bc14212011-11-22 05:10:51 +00002713#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2714static void skb_update_prio(struct sk_buff *skb)
2715{
Igor Maravic6977a792011-11-25 07:44:54 +00002716 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
Neil Horman5bc14212011-11-22 05:10:51 +00002717
Eric Dumazet91c68ce2012-07-08 21:45:10 +00002718 if (!skb->priority && skb->sk && map) {
2719 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2720
2721 if (prioidx < map->priomap_len)
2722 skb->priority = map->priomap[prioidx];
2723 }
Neil Horman5bc14212011-11-22 05:10:51 +00002724}
2725#else
2726#define skb_update_prio(skb)
2727#endif
2728
Eric Dumazet745e20f2010-09-29 13:23:09 -07002729static DEFINE_PER_CPU(int, xmit_recursion);
David S. Miller11a766c2010-10-25 12:51:55 -07002730#define RECURSION_LIMIT 10
Eric Dumazet745e20f2010-09-29 13:23:09 -07002731
Dave Jonesd29f7492008-07-22 14:09:06 -07002732/**
Michel Machado95603e22012-06-12 10:16:35 +00002733 * dev_loopback_xmit - loop back @skb
2734 * @skb: buffer to transmit
2735 */
2736int dev_loopback_xmit(struct sk_buff *skb)
2737{
2738 skb_reset_mac_header(skb);
2739 __skb_pull(skb, skb_network_offset(skb));
2740 skb->pkt_type = PACKET_LOOPBACK;
2741 skb->ip_summed = CHECKSUM_UNNECESSARY;
2742 WARN_ON(!skb_dst(skb));
2743 skb_dst_force(skb);
2744 netif_rx_ni(skb);
2745 return 0;
2746}
2747EXPORT_SYMBOL(dev_loopback_xmit);
2748
2749/**
Dave Jonesd29f7492008-07-22 14:09:06 -07002750 * dev_queue_xmit - transmit a buffer
2751 * @skb: buffer to transmit
2752 *
2753 * Queue a buffer for transmission to a network device. The caller must
2754 * have set the device and priority and built the buffer before calling
2755 * this function. The function can be called from an interrupt.
2756 *
2757 * A negative errno code is returned on a failure. A success does not
2758 * guarantee the frame will be transmitted as it may be dropped due
2759 * to congestion or traffic shaping.
2760 *
2761 * -----------------------------------------------------------------------------------
2762 * I notice this method can also return errors from the queue disciplines,
2763 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2764 * be positive.
2765 *
2766 * Regardless of the return value, the skb is consumed, so it is currently
2767 * difficult to retry a send to this method. (You can bump the ref count
2768 * before sending to hold a reference for retry if you are careful.)
2769 *
2770 * When calling this method, interrupts MUST be enabled. This is because
2771 * the BH enable code must have IRQs enabled so that it will not deadlock.
2772 * --BLG
2773 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002774int dev_queue_xmit(struct sk_buff *skb)
2775{
2776 struct net_device *dev = skb->dev;
David S. Millerdc2b4842008-07-08 17:18:23 -07002777 struct netdev_queue *txq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002778 struct Qdisc *q;
2779 int rc = -ENOMEM;
2780
Eric Dumazet6d1ccff2013-02-05 20:22:20 +00002781 skb_reset_mac_header(skb);
2782
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002783 /* Disable soft irqs for various locks below. Also
2784 * stops preemption for RCU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002785 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002786 rcu_read_lock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002787
Neil Horman5bc14212011-11-22 05:10:51 +00002788 skb_update_prio(skb);
2789
Amerigo Wang8c4c49d2012-09-17 20:16:31 +00002790 txq = netdev_pick_tx(dev, skb);
Paul E. McKenneya898def2010-02-22 17:04:49 -08002791 q = rcu_dereference_bh(txq->qdisc);
David S. Miller37437bb2008-07-16 02:15:04 -07002792
Linus Torvalds1da177e2005-04-16 15:20:36 -07002793#ifdef CONFIG_NET_CLS_ACT
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002794 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002795#endif
Koki Sanagicf66ba52010-08-23 18:45:02 +09002796 trace_net_dev_queue(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002797 if (q->enqueue) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002798 rc = __dev_xmit_skb(skb, q, dev, txq);
David S. Miller37437bb2008-07-16 02:15:04 -07002799 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002800 }
2801
2802 /* The device has no queue. Common case for software devices:
2803 loopback, all the sorts of tunnels...
2804
Herbert Xu932ff272006-06-09 12:20:56 -07002805 Really, it is unlikely that netif_tx_lock protection is necessary
2806 here. (f.e. loopback and IP tunnels are clean ignoring statistics
Linus Torvalds1da177e2005-04-16 15:20:36 -07002807 counters.)
2808 However, it is possible, that they rely on protection
2809 made by us here.
2810
2811 Check this and shot the lock. It is not prone from deadlocks.
2812 Either shot noqueue qdisc, it is even simpler 8)
2813 */
2814 if (dev->flags & IFF_UP) {
2815 int cpu = smp_processor_id(); /* ok because BHs are off */
2816
David S. Millerc773e842008-07-08 23:13:53 -07002817 if (txq->xmit_lock_owner != cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002818
Eric Dumazet745e20f2010-09-29 13:23:09 -07002819 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2820 goto recursion_alert;
2821
David S. Millerc773e842008-07-08 23:13:53 -07002822 HARD_TX_LOCK(dev, txq, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002823
Tom Herbert734664982011-11-28 16:32:44 +00002824 if (!netif_xmit_stopped(txq)) {
Eric Dumazet745e20f2010-09-29 13:23:09 -07002825 __this_cpu_inc(xmit_recursion);
Patrick McHardy572a9d72009-11-10 06:14:14 +00002826 rc = dev_hard_start_xmit(skb, dev, txq);
Eric Dumazet745e20f2010-09-29 13:23:09 -07002827 __this_cpu_dec(xmit_recursion);
Patrick McHardy572a9d72009-11-10 06:14:14 +00002828 if (dev_xmit_complete(rc)) {
David S. Millerc773e842008-07-08 23:13:53 -07002829 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002830 goto out;
2831 }
2832 }
David S. Millerc773e842008-07-08 23:13:53 -07002833 HARD_TX_UNLOCK(dev, txq);
Joe Perchese87cc472012-05-13 21:56:26 +00002834 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2835 dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002836 } else {
2837 /* Recursion is detected! It is possible,
Eric Dumazet745e20f2010-09-29 13:23:09 -07002838 * unfortunately
2839 */
2840recursion_alert:
Joe Perchese87cc472012-05-13 21:56:26 +00002841 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2842 dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002843 }
2844 }
2845
2846 rc = -ENETDOWN;
Herbert Xud4828d82006-06-22 02:28:18 -07002847 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002848
Linus Torvalds1da177e2005-04-16 15:20:36 -07002849 kfree_skb(skb);
2850 return rc;
2851out:
Herbert Xud4828d82006-06-22 02:28:18 -07002852 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002853 return rc;
2854}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002855EXPORT_SYMBOL(dev_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002856
2857
2858/*=======================================================================
2859 Receiver routines
2860 =======================================================================*/
2861
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002862int netdev_max_backlog __read_mostly = 1000;
Eric Dumazetc9e6bc62012-09-27 19:29:05 +00002863EXPORT_SYMBOL(netdev_max_backlog);
2864
Eric Dumazet3b098e22010-05-15 23:57:10 -07002865int netdev_tstamp_prequeue __read_mostly = 1;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002866int netdev_budget __read_mostly = 300;
2867int weight_p __read_mostly = 64; /* old backlog weight */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002868
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002869/* Called with irq disabled */
2870static inline void ____napi_schedule(struct softnet_data *sd,
2871 struct napi_struct *napi)
2872{
2873 list_add_tail(&napi->poll_list, &sd->poll_list);
2874 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2875}
2876
Eric Dumazetdf334542010-03-24 19:13:54 +00002877#ifdef CONFIG_RPS
Tom Herbertfec5e652010-04-16 16:01:27 -07002878
2879/* One global table that all flow-based protocols share. */
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002880struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
Tom Herbertfec5e652010-04-16 16:01:27 -07002881EXPORT_SYMBOL(rps_sock_flow_table);
2882
Ingo Molnarc5905af2012-02-24 08:31:31 +01002883struct static_key rps_needed __read_mostly;
Eric Dumazetadc93002011-11-17 03:13:26 +00002884
Ben Hutchingsc4454772011-01-19 11:03:53 +00002885static struct rps_dev_flow *
2886set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2887 struct rps_dev_flow *rflow, u16 next_cpu)
2888{
Ben Hutchings09994d12011-10-03 04:42:46 +00002889 if (next_cpu != RPS_NO_CPU) {
Ben Hutchingsc4454772011-01-19 11:03:53 +00002890#ifdef CONFIG_RFS_ACCEL
2891 struct netdev_rx_queue *rxqueue;
2892 struct rps_dev_flow_table *flow_table;
2893 struct rps_dev_flow *old_rflow;
2894 u32 flow_id;
2895 u16 rxq_index;
2896 int rc;
2897
2898 /* Should we steer this flow to a different hardware queue? */
Ben Hutchings69a19ee2011-02-15 20:32:04 +00002899 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2900 !(dev->features & NETIF_F_NTUPLE))
Ben Hutchingsc4454772011-01-19 11:03:53 +00002901 goto out;
2902 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2903 if (rxq_index == skb_get_rx_queue(skb))
2904 goto out;
2905
2906 rxqueue = dev->_rx + rxq_index;
2907 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2908 if (!flow_table)
2909 goto out;
2910 flow_id = skb->rxhash & flow_table->mask;
2911 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2912 rxq_index, flow_id);
2913 if (rc < 0)
2914 goto out;
2915 old_rflow = rflow;
2916 rflow = &flow_table->flows[flow_id];
Ben Hutchingsc4454772011-01-19 11:03:53 +00002917 rflow->filter = rc;
2918 if (old_rflow->filter == rflow->filter)
2919 old_rflow->filter = RPS_NO_FILTER;
2920 out:
2921#endif
2922 rflow->last_qtail =
Ben Hutchings09994d12011-10-03 04:42:46 +00002923 per_cpu(softnet_data, next_cpu).input_queue_head;
Ben Hutchingsc4454772011-01-19 11:03:53 +00002924 }
2925
Ben Hutchings09994d12011-10-03 04:42:46 +00002926 rflow->cpu = next_cpu;
Ben Hutchingsc4454772011-01-19 11:03:53 +00002927 return rflow;
2928}
2929
Tom Herbert0a9627f2010-03-16 08:03:29 +00002930/*
2931 * get_rps_cpu is called from netif_receive_skb and returns the target
2932 * CPU from the RPS map of the receiving queue for a given skb.
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002933 * rcu_read_lock must be held on entry.
Tom Herbert0a9627f2010-03-16 08:03:29 +00002934 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002935static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2936 struct rps_dev_flow **rflowp)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002937{
Tom Herbert0a9627f2010-03-16 08:03:29 +00002938 struct netdev_rx_queue *rxqueue;
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002939 struct rps_map *map;
Tom Herbertfec5e652010-04-16 16:01:27 -07002940 struct rps_dev_flow_table *flow_table;
2941 struct rps_sock_flow_table *sock_flow_table;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002942 int cpu = -1;
Tom Herbertfec5e652010-04-16 16:01:27 -07002943 u16 tcpu;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002944
Tom Herbert0a9627f2010-03-16 08:03:29 +00002945 if (skb_rx_queue_recorded(skb)) {
2946 u16 index = skb_get_rx_queue(skb);
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002947 if (unlikely(index >= dev->real_num_rx_queues)) {
2948 WARN_ONCE(dev->real_num_rx_queues > 1,
2949 "%s received packet on queue %u, but number "
2950 "of RX queues is %u\n",
2951 dev->name, index, dev->real_num_rx_queues);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002952 goto done;
2953 }
2954 rxqueue = dev->_rx + index;
2955 } else
2956 rxqueue = dev->_rx;
2957
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002958 map = rcu_dereference(rxqueue->rps_map);
2959 if (map) {
Tom Herbert85875232011-01-31 16:23:42 -08002960 if (map->len == 1 &&
Eric Dumazet33d480c2011-08-11 19:30:52 +00002961 !rcu_access_pointer(rxqueue->rps_flow_table)) {
Changli Gao6febfca2010-09-03 23:12:37 +00002962 tcpu = map->cpus[0];
2963 if (cpu_online(tcpu))
2964 cpu = tcpu;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002965 goto done;
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002966 }
Eric Dumazet33d480c2011-08-11 19:30:52 +00002967 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00002968 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002969 }
2970
Changli Gao2d47b452010-08-17 19:00:56 +00002971 skb_reset_network_header(skb);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002972 if (!skb_get_rxhash(skb))
Tom Herbert0a9627f2010-03-16 08:03:29 +00002973 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002974
Tom Herbertfec5e652010-04-16 16:01:27 -07002975 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2976 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2977 if (flow_table && sock_flow_table) {
2978 u16 next_cpu;
2979 struct rps_dev_flow *rflow;
2980
2981 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2982 tcpu = rflow->cpu;
2983
2984 next_cpu = sock_flow_table->ents[skb->rxhash &
2985 sock_flow_table->mask];
2986
2987 /*
2988 * If the desired CPU (where last recvmsg was done) is
2989 * different from current CPU (one in the rx-queue flow
2990 * table entry), switch if one of the following holds:
2991 * - Current CPU is unset (equal to RPS_NO_CPU).
2992 * - Current CPU is offline.
2993 * - The current CPU's queue tail has advanced beyond the
2994 * last packet that was enqueued using this table entry.
2995 * This guarantees that all previous packets for the flow
2996 * have been dequeued, thus preserving in order delivery.
2997 */
2998 if (unlikely(tcpu != next_cpu) &&
2999 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3000 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
Tom Herbertbaefa312012-11-16 09:04:15 +00003001 rflow->last_qtail)) >= 0)) {
3002 tcpu = next_cpu;
Ben Hutchingsc4454772011-01-19 11:03:53 +00003003 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
Tom Herbertbaefa312012-11-16 09:04:15 +00003004 }
Ben Hutchingsc4454772011-01-19 11:03:53 +00003005
Tom Herbertfec5e652010-04-16 16:01:27 -07003006 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3007 *rflowp = rflow;
3008 cpu = tcpu;
3009 goto done;
3010 }
3011 }
3012
Tom Herbert0a9627f2010-03-16 08:03:29 +00003013 if (map) {
Tom Herbertfec5e652010-04-16 16:01:27 -07003014 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
Tom Herbert0a9627f2010-03-16 08:03:29 +00003015
3016 if (cpu_online(tcpu)) {
3017 cpu = tcpu;
3018 goto done;
3019 }
3020 }
3021
3022done:
Tom Herbert0a9627f2010-03-16 08:03:29 +00003023 return cpu;
3024}
3025
Ben Hutchingsc4454772011-01-19 11:03:53 +00003026#ifdef CONFIG_RFS_ACCEL
3027
3028/**
3029 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3030 * @dev: Device on which the filter was set
3031 * @rxq_index: RX queue index
3032 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3033 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3034 *
3035 * Drivers that implement ndo_rx_flow_steer() should periodically call
3036 * this function for each installed filter and remove the filters for
3037 * which it returns %true.
3038 */
3039bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3040 u32 flow_id, u16 filter_id)
3041{
3042 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3043 struct rps_dev_flow_table *flow_table;
3044 struct rps_dev_flow *rflow;
3045 bool expire = true;
3046 int cpu;
3047
3048 rcu_read_lock();
3049 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3050 if (flow_table && flow_id <= flow_table->mask) {
3051 rflow = &flow_table->flows[flow_id];
3052 cpu = ACCESS_ONCE(rflow->cpu);
3053 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3054 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3055 rflow->last_qtail) <
3056 (int)(10 * flow_table->mask)))
3057 expire = false;
3058 }
3059 rcu_read_unlock();
3060 return expire;
3061}
3062EXPORT_SYMBOL(rps_may_expire_flow);
3063
3064#endif /* CONFIG_RFS_ACCEL */
3065
Tom Herbert0a9627f2010-03-16 08:03:29 +00003066/* Called from hardirq (IPI) context */
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003067static void rps_trigger_softirq(void *data)
Tom Herbert0a9627f2010-03-16 08:03:29 +00003068{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003069 struct softnet_data *sd = data;
3070
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003071 ____napi_schedule(sd, &sd->backlog);
Changli Gaodee42872010-05-02 05:42:16 +00003072 sd->received_rps++;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003073}
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003074
Tom Herbertfec5e652010-04-16 16:01:27 -07003075#endif /* CONFIG_RPS */
Tom Herbert0a9627f2010-03-16 08:03:29 +00003076
3077/*
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003078 * Check if this softnet_data structure is another cpu one
3079 * If yes, queue it to our IPI list and return 1
3080 * If no, return 0
3081 */
3082static int rps_ipi_queued(struct softnet_data *sd)
3083{
3084#ifdef CONFIG_RPS
3085 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3086
3087 if (sd != mysd) {
3088 sd->rps_ipi_next = mysd->rps_ipi_list;
3089 mysd->rps_ipi_list = sd;
3090
3091 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3092 return 1;
3093 }
3094#endif /* CONFIG_RPS */
3095 return 0;
3096}
3097
Willem de Bruijn99bbc702013-05-20 04:02:32 +00003098#ifdef CONFIG_NET_FLOW_LIMIT
3099int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3100#endif
3101
3102static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3103{
3104#ifdef CONFIG_NET_FLOW_LIMIT
3105 struct sd_flow_limit *fl;
3106 struct softnet_data *sd;
3107 unsigned int old_flow, new_flow;
3108
3109 if (qlen < (netdev_max_backlog >> 1))
3110 return false;
3111
3112 sd = &__get_cpu_var(softnet_data);
3113
3114 rcu_read_lock();
3115 fl = rcu_dereference(sd->flow_limit);
3116 if (fl) {
3117 new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
3118 old_flow = fl->history[fl->history_head];
3119 fl->history[fl->history_head] = new_flow;
3120
3121 fl->history_head++;
3122 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3123
3124 if (likely(fl->buckets[old_flow]))
3125 fl->buckets[old_flow]--;
3126
3127 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3128 fl->count++;
3129 rcu_read_unlock();
3130 return true;
3131 }
3132 }
3133 rcu_read_unlock();
3134#endif
3135 return false;
3136}
3137
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003138/*
Tom Herbert0a9627f2010-03-16 08:03:29 +00003139 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3140 * queue (may be a remote CPU queue).
3141 */
Tom Herbertfec5e652010-04-16 16:01:27 -07003142static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3143 unsigned int *qtail)
Tom Herbert0a9627f2010-03-16 08:03:29 +00003144{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003145 struct softnet_data *sd;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003146 unsigned long flags;
Willem de Bruijn99bbc702013-05-20 04:02:32 +00003147 unsigned int qlen;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003148
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003149 sd = &per_cpu(softnet_data, cpu);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003150
3151 local_irq_save(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003152
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003153 rps_lock(sd);
Willem de Bruijn99bbc702013-05-20 04:02:32 +00003154 qlen = skb_queue_len(&sd->input_pkt_queue);
3155 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
Changli Gao6e7676c2010-04-27 15:07:33 -07003156 if (skb_queue_len(&sd->input_pkt_queue)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00003157enqueue:
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003158 __skb_queue_tail(&sd->input_pkt_queue, skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003159 input_queue_tail_incr_save(sd, qtail);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003160 rps_unlock(sd);
Changli Gao152102c2010-03-30 20:16:22 +00003161 local_irq_restore(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003162 return NET_RX_SUCCESS;
3163 }
3164
Eric Dumazetebda37c22010-05-06 23:51:21 +00003165 /* Schedule NAPI for backlog device
3166 * We can use non atomic operation since we own the queue lock
3167 */
3168 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003169 if (!rps_ipi_queued(sd))
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003170 ____napi_schedule(sd, &sd->backlog);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003171 }
3172 goto enqueue;
3173 }
3174
Changli Gaodee42872010-05-02 05:42:16 +00003175 sd->dropped++;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003176 rps_unlock(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003177
Tom Herbert0a9627f2010-03-16 08:03:29 +00003178 local_irq_restore(flags);
3179
Eric Dumazetcaf586e2010-09-30 21:06:55 +00003180 atomic_long_inc(&skb->dev->rx_dropped);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003181 kfree_skb(skb);
3182 return NET_RX_DROP;
3183}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003184
Linus Torvalds1da177e2005-04-16 15:20:36 -07003185/**
3186 * netif_rx - post buffer to the network code
3187 * @skb: buffer to post
3188 *
3189 * This function receives a packet from a device driver and queues it for
3190 * the upper (protocol) levels to process. It always succeeds. The buffer
3191 * may be dropped during processing for congestion control or by the
3192 * protocol layers.
3193 *
3194 * return values:
3195 * NET_RX_SUCCESS (no congestion)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003196 * NET_RX_DROP (packet was dropped)
3197 *
3198 */
3199
3200int netif_rx(struct sk_buff *skb)
3201{
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003202 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003203
3204 /* if netpoll wants it, pretend we never saw it */
3205 if (netpoll_rx(skb))
3206 return NET_RX_DROP;
3207
Eric Dumazet588f0332011-11-15 04:12:55 +00003208 net_timestamp_check(netdev_tstamp_prequeue, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003209
Koki Sanagicf66ba52010-08-23 18:45:02 +09003210 trace_netif_rx(skb);
Eric Dumazetdf334542010-03-24 19:13:54 +00003211#ifdef CONFIG_RPS
Ingo Molnarc5905af2012-02-24 08:31:31 +01003212 if (static_key_false(&rps_needed)) {
Tom Herbertfec5e652010-04-16 16:01:27 -07003213 struct rps_dev_flow voidflow, *rflow = &voidflow;
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003214 int cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003215
Changli Gaocece1942010-08-07 20:35:43 -07003216 preempt_disable();
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003217 rcu_read_lock();
Tom Herbertfec5e652010-04-16 16:01:27 -07003218
3219 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003220 if (cpu < 0)
3221 cpu = smp_processor_id();
Tom Herbertfec5e652010-04-16 16:01:27 -07003222
3223 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3224
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003225 rcu_read_unlock();
Changli Gaocece1942010-08-07 20:35:43 -07003226 preempt_enable();
Eric Dumazetadc93002011-11-17 03:13:26 +00003227 } else
3228#endif
Tom Herbertfec5e652010-04-16 16:01:27 -07003229 {
3230 unsigned int qtail;
3231 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3232 put_cpu();
3233 }
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003234 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003235}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003236EXPORT_SYMBOL(netif_rx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003237
3238int netif_rx_ni(struct sk_buff *skb)
3239{
3240 int err;
3241
3242 preempt_disable();
3243 err = netif_rx(skb);
3244 if (local_softirq_pending())
3245 do_softirq();
3246 preempt_enable();
3247
3248 return err;
3249}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003250EXPORT_SYMBOL(netif_rx_ni);
3251
Linus Torvalds1da177e2005-04-16 15:20:36 -07003252static void net_tx_action(struct softirq_action *h)
3253{
3254 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3255
3256 if (sd->completion_queue) {
3257 struct sk_buff *clist;
3258
3259 local_irq_disable();
3260 clist = sd->completion_queue;
3261 sd->completion_queue = NULL;
3262 local_irq_enable();
3263
3264 while (clist) {
3265 struct sk_buff *skb = clist;
3266 clist = clist->next;
3267
Ilpo Järvinen547b7922008-07-25 21:43:18 -07003268 WARN_ON(atomic_read(&skb->users));
Koki Sanagi07dc22e2010-08-23 18:46:12 +09003269 trace_kfree_skb(skb, net_tx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003270 __kfree_skb(skb);
3271 }
3272 }
3273
3274 if (sd->output_queue) {
David S. Miller37437bb2008-07-16 02:15:04 -07003275 struct Qdisc *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003276
3277 local_irq_disable();
3278 head = sd->output_queue;
3279 sd->output_queue = NULL;
Changli Gaoa9cbd582010-04-26 23:06:24 +00003280 sd->output_queue_tailp = &sd->output_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003281 local_irq_enable();
3282
3283 while (head) {
David S. Miller37437bb2008-07-16 02:15:04 -07003284 struct Qdisc *q = head;
3285 spinlock_t *root_lock;
3286
Linus Torvalds1da177e2005-04-16 15:20:36 -07003287 head = head->next_sched;
3288
David S. Miller5fb66222008-08-02 20:02:43 -07003289 root_lock = qdisc_lock(q);
David S. Miller37437bb2008-07-16 02:15:04 -07003290 if (spin_trylock(root_lock)) {
Jarek Poplawskidef82a12008-08-17 21:54:43 -07003291 smp_mb__before_clear_bit();
3292 clear_bit(__QDISC_STATE_SCHED,
3293 &q->state);
David S. Miller37437bb2008-07-16 02:15:04 -07003294 qdisc_run(q);
3295 spin_unlock(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003296 } else {
David S. Miller195648b2008-08-19 04:00:36 -07003297 if (!test_bit(__QDISC_STATE_DEACTIVATED,
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07003298 &q->state)) {
David S. Miller195648b2008-08-19 04:00:36 -07003299 __netif_reschedule(q);
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07003300 } else {
3301 smp_mb__before_clear_bit();
3302 clear_bit(__QDISC_STATE_SCHED,
3303 &q->state);
3304 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003305 }
3306 }
3307 }
3308}
3309
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003310#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3311 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
Michał Mirosławda678292009-06-05 05:35:28 +00003312/* This hook is defined here for ATM LANE */
3313int (*br_fdb_test_addr_hook)(struct net_device *dev,
3314 unsigned char *addr) __read_mostly;
Stephen Hemminger4fb019a2009-09-11 11:50:08 -07003315EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
Michał Mirosławda678292009-06-05 05:35:28 +00003316#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003317
Linus Torvalds1da177e2005-04-16 15:20:36 -07003318#ifdef CONFIG_NET_CLS_ACT
3319/* TODO: Maybe we should just force sch_ingress to be compiled in
3320 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3321 * a compare and 2 stores extra right now if we dont have it on
3322 * but have CONFIG_NET_CLS_ACT
Lucas De Marchi25985ed2011-03-30 22:57:33 -03003323 * NOTE: This doesn't stop any functionality; if you dont have
3324 * the ingress scheduler, you just can't add policies on ingress.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003325 *
3326 */
Eric Dumazet24824a02010-10-02 06:11:55 +00003327static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003328{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003329 struct net_device *dev = skb->dev;
Herbert Xuf697c3e2007-10-14 00:38:47 -07003330 u32 ttl = G_TC_RTTL(skb->tc_verd);
David S. Miller555353c2008-07-08 17:33:13 -07003331 int result = TC_ACT_OK;
3332 struct Qdisc *q;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003333
Stephen Hemmingerde384832010-08-01 00:33:23 -07003334 if (unlikely(MAX_RED_LOOP < ttl++)) {
Joe Perchese87cc472012-05-13 21:56:26 +00003335 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3336 skb->skb_iif, dev->ifindex);
Herbert Xuf697c3e2007-10-14 00:38:47 -07003337 return TC_ACT_SHOT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003338 }
3339
Herbert Xuf697c3e2007-10-14 00:38:47 -07003340 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3341 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3342
David S. Miller83874002008-07-17 00:53:03 -07003343 q = rxq->qdisc;
David S. Miller8d50b532008-07-30 02:37:46 -07003344 if (q != &noop_qdisc) {
David S. Miller83874002008-07-17 00:53:03 -07003345 spin_lock(qdisc_lock(q));
David S. Millera9312ae2008-08-17 21:51:03 -07003346 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3347 result = qdisc_enqueue_root(skb, q);
David S. Miller83874002008-07-17 00:53:03 -07003348 spin_unlock(qdisc_lock(q));
3349 }
Herbert Xuf697c3e2007-10-14 00:38:47 -07003350
Linus Torvalds1da177e2005-04-16 15:20:36 -07003351 return result;
3352}
Herbert Xuf697c3e2007-10-14 00:38:47 -07003353
3354static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3355 struct packet_type **pt_prev,
3356 int *ret, struct net_device *orig_dev)
3357{
Eric Dumazet24824a02010-10-02 06:11:55 +00003358 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3359
3360 if (!rxq || rxq->qdisc == &noop_qdisc)
Herbert Xuf697c3e2007-10-14 00:38:47 -07003361 goto out;
3362
3363 if (*pt_prev) {
3364 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3365 *pt_prev = NULL;
Herbert Xuf697c3e2007-10-14 00:38:47 -07003366 }
3367
Eric Dumazet24824a02010-10-02 06:11:55 +00003368 switch (ing_filter(skb, rxq)) {
Herbert Xuf697c3e2007-10-14 00:38:47 -07003369 case TC_ACT_SHOT:
3370 case TC_ACT_STOLEN:
3371 kfree_skb(skb);
3372 return NULL;
3373 }
3374
3375out:
3376 skb->tc_verd = 0;
3377 return skb;
3378}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003379#endif
3380
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003381/**
3382 * netdev_rx_handler_register - register receive handler
3383 * @dev: device to register a handler for
3384 * @rx_handler: receive handler to register
Jiri Pirko93e2c322010-06-10 03:34:59 +00003385 * @rx_handler_data: data pointer that is used by rx handler
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003386 *
3387 * Register a receive hander for a device. This handler will then be
3388 * called from __netif_receive_skb. A negative errno code is returned
3389 * on a failure.
3390 *
3391 * The caller must hold the rtnl_mutex.
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003392 *
3393 * For a general description of rx_handler, see enum rx_handler_result.
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003394 */
3395int netdev_rx_handler_register(struct net_device *dev,
Jiri Pirko93e2c322010-06-10 03:34:59 +00003396 rx_handler_func_t *rx_handler,
3397 void *rx_handler_data)
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003398{
3399 ASSERT_RTNL();
3400
3401 if (dev->rx_handler)
3402 return -EBUSY;
3403
Eric Dumazet00cfec32013-03-29 03:01:22 +00003404 /* Note: rx_handler_data must be set before rx_handler */
Jiri Pirko93e2c322010-06-10 03:34:59 +00003405 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003406 rcu_assign_pointer(dev->rx_handler, rx_handler);
3407
3408 return 0;
3409}
3410EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3411
3412/**
3413 * netdev_rx_handler_unregister - unregister receive handler
3414 * @dev: device to unregister a handler from
3415 *
Kusanagi Kouichi166ec362013-03-18 02:59:52 +00003416 * Unregister a receive handler from a device.
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003417 *
3418 * The caller must hold the rtnl_mutex.
3419 */
3420void netdev_rx_handler_unregister(struct net_device *dev)
3421{
3422
3423 ASSERT_RTNL();
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00003424 RCU_INIT_POINTER(dev->rx_handler, NULL);
Eric Dumazet00cfec32013-03-29 03:01:22 +00003425 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3426 * section has a guarantee to see a non NULL rx_handler_data
3427 * as well.
3428 */
3429 synchronize_net();
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00003430 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003431}
3432EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3433
Mel Gormanb4b9e352012-07-31 16:44:26 -07003434/*
3435 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3436 * the special handling of PFMEMALLOC skbs.
3437 */
3438static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3439{
3440 switch (skb->protocol) {
3441 case __constant_htons(ETH_P_ARP):
3442 case __constant_htons(ETH_P_IP):
3443 case __constant_htons(ETH_P_IPV6):
3444 case __constant_htons(ETH_P_8021Q):
Patrick McHardy8ad227f2013-04-19 02:04:31 +00003445 case __constant_htons(ETH_P_8021AD):
Mel Gormanb4b9e352012-07-31 16:44:26 -07003446 return true;
3447 default:
3448 return false;
3449 }
3450}
3451
David S. Miller9754e292013-02-14 15:57:38 -05003452static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003453{
3454 struct packet_type *ptype, *pt_prev;
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003455 rx_handler_func_t *rx_handler;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003456 struct net_device *orig_dev;
David S. Miller63d8ea72011-02-28 10:48:59 -08003457 struct net_device *null_or_dev;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003458 bool deliver_exact = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003459 int ret = NET_RX_DROP;
Al Viro252e3342006-11-14 20:48:11 -08003460 __be16 type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003461
Eric Dumazet588f0332011-11-15 04:12:55 +00003462 net_timestamp_check(!netdev_tstamp_prequeue, skb);
Eric Dumazet81bbb3d2009-09-30 16:42:42 -07003463
Koki Sanagicf66ba52010-08-23 18:45:02 +09003464 trace_netif_receive_skb(skb);
Patrick McHardy9b22ea52008-11-04 14:49:57 -08003465
Linus Torvalds1da177e2005-04-16 15:20:36 -07003466 /* if we've gotten here through NAPI, check netpoll */
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003467 if (netpoll_receive_skb(skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003468 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003469
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07003470 orig_dev = skb->dev;
Jiri Pirko1765a572011-02-12 06:48:36 +00003471
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003472 skb_reset_network_header(skb);
Eric Dumazetfda55ec2013-01-07 09:28:21 +00003473 if (!skb_transport_header_was_set(skb))
3474 skb_reset_transport_header(skb);
Jiri Pirko0b5c9db2011-06-10 06:56:58 +00003475 skb_reset_mac_len(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003476
3477 pt_prev = NULL;
3478
3479 rcu_read_lock();
3480
David S. Miller63d8ea72011-02-28 10:48:59 -08003481another_round:
David S. Millerb6858172012-07-23 16:27:54 -07003482 skb->skb_iif = skb->dev->ifindex;
David S. Miller63d8ea72011-02-28 10:48:59 -08003483
3484 __this_cpu_inc(softnet_data.processed);
3485
Patrick McHardy8ad227f2013-04-19 02:04:31 +00003486 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3487 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
Jiri Pirkobcc6d472011-04-07 19:48:33 +00003488 skb = vlan_untag(skb);
3489 if (unlikely(!skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003490 goto unlock;
Jiri Pirkobcc6d472011-04-07 19:48:33 +00003491 }
3492
Linus Torvalds1da177e2005-04-16 15:20:36 -07003493#ifdef CONFIG_NET_CLS_ACT
3494 if (skb->tc_verd & TC_NCLS) {
3495 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3496 goto ncls;
3497 }
3498#endif
3499
David S. Miller9754e292013-02-14 15:57:38 -05003500 if (pfmemalloc)
Mel Gormanb4b9e352012-07-31 16:44:26 -07003501 goto skip_taps;
3502
Linus Torvalds1da177e2005-04-16 15:20:36 -07003503 list_for_each_entry_rcu(ptype, &ptype_all, list) {
David S. Miller63d8ea72011-02-28 10:48:59 -08003504 if (!ptype->dev || ptype->dev == skb->dev) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003505 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003506 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003507 pt_prev = ptype;
3508 }
3509 }
3510
Mel Gormanb4b9e352012-07-31 16:44:26 -07003511skip_taps:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003512#ifdef CONFIG_NET_CLS_ACT
Herbert Xuf697c3e2007-10-14 00:38:47 -07003513 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3514 if (!skb)
Mel Gormanb4b9e352012-07-31 16:44:26 -07003515 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003516ncls:
3517#endif
3518
David S. Miller9754e292013-02-14 15:57:38 -05003519 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003520 goto drop;
3521
John Fastabend24257172011-10-10 09:16:41 +00003522 if (vlan_tx_tag_present(skb)) {
3523 if (pt_prev) {
3524 ret = deliver_skb(skb, pt_prev, orig_dev);
3525 pt_prev = NULL;
3526 }
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003527 if (vlan_do_receive(&skb))
John Fastabend24257172011-10-10 09:16:41 +00003528 goto another_round;
3529 else if (unlikely(!skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003530 goto unlock;
John Fastabend24257172011-10-10 09:16:41 +00003531 }
3532
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003533 rx_handler = rcu_dereference(skb->dev->rx_handler);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003534 if (rx_handler) {
3535 if (pt_prev) {
3536 ret = deliver_skb(skb, pt_prev, orig_dev);
3537 pt_prev = NULL;
3538 }
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003539 switch (rx_handler(&skb)) {
3540 case RX_HANDLER_CONSUMED:
Cristian Bercaru3bc1b1a2013-03-08 07:03:38 +00003541 ret = NET_RX_SUCCESS;
Mel Gormanb4b9e352012-07-31 16:44:26 -07003542 goto unlock;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003543 case RX_HANDLER_ANOTHER:
David S. Miller63d8ea72011-02-28 10:48:59 -08003544 goto another_round;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003545 case RX_HANDLER_EXACT:
3546 deliver_exact = true;
3547 case RX_HANDLER_PASS:
3548 break;
3549 default:
3550 BUG();
3551 }
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003552 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003553
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003554 if (vlan_tx_nonzero_tag_present(skb))
3555 skb->pkt_type = PACKET_OTHERHOST;
3556
David S. Miller63d8ea72011-02-28 10:48:59 -08003557 /* deliver only exact match when indicated */
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003558 null_or_dev = deliver_exact ? skb->dev : NULL;
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00003559
Linus Torvalds1da177e2005-04-16 15:20:36 -07003560 type = skb->protocol;
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08003561 list_for_each_entry_rcu(ptype,
3562 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
David S. Miller63d8ea72011-02-28 10:48:59 -08003563 if (ptype->type == type &&
Jiri Pirkoe3f48d32011-02-28 20:26:31 +00003564 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3565 ptype->dev == orig_dev)) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003566 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003567 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003568 pt_prev = ptype;
3569 }
3570 }
3571
3572 if (pt_prev) {
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00003573 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
Michael S. Tsirkin0e698bf2012-09-15 22:44:16 +00003574 goto drop;
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00003575 else
3576 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003577 } else {
Mel Gormanb4b9e352012-07-31 16:44:26 -07003578drop:
Eric Dumazetcaf586e2010-09-30 21:06:55 +00003579 atomic_long_inc(&skb->dev->rx_dropped);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003580 kfree_skb(skb);
3581 /* Jamal, now you will not able to escape explaining
3582 * me how you were going to use this. :-)
3583 */
3584 ret = NET_RX_DROP;
3585 }
3586
Mel Gormanb4b9e352012-07-31 16:44:26 -07003587unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003588 rcu_read_unlock();
Mel Gormanb4b9e352012-07-31 16:44:26 -07003589out:
David S. Miller9754e292013-02-14 15:57:38 -05003590 return ret;
3591}
3592
3593static int __netif_receive_skb(struct sk_buff *skb)
3594{
3595 int ret;
3596
3597 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3598 unsigned long pflags = current->flags;
3599
3600 /*
3601 * PFMEMALLOC skbs are special, they should
3602 * - be delivered to SOCK_MEMALLOC sockets only
3603 * - stay away from userspace
3604 * - have bounded memory usage
3605 *
3606 * Use PF_MEMALLOC as this saves us from propagating the allocation
3607 * context down to all allocation sites.
3608 */
3609 current->flags |= PF_MEMALLOC;
3610 ret = __netif_receive_skb_core(skb, true);
3611 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3612 } else
3613 ret = __netif_receive_skb_core(skb, false);
3614
Linus Torvalds1da177e2005-04-16 15:20:36 -07003615 return ret;
3616}
Tom Herbert0a9627f2010-03-16 08:03:29 +00003617
3618/**
3619 * netif_receive_skb - process receive buffer from network
3620 * @skb: buffer to process
3621 *
3622 * netif_receive_skb() is the main receive data processing function.
3623 * It always succeeds. The buffer may be dropped during processing
3624 * for congestion control or by the protocol layers.
3625 *
3626 * This function may only be called from softirq context and interrupts
3627 * should be enabled.
3628 *
3629 * Return values (usually ignored):
3630 * NET_RX_SUCCESS: no congestion
3631 * NET_RX_DROP: packet was dropped
3632 */
3633int netif_receive_skb(struct sk_buff *skb)
3634{
Eric Dumazet588f0332011-11-15 04:12:55 +00003635 net_timestamp_check(netdev_tstamp_prequeue, skb);
Eric Dumazet3b098e22010-05-15 23:57:10 -07003636
Richard Cochranc1f19b52010-07-17 08:49:36 +00003637 if (skb_defer_rx_timestamp(skb))
3638 return NET_RX_SUCCESS;
3639
Eric Dumazetdf334542010-03-24 19:13:54 +00003640#ifdef CONFIG_RPS
Ingo Molnarc5905af2012-02-24 08:31:31 +01003641 if (static_key_false(&rps_needed)) {
Eric Dumazet3b098e22010-05-15 23:57:10 -07003642 struct rps_dev_flow voidflow, *rflow = &voidflow;
3643 int cpu, ret;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003644
Eric Dumazet3b098e22010-05-15 23:57:10 -07003645 rcu_read_lock();
Tom Herbert0a9627f2010-03-16 08:03:29 +00003646
Eric Dumazet3b098e22010-05-15 23:57:10 -07003647 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Tom Herbertfec5e652010-04-16 16:01:27 -07003648
Eric Dumazet3b098e22010-05-15 23:57:10 -07003649 if (cpu >= 0) {
3650 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3651 rcu_read_unlock();
Eric Dumazetadc93002011-11-17 03:13:26 +00003652 return ret;
Eric Dumazet3b098e22010-05-15 23:57:10 -07003653 }
Eric Dumazetadc93002011-11-17 03:13:26 +00003654 rcu_read_unlock();
Tom Herbertfec5e652010-04-16 16:01:27 -07003655 }
Tom Herbert1e94d722010-03-18 17:45:44 -07003656#endif
Eric Dumazetadc93002011-11-17 03:13:26 +00003657 return __netif_receive_skb(skb);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003658}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003659EXPORT_SYMBOL(netif_receive_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003660
Eric Dumazet88751272010-04-19 05:07:33 +00003661/* Network device is going away, flush any packets still pending
3662 * Called with irqs disabled.
3663 */
Changli Gao152102c2010-03-30 20:16:22 +00003664static void flush_backlog(void *arg)
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003665{
Changli Gao152102c2010-03-30 20:16:22 +00003666 struct net_device *dev = arg;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003667 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003668 struct sk_buff *skb, *tmp;
3669
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003670 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003671 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003672 if (skb->dev == dev) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003673 __skb_unlink(skb, &sd->input_pkt_queue);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003674 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003675 input_queue_head_incr(sd);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003676 }
Changli Gao6e7676c2010-04-27 15:07:33 -07003677 }
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003678 rps_unlock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003679
3680 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3681 if (skb->dev == dev) {
3682 __skb_unlink(skb, &sd->process_queue);
3683 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003684 input_queue_head_incr(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003685 }
3686 }
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003687}
3688
Herbert Xud565b0a2008-12-15 23:38:52 -08003689static int napi_gro_complete(struct sk_buff *skb)
3690{
Vlad Yasevich22061d82012-11-15 08:49:11 +00003691 struct packet_offload *ptype;
Herbert Xud565b0a2008-12-15 23:38:52 -08003692 __be16 type = skb->protocol;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003693 struct list_head *head = &offload_base;
Herbert Xud565b0a2008-12-15 23:38:52 -08003694 int err = -ENOENT;
3695
Eric Dumazetc3c7c252012-12-06 13:54:59 +00003696 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3697
Herbert Xufc59f9a2009-04-14 15:11:06 -07003698 if (NAPI_GRO_CB(skb)->count == 1) {
3699 skb_shinfo(skb)->gso_size = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003700 goto out;
Herbert Xufc59f9a2009-04-14 15:11:06 -07003701 }
Herbert Xud565b0a2008-12-15 23:38:52 -08003702
3703 rcu_read_lock();
3704 list_for_each_entry_rcu(ptype, head, list) {
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003705 if (ptype->type != type || !ptype->callbacks.gro_complete)
Herbert Xud565b0a2008-12-15 23:38:52 -08003706 continue;
3707
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003708 err = ptype->callbacks.gro_complete(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003709 break;
3710 }
3711 rcu_read_unlock();
3712
3713 if (err) {
3714 WARN_ON(&ptype->list == head);
3715 kfree_skb(skb);
3716 return NET_RX_SUCCESS;
3717 }
3718
3719out:
Herbert Xud565b0a2008-12-15 23:38:52 -08003720 return netif_receive_skb(skb);
3721}
3722
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003723/* napi->gro_list contains packets ordered by age.
3724 * youngest packets at the head of it.
3725 * Complete skbs in reverse order to reduce latencies.
3726 */
3727void napi_gro_flush(struct napi_struct *napi, bool flush_old)
Herbert Xud565b0a2008-12-15 23:38:52 -08003728{
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003729 struct sk_buff *skb, *prev = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08003730
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003731 /* scan list and build reverse chain */
3732 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3733 skb->prev = prev;
3734 prev = skb;
Herbert Xud565b0a2008-12-15 23:38:52 -08003735 }
3736
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003737 for (skb = prev; skb; skb = prev) {
3738 skb->next = NULL;
3739
3740 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3741 return;
3742
3743 prev = skb->prev;
3744 napi_gro_complete(skb);
3745 napi->gro_count--;
3746 }
3747
Herbert Xud565b0a2008-12-15 23:38:52 -08003748 napi->gro_list = NULL;
3749}
Eric Dumazet86cac582010-08-31 18:25:32 +00003750EXPORT_SYMBOL(napi_gro_flush);
Herbert Xud565b0a2008-12-15 23:38:52 -08003751
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003752static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3753{
3754 struct sk_buff *p;
3755 unsigned int maclen = skb->dev->hard_header_len;
3756
3757 for (p = napi->gro_list; p; p = p->next) {
3758 unsigned long diffs;
3759
3760 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3761 diffs |= p->vlan_tci ^ skb->vlan_tci;
3762 if (maclen == ETH_HLEN)
3763 diffs |= compare_ether_header(skb_mac_header(p),
3764 skb_gro_mac_header(skb));
3765 else if (!diffs)
3766 diffs = memcmp(skb_mac_header(p),
3767 skb_gro_mac_header(skb),
3768 maclen);
3769 NAPI_GRO_CB(p)->same_flow = !diffs;
3770 NAPI_GRO_CB(p)->flush = 0;
3771 }
3772}
3773
Rami Rosenbb728822012-11-28 21:55:25 +00003774static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xud565b0a2008-12-15 23:38:52 -08003775{
3776 struct sk_buff **pp = NULL;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003777 struct packet_offload *ptype;
Herbert Xud565b0a2008-12-15 23:38:52 -08003778 __be16 type = skb->protocol;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003779 struct list_head *head = &offload_base;
Herbert Xu0da2afd52008-12-26 14:57:42 -08003780 int same_flow;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003781 enum gro_result ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003782
Jarek Poplawskice9e76c2010-08-05 01:19:11 +00003783 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
Herbert Xud565b0a2008-12-15 23:38:52 -08003784 goto normal;
3785
David S. Miller21dc3302010-08-23 00:13:46 -07003786 if (skb_is_gso(skb) || skb_has_frag_list(skb))
Herbert Xuf17f5c92009-01-14 14:36:12 -08003787 goto normal;
3788
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003789 gro_list_prepare(napi, skb);
3790
Herbert Xud565b0a2008-12-15 23:38:52 -08003791 rcu_read_lock();
3792 list_for_each_entry_rcu(ptype, head, list) {
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003793 if (ptype->type != type || !ptype->callbacks.gro_receive)
Herbert Xud565b0a2008-12-15 23:38:52 -08003794 continue;
3795
Herbert Xu86911732009-01-29 14:19:50 +00003796 skb_set_network_header(skb, skb_gro_offset(skb));
Eric Dumazetefd94502013-02-14 17:31:48 +00003797 skb_reset_mac_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003798 NAPI_GRO_CB(skb)->same_flow = 0;
3799 NAPI_GRO_CB(skb)->flush = 0;
Herbert Xu5d38a072009-01-04 16:13:40 -08003800 NAPI_GRO_CB(skb)->free = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003801
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003802 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003803 break;
3804 }
3805 rcu_read_unlock();
3806
3807 if (&ptype->list == head)
3808 goto normal;
3809
Herbert Xu0da2afd52008-12-26 14:57:42 -08003810 same_flow = NAPI_GRO_CB(skb)->same_flow;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003811 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
Herbert Xu0da2afd52008-12-26 14:57:42 -08003812
Herbert Xud565b0a2008-12-15 23:38:52 -08003813 if (pp) {
3814 struct sk_buff *nskb = *pp;
3815
3816 *pp = nskb->next;
3817 nskb->next = NULL;
3818 napi_gro_complete(nskb);
Herbert Xu4ae55442009-02-08 18:00:36 +00003819 napi->gro_count--;
Herbert Xud565b0a2008-12-15 23:38:52 -08003820 }
3821
Herbert Xu0da2afd52008-12-26 14:57:42 -08003822 if (same_flow)
Herbert Xud565b0a2008-12-15 23:38:52 -08003823 goto ok;
3824
Herbert Xu4ae55442009-02-08 18:00:36 +00003825 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
Herbert Xud565b0a2008-12-15 23:38:52 -08003826 goto normal;
Herbert Xud565b0a2008-12-15 23:38:52 -08003827
Herbert Xu4ae55442009-02-08 18:00:36 +00003828 napi->gro_count++;
Herbert Xud565b0a2008-12-15 23:38:52 -08003829 NAPI_GRO_CB(skb)->count = 1;
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003830 NAPI_GRO_CB(skb)->age = jiffies;
Herbert Xu86911732009-01-29 14:19:50 +00003831 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003832 skb->next = napi->gro_list;
3833 napi->gro_list = skb;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003834 ret = GRO_HELD;
Herbert Xud565b0a2008-12-15 23:38:52 -08003835
Herbert Xuad0f9902009-02-01 01:24:55 -08003836pull:
Herbert Xucb189782009-05-26 18:50:31 +00003837 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3838 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3839
3840 BUG_ON(skb->end - skb->tail < grow);
3841
3842 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3843
3844 skb->tail += grow;
3845 skb->data_len -= grow;
3846
3847 skb_shinfo(skb)->frags[0].page_offset += grow;
Eric Dumazet9e903e02011-10-18 21:00:24 +00003848 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
Herbert Xucb189782009-05-26 18:50:31 +00003849
Eric Dumazet9e903e02011-10-18 21:00:24 +00003850 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
Ian Campbellea2ab692011-08-22 23:44:58 +00003851 skb_frag_unref(skb, 0);
Herbert Xucb189782009-05-26 18:50:31 +00003852 memmove(skb_shinfo(skb)->frags,
3853 skb_shinfo(skb)->frags + 1,
Jarek Poplawskie5093ae2010-08-11 02:02:10 +00003854 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
Herbert Xucb189782009-05-26 18:50:31 +00003855 }
Herbert Xuad0f9902009-02-01 01:24:55 -08003856 }
3857
Herbert Xud565b0a2008-12-15 23:38:52 -08003858ok:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003859 return ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003860
3861normal:
Herbert Xuad0f9902009-02-01 01:24:55 -08003862 ret = GRO_NORMAL;
3863 goto pull;
Herbert Xu5d38a072009-01-04 16:13:40 -08003864}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003865
Herbert Xu96e93ea2009-01-06 10:49:34 -08003866
Rami Rosenbb728822012-11-28 21:55:25 +00003867static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
Herbert Xu5d38a072009-01-04 16:13:40 -08003868{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003869 switch (ret) {
3870 case GRO_NORMAL:
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003871 if (netif_receive_skb(skb))
3872 ret = GRO_DROP;
3873 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003874
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003875 case GRO_DROP:
Herbert Xu5d38a072009-01-04 16:13:40 -08003876 kfree_skb(skb);
3877 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003878
Eric Dumazetdaa86542012-04-19 07:07:40 +00003879 case GRO_MERGED_FREE:
Eric Dumazetd7e88832012-04-30 08:10:34 +00003880 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3881 kmem_cache_free(skbuff_head_cache, skb);
3882 else
3883 __kfree_skb(skb);
Eric Dumazetdaa86542012-04-19 07:07:40 +00003884 break;
3885
Ben Hutchings5b252f02009-10-29 07:17:09 +00003886 case GRO_HELD:
3887 case GRO_MERGED:
3888 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003889 }
3890
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003891 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003892}
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003893
Eric Dumazetca07e432012-10-06 22:28:06 +00003894static void skb_gro_reset_offset(struct sk_buff *skb)
Herbert Xu78a478d2009-05-26 18:50:21 +00003895{
Eric Dumazetca07e432012-10-06 22:28:06 +00003896 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3897 const skb_frag_t *frag0 = &pinfo->frags[0];
3898
Herbert Xu78a478d2009-05-26 18:50:21 +00003899 NAPI_GRO_CB(skb)->data_offset = 0;
3900 NAPI_GRO_CB(skb)->frag0 = NULL;
Herbert Xu74895942009-05-26 18:50:27 +00003901 NAPI_GRO_CB(skb)->frag0_len = 0;
Herbert Xu78a478d2009-05-26 18:50:21 +00003902
Simon Hormanced14f62013-05-28 20:34:25 +00003903 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
Eric Dumazetca07e432012-10-06 22:28:06 +00003904 pinfo->nr_frags &&
3905 !PageHighMem(skb_frag_page(frag0))) {
3906 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3907 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
Herbert Xu74895942009-05-26 18:50:27 +00003908 }
Herbert Xu78a478d2009-05-26 18:50:21 +00003909}
Herbert Xu78a478d2009-05-26 18:50:21 +00003910
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003911gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003912{
Herbert Xu86911732009-01-29 14:19:50 +00003913 skb_gro_reset_offset(skb);
3914
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003915 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003916}
3917EXPORT_SYMBOL(napi_gro_receive);
3918
stephen hemmingerd0c2b0d2010-10-19 07:12:10 +00003919static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003920{
Herbert Xu96e93ea2009-01-06 10:49:34 -08003921 __skb_pull(skb, skb_headlen(skb));
Eric Dumazet2a2a4592012-03-21 06:58:03 +00003922 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3923 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
Jesse Gross3701e512010-10-20 13:56:06 +00003924 skb->vlan_tci = 0;
Herbert Xu66c46d72011-01-29 20:44:54 -08003925 skb->dev = napi->dev;
Andy Gospodarek6d152e22011-02-02 14:53:25 -08003926 skb->skb_iif = 0;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003927
3928 napi->skb = skb;
3929}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003930
Herbert Xu76620aa2009-04-16 02:02:07 -07003931struct sk_buff *napi_get_frags(struct napi_struct *napi)
Herbert Xu5d38a072009-01-04 16:13:40 -08003932{
Herbert Xu5d38a072009-01-04 16:13:40 -08003933 struct sk_buff *skb = napi->skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003934
3935 if (!skb) {
Eric Dumazet89d71a62009-10-13 05:34:20 +00003936 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3937 if (skb)
3938 napi->skb = skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003939 }
Herbert Xu96e93ea2009-01-06 10:49:34 -08003940 return skb;
3941}
Herbert Xu76620aa2009-04-16 02:02:07 -07003942EXPORT_SYMBOL(napi_get_frags);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003943
Rami Rosenbb728822012-11-28 21:55:25 +00003944static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003945 gro_result_t ret)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003946{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003947 switch (ret) {
3948 case GRO_NORMAL:
Herbert Xu86911732009-01-29 14:19:50 +00003949 case GRO_HELD:
Ajit Khapardee76b69c2010-02-16 20:25:43 +00003950 skb->protocol = eth_type_trans(skb, skb->dev);
Herbert Xu86911732009-01-29 14:19:50 +00003951
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003952 if (ret == GRO_HELD)
3953 skb_gro_pull(skb, -ETH_HLEN);
3954 else if (netif_receive_skb(skb))
3955 ret = GRO_DROP;
Herbert Xu86911732009-01-29 14:19:50 +00003956 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003957
3958 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003959 case GRO_MERGED_FREE:
3960 napi_reuse_skb(napi, skb);
3961 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003962
3963 case GRO_MERGED:
3964 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003965 }
3966
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003967 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003968}
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003969
Eric Dumazet4adb9c42012-05-18 20:49:06 +00003970static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003971{
Herbert Xu76620aa2009-04-16 02:02:07 -07003972 struct sk_buff *skb = napi->skb;
3973 struct ethhdr *eth;
Herbert Xua5b1cf22009-05-26 18:50:28 +00003974 unsigned int hlen;
3975 unsigned int off;
Herbert Xu76620aa2009-04-16 02:02:07 -07003976
3977 napi->skb = NULL;
3978
3979 skb_reset_mac_header(skb);
3980 skb_gro_reset_offset(skb);
3981
Herbert Xua5b1cf22009-05-26 18:50:28 +00003982 off = skb_gro_offset(skb);
3983 hlen = off + sizeof(*eth);
3984 eth = skb_gro_header_fast(skb, off);
3985 if (skb_gro_header_hard(skb, hlen)) {
3986 eth = skb_gro_header_slow(skb, hlen, off);
3987 if (unlikely(!eth)) {
3988 napi_reuse_skb(napi, skb);
3989 skb = NULL;
3990 goto out;
3991 }
Herbert Xu76620aa2009-04-16 02:02:07 -07003992 }
3993
3994 skb_gro_pull(skb, sizeof(*eth));
3995
3996 /*
3997 * This works because the only protocols we care about don't require
3998 * special handling. We'll fix it up properly at the end.
3999 */
4000 skb->protocol = eth->h_proto;
4001
4002out:
4003 return skb;
4004}
Herbert Xu76620aa2009-04-16 02:02:07 -07004005
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07004006gro_result_t napi_gro_frags(struct napi_struct *napi)
Herbert Xu76620aa2009-04-16 02:02:07 -07004007{
4008 struct sk_buff *skb = napi_frags_skb(napi);
Herbert Xu96e93ea2009-01-06 10:49:34 -08004009
4010 if (!skb)
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07004011 return GRO_DROP;
Herbert Xu96e93ea2009-01-06 10:49:34 -08004012
Eric Dumazet89c5fa32012-12-10 13:28:16 +00004013 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
Herbert Xu5d38a072009-01-04 16:13:40 -08004014}
4015EXPORT_SYMBOL(napi_gro_frags);
4016
Eric Dumazete326bed2010-04-22 00:22:45 -07004017/*
4018 * net_rps_action sends any pending IPI's for rps.
4019 * Note: called with local irq disabled, but exits with local irq enabled.
4020 */
4021static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4022{
4023#ifdef CONFIG_RPS
4024 struct softnet_data *remsd = sd->rps_ipi_list;
4025
4026 if (remsd) {
4027 sd->rps_ipi_list = NULL;
4028
4029 local_irq_enable();
4030
4031 /* Send pending IPI's to kick RPS processing on remote cpus. */
4032 while (remsd) {
4033 struct softnet_data *next = remsd->rps_ipi_next;
4034
4035 if (cpu_online(remsd->cpu))
4036 __smp_call_function_single(remsd->cpu,
4037 &remsd->csd, 0);
4038 remsd = next;
4039 }
4040 } else
4041#endif
4042 local_irq_enable();
4043}
4044
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004045static int process_backlog(struct napi_struct *napi, int quota)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004046{
4047 int work = 0;
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004048 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004049
Eric Dumazete326bed2010-04-22 00:22:45 -07004050#ifdef CONFIG_RPS
4051 /* Check if we have pending ipi, its better to send them now,
4052 * not waiting net_rx_action() end.
4053 */
4054 if (sd->rps_ipi_list) {
4055 local_irq_disable();
4056 net_rps_action_and_irq_enable(sd);
4057 }
4058#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004059 napi->weight = weight_p;
Changli Gao6e7676c2010-04-27 15:07:33 -07004060 local_irq_disable();
4061 while (work < quota) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004062 struct sk_buff *skb;
Changli Gao6e7676c2010-04-27 15:07:33 -07004063 unsigned int qlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004064
Changli Gao6e7676c2010-04-27 15:07:33 -07004065 while ((skb = __skb_dequeue(&sd->process_queue))) {
Eric Dumazete4008272010-04-05 15:42:39 -07004066 local_irq_enable();
Changli Gao6e7676c2010-04-27 15:07:33 -07004067 __netif_receive_skb(skb);
Changli Gao6e7676c2010-04-27 15:07:33 -07004068 local_irq_disable();
Tom Herbert76cc8b12010-05-20 18:37:59 +00004069 input_queue_head_incr(sd);
4070 if (++work >= quota) {
4071 local_irq_enable();
4072 return work;
4073 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004074 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004075
Changli Gao6e7676c2010-04-27 15:07:33 -07004076 rps_lock(sd);
4077 qlen = skb_queue_len(&sd->input_pkt_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00004078 if (qlen)
Changli Gao6e7676c2010-04-27 15:07:33 -07004079 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4080 &sd->process_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00004081
Changli Gao6e7676c2010-04-27 15:07:33 -07004082 if (qlen < quota - work) {
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004083 /*
4084 * Inline a custom version of __napi_complete().
4085 * only current cpu owns and manipulates this napi,
4086 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4087 * we can use a plain write instead of clear_bit(),
4088 * and we dont need an smp_mb() memory barrier.
4089 */
4090 list_del(&napi->poll_list);
4091 napi->state = 0;
4092
Changli Gao6e7676c2010-04-27 15:07:33 -07004093 quota = work + qlen;
4094 }
4095 rps_unlock(sd);
4096 }
4097 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004098
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004099 return work;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004100}
4101
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004102/**
4103 * __napi_schedule - schedule for receive
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004104 * @n: entry to schedule
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004105 *
4106 * The entry's receive function will be scheduled to run
4107 */
Harvey Harrisonb5606c22008-02-13 15:03:16 -08004108void __napi_schedule(struct napi_struct *n)
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004109{
4110 unsigned long flags;
4111
4112 local_irq_save(flags);
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004113 ____napi_schedule(&__get_cpu_var(softnet_data), n);
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004114 local_irq_restore(flags);
4115}
4116EXPORT_SYMBOL(__napi_schedule);
4117
Herbert Xud565b0a2008-12-15 23:38:52 -08004118void __napi_complete(struct napi_struct *n)
4119{
4120 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4121 BUG_ON(n->gro_list);
4122
4123 list_del(&n->poll_list);
4124 smp_mb__before_clear_bit();
4125 clear_bit(NAPI_STATE_SCHED, &n->state);
4126}
4127EXPORT_SYMBOL(__napi_complete);
4128
4129void napi_complete(struct napi_struct *n)
4130{
4131 unsigned long flags;
4132
4133 /*
4134 * don't let napi dequeue from the cpu poll list
4135 * just in case its running on a different cpu
4136 */
4137 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4138 return;
4139
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004140 napi_gro_flush(n, false);
Herbert Xud565b0a2008-12-15 23:38:52 -08004141 local_irq_save(flags);
4142 __napi_complete(n);
4143 local_irq_restore(flags);
4144}
4145EXPORT_SYMBOL(napi_complete);
4146
Eliezer Tamiraf12fa62013-06-10 11:39:41 +03004147/* must be called under rcu_read_lock(), as we dont take a reference */
4148struct napi_struct *napi_by_id(unsigned int napi_id)
4149{
4150 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4151 struct napi_struct *napi;
4152
4153 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4154 if (napi->napi_id == napi_id)
4155 return napi;
4156
4157 return NULL;
4158}
4159EXPORT_SYMBOL_GPL(napi_by_id);
4160
4161void napi_hash_add(struct napi_struct *napi)
4162{
4163 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4164
4165 spin_lock(&napi_hash_lock);
4166
4167 /* 0 is not a valid id, we also skip an id that is taken
4168 * we expect both events to be extremely rare
4169 */
4170 napi->napi_id = 0;
4171 while (!napi->napi_id) {
4172 napi->napi_id = ++napi_gen_id;
4173 if (napi_by_id(napi->napi_id))
4174 napi->napi_id = 0;
4175 }
4176
4177 hlist_add_head_rcu(&napi->napi_hash_node,
4178 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4179
4180 spin_unlock(&napi_hash_lock);
4181 }
4182}
4183EXPORT_SYMBOL_GPL(napi_hash_add);
4184
4185/* Warning : caller is responsible to make sure rcu grace period
4186 * is respected before freeing memory containing @napi
4187 */
4188void napi_hash_del(struct napi_struct *napi)
4189{
4190 spin_lock(&napi_hash_lock);
4191
4192 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4193 hlist_del_rcu(&napi->napi_hash_node);
4194
4195 spin_unlock(&napi_hash_lock);
4196}
4197EXPORT_SYMBOL_GPL(napi_hash_del);
4198
Herbert Xud565b0a2008-12-15 23:38:52 -08004199void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4200 int (*poll)(struct napi_struct *, int), int weight)
4201{
4202 INIT_LIST_HEAD(&napi->poll_list);
Herbert Xu4ae55442009-02-08 18:00:36 +00004203 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08004204 napi->gro_list = NULL;
Herbert Xu5d38a072009-01-04 16:13:40 -08004205 napi->skb = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08004206 napi->poll = poll;
Eric Dumazet82dc3c62013-03-05 15:57:22 +00004207 if (weight > NAPI_POLL_WEIGHT)
4208 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4209 weight, dev->name);
Herbert Xud565b0a2008-12-15 23:38:52 -08004210 napi->weight = weight;
4211 list_add(&napi->dev_list, &dev->napi_list);
Herbert Xud565b0a2008-12-15 23:38:52 -08004212 napi->dev = dev;
Herbert Xu5d38a072009-01-04 16:13:40 -08004213#ifdef CONFIG_NETPOLL
Herbert Xud565b0a2008-12-15 23:38:52 -08004214 spin_lock_init(&napi->poll_lock);
4215 napi->poll_owner = -1;
4216#endif
4217 set_bit(NAPI_STATE_SCHED, &napi->state);
4218}
4219EXPORT_SYMBOL(netif_napi_add);
4220
4221void netif_napi_del(struct napi_struct *napi)
4222{
4223 struct sk_buff *skb, *next;
4224
Peter P Waskiewicz Jrd7b06632008-12-26 01:35:35 -08004225 list_del_init(&napi->dev_list);
Herbert Xu76620aa2009-04-16 02:02:07 -07004226 napi_free_frags(napi);
Herbert Xud565b0a2008-12-15 23:38:52 -08004227
4228 for (skb = napi->gro_list; skb; skb = next) {
4229 next = skb->next;
4230 skb->next = NULL;
4231 kfree_skb(skb);
4232 }
4233
4234 napi->gro_list = NULL;
Herbert Xu4ae55442009-02-08 18:00:36 +00004235 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08004236}
4237EXPORT_SYMBOL(netif_napi_del);
4238
Linus Torvalds1da177e2005-04-16 15:20:36 -07004239static void net_rx_action(struct softirq_action *h)
4240{
Eric Dumazete326bed2010-04-22 00:22:45 -07004241 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger24f8b232008-11-03 17:14:38 -08004242 unsigned long time_limit = jiffies + 2;
Stephen Hemminger51b0bde2005-06-23 20:14:40 -07004243 int budget = netdev_budget;
Matt Mackall53fb95d2005-08-11 19:27:43 -07004244 void *have;
4245
Linus Torvalds1da177e2005-04-16 15:20:36 -07004246 local_irq_disable();
4247
Eric Dumazete326bed2010-04-22 00:22:45 -07004248 while (!list_empty(&sd->poll_list)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004249 struct napi_struct *n;
4250 int work, weight;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004251
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004252 /* If softirq window is exhuasted then punt.
Stephen Hemminger24f8b232008-11-03 17:14:38 -08004253 * Allow this to run for 2 jiffies since which will allow
4254 * an average latency of 1.5/HZ.
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004255 */
Eric Dumazetd1f41b62013-03-05 07:15:13 +00004256 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004257 goto softnet_break;
4258
4259 local_irq_enable();
4260
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004261 /* Even though interrupts have been re-enabled, this
4262 * access is safe because interrupts can only add new
4263 * entries to the tail of this list, and only ->poll()
4264 * calls can remove this head entry from the list.
4265 */
Eric Dumazete326bed2010-04-22 00:22:45 -07004266 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004267
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004268 have = netpoll_poll_lock(n);
4269
4270 weight = n->weight;
4271
David S. Miller0a7606c2007-10-29 21:28:47 -07004272 /* This NAPI_STATE_SCHED test is for avoiding a race
4273 * with netpoll's poll_napi(). Only the entity which
4274 * obtains the lock and sees NAPI_STATE_SCHED set will
4275 * actually make the ->poll() call. Therefore we avoid
Lucas De Marchi25985ed2011-03-30 22:57:33 -03004276 * accidentally calling ->poll() when NAPI is not scheduled.
David S. Miller0a7606c2007-10-29 21:28:47 -07004277 */
4278 work = 0;
Neil Horman4ea7e382009-05-21 07:36:08 +00004279 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
David S. Miller0a7606c2007-10-29 21:28:47 -07004280 work = n->poll(n, weight);
Neil Horman4ea7e382009-05-21 07:36:08 +00004281 trace_napi_poll(n);
4282 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004283
4284 WARN_ON_ONCE(work > weight);
4285
4286 budget -= work;
4287
4288 local_irq_disable();
4289
4290 /* Drivers must not modify the NAPI state if they
4291 * consume the entire weight. In such cases this code
4292 * still "owns" the NAPI instance and therefore can
4293 * move the instance around on the list at-will.
4294 */
David S. Millerfed17f32008-01-07 21:00:40 -08004295 if (unlikely(work == weight)) {
Herbert Xuff780cd2009-06-26 19:27:04 -07004296 if (unlikely(napi_disable_pending(n))) {
4297 local_irq_enable();
4298 napi_complete(n);
4299 local_irq_disable();
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004300 } else {
4301 if (n->gro_list) {
4302 /* flush too old packets
4303 * If HZ < 1000, flush all packets.
4304 */
4305 local_irq_enable();
4306 napi_gro_flush(n, HZ >= 1000);
4307 local_irq_disable();
4308 }
Eric Dumazete326bed2010-04-22 00:22:45 -07004309 list_move_tail(&n->poll_list, &sd->poll_list);
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004310 }
David S. Millerfed17f32008-01-07 21:00:40 -08004311 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004312
4313 netpoll_poll_unlock(have);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004314 }
4315out:
Eric Dumazete326bed2010-04-22 00:22:45 -07004316 net_rps_action_and_irq_enable(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00004317
Chris Leechdb217332006-06-17 21:24:58 -07004318#ifdef CONFIG_NET_DMA
4319 /*
4320 * There may not be any more sk_buffs coming right now, so push
4321 * any pending DMA copies to hardware
4322 */
Dan Williams2ba05622009-01-06 11:38:14 -07004323 dma_issue_pending_all();
Chris Leechdb217332006-06-17 21:24:58 -07004324#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004325
Linus Torvalds1da177e2005-04-16 15:20:36 -07004326 return;
4327
4328softnet_break:
Changli Gaodee42872010-05-02 05:42:16 +00004329 sd->time_squeeze++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004330 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4331 goto out;
4332}
4333
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004334struct netdev_upper {
4335 struct net_device *dev;
4336 bool master;
4337 struct list_head list;
4338 struct rcu_head rcu;
4339 struct list_head search_list;
4340};
4341
4342static void __append_search_uppers(struct list_head *search_list,
4343 struct net_device *dev)
4344{
4345 struct netdev_upper *upper;
4346
4347 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4348 /* check if this upper is not already in search list */
4349 if (list_empty(&upper->search_list))
4350 list_add_tail(&upper->search_list, search_list);
4351 }
4352}
4353
4354static bool __netdev_search_upper_dev(struct net_device *dev,
4355 struct net_device *upper_dev)
4356{
4357 LIST_HEAD(search_list);
4358 struct netdev_upper *upper;
4359 struct netdev_upper *tmp;
4360 bool ret = false;
4361
4362 __append_search_uppers(&search_list, dev);
4363 list_for_each_entry(upper, &search_list, search_list) {
4364 if (upper->dev == upper_dev) {
4365 ret = true;
4366 break;
4367 }
4368 __append_search_uppers(&search_list, upper->dev);
4369 }
4370 list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4371 INIT_LIST_HEAD(&upper->search_list);
4372 return ret;
4373}
4374
4375static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4376 struct net_device *upper_dev)
4377{
4378 struct netdev_upper *upper;
4379
4380 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4381 if (upper->dev == upper_dev)
4382 return upper;
4383 }
4384 return NULL;
4385}
4386
4387/**
4388 * netdev_has_upper_dev - Check if device is linked to an upper device
4389 * @dev: device
4390 * @upper_dev: upper device to check
4391 *
4392 * Find out if a device is linked to specified upper device and return true
4393 * in case it is. Note that this checks only immediate upper device,
4394 * not through a complete stack of devices. The caller must hold the RTNL lock.
4395 */
4396bool netdev_has_upper_dev(struct net_device *dev,
4397 struct net_device *upper_dev)
4398{
4399 ASSERT_RTNL();
4400
4401 return __netdev_find_upper(dev, upper_dev);
4402}
4403EXPORT_SYMBOL(netdev_has_upper_dev);
4404
4405/**
4406 * netdev_has_any_upper_dev - Check if device is linked to some device
4407 * @dev: device
4408 *
4409 * Find out if a device is linked to an upper device and return true in case
4410 * it is. The caller must hold the RTNL lock.
4411 */
4412bool netdev_has_any_upper_dev(struct net_device *dev)
4413{
4414 ASSERT_RTNL();
4415
4416 return !list_empty(&dev->upper_dev_list);
4417}
4418EXPORT_SYMBOL(netdev_has_any_upper_dev);
4419
4420/**
4421 * netdev_master_upper_dev_get - Get master upper device
4422 * @dev: device
4423 *
4424 * Find a master upper device and return pointer to it or NULL in case
4425 * it's not there. The caller must hold the RTNL lock.
4426 */
4427struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4428{
4429 struct netdev_upper *upper;
4430
4431 ASSERT_RTNL();
4432
4433 if (list_empty(&dev->upper_dev_list))
4434 return NULL;
4435
4436 upper = list_first_entry(&dev->upper_dev_list,
4437 struct netdev_upper, list);
4438 if (likely(upper->master))
4439 return upper->dev;
4440 return NULL;
4441}
4442EXPORT_SYMBOL(netdev_master_upper_dev_get);
4443
4444/**
4445 * netdev_master_upper_dev_get_rcu - Get master upper device
4446 * @dev: device
4447 *
4448 * Find a master upper device and return pointer to it or NULL in case
4449 * it's not there. The caller must hold the RCU read lock.
4450 */
4451struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4452{
4453 struct netdev_upper *upper;
4454
4455 upper = list_first_or_null_rcu(&dev->upper_dev_list,
4456 struct netdev_upper, list);
4457 if (upper && likely(upper->master))
4458 return upper->dev;
4459 return NULL;
4460}
4461EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4462
4463static int __netdev_upper_dev_link(struct net_device *dev,
4464 struct net_device *upper_dev, bool master)
4465{
4466 struct netdev_upper *upper;
4467
4468 ASSERT_RTNL();
4469
4470 if (dev == upper_dev)
4471 return -EBUSY;
4472
4473 /* To prevent loops, check if dev is not upper device to upper_dev. */
4474 if (__netdev_search_upper_dev(upper_dev, dev))
4475 return -EBUSY;
4476
4477 if (__netdev_find_upper(dev, upper_dev))
4478 return -EEXIST;
4479
4480 if (master && netdev_master_upper_dev_get(dev))
4481 return -EBUSY;
4482
4483 upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4484 if (!upper)
4485 return -ENOMEM;
4486
4487 upper->dev = upper_dev;
4488 upper->master = master;
4489 INIT_LIST_HEAD(&upper->search_list);
4490
4491 /* Ensure that master upper link is always the first item in list. */
4492 if (master)
4493 list_add_rcu(&upper->list, &dev->upper_dev_list);
4494 else
4495 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4496 dev_hold(upper_dev);
Jiri Pirko42e52bf2013-05-25 04:12:10 +00004497 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004498 return 0;
4499}
4500
4501/**
4502 * netdev_upper_dev_link - Add a link to the upper device
4503 * @dev: device
4504 * @upper_dev: new upper device
4505 *
4506 * Adds a link to device which is upper to this one. The caller must hold
4507 * the RTNL lock. On a failure a negative errno code is returned.
4508 * On success the reference counts are adjusted and the function
4509 * returns zero.
4510 */
4511int netdev_upper_dev_link(struct net_device *dev,
4512 struct net_device *upper_dev)
4513{
4514 return __netdev_upper_dev_link(dev, upper_dev, false);
4515}
4516EXPORT_SYMBOL(netdev_upper_dev_link);
4517
4518/**
4519 * netdev_master_upper_dev_link - Add a master link to the upper device
4520 * @dev: device
4521 * @upper_dev: new upper device
4522 *
4523 * Adds a link to device which is upper to this one. In this case, only
4524 * one master upper device can be linked, although other non-master devices
4525 * might be linked as well. The caller must hold the RTNL lock.
4526 * On a failure a negative errno code is returned. On success the reference
4527 * counts are adjusted and the function returns zero.
4528 */
4529int netdev_master_upper_dev_link(struct net_device *dev,
4530 struct net_device *upper_dev)
4531{
4532 return __netdev_upper_dev_link(dev, upper_dev, true);
4533}
4534EXPORT_SYMBOL(netdev_master_upper_dev_link);
4535
4536/**
4537 * netdev_upper_dev_unlink - Removes a link to upper device
4538 * @dev: device
4539 * @upper_dev: new upper device
4540 *
4541 * Removes a link to device which is upper to this one. The caller must hold
4542 * the RTNL lock.
4543 */
4544void netdev_upper_dev_unlink(struct net_device *dev,
4545 struct net_device *upper_dev)
4546{
4547 struct netdev_upper *upper;
4548
4549 ASSERT_RTNL();
4550
4551 upper = __netdev_find_upper(dev, upper_dev);
4552 if (!upper)
4553 return;
4554 list_del_rcu(&upper->list);
4555 dev_put(upper_dev);
4556 kfree_rcu(upper, rcu);
Jiri Pirko42e52bf2013-05-25 04:12:10 +00004557 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004558}
4559EXPORT_SYMBOL(netdev_upper_dev_unlink);
4560
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004561static void dev_change_rx_flags(struct net_device *dev, int flags)
4562{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004563 const struct net_device_ops *ops = dev->netdev_ops;
4564
4565 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4566 ops->ndo_change_rx_flags(dev, flags);
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004567}
4568
Wang Chendad9b332008-06-18 01:48:28 -07004569static int __dev_set_promiscuity(struct net_device *dev, int inc)
Patrick McHardy4417da62007-06-27 01:28:10 -07004570{
Eric Dumazetb536db92011-11-30 21:42:26 +00004571 unsigned int old_flags = dev->flags;
Eric W. Biedermand04a48b2012-05-23 17:01:57 -06004572 kuid_t uid;
4573 kgid_t gid;
Patrick McHardy4417da62007-06-27 01:28:10 -07004574
Patrick McHardy24023452007-07-14 18:51:31 -07004575 ASSERT_RTNL();
4576
Wang Chendad9b332008-06-18 01:48:28 -07004577 dev->flags |= IFF_PROMISC;
4578 dev->promiscuity += inc;
4579 if (dev->promiscuity == 0) {
4580 /*
4581 * Avoid overflow.
4582 * If inc causes overflow, untouch promisc and return error.
4583 */
4584 if (inc < 0)
4585 dev->flags &= ~IFF_PROMISC;
4586 else {
4587 dev->promiscuity -= inc;
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004588 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4589 dev->name);
Wang Chendad9b332008-06-18 01:48:28 -07004590 return -EOVERFLOW;
4591 }
4592 }
Patrick McHardy4417da62007-06-27 01:28:10 -07004593 if (dev->flags != old_flags) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004594 pr_info("device %s %s promiscuous mode\n",
4595 dev->name,
4596 dev->flags & IFF_PROMISC ? "entered" : "left");
David Howells8192b0c2008-11-14 10:39:10 +11004597 if (audit_enabled) {
4598 current_uid_gid(&uid, &gid);
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004599 audit_log(current->audit_context, GFP_ATOMIC,
4600 AUDIT_ANOM_PROMISCUOUS,
4601 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4602 dev->name, (dev->flags & IFF_PROMISC),
4603 (old_flags & IFF_PROMISC),
Eric W. Biedermane1760bd2012-09-10 22:39:43 -07004604 from_kuid(&init_user_ns, audit_get_loginuid(current)),
Eric W. Biedermand04a48b2012-05-23 17:01:57 -06004605 from_kuid(&init_user_ns, uid),
4606 from_kgid(&init_user_ns, gid),
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004607 audit_get_sessionid(current));
David Howells8192b0c2008-11-14 10:39:10 +11004608 }
Patrick McHardy24023452007-07-14 18:51:31 -07004609
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004610 dev_change_rx_flags(dev, IFF_PROMISC);
Patrick McHardy4417da62007-06-27 01:28:10 -07004611 }
Wang Chendad9b332008-06-18 01:48:28 -07004612 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004613}
4614
Linus Torvalds1da177e2005-04-16 15:20:36 -07004615/**
4616 * dev_set_promiscuity - update promiscuity count on a device
4617 * @dev: device
4618 * @inc: modifier
4619 *
Stephen Hemminger3041a062006-05-26 13:25:24 -07004620 * Add or remove promiscuity from a device. While the count in the device
Linus Torvalds1da177e2005-04-16 15:20:36 -07004621 * remains above zero the interface remains promiscuous. Once it hits zero
4622 * the device reverts back to normal filtering operation. A negative inc
4623 * value is used to drop promiscuity on the device.
Wang Chendad9b332008-06-18 01:48:28 -07004624 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004625 */
Wang Chendad9b332008-06-18 01:48:28 -07004626int dev_set_promiscuity(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004627{
Eric Dumazetb536db92011-11-30 21:42:26 +00004628 unsigned int old_flags = dev->flags;
Wang Chendad9b332008-06-18 01:48:28 -07004629 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004630
Wang Chendad9b332008-06-18 01:48:28 -07004631 err = __dev_set_promiscuity(dev, inc);
Patrick McHardy4b5a6982008-07-06 15:49:08 -07004632 if (err < 0)
Wang Chendad9b332008-06-18 01:48:28 -07004633 return err;
Patrick McHardy4417da62007-06-27 01:28:10 -07004634 if (dev->flags != old_flags)
4635 dev_set_rx_mode(dev);
Wang Chendad9b332008-06-18 01:48:28 -07004636 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004637}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004638EXPORT_SYMBOL(dev_set_promiscuity);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004639
4640/**
4641 * dev_set_allmulti - update allmulti count on a device
4642 * @dev: device
4643 * @inc: modifier
4644 *
4645 * Add or remove reception of all multicast frames to a device. While the
4646 * count in the device remains above zero the interface remains listening
4647 * to all interfaces. Once it hits zero the device reverts back to normal
4648 * filtering operation. A negative @inc value is used to drop the counter
4649 * when releasing a resource needing all multicasts.
Wang Chendad9b332008-06-18 01:48:28 -07004650 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004651 */
4652
Wang Chendad9b332008-06-18 01:48:28 -07004653int dev_set_allmulti(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004654{
Eric Dumazetb536db92011-11-30 21:42:26 +00004655 unsigned int old_flags = dev->flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004656
Patrick McHardy24023452007-07-14 18:51:31 -07004657 ASSERT_RTNL();
4658
Linus Torvalds1da177e2005-04-16 15:20:36 -07004659 dev->flags |= IFF_ALLMULTI;
Wang Chendad9b332008-06-18 01:48:28 -07004660 dev->allmulti += inc;
4661 if (dev->allmulti == 0) {
4662 /*
4663 * Avoid overflow.
4664 * If inc causes overflow, untouch allmulti and return error.
4665 */
4666 if (inc < 0)
4667 dev->flags &= ~IFF_ALLMULTI;
4668 else {
4669 dev->allmulti -= inc;
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004670 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4671 dev->name);
Wang Chendad9b332008-06-18 01:48:28 -07004672 return -EOVERFLOW;
4673 }
4674 }
Patrick McHardy24023452007-07-14 18:51:31 -07004675 if (dev->flags ^ old_flags) {
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004676 dev_change_rx_flags(dev, IFF_ALLMULTI);
Patrick McHardy4417da62007-06-27 01:28:10 -07004677 dev_set_rx_mode(dev);
Patrick McHardy24023452007-07-14 18:51:31 -07004678 }
Wang Chendad9b332008-06-18 01:48:28 -07004679 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004680}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004681EXPORT_SYMBOL(dev_set_allmulti);
Patrick McHardy4417da62007-06-27 01:28:10 -07004682
4683/*
4684 * Upload unicast and multicast address lists to device and
4685 * configure RX filtering. When the device doesn't support unicast
Joe Perches53ccaae2007-12-20 14:02:06 -08004686 * filtering it is put in promiscuous mode while unicast addresses
Patrick McHardy4417da62007-06-27 01:28:10 -07004687 * are present.
4688 */
4689void __dev_set_rx_mode(struct net_device *dev)
4690{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004691 const struct net_device_ops *ops = dev->netdev_ops;
4692
Patrick McHardy4417da62007-06-27 01:28:10 -07004693 /* dev_open will call this function so the list will stay sane. */
4694 if (!(dev->flags&IFF_UP))
4695 return;
4696
4697 if (!netif_device_present(dev))
YOSHIFUJI Hideaki40b77c92007-07-19 10:43:23 +09004698 return;
Patrick McHardy4417da62007-06-27 01:28:10 -07004699
Jiri Pirko01789342011-08-16 06:29:00 +00004700 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004701 /* Unicast addresses changes may only happen under the rtnl,
4702 * therefore calling __dev_set_promiscuity here is safe.
4703 */
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004704 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004705 __dev_set_promiscuity(dev, 1);
Joe Perches2d348d12011-07-25 16:17:35 -07004706 dev->uc_promisc = true;
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004707 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004708 __dev_set_promiscuity(dev, -1);
Joe Perches2d348d12011-07-25 16:17:35 -07004709 dev->uc_promisc = false;
Patrick McHardy4417da62007-06-27 01:28:10 -07004710 }
Patrick McHardy4417da62007-06-27 01:28:10 -07004711 }
Jiri Pirko01789342011-08-16 06:29:00 +00004712
4713 if (ops->ndo_set_rx_mode)
4714 ops->ndo_set_rx_mode(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004715}
4716
4717void dev_set_rx_mode(struct net_device *dev)
4718{
David S. Millerb9e40852008-07-15 00:15:08 -07004719 netif_addr_lock_bh(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004720 __dev_set_rx_mode(dev);
David S. Millerb9e40852008-07-15 00:15:08 -07004721 netif_addr_unlock_bh(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004722}
4723
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004724/**
4725 * dev_get_flags - get flags reported to userspace
4726 * @dev: device
4727 *
4728 * Get the combination of flag bits exported through APIs to userspace.
4729 */
Eric Dumazet95c96172012-04-15 05:58:06 +00004730unsigned int dev_get_flags(const struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004731{
Eric Dumazet95c96172012-04-15 05:58:06 +00004732 unsigned int flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004733
4734 flags = (dev->flags & ~(IFF_PROMISC |
4735 IFF_ALLMULTI |
Stefan Rompfb00055a2006-03-20 17:09:11 -08004736 IFF_RUNNING |
4737 IFF_LOWER_UP |
4738 IFF_DORMANT)) |
Linus Torvalds1da177e2005-04-16 15:20:36 -07004739 (dev->gflags & (IFF_PROMISC |
4740 IFF_ALLMULTI));
4741
Stefan Rompfb00055a2006-03-20 17:09:11 -08004742 if (netif_running(dev)) {
4743 if (netif_oper_up(dev))
4744 flags |= IFF_RUNNING;
4745 if (netif_carrier_ok(dev))
4746 flags |= IFF_LOWER_UP;
4747 if (netif_dormant(dev))
4748 flags |= IFF_DORMANT;
4749 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004750
4751 return flags;
4752}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004753EXPORT_SYMBOL(dev_get_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004754
Patrick McHardybd380812010-02-26 06:34:53 +00004755int __dev_change_flags(struct net_device *dev, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004756{
Eric Dumazetb536db92011-11-30 21:42:26 +00004757 unsigned int old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00004758 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004759
Patrick McHardy24023452007-07-14 18:51:31 -07004760 ASSERT_RTNL();
4761
Linus Torvalds1da177e2005-04-16 15:20:36 -07004762 /*
4763 * Set the flags on our device.
4764 */
4765
4766 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4767 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4768 IFF_AUTOMEDIA)) |
4769 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4770 IFF_ALLMULTI));
4771
4772 /*
4773 * Load in the correct multicast list now the flags have changed.
4774 */
4775
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004776 if ((old_flags ^ flags) & IFF_MULTICAST)
4777 dev_change_rx_flags(dev, IFF_MULTICAST);
Patrick McHardy24023452007-07-14 18:51:31 -07004778
Patrick McHardy4417da62007-06-27 01:28:10 -07004779 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004780
4781 /*
4782 * Have we downed the interface. We handle IFF_UP ourselves
4783 * according to user attempts to set it, rather than blindly
4784 * setting it.
4785 */
4786
4787 ret = 0;
4788 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
Patrick McHardybd380812010-02-26 06:34:53 +00004789 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004790
4791 if (!ret)
Patrick McHardy4417da62007-06-27 01:28:10 -07004792 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004793 }
4794
Linus Torvalds1da177e2005-04-16 15:20:36 -07004795 if ((flags ^ dev->gflags) & IFF_PROMISC) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004796 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4797
Linus Torvalds1da177e2005-04-16 15:20:36 -07004798 dev->gflags ^= IFF_PROMISC;
4799 dev_set_promiscuity(dev, inc);
4800 }
4801
4802 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4803 is important. Some (broken) drivers set IFF_PROMISC, when
4804 IFF_ALLMULTI is requested not asking us and not reporting.
4805 */
4806 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004807 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4808
Linus Torvalds1da177e2005-04-16 15:20:36 -07004809 dev->gflags ^= IFF_ALLMULTI;
4810 dev_set_allmulti(dev, inc);
4811 }
4812
Patrick McHardybd380812010-02-26 06:34:53 +00004813 return ret;
4814}
4815
4816void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4817{
4818 unsigned int changes = dev->flags ^ old_flags;
4819
4820 if (changes & IFF_UP) {
4821 if (dev->flags & IFF_UP)
4822 call_netdevice_notifiers(NETDEV_UP, dev);
4823 else
4824 call_netdevice_notifiers(NETDEV_DOWN, dev);
4825 }
4826
4827 if (dev->flags & IFF_UP &&
Jiri Pirkobe9efd32013-05-28 01:30:22 +00004828 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
4829 struct netdev_notifier_change_info change_info;
4830
4831 change_info.flags_changed = changes;
4832 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
4833 &change_info.info);
4834 }
Patrick McHardybd380812010-02-26 06:34:53 +00004835}
4836
4837/**
4838 * dev_change_flags - change device settings
4839 * @dev: device
4840 * @flags: device state flags
4841 *
4842 * Change settings on device based state flags. The flags are
4843 * in the userspace exported format.
4844 */
Eric Dumazetb536db92011-11-30 21:42:26 +00004845int dev_change_flags(struct net_device *dev, unsigned int flags)
Patrick McHardybd380812010-02-26 06:34:53 +00004846{
Eric Dumazetb536db92011-11-30 21:42:26 +00004847 int ret;
4848 unsigned int changes, old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00004849
4850 ret = __dev_change_flags(dev, flags);
4851 if (ret < 0)
4852 return ret;
4853
4854 changes = old_flags ^ dev->flags;
Thomas Graf7c355f52007-06-05 16:03:03 -07004855 if (changes)
4856 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004857
Patrick McHardybd380812010-02-26 06:34:53 +00004858 __dev_notify_flags(dev, old_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004859 return ret;
4860}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004861EXPORT_SYMBOL(dev_change_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004862
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004863/**
4864 * dev_set_mtu - Change maximum transfer unit
4865 * @dev: device
4866 * @new_mtu: new transfer unit
4867 *
4868 * Change the maximum transfer size of the network device.
4869 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004870int dev_set_mtu(struct net_device *dev, int new_mtu)
4871{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004872 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004873 int err;
4874
4875 if (new_mtu == dev->mtu)
4876 return 0;
4877
4878 /* MTU must be positive. */
4879 if (new_mtu < 0)
4880 return -EINVAL;
4881
4882 if (!netif_device_present(dev))
4883 return -ENODEV;
4884
4885 err = 0;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004886 if (ops->ndo_change_mtu)
4887 err = ops->ndo_change_mtu(dev, new_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004888 else
4889 dev->mtu = new_mtu;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004890
Jiri Pirkoe3d8fab2012-12-03 01:16:32 +00004891 if (!err)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07004892 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004893 return err;
4894}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004895EXPORT_SYMBOL(dev_set_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004896
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004897/**
Vlad Dogarucbda10f2011-01-13 23:38:30 +00004898 * dev_set_group - Change group this device belongs to
4899 * @dev: device
4900 * @new_group: group this device should belong to
4901 */
4902void dev_set_group(struct net_device *dev, int new_group)
4903{
4904 dev->group = new_group;
4905}
4906EXPORT_SYMBOL(dev_set_group);
4907
4908/**
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004909 * dev_set_mac_address - Change Media Access Control Address
4910 * @dev: device
4911 * @sa: new address
4912 *
4913 * Change the hardware (MAC) address of the device
4914 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004915int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4916{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004917 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004918 int err;
4919
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004920 if (!ops->ndo_set_mac_address)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004921 return -EOPNOTSUPP;
4922 if (sa->sa_family != dev->type)
4923 return -EINVAL;
4924 if (!netif_device_present(dev))
4925 return -ENODEV;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004926 err = ops->ndo_set_mac_address(dev, sa);
Jiri Pirkof6521512013-01-01 03:30:14 +00004927 if (err)
4928 return err;
Jiri Pirkofbdeca22013-01-01 03:30:16 +00004929 dev->addr_assign_type = NET_ADDR_SET;
Jiri Pirkof6521512013-01-01 03:30:14 +00004930 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04004931 add_device_randomness(dev->dev_addr, dev->addr_len);
Jiri Pirkof6521512013-01-01 03:30:14 +00004932 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004933}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004934EXPORT_SYMBOL(dev_set_mac_address);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004935
Jiri Pirko4bf84c32012-12-27 23:49:37 +00004936/**
4937 * dev_change_carrier - Change device carrier
4938 * @dev: device
Randy Dunlap691b3b72013-03-04 12:32:43 +00004939 * @new_carrier: new value
Jiri Pirko4bf84c32012-12-27 23:49:37 +00004940 *
4941 * Change device carrier
4942 */
4943int dev_change_carrier(struct net_device *dev, bool new_carrier)
4944{
4945 const struct net_device_ops *ops = dev->netdev_ops;
4946
4947 if (!ops->ndo_change_carrier)
4948 return -EOPNOTSUPP;
4949 if (!netif_device_present(dev))
4950 return -ENODEV;
4951 return ops->ndo_change_carrier(dev, new_carrier);
4952}
4953EXPORT_SYMBOL(dev_change_carrier);
4954
Linus Torvalds1da177e2005-04-16 15:20:36 -07004955/**
4956 * dev_new_index - allocate an ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004957 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07004958 *
4959 * Returns a suitable unique value for a new device interface
4960 * number. The caller must hold the rtnl semaphore or the
4961 * dev_base_lock to be sure it remains unique.
4962 */
Eric W. Biederman881d9662007-09-17 11:56:21 -07004963static int dev_new_index(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004964{
Pavel Emelyanovaa79e662012-08-08 21:53:19 +00004965 int ifindex = net->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004966 for (;;) {
4967 if (++ifindex <= 0)
4968 ifindex = 1;
Eric W. Biederman881d9662007-09-17 11:56:21 -07004969 if (!__dev_get_by_index(net, ifindex))
Pavel Emelyanovaa79e662012-08-08 21:53:19 +00004970 return net->ifindex = ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004971 }
4972}
4973
Linus Torvalds1da177e2005-04-16 15:20:36 -07004974/* Delayed registration/unregisteration */
Denis Cheng3b5b34f2007-12-07 00:49:17 -08004975static LIST_HEAD(net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004976
Stephen Hemminger6f05f622007-03-08 20:46:03 -08004977static void net_set_todo(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004978{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004979 list_add_tail(&dev->todo_list, &net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004980}
4981
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004982static void rollback_registered_many(struct list_head *head)
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004983{
Krishna Kumare93737b2009-12-08 22:26:02 +00004984 struct net_device *dev, *tmp;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004985
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004986 BUG_ON(dev_boot_phase);
4987 ASSERT_RTNL();
4988
Krishna Kumare93737b2009-12-08 22:26:02 +00004989 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004990 /* Some devices call without registering
Krishna Kumare93737b2009-12-08 22:26:02 +00004991 * for initialization unwind. Remove those
4992 * devices and proceed with the remaining.
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004993 */
4994 if (dev->reg_state == NETREG_UNINITIALIZED) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004995 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4996 dev->name, dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004997
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004998 WARN_ON(1);
Krishna Kumare93737b2009-12-08 22:26:02 +00004999 list_del(&dev->unreg_list);
5000 continue;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005001 }
Eric Dumazet449f4542011-05-19 12:24:16 +00005002 dev->dismantle = true;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005003 BUG_ON(dev->reg_state != NETREG_REGISTERED);
Octavian Purdila44345722010-12-13 12:44:07 +00005004 }
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005005
Octavian Purdila44345722010-12-13 12:44:07 +00005006 /* If device is running, close it first. */
5007 dev_close_many(head);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005008
Octavian Purdila44345722010-12-13 12:44:07 +00005009 list_for_each_entry(dev, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005010 /* And unlink it from device chain. */
5011 unlist_netdevice(dev);
5012
5013 dev->reg_state = NETREG_UNREGISTERING;
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005014 }
5015
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005016 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005017
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005018 list_for_each_entry(dev, head, unreg_list) {
5019 /* Shutdown queueing discipline. */
5020 dev_shutdown(dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005021
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005022
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005023 /* Notify protocols, that we are about to destroy
5024 this device. They should clean all the things.
5025 */
5026 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5027
Patrick McHardya2835762010-02-26 06:34:51 +00005028 if (!dev->rtnl_link_ops ||
5029 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5030 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5031
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005032 /*
5033 * Flush the unicast and multicast chains
5034 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005035 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00005036 dev_mc_flush(dev);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005037
5038 if (dev->netdev_ops->ndo_uninit)
5039 dev->netdev_ops->ndo_uninit(dev);
5040
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005041 /* Notifier chain MUST detach us all upper devices. */
5042 WARN_ON(netdev_has_any_upper_dev(dev));
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005043
5044 /* Remove entries from kobject tree */
5045 netdev_unregister_kobject(dev);
Alexander Duyck024e9672013-01-10 08:57:46 +00005046#ifdef CONFIG_XPS
5047 /* Remove XPS queueing entries */
5048 netif_reset_xps_queues_gt(dev, 0);
5049#endif
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005050 }
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005051
Eric W. Biederman850a5452011-10-13 22:25:23 +00005052 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005053
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005054 list_for_each_entry(dev, head, unreg_list)
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005055 dev_put(dev);
5056}
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005057
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005058static void rollback_registered(struct net_device *dev)
5059{
5060 LIST_HEAD(single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005061
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005062 list_add(&dev->unreg_list, &single);
5063 rollback_registered_many(&single);
Eric Dumazetceaaec92011-02-17 22:59:19 +00005064 list_del(&single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005065}
5066
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005067static netdev_features_t netdev_fix_features(struct net_device *dev,
5068 netdev_features_t features)
Herbert Xub63365a2008-10-23 01:11:29 -07005069{
Michał Mirosław57422dc2011-01-22 12:14:12 +00005070 /* Fix illegal checksum combinations */
5071 if ((features & NETIF_F_HW_CSUM) &&
5072 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005073 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
Michał Mirosław57422dc2011-01-22 12:14:12 +00005074 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5075 }
5076
Herbert Xub63365a2008-10-23 01:11:29 -07005077 /* TSO requires that SG is present as well. */
Ben Hutchingsea2d3682011-04-12 14:38:37 +00005078 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005079 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
Ben Hutchingsea2d3682011-04-12 14:38:37 +00005080 features &= ~NETIF_F_ALL_TSO;
Herbert Xub63365a2008-10-23 01:11:29 -07005081 }
5082
Pravin B Shelarec5f0612013-03-07 09:28:01 +00005083 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5084 !(features & NETIF_F_IP_CSUM)) {
5085 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5086 features &= ~NETIF_F_TSO;
5087 features &= ~NETIF_F_TSO_ECN;
5088 }
5089
5090 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5091 !(features & NETIF_F_IPV6_CSUM)) {
5092 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5093 features &= ~NETIF_F_TSO6;
5094 }
5095
Ben Hutchings31d8b9e2011-04-12 14:47:15 +00005096 /* TSO ECN requires that TSO is present as well. */
5097 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5098 features &= ~NETIF_F_TSO_ECN;
5099
Michał Mirosław212b5732011-02-15 16:59:16 +00005100 /* Software GSO depends on SG. */
5101 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005102 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
Michał Mirosław212b5732011-02-15 16:59:16 +00005103 features &= ~NETIF_F_GSO;
5104 }
5105
Michał Mirosławacd11302011-01-24 15:45:15 -08005106 /* UFO needs SG and checksumming */
Herbert Xub63365a2008-10-23 01:11:29 -07005107 if (features & NETIF_F_UFO) {
Michał Mirosław79032642010-11-30 06:38:00 +00005108 /* maybe split UFO into V4 and V6? */
5109 if (!((features & NETIF_F_GEN_CSUM) ||
5110 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5111 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005112 netdev_dbg(dev,
Michał Mirosławacd11302011-01-24 15:45:15 -08005113 "Dropping NETIF_F_UFO since no checksum offload features.\n");
Herbert Xub63365a2008-10-23 01:11:29 -07005114 features &= ~NETIF_F_UFO;
5115 }
5116
5117 if (!(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005118 netdev_dbg(dev,
Michał Mirosławacd11302011-01-24 15:45:15 -08005119 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
Herbert Xub63365a2008-10-23 01:11:29 -07005120 features &= ~NETIF_F_UFO;
5121 }
5122 }
5123
5124 return features;
5125}
Herbert Xub63365a2008-10-23 01:11:29 -07005126
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005127int __netdev_update_features(struct net_device *dev)
Michał Mirosław5455c692011-02-15 16:59:17 +00005128{
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005129 netdev_features_t features;
Michał Mirosław5455c692011-02-15 16:59:17 +00005130 int err = 0;
5131
Michał Mirosław87267482011-04-12 09:56:38 +00005132 ASSERT_RTNL();
5133
Michał Mirosław5455c692011-02-15 16:59:17 +00005134 features = netdev_get_wanted_features(dev);
5135
5136 if (dev->netdev_ops->ndo_fix_features)
5137 features = dev->netdev_ops->ndo_fix_features(dev, features);
5138
5139 /* driver might be less strict about feature dependencies */
5140 features = netdev_fix_features(dev, features);
5141
5142 if (dev->features == features)
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005143 return 0;
Michał Mirosław5455c692011-02-15 16:59:17 +00005144
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005145 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5146 &dev->features, &features);
Michał Mirosław5455c692011-02-15 16:59:17 +00005147
5148 if (dev->netdev_ops->ndo_set_features)
5149 err = dev->netdev_ops->ndo_set_features(dev, features);
5150
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005151 if (unlikely(err < 0)) {
Michał Mirosław5455c692011-02-15 16:59:17 +00005152 netdev_err(dev,
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005153 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5154 err, &features, &dev->features);
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005155 return -1;
5156 }
5157
5158 if (!err)
5159 dev->features = features;
5160
5161 return 1;
5162}
5163
Michał Mirosławafe12cc2011-05-07 03:22:17 +00005164/**
5165 * netdev_update_features - recalculate device features
5166 * @dev: the device to check
5167 *
5168 * Recalculate dev->features set and send notifications if it
5169 * has changed. Should be called after driver or hardware dependent
5170 * conditions might have changed that influence the features.
5171 */
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005172void netdev_update_features(struct net_device *dev)
5173{
5174 if (__netdev_update_features(dev))
5175 netdev_features_change(dev);
Michał Mirosław5455c692011-02-15 16:59:17 +00005176}
5177EXPORT_SYMBOL(netdev_update_features);
5178
Linus Torvalds1da177e2005-04-16 15:20:36 -07005179/**
Michał Mirosławafe12cc2011-05-07 03:22:17 +00005180 * netdev_change_features - recalculate device features
5181 * @dev: the device to check
5182 *
5183 * Recalculate dev->features set and send notifications even
5184 * if they have not changed. Should be called instead of
5185 * netdev_update_features() if also dev->vlan_features might
5186 * have changed to allow the changes to be propagated to stacked
5187 * VLAN devices.
5188 */
5189void netdev_change_features(struct net_device *dev)
5190{
5191 __netdev_update_features(dev);
5192 netdev_features_change(dev);
5193}
5194EXPORT_SYMBOL(netdev_change_features);
5195
5196/**
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08005197 * netif_stacked_transfer_operstate - transfer operstate
5198 * @rootdev: the root or lower level device to transfer state from
5199 * @dev: the device to transfer operstate to
5200 *
5201 * Transfer operational state from root to device. This is normally
5202 * called when a stacking relationship exists between the root
5203 * device and the device(a leaf device).
5204 */
5205void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5206 struct net_device *dev)
5207{
5208 if (rootdev->operstate == IF_OPER_DORMANT)
5209 netif_dormant_on(dev);
5210 else
5211 netif_dormant_off(dev);
5212
5213 if (netif_carrier_ok(rootdev)) {
5214 if (!netif_carrier_ok(dev))
5215 netif_carrier_on(dev);
5216 } else {
5217 if (netif_carrier_ok(dev))
5218 netif_carrier_off(dev);
5219 }
5220}
5221EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5222
Tom Herbertbf264142010-11-26 08:36:09 +00005223#ifdef CONFIG_RPS
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005224static int netif_alloc_rx_queues(struct net_device *dev)
5225{
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005226 unsigned int i, count = dev->num_rx_queues;
Tom Herbertbd25fa72010-10-18 18:00:16 +00005227 struct netdev_rx_queue *rx;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005228
Tom Herbertbd25fa72010-10-18 18:00:16 +00005229 BUG_ON(count < 1);
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005230
Tom Herbertbd25fa72010-10-18 18:00:16 +00005231 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
Joe Perches62b59422013-02-04 16:48:16 +00005232 if (!rx)
Tom Herbertbd25fa72010-10-18 18:00:16 +00005233 return -ENOMEM;
Joe Perches62b59422013-02-04 16:48:16 +00005234
Tom Herbertbd25fa72010-10-18 18:00:16 +00005235 dev->_rx = rx;
5236
Tom Herbertbd25fa72010-10-18 18:00:16 +00005237 for (i = 0; i < count; i++)
Tom Herbertfe822242010-11-09 10:47:38 +00005238 rx[i].dev = dev;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005239 return 0;
5240}
Tom Herbertbf264142010-11-26 08:36:09 +00005241#endif
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005242
Changli Gaoaa942102010-12-04 02:31:41 +00005243static void netdev_init_one_queue(struct net_device *dev,
5244 struct netdev_queue *queue, void *_unused)
5245{
5246 /* Initialize queue lock */
5247 spin_lock_init(&queue->_xmit_lock);
5248 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5249 queue->xmit_lock_owner = -1;
Changli Gaob236da62010-12-14 03:09:15 +00005250 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
Changli Gaoaa942102010-12-04 02:31:41 +00005251 queue->dev = dev;
Tom Herbert114cf582011-11-28 16:33:09 +00005252#ifdef CONFIG_BQL
5253 dql_init(&queue->dql, HZ);
5254#endif
Changli Gaoaa942102010-12-04 02:31:41 +00005255}
5256
Eric Dumazet60877a32013-06-20 01:15:51 -07005257static void netif_free_tx_queues(struct net_device *dev)
5258{
5259 if (is_vmalloc_addr(dev->_tx))
5260 vfree(dev->_tx);
5261 else
5262 kfree(dev->_tx);
5263}
5264
Tom Herberte6484932010-10-18 18:04:39 +00005265static int netif_alloc_netdev_queues(struct net_device *dev)
5266{
5267 unsigned int count = dev->num_tx_queues;
5268 struct netdev_queue *tx;
Eric Dumazet60877a32013-06-20 01:15:51 -07005269 size_t sz = count * sizeof(*tx);
Tom Herberte6484932010-10-18 18:04:39 +00005270
Eric Dumazet60877a32013-06-20 01:15:51 -07005271 BUG_ON(count < 1 || count > 0xffff);
Tom Herberte6484932010-10-18 18:04:39 +00005272
Eric Dumazet60877a32013-06-20 01:15:51 -07005273 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5274 if (!tx) {
5275 tx = vzalloc(sz);
5276 if (!tx)
5277 return -ENOMEM;
5278 }
Tom Herberte6484932010-10-18 18:04:39 +00005279 dev->_tx = tx;
Tom Herbert1d24eb42010-11-21 13:17:27 +00005280
Tom Herberte6484932010-10-18 18:04:39 +00005281 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5282 spin_lock_init(&dev->tx_global_lock);
Changli Gaoaa942102010-12-04 02:31:41 +00005283
5284 return 0;
Tom Herberte6484932010-10-18 18:04:39 +00005285}
5286
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08005287/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005288 * register_netdevice - register a network device
5289 * @dev: device to register
5290 *
5291 * Take a completed network device structure and add it to the kernel
5292 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5293 * chain. 0 is returned on success. A negative errno code is returned
5294 * on a failure to set up the device, or if the name is a duplicate.
5295 *
5296 * Callers must hold the rtnl semaphore. You may want
5297 * register_netdev() instead of this.
5298 *
5299 * BUGS:
5300 * The locking appears insufficient to guarantee two parallel registers
5301 * will not get the same name.
5302 */
5303
5304int register_netdevice(struct net_device *dev)
5305{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005306 int ret;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005307 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005308
5309 BUG_ON(dev_boot_phase);
5310 ASSERT_RTNL();
5311
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005312 might_sleep();
5313
Linus Torvalds1da177e2005-04-16 15:20:36 -07005314 /* When net_device's are persistent, this will be fatal. */
5315 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005316 BUG_ON(!net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005317
David S. Millerf1f28aa2008-07-15 00:08:33 -07005318 spin_lock_init(&dev->addr_list_lock);
David S. Millercf508b12008-07-22 14:16:42 -07005319 netdev_set_addr_lockdep_class(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005320
Linus Torvalds1da177e2005-04-16 15:20:36 -07005321 dev->iflink = -1;
5322
Gao feng828de4f2012-09-13 20:58:27 +00005323 ret = dev_get_valid_name(net, dev, dev->name);
Peter Pan(潘卫平)0696c3a2011-05-12 15:46:56 +00005324 if (ret < 0)
5325 goto out;
5326
Linus Torvalds1da177e2005-04-16 15:20:36 -07005327 /* Init, if this function is available */
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005328 if (dev->netdev_ops->ndo_init) {
5329 ret = dev->netdev_ops->ndo_init(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005330 if (ret) {
5331 if (ret > 0)
5332 ret = -EIO;
Adrian Bunk90833aa2006-11-13 16:02:22 -08005333 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005334 }
5335 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005336
Patrick McHardyf6469682013-04-19 02:04:27 +00005337 if (((dev->hw_features | dev->features) &
5338 NETIF_F_HW_VLAN_CTAG_FILTER) &&
Michał Mirosławd2ed2732013-01-29 15:14:16 +00005339 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5340 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5341 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5342 ret = -EINVAL;
5343 goto err_uninit;
5344 }
5345
Pavel Emelyanov9c7dafb2012-08-08 21:52:46 +00005346 ret = -EBUSY;
5347 if (!dev->ifindex)
5348 dev->ifindex = dev_new_index(net);
5349 else if (__dev_get_by_index(net, dev->ifindex))
5350 goto err_uninit;
5351
Linus Torvalds1da177e2005-04-16 15:20:36 -07005352 if (dev->iflink == -1)
5353 dev->iflink = dev->ifindex;
5354
Michał Mirosław5455c692011-02-15 16:59:17 +00005355 /* Transfer changeable features to wanted_features and enable
5356 * software offloads (GSO and GRO).
5357 */
5358 dev->hw_features |= NETIF_F_SOFT_FEATURES;
Michał Mirosław14d12322011-02-22 16:52:28 +00005359 dev->features |= NETIF_F_SOFT_FEATURES;
5360 dev->wanted_features = dev->features & dev->hw_features;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005361
Tom Herbertc6e1a0d2011-04-04 22:30:30 -07005362 /* Turn on no cache copy if HW is doing checksum */
Michał Mirosław34324dc2011-11-15 15:29:55 +00005363 if (!(dev->flags & IFF_LOOPBACK)) {
5364 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5365 if (dev->features & NETIF_F_ALL_CSUM) {
5366 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5367 dev->features |= NETIF_F_NOCACHE_COPY;
5368 }
Tom Herbertc6e1a0d2011-04-04 22:30:30 -07005369 }
5370
Michał Mirosław1180e7d2011-07-14 14:41:11 -07005371 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
Brandon Philips16c3ea72010-09-15 09:24:24 +00005372 */
Michał Mirosław1180e7d2011-07-14 14:41:11 -07005373 dev->vlan_features |= NETIF_F_HIGHDMA;
Brandon Philips16c3ea72010-09-15 09:24:24 +00005374
Pravin B Shelaree579672013-03-07 09:28:08 +00005375 /* Make NETIF_F_SG inheritable to tunnel devices.
5376 */
5377 dev->hw_enc_features |= NETIF_F_SG;
5378
Simon Horman0d89d202013-05-23 21:02:52 +00005379 /* Make NETIF_F_SG inheritable to MPLS.
5380 */
5381 dev->mpls_features |= NETIF_F_SG;
5382
Johannes Berg7ffbe3f2009-10-02 05:15:27 +00005383 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5384 ret = notifier_to_errno(ret);
5385 if (ret)
5386 goto err_uninit;
5387
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005388 ret = netdev_register_kobject(dev);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005389 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005390 goto err_uninit;
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005391 dev->reg_state = NETREG_REGISTERED;
5392
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005393 __netdev_update_features(dev);
Michał Mirosław8e9b59b2011-02-22 16:52:28 +00005394
Linus Torvalds1da177e2005-04-16 15:20:36 -07005395 /*
5396 * Default initial state at registry is that the
5397 * device is present.
5398 */
5399
5400 set_bit(__LINK_STATE_PRESENT, &dev->state);
5401
Ben Hutchings8f4cccb2012-08-20 22:16:51 +01005402 linkwatch_init_dev(dev);
5403
Linus Torvalds1da177e2005-04-16 15:20:36 -07005404 dev_init_scheduler(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005405 dev_hold(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005406 list_netdevice(dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04005407 add_device_randomness(dev->dev_addr, dev->addr_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005408
Jiri Pirko948b3372013-01-08 01:38:25 +00005409 /* If the device has permanent device address, driver should
5410 * set dev_addr and also addr_assign_type should be set to
5411 * NET_ADDR_PERM (default value).
5412 */
5413 if (dev->addr_assign_type == NET_ADDR_PERM)
5414 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5415
Linus Torvalds1da177e2005-04-16 15:20:36 -07005416 /* Notify protocols, that a new device appeared. */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005417 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07005418 ret = notifier_to_errno(ret);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005419 if (ret) {
5420 rollback_registered(dev);
5421 dev->reg_state = NETREG_UNREGISTERED;
5422 }
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005423 /*
5424 * Prevent userspace races by waiting until the network
5425 * device is fully setup before sending notifications.
5426 */
Patrick McHardya2835762010-02-26 06:34:51 +00005427 if (!dev->rtnl_link_ops ||
5428 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5429 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005430
5431out:
5432 return ret;
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005433
5434err_uninit:
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005435 if (dev->netdev_ops->ndo_uninit)
5436 dev->netdev_ops->ndo_uninit(dev);
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005437 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005438}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005439EXPORT_SYMBOL(register_netdevice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005440
5441/**
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005442 * init_dummy_netdev - init a dummy network device for NAPI
5443 * @dev: device to init
5444 *
5445 * This takes a network device structure and initialize the minimum
5446 * amount of fields so it can be used to schedule NAPI polls without
5447 * registering a full blown interface. This is to be used by drivers
5448 * that need to tie several hardware interfaces to a single NAPI
5449 * poll scheduler due to HW limitations.
5450 */
5451int init_dummy_netdev(struct net_device *dev)
5452{
5453 /* Clear everything. Note we don't initialize spinlocks
5454 * are they aren't supposed to be taken by any of the
5455 * NAPI code and this dummy netdev is supposed to be
5456 * only ever used for NAPI polls
5457 */
5458 memset(dev, 0, sizeof(struct net_device));
5459
5460 /* make sure we BUG if trying to hit standard
5461 * register/unregister code path
5462 */
5463 dev->reg_state = NETREG_DUMMY;
5464
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005465 /* NAPI wants this */
5466 INIT_LIST_HEAD(&dev->napi_list);
5467
5468 /* a dummy interface is started by default */
5469 set_bit(__LINK_STATE_PRESENT, &dev->state);
5470 set_bit(__LINK_STATE_START, &dev->state);
5471
Eric Dumazet29b44332010-10-11 10:22:12 +00005472 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5473 * because users of this 'device' dont need to change
5474 * its refcount.
5475 */
5476
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005477 return 0;
5478}
5479EXPORT_SYMBOL_GPL(init_dummy_netdev);
5480
5481
5482/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005483 * register_netdev - register a network device
5484 * @dev: device to register
5485 *
5486 * Take a completed network device structure and add it to the kernel
5487 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5488 * chain. 0 is returned on success. A negative errno code is returned
5489 * on a failure to set up the device, or if the name is a duplicate.
5490 *
Borislav Petkov38b4da32007-04-20 22:14:10 -07005491 * This is a wrapper around register_netdevice that takes the rtnl semaphore
Linus Torvalds1da177e2005-04-16 15:20:36 -07005492 * and expands the device name if you passed a format string to
5493 * alloc_netdev.
5494 */
5495int register_netdev(struct net_device *dev)
5496{
5497 int err;
5498
5499 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005500 err = register_netdevice(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005501 rtnl_unlock();
5502 return err;
5503}
5504EXPORT_SYMBOL(register_netdev);
5505
Eric Dumazet29b44332010-10-11 10:22:12 +00005506int netdev_refcnt_read(const struct net_device *dev)
5507{
5508 int i, refcnt = 0;
5509
5510 for_each_possible_cpu(i)
5511 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5512 return refcnt;
5513}
5514EXPORT_SYMBOL(netdev_refcnt_read);
5515
Ben Hutchings2c530402012-07-10 10:55:09 +00005516/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005517 * netdev_wait_allrefs - wait until all references are gone.
Randy Dunlap3de7a372012-08-18 14:36:44 +00005518 * @dev: target net_device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005519 *
5520 * This is called when unregistering network devices.
5521 *
5522 * Any protocol or device that holds a reference should register
5523 * for netdevice notification, and cleanup and put back the
5524 * reference if they receive an UNREGISTER event.
5525 * We can get stuck here if buggy protocols don't correctly
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005526 * call dev_put.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005527 */
5528static void netdev_wait_allrefs(struct net_device *dev)
5529{
5530 unsigned long rebroadcast_time, warning_time;
Eric Dumazet29b44332010-10-11 10:22:12 +00005531 int refcnt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005532
Eric Dumazete014deb2009-11-17 05:59:21 +00005533 linkwatch_forget_dev(dev);
5534
Linus Torvalds1da177e2005-04-16 15:20:36 -07005535 rebroadcast_time = warning_time = jiffies;
Eric Dumazet29b44332010-10-11 10:22:12 +00005536 refcnt = netdev_refcnt_read(dev);
5537
5538 while (refcnt != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005539 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005540 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005541
5542 /* Rebroadcast unregister notification */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005543 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005544
Eric Dumazet748e2d92012-08-22 21:50:59 +00005545 __rtnl_unlock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005546 rcu_barrier();
Eric Dumazet748e2d92012-08-22 21:50:59 +00005547 rtnl_lock();
5548
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005549 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005550 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5551 &dev->state)) {
5552 /* We must not have linkwatch events
5553 * pending on unregister. If this
5554 * happens, we simply run the queue
5555 * unscheduled, resulting in a noop
5556 * for this device.
5557 */
5558 linkwatch_run_queue();
5559 }
5560
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005561 __rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005562
5563 rebroadcast_time = jiffies;
5564 }
5565
5566 msleep(250);
5567
Eric Dumazet29b44332010-10-11 10:22:12 +00005568 refcnt = netdev_refcnt_read(dev);
5569
Linus Torvalds1da177e2005-04-16 15:20:36 -07005570 if (time_after(jiffies, warning_time + 10 * HZ)) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005571 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5572 dev->name, refcnt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005573 warning_time = jiffies;
5574 }
5575 }
5576}
5577
5578/* The sequence is:
5579 *
5580 * rtnl_lock();
5581 * ...
5582 * register_netdevice(x1);
5583 * register_netdevice(x2);
5584 * ...
5585 * unregister_netdevice(y1);
5586 * unregister_netdevice(y2);
5587 * ...
5588 * rtnl_unlock();
5589 * free_netdev(y1);
5590 * free_netdev(y2);
5591 *
Herbert Xu58ec3b42008-10-07 15:50:03 -07005592 * We are invoked by rtnl_unlock().
Linus Torvalds1da177e2005-04-16 15:20:36 -07005593 * This allows us to deal with problems:
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005594 * 1) We can delete sysfs objects which invoke hotplug
Linus Torvalds1da177e2005-04-16 15:20:36 -07005595 * without deadlocking with linkwatch via keventd.
5596 * 2) Since we run with the RTNL semaphore not held, we can sleep
5597 * safely in order to wait for the netdev refcnt to drop to zero.
Herbert Xu58ec3b42008-10-07 15:50:03 -07005598 *
5599 * We must not return until all unregister events added during
5600 * the interval the lock was held have been completed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005601 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005602void netdev_run_todo(void)
5603{
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005604 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005605
Linus Torvalds1da177e2005-04-16 15:20:36 -07005606 /* Snapshot list, allow later requests */
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005607 list_replace_init(&net_todo_list, &list);
Herbert Xu58ec3b42008-10-07 15:50:03 -07005608
5609 __rtnl_unlock();
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005610
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005611
5612 /* Wait for rcu callbacks to finish before next phase */
Eric W. Biederman850a5452011-10-13 22:25:23 +00005613 if (!list_empty(&list))
5614 rcu_barrier();
5615
Linus Torvalds1da177e2005-04-16 15:20:36 -07005616 while (!list_empty(&list)) {
5617 struct net_device *dev
stephen hemmingere5e26d72010-02-24 14:01:38 +00005618 = list_first_entry(&list, struct net_device, todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005619 list_del(&dev->todo_list);
5620
Eric Dumazet748e2d92012-08-22 21:50:59 +00005621 rtnl_lock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005622 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Eric Dumazet748e2d92012-08-22 21:50:59 +00005623 __rtnl_unlock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005624
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005625 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005626 pr_err("network todo '%s' but state %d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07005627 dev->name, dev->reg_state);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005628 dump_stack();
5629 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005630 }
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005631
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005632 dev->reg_state = NETREG_UNREGISTERED;
5633
Changli Gao152102c2010-03-30 20:16:22 +00005634 on_each_cpu(flush_backlog, dev, 1);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07005635
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005636 netdev_wait_allrefs(dev);
5637
5638 /* paranoia */
Eric Dumazet29b44332010-10-11 10:22:12 +00005639 BUG_ON(netdev_refcnt_read(dev));
Eric Dumazet33d480c2011-08-11 19:30:52 +00005640 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5641 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
Ilpo Järvinen547b7922008-07-25 21:43:18 -07005642 WARN_ON(dev->dn_ptr);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005643
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005644 if (dev->destructor)
5645 dev->destructor(dev);
Stephen Hemminger9093bbb2007-05-19 15:39:25 -07005646
5647 /* Free network device */
5648 kobject_put(&dev->dev.kobj);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005649 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005650}
5651
Ben Hutchings3cfde792010-07-09 09:11:52 +00005652/* Convert net_device_stats to rtnl_link_stats64. They have the same
5653 * fields in the same order, with only the type differing.
5654 */
Eric Dumazet77a1abf2012-03-05 04:50:09 +00005655void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5656 const struct net_device_stats *netdev_stats)
Ben Hutchings3cfde792010-07-09 09:11:52 +00005657{
5658#if BITS_PER_LONG == 64
Eric Dumazet77a1abf2012-03-05 04:50:09 +00005659 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5660 memcpy(stats64, netdev_stats, sizeof(*stats64));
Ben Hutchings3cfde792010-07-09 09:11:52 +00005661#else
5662 size_t i, n = sizeof(*stats64) / sizeof(u64);
5663 const unsigned long *src = (const unsigned long *)netdev_stats;
5664 u64 *dst = (u64 *)stats64;
5665
5666 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5667 sizeof(*stats64) / sizeof(u64));
5668 for (i = 0; i < n; i++)
5669 dst[i] = src[i];
5670#endif
5671}
Eric Dumazet77a1abf2012-03-05 04:50:09 +00005672EXPORT_SYMBOL(netdev_stats_to_stats64);
Ben Hutchings3cfde792010-07-09 09:11:52 +00005673
Eric Dumazetd83345a2009-11-16 03:36:51 +00005674/**
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005675 * dev_get_stats - get network device statistics
5676 * @dev: device to get statistics from
Eric Dumazet28172732010-07-07 14:58:56 -07005677 * @storage: place to store stats
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005678 *
Ben Hutchingsd7753512010-07-09 09:12:41 +00005679 * Get network statistics from device. Return @storage.
5680 * The device driver may provide its own method by setting
5681 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5682 * otherwise the internal statistics structure is used.
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005683 */
Ben Hutchingsd7753512010-07-09 09:12:41 +00005684struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5685 struct rtnl_link_stats64 *storage)
Eric Dumazet7004bf22009-05-18 00:34:33 +00005686{
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005687 const struct net_device_ops *ops = dev->netdev_ops;
5688
Eric Dumazet28172732010-07-07 14:58:56 -07005689 if (ops->ndo_get_stats64) {
5690 memset(storage, 0, sizeof(*storage));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005691 ops->ndo_get_stats64(dev, storage);
5692 } else if (ops->ndo_get_stats) {
Ben Hutchings3cfde792010-07-09 09:11:52 +00005693 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005694 } else {
5695 netdev_stats_to_stats64(storage, &dev->stats);
Eric Dumazet28172732010-07-07 14:58:56 -07005696 }
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005697 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
Eric Dumazet28172732010-07-07 14:58:56 -07005698 return storage;
Rusty Russellc45d2862007-03-28 14:29:08 -07005699}
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005700EXPORT_SYMBOL(dev_get_stats);
Rusty Russellc45d2862007-03-28 14:29:08 -07005701
Eric Dumazet24824a02010-10-02 06:11:55 +00005702struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
David S. Millerdc2b4842008-07-08 17:18:23 -07005703{
Eric Dumazet24824a02010-10-02 06:11:55 +00005704 struct netdev_queue *queue = dev_ingress_queue(dev);
David S. Millerdc2b4842008-07-08 17:18:23 -07005705
Eric Dumazet24824a02010-10-02 06:11:55 +00005706#ifdef CONFIG_NET_CLS_ACT
5707 if (queue)
5708 return queue;
5709 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5710 if (!queue)
5711 return NULL;
5712 netdev_init_one_queue(dev, queue, NULL);
Eric Dumazet24824a02010-10-02 06:11:55 +00005713 queue->qdisc = &noop_qdisc;
5714 queue->qdisc_sleeping = &noop_qdisc;
5715 rcu_assign_pointer(dev->ingress_queue, queue);
5716#endif
5717 return queue;
David S. Millerbb949fb2008-07-08 16:55:56 -07005718}
5719
Eric Dumazet2c60db02012-09-16 09:17:26 +00005720static const struct ethtool_ops default_ethtool_ops;
5721
Stanislaw Gruszkad07d7502013-01-10 23:19:10 +00005722void netdev_set_default_ethtool_ops(struct net_device *dev,
5723 const struct ethtool_ops *ops)
5724{
5725 if (dev->ethtool_ops == &default_ethtool_ops)
5726 dev->ethtool_ops = ops;
5727}
5728EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5729
Linus Torvalds1da177e2005-04-16 15:20:36 -07005730/**
Tom Herbert36909ea2011-01-09 19:36:31 +00005731 * alloc_netdev_mqs - allocate network device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005732 * @sizeof_priv: size of private data to allocate space for
5733 * @name: device name format string
5734 * @setup: callback to initialize device
Tom Herbert36909ea2011-01-09 19:36:31 +00005735 * @txqs: the number of TX subqueues to allocate
5736 * @rxqs: the number of RX subqueues to allocate
Linus Torvalds1da177e2005-04-16 15:20:36 -07005737 *
5738 * Allocates a struct net_device with private data area for driver use
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005739 * and performs basic initialization. Also allocates subquue structs
Tom Herbert36909ea2011-01-09 19:36:31 +00005740 * for each queue on the device.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005741 */
Tom Herbert36909ea2011-01-09 19:36:31 +00005742struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5743 void (*setup)(struct net_device *),
5744 unsigned int txqs, unsigned int rxqs)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005745{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005746 struct net_device *dev;
Stephen Hemminger79439862008-07-21 13:28:44 -07005747 size_t alloc_size;
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005748 struct net_device *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005749
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07005750 BUG_ON(strlen(name) >= sizeof(dev->name));
5751
Tom Herbert36909ea2011-01-09 19:36:31 +00005752 if (txqs < 1) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005753 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
Tom Herbert55513fb2010-10-18 17:55:58 +00005754 return NULL;
5755 }
5756
Tom Herbert36909ea2011-01-09 19:36:31 +00005757#ifdef CONFIG_RPS
5758 if (rxqs < 1) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005759 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
Tom Herbert36909ea2011-01-09 19:36:31 +00005760 return NULL;
5761 }
5762#endif
5763
David S. Millerfd2ea0a2008-07-17 01:56:23 -07005764 alloc_size = sizeof(struct net_device);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005765 if (sizeof_priv) {
5766 /* ensure 32-byte alignment of private area */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005767 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005768 alloc_size += sizeof_priv;
5769 }
5770 /* ensure 32-byte alignment of whole construct */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005771 alloc_size += NETDEV_ALIGN - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005772
Paolo 'Blaisorblade' Giarrusso31380de2006-04-06 22:38:28 -07005773 p = kzalloc(alloc_size, GFP_KERNEL);
Joe Perches62b59422013-02-04 16:48:16 +00005774 if (!p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005775 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005776
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005777 dev = PTR_ALIGN(p, NETDEV_ALIGN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005778 dev->padded = (char *)dev - (char *)p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005779
Eric Dumazet29b44332010-10-11 10:22:12 +00005780 dev->pcpu_refcnt = alloc_percpu(int);
5781 if (!dev->pcpu_refcnt)
Tom Herberte6484932010-10-18 18:04:39 +00005782 goto free_p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005783
Linus Torvalds1da177e2005-04-16 15:20:36 -07005784 if (dev_addr_init(dev))
Eric Dumazet29b44332010-10-11 10:22:12 +00005785 goto free_pcpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005786
Jiri Pirko22bedad32010-04-01 21:22:57 +00005787 dev_mc_init(dev);
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005788 dev_uc_init(dev);
Jiri Pirkoccffad252009-05-22 23:22:17 +00005789
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005790 dev_net_set(dev, &init_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005791
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07005792 dev->gso_max_size = GSO_MAX_SIZE;
Ben Hutchings30b678d2012-07-30 15:57:00 +00005793 dev->gso_max_segs = GSO_MAX_SEGS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005794
Herbert Xud565b0a2008-12-15 23:38:52 -08005795 INIT_LIST_HEAD(&dev->napi_list);
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005796 INIT_LIST_HEAD(&dev->unreg_list);
Eric Dumazete014deb2009-11-17 05:59:21 +00005797 INIT_LIST_HEAD(&dev->link_watch_list);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005798 INIT_LIST_HEAD(&dev->upper_dev_list);
Eric Dumazet93f154b2009-05-18 22:19:19 -07005799 dev->priv_flags = IFF_XMIT_DST_RELEASE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005800 setup(dev);
David S. Miller8d3bdbd2011-02-08 15:02:50 -08005801
5802 dev->num_tx_queues = txqs;
5803 dev->real_num_tx_queues = txqs;
5804 if (netif_alloc_netdev_queues(dev))
5805 goto free_all;
5806
5807#ifdef CONFIG_RPS
5808 dev->num_rx_queues = rxqs;
5809 dev->real_num_rx_queues = rxqs;
5810 if (netif_alloc_rx_queues(dev))
5811 goto free_all;
5812#endif
5813
Linus Torvalds1da177e2005-04-16 15:20:36 -07005814 strcpy(dev->name, name);
Vlad Dogarucbda10f2011-01-13 23:38:30 +00005815 dev->group = INIT_NETDEV_GROUP;
Eric Dumazet2c60db02012-09-16 09:17:26 +00005816 if (!dev->ethtool_ops)
5817 dev->ethtool_ops = &default_ethtool_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005818 return dev;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005819
David S. Miller8d3bdbd2011-02-08 15:02:50 -08005820free_all:
5821 free_netdev(dev);
5822 return NULL;
5823
Eric Dumazet29b44332010-10-11 10:22:12 +00005824free_pcpu:
5825 free_percpu(dev->pcpu_refcnt);
Eric Dumazet60877a32013-06-20 01:15:51 -07005826 netif_free_tx_queues(dev);
Tom Herbertfe822242010-11-09 10:47:38 +00005827#ifdef CONFIG_RPS
5828 kfree(dev->_rx);
5829#endif
5830
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005831free_p:
5832 kfree(p);
5833 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005834}
Tom Herbert36909ea2011-01-09 19:36:31 +00005835EXPORT_SYMBOL(alloc_netdev_mqs);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005836
5837/**
5838 * free_netdev - free network device
5839 * @dev: device
5840 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005841 * This function does the last stage of destroying an allocated device
5842 * interface. The reference to the device object is released.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005843 * If this is the last reference then it will be freed.
5844 */
5845void free_netdev(struct net_device *dev)
5846{
Herbert Xud565b0a2008-12-15 23:38:52 -08005847 struct napi_struct *p, *n;
5848
Denis V. Lunevf3005d72008-04-16 02:02:18 -07005849 release_net(dev_net(dev));
5850
Eric Dumazet60877a32013-06-20 01:15:51 -07005851 netif_free_tx_queues(dev);
Tom Herbertfe822242010-11-09 10:47:38 +00005852#ifdef CONFIG_RPS
5853 kfree(dev->_rx);
5854#endif
David S. Millere8a04642008-07-17 00:34:19 -07005855
Eric Dumazet33d480c2011-08-11 19:30:52 +00005856 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
Eric Dumazet24824a02010-10-02 06:11:55 +00005857
Jiri Pirkof001fde2009-05-05 02:48:28 +00005858 /* Flush device addresses */
5859 dev_addr_flush(dev);
5860
Herbert Xud565b0a2008-12-15 23:38:52 -08005861 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5862 netif_napi_del(p);
5863
Eric Dumazet29b44332010-10-11 10:22:12 +00005864 free_percpu(dev->pcpu_refcnt);
5865 dev->pcpu_refcnt = NULL;
5866
Stephen Hemminger3041a062006-05-26 13:25:24 -07005867 /* Compatibility with error handling in drivers */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005868 if (dev->reg_state == NETREG_UNINITIALIZED) {
5869 kfree((char *)dev - dev->padded);
5870 return;
5871 }
5872
5873 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5874 dev->reg_state = NETREG_RELEASED;
5875
Greg Kroah-Hartman43cb76d2002-04-09 12:14:34 -07005876 /* will free via device release */
5877 put_device(&dev->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005878}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005879EXPORT_SYMBOL(free_netdev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005880
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005881/**
5882 * synchronize_net - Synchronize with packet receive processing
5883 *
5884 * Wait for packets currently being received to be done.
5885 * Does not block later packets from starting.
5886 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005887void synchronize_net(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005888{
5889 might_sleep();
Eric Dumazetbe3fc412011-05-23 23:07:32 +00005890 if (rtnl_is_locked())
5891 synchronize_rcu_expedited();
5892 else
5893 synchronize_rcu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005894}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005895EXPORT_SYMBOL(synchronize_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005896
5897/**
Eric Dumazet44a08732009-10-27 07:03:04 +00005898 * unregister_netdevice_queue - remove device from the kernel
Linus Torvalds1da177e2005-04-16 15:20:36 -07005899 * @dev: device
Eric Dumazet44a08732009-10-27 07:03:04 +00005900 * @head: list
Jaswinder Singh Rajput6ebfbc02009-11-22 20:43:13 -08005901 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07005902 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005903 * from the kernel tables.
Eric Dumazet44a08732009-10-27 07:03:04 +00005904 * If head not NULL, device is queued to be unregistered later.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005905 *
5906 * Callers must hold the rtnl semaphore. You may want
5907 * unregister_netdev() instead of this.
5908 */
5909
Eric Dumazet44a08732009-10-27 07:03:04 +00005910void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005911{
Herbert Xua6620712007-12-12 19:21:56 -08005912 ASSERT_RTNL();
5913
Eric Dumazet44a08732009-10-27 07:03:04 +00005914 if (head) {
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005915 list_move_tail(&dev->unreg_list, head);
Eric Dumazet44a08732009-10-27 07:03:04 +00005916 } else {
5917 rollback_registered(dev);
5918 /* Finish processing unregister after unlock */
5919 net_set_todo(dev);
5920 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005921}
Eric Dumazet44a08732009-10-27 07:03:04 +00005922EXPORT_SYMBOL(unregister_netdevice_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005923
5924/**
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005925 * unregister_netdevice_many - unregister many devices
5926 * @head: list of devices
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005927 */
5928void unregister_netdevice_many(struct list_head *head)
5929{
5930 struct net_device *dev;
5931
5932 if (!list_empty(head)) {
5933 rollback_registered_many(head);
5934 list_for_each_entry(dev, head, unreg_list)
5935 net_set_todo(dev);
5936 }
5937}
Eric Dumazet63c80992009-10-27 07:06:49 +00005938EXPORT_SYMBOL(unregister_netdevice_many);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005939
5940/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005941 * unregister_netdev - remove device from the kernel
5942 * @dev: device
5943 *
5944 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005945 * from the kernel tables.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005946 *
5947 * This is just a wrapper for unregister_netdevice that takes
5948 * the rtnl semaphore. In general you want to use this and not
5949 * unregister_netdevice.
5950 */
5951void unregister_netdev(struct net_device *dev)
5952{
5953 rtnl_lock();
5954 unregister_netdevice(dev);
5955 rtnl_unlock();
5956}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005957EXPORT_SYMBOL(unregister_netdev);
5958
Eric W. Biedermance286d32007-09-12 13:53:49 +02005959/**
5960 * dev_change_net_namespace - move device to different nethost namespace
5961 * @dev: device
5962 * @net: network namespace
5963 * @pat: If not NULL name pattern to try if the current device name
5964 * is already taken in the destination network namespace.
5965 *
5966 * This function shuts down a device interface and moves it
5967 * to a new network namespace. On success 0 is returned, on
5968 * a failure a netagive errno code is returned.
5969 *
5970 * Callers must hold the rtnl semaphore.
5971 */
5972
5973int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5974{
Eric W. Biedermance286d32007-09-12 13:53:49 +02005975 int err;
5976
5977 ASSERT_RTNL();
5978
5979 /* Don't allow namespace local devices to be moved. */
5980 err = -EINVAL;
5981 if (dev->features & NETIF_F_NETNS_LOCAL)
5982 goto out;
5983
5984 /* Ensure the device has been registrered */
Eric W. Biedermance286d32007-09-12 13:53:49 +02005985 if (dev->reg_state != NETREG_REGISTERED)
5986 goto out;
5987
5988 /* Get out if there is nothing todo */
5989 err = 0;
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09005990 if (net_eq(dev_net(dev), net))
Eric W. Biedermance286d32007-09-12 13:53:49 +02005991 goto out;
5992
5993 /* Pick the destination device name, and ensure
5994 * we can use it in the destination network namespace.
5995 */
5996 err = -EEXIST;
Octavian Purdilad9031022009-11-18 02:36:59 +00005997 if (__dev_get_by_name(net, dev->name)) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02005998 /* We get here if we can't use the current device name */
5999 if (!pat)
6000 goto out;
Gao feng828de4f2012-09-13 20:58:27 +00006001 if (dev_get_valid_name(net, dev, pat) < 0)
Eric W. Biedermance286d32007-09-12 13:53:49 +02006002 goto out;
6003 }
6004
6005 /*
6006 * And now a mini version of register_netdevice unregister_netdevice.
6007 */
6008
6009 /* If device is running close it first. */
Pavel Emelyanov9b772652007-10-10 02:49:09 -07006010 dev_close(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006011
6012 /* And unlink it from device chain */
6013 err = -ENODEV;
6014 unlist_netdevice(dev);
6015
6016 synchronize_net();
6017
6018 /* Shutdown queueing discipline. */
6019 dev_shutdown(dev);
6020
6021 /* Notify protocols, that we are about to destroy
6022 this device. They should clean all the things.
David Lamparter3b27e102010-09-17 03:22:19 +00006023
6024 Note that dev->reg_state stays at NETREG_REGISTERED.
6025 This is wanted because this way 8021q and macvlan know
6026 the device is just moving and can keep their slaves up.
Eric W. Biedermance286d32007-09-12 13:53:49 +02006027 */
6028 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Gao feng6549dd42012-08-23 15:36:55 +00006029 rcu_barrier();
6030 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Eric W. Biedermand2237d32011-10-21 06:24:20 +00006031 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006032
6033 /*
6034 * Flush the unicast and multicast chains
6035 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00006036 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00006037 dev_mc_flush(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006038
Serge Hallyn4e66ae22012-12-03 16:17:12 +00006039 /* Send a netdev-removed uevent to the old namespace */
6040 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6041
Eric W. Biedermance286d32007-09-12 13:53:49 +02006042 /* Actually switch the network namespace */
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09006043 dev_net_set(dev, net);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006044
Eric W. Biedermance286d32007-09-12 13:53:49 +02006045 /* If there is an ifindex conflict assign a new one */
6046 if (__dev_get_by_index(net, dev->ifindex)) {
6047 int iflink = (dev->iflink == dev->ifindex);
6048 dev->ifindex = dev_new_index(net);
6049 if (iflink)
6050 dev->iflink = dev->ifindex;
6051 }
6052
Serge Hallyn4e66ae22012-12-03 16:17:12 +00006053 /* Send a netdev-add uevent to the new namespace */
6054 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6055
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006056 /* Fixup kobjects */
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07006057 err = device_rename(&dev->dev, dev->name);
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006058 WARN_ON(err);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006059
6060 /* Add the device back in the hashes */
6061 list_netdevice(dev);
6062
6063 /* Notify protocols, that a new device appeared. */
6064 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6065
Eric W. Biedermand90a9092009-12-12 22:11:15 +00006066 /*
6067 * Prevent userspace races by waiting until the network
6068 * device is fully setup before sending notifications.
6069 */
6070 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6071
Eric W. Biedermance286d32007-09-12 13:53:49 +02006072 synchronize_net();
6073 err = 0;
6074out:
6075 return err;
6076}
Johannes Berg463d0182009-07-14 00:33:35 +02006077EXPORT_SYMBOL_GPL(dev_change_net_namespace);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006078
Linus Torvalds1da177e2005-04-16 15:20:36 -07006079static int dev_cpu_callback(struct notifier_block *nfb,
6080 unsigned long action,
6081 void *ocpu)
6082{
6083 struct sk_buff **list_skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006084 struct sk_buff *skb;
6085 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6086 struct softnet_data *sd, *oldsd;
6087
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006088 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006089 return NOTIFY_OK;
6090
6091 local_irq_disable();
6092 cpu = smp_processor_id();
6093 sd = &per_cpu(softnet_data, cpu);
6094 oldsd = &per_cpu(softnet_data, oldcpu);
6095
6096 /* Find end of our completion_queue. */
6097 list_skb = &sd->completion_queue;
6098 while (*list_skb)
6099 list_skb = &(*list_skb)->next;
6100 /* Append completion queue from offline CPU. */
6101 *list_skb = oldsd->completion_queue;
6102 oldsd->completion_queue = NULL;
6103
Linus Torvalds1da177e2005-04-16 15:20:36 -07006104 /* Append output queue from offline CPU. */
Changli Gaoa9cbd582010-04-26 23:06:24 +00006105 if (oldsd->output_queue) {
6106 *sd->output_queue_tailp = oldsd->output_queue;
6107 sd->output_queue_tailp = oldsd->output_queue_tailp;
6108 oldsd->output_queue = NULL;
6109 oldsd->output_queue_tailp = &oldsd->output_queue;
6110 }
Heiko Carstens264524d2011-06-06 20:50:03 +00006111 /* Append NAPI poll list from offline CPU. */
6112 if (!list_empty(&oldsd->poll_list)) {
6113 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6114 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6115 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006116
6117 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6118 local_irq_enable();
6119
6120 /* Process offline CPU's input_pkt_queue */
Tom Herbert76cc8b12010-05-20 18:37:59 +00006121 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6122 netif_rx(skb);
6123 input_queue_head_incr(oldsd);
6124 }
Tom Herbertfec5e652010-04-16 16:01:27 -07006125 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006126 netif_rx(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00006127 input_queue_head_incr(oldsd);
Tom Herbertfec5e652010-04-16 16:01:27 -07006128 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006129
6130 return NOTIFY_OK;
6131}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006132
6133
Herbert Xu7f353bf2007-08-10 15:47:58 -07006134/**
Herbert Xub63365a2008-10-23 01:11:29 -07006135 * netdev_increment_features - increment feature set by one
6136 * @all: current feature set
6137 * @one: new feature set
6138 * @mask: mask feature set
Herbert Xu7f353bf2007-08-10 15:47:58 -07006139 *
6140 * Computes a new feature set after adding a device with feature set
Herbert Xub63365a2008-10-23 01:11:29 -07006141 * @one to the master device with current feature set @all. Will not
6142 * enable anything that is off in @mask. Returns the new feature set.
Herbert Xu7f353bf2007-08-10 15:47:58 -07006143 */
Michał Mirosławc8f44af2011-11-15 15:29:55 +00006144netdev_features_t netdev_increment_features(netdev_features_t all,
6145 netdev_features_t one, netdev_features_t mask)
Herbert Xu7f353bf2007-08-10 15:47:58 -07006146{
Michał Mirosław1742f182011-04-22 06:31:16 +00006147 if (mask & NETIF_F_GEN_CSUM)
6148 mask |= NETIF_F_ALL_CSUM;
6149 mask |= NETIF_F_VLAN_CHALLENGED;
6150
6151 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6152 all &= one | ~NETIF_F_ALL_FOR_ALL;
6153
Michał Mirosław1742f182011-04-22 06:31:16 +00006154 /* If one device supports hw checksumming, set for all. */
6155 if (all & NETIF_F_GEN_CSUM)
6156 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
Herbert Xu7f353bf2007-08-10 15:47:58 -07006157
6158 return all;
6159}
Herbert Xub63365a2008-10-23 01:11:29 -07006160EXPORT_SYMBOL(netdev_increment_features);
Herbert Xu7f353bf2007-08-10 15:47:58 -07006161
Baruch Siach430f03c2013-06-02 20:43:55 +00006162static struct hlist_head * __net_init netdev_create_hash(void)
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006163{
6164 int i;
6165 struct hlist_head *hash;
6166
6167 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6168 if (hash != NULL)
6169 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6170 INIT_HLIST_HEAD(&hash[i]);
6171
6172 return hash;
6173}
6174
Eric W. Biederman881d9662007-09-17 11:56:21 -07006175/* Initialize per network namespace state */
Pavel Emelyanov46650792007-10-08 20:38:39 -07006176static int __net_init netdev_init(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07006177{
Rustad, Mark D734b6542012-07-18 09:06:07 +00006178 if (net != &init_net)
6179 INIT_LIST_HEAD(&net->dev_base_head);
Eric W. Biederman881d9662007-09-17 11:56:21 -07006180
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006181 net->dev_name_head = netdev_create_hash();
6182 if (net->dev_name_head == NULL)
6183 goto err_name;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006184
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006185 net->dev_index_head = netdev_create_hash();
6186 if (net->dev_index_head == NULL)
6187 goto err_idx;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006188
6189 return 0;
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006190
6191err_idx:
6192 kfree(net->dev_name_head);
6193err_name:
6194 return -ENOMEM;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006195}
6196
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006197/**
6198 * netdev_drivername - network driver for the device
6199 * @dev: network device
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006200 *
6201 * Determine network driver for device.
6202 */
David S. Miller3019de12011-06-06 16:41:33 -07006203const char *netdev_drivername(const struct net_device *dev)
Arjan van de Ven6579e572008-07-21 13:31:48 -07006204{
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07006205 const struct device_driver *driver;
6206 const struct device *parent;
David S. Miller3019de12011-06-06 16:41:33 -07006207 const char *empty = "";
Arjan van de Ven6579e572008-07-21 13:31:48 -07006208
6209 parent = dev->dev.parent;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006210 if (!parent)
David S. Miller3019de12011-06-06 16:41:33 -07006211 return empty;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006212
6213 driver = parent->driver;
6214 if (driver && driver->name)
David S. Miller3019de12011-06-06 16:41:33 -07006215 return driver->name;
6216 return empty;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006217}
6218
Joe Perchesb004ff42012-09-12 20:12:19 -07006219static int __netdev_printk(const char *level, const struct net_device *dev,
Joe Perches256df2f2010-06-27 01:02:35 +00006220 struct va_format *vaf)
6221{
6222 int r;
6223
Joe Perchesb004ff42012-09-12 20:12:19 -07006224 if (dev && dev->dev.parent) {
Joe Perches666f3552012-09-12 20:14:11 -07006225 r = dev_printk_emit(level[1] - '0',
6226 dev->dev.parent,
6227 "%s %s %s: %pV",
6228 dev_driver_string(dev->dev.parent),
6229 dev_name(dev->dev.parent),
6230 netdev_name(dev), vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006231 } else if (dev) {
Joe Perches256df2f2010-06-27 01:02:35 +00006232 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006233 } else {
Joe Perches256df2f2010-06-27 01:02:35 +00006234 r = printk("%s(NULL net_device): %pV", level, vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006235 }
Joe Perches256df2f2010-06-27 01:02:35 +00006236
6237 return r;
6238}
6239
6240int netdev_printk(const char *level, const struct net_device *dev,
6241 const char *format, ...)
6242{
6243 struct va_format vaf;
6244 va_list args;
6245 int r;
6246
6247 va_start(args, format);
6248
6249 vaf.fmt = format;
6250 vaf.va = &args;
6251
6252 r = __netdev_printk(level, dev, &vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006253
Joe Perches256df2f2010-06-27 01:02:35 +00006254 va_end(args);
6255
6256 return r;
6257}
6258EXPORT_SYMBOL(netdev_printk);
6259
6260#define define_netdev_printk_level(func, level) \
6261int func(const struct net_device *dev, const char *fmt, ...) \
6262{ \
6263 int r; \
6264 struct va_format vaf; \
6265 va_list args; \
6266 \
6267 va_start(args, fmt); \
6268 \
6269 vaf.fmt = fmt; \
6270 vaf.va = &args; \
6271 \
6272 r = __netdev_printk(level, dev, &vaf); \
Joe Perchesb004ff42012-09-12 20:12:19 -07006273 \
Joe Perches256df2f2010-06-27 01:02:35 +00006274 va_end(args); \
6275 \
6276 return r; \
6277} \
6278EXPORT_SYMBOL(func);
6279
6280define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6281define_netdev_printk_level(netdev_alert, KERN_ALERT);
6282define_netdev_printk_level(netdev_crit, KERN_CRIT);
6283define_netdev_printk_level(netdev_err, KERN_ERR);
6284define_netdev_printk_level(netdev_warn, KERN_WARNING);
6285define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6286define_netdev_printk_level(netdev_info, KERN_INFO);
6287
Pavel Emelyanov46650792007-10-08 20:38:39 -07006288static void __net_exit netdev_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07006289{
6290 kfree(net->dev_name_head);
6291 kfree(net->dev_index_head);
6292}
6293
Denis V. Lunev022cbae2007-11-13 03:23:50 -08006294static struct pernet_operations __net_initdata netdev_net_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07006295 .init = netdev_init,
6296 .exit = netdev_exit,
6297};
6298
Pavel Emelyanov46650792007-10-08 20:38:39 -07006299static void __net_exit default_device_exit(struct net *net)
Eric W. Biedermance286d32007-09-12 13:53:49 +02006300{
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006301 struct net_device *dev, *aux;
Eric W. Biedermance286d32007-09-12 13:53:49 +02006302 /*
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006303 * Push all migratable network devices back to the
Eric W. Biedermance286d32007-09-12 13:53:49 +02006304 * initial network namespace
6305 */
6306 rtnl_lock();
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006307 for_each_netdev_safe(net, dev, aux) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006308 int err;
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006309 char fb_name[IFNAMSIZ];
Eric W. Biedermance286d32007-09-12 13:53:49 +02006310
6311 /* Ignore unmoveable devices (i.e. loopback) */
6312 if (dev->features & NETIF_F_NETNS_LOCAL)
6313 continue;
6314
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006315 /* Leave virtual devices for the generic cleanup */
6316 if (dev->rtnl_link_ops)
6317 continue;
Eric W. Biedermand0c082c2008-11-05 15:59:38 -08006318
Lucas De Marchi25985ed2011-03-30 22:57:33 -03006319 /* Push remaining network devices to init_net */
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006320 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6321 err = dev_change_net_namespace(dev, &init_net, fb_name);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006322 if (err) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00006323 pr_emerg("%s: failed to move %s to init_net: %d\n",
6324 __func__, dev->name, err);
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006325 BUG();
Eric W. Biedermance286d32007-09-12 13:53:49 +02006326 }
6327 }
6328 rtnl_unlock();
6329}
6330
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006331static void __net_exit default_device_exit_batch(struct list_head *net_list)
6332{
6333 /* At exit all network devices most be removed from a network
Uwe Kleine-Königb5950762010-11-01 15:38:34 -04006334 * namespace. Do this in the reverse order of registration.
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006335 * Do this across as many network namespaces as possible to
6336 * improve batching efficiency.
6337 */
6338 struct net_device *dev;
6339 struct net *net;
6340 LIST_HEAD(dev_kill_list);
6341
6342 rtnl_lock();
6343 list_for_each_entry(net, net_list, exit_list) {
6344 for_each_netdev_reverse(net, dev) {
6345 if (dev->rtnl_link_ops)
6346 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6347 else
6348 unregister_netdevice_queue(dev, &dev_kill_list);
6349 }
6350 }
6351 unregister_netdevice_many(&dev_kill_list);
Eric Dumazetceaaec92011-02-17 22:59:19 +00006352 list_del(&dev_kill_list);
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006353 rtnl_unlock();
6354}
6355
Denis V. Lunev022cbae2007-11-13 03:23:50 -08006356static struct pernet_operations __net_initdata default_device_ops = {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006357 .exit = default_device_exit,
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006358 .exit_batch = default_device_exit_batch,
Eric W. Biedermance286d32007-09-12 13:53:49 +02006359};
6360
Linus Torvalds1da177e2005-04-16 15:20:36 -07006361/*
6362 * Initialize the DEV module. At boot time this walks the device list and
6363 * unhooks any devices that fail to initialise (normally hardware not
6364 * present) and leaves us with a valid list of present and active devices.
6365 *
6366 */
6367
6368/*
6369 * This is called single threaded during boot, so no need
6370 * to take the rtnl semaphore.
6371 */
6372static int __init net_dev_init(void)
6373{
6374 int i, rc = -ENOMEM;
6375
6376 BUG_ON(!dev_boot_phase);
6377
Linus Torvalds1da177e2005-04-16 15:20:36 -07006378 if (dev_proc_init())
6379 goto out;
6380
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006381 if (netdev_kobject_init())
Linus Torvalds1da177e2005-04-16 15:20:36 -07006382 goto out;
6383
6384 INIT_LIST_HEAD(&ptype_all);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08006385 for (i = 0; i < PTYPE_HASH_SIZE; i++)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006386 INIT_LIST_HEAD(&ptype_base[i]);
6387
Vlad Yasevich62532da2012-11-15 08:49:10 +00006388 INIT_LIST_HEAD(&offload_base);
6389
Eric W. Biederman881d9662007-09-17 11:56:21 -07006390 if (register_pernet_subsys(&netdev_net_ops))
6391 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006392
6393 /*
6394 * Initialise the packet receive queues.
6395 */
6396
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07006397 for_each_possible_cpu(i) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006398 struct softnet_data *sd = &per_cpu(softnet_data, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006399
Changli Gaodee42872010-05-02 05:42:16 +00006400 memset(sd, 0, sizeof(*sd));
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006401 skb_queue_head_init(&sd->input_pkt_queue);
Changli Gao6e7676c2010-04-27 15:07:33 -07006402 skb_queue_head_init(&sd->process_queue);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006403 sd->completion_queue = NULL;
6404 INIT_LIST_HEAD(&sd->poll_list);
Changli Gaoa9cbd582010-04-26 23:06:24 +00006405 sd->output_queue = NULL;
6406 sd->output_queue_tailp = &sd->output_queue;
Eric Dumazetdf334542010-03-24 19:13:54 +00006407#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006408 sd->csd.func = rps_trigger_softirq;
6409 sd->csd.info = sd;
6410 sd->csd.flags = 0;
6411 sd->cpu = i;
Tom Herbert1e94d722010-03-18 17:45:44 -07006412#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00006413
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006414 sd->backlog.poll = process_backlog;
6415 sd->backlog.weight = weight_p;
6416 sd->backlog.gro_list = NULL;
6417 sd->backlog.gro_count = 0;
Willem de Bruijn99bbc702013-05-20 04:02:32 +00006418
6419#ifdef CONFIG_NET_FLOW_LIMIT
6420 sd->flow_limit = NULL;
6421#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006422 }
6423
Linus Torvalds1da177e2005-04-16 15:20:36 -07006424 dev_boot_phase = 0;
6425
Eric W. Biederman505d4f72008-11-07 22:54:20 -08006426 /* The loopback device is special if any other network devices
6427 * is present in a network namespace the loopback device must
6428 * be present. Since we now dynamically allocate and free the
6429 * loopback device ensure this invariant is maintained by
6430 * keeping the loopback device as the first device on the
6431 * list of network devices. Ensuring the loopback devices
6432 * is the first device that appears and the last network device
6433 * that disappears.
6434 */
6435 if (register_pernet_device(&loopback_net_ops))
6436 goto out;
6437
6438 if (register_pernet_device(&default_device_ops))
6439 goto out;
6440
Carlos R. Mafra962cf362008-05-15 11:15:37 -03006441 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6442 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006443
6444 hotcpu_notifier(dev_cpu_callback, 0);
6445 dst_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006446 rc = 0;
6447out:
6448 return rc;
6449}
6450
6451subsys_initcall(net_dev_init);