blob: 370354a9c5f6926e977ce4374541bc4b6e2e5dac [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070076#include <linux/bitops.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080077#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070078#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
stephen hemminger08e98972009-11-10 07:20:34 +000081#include <linux/hash.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090082#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/sched.h>
Arjan van de Ven4a3e2f72006-03-20 22:33:17 -080084#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
Ben Hutchings0187bdf2008-06-19 16:15:47 -070094#include <linux/ethtool.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <linux/notifier.h>
96#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020097#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <net/sock.h>
99#include <linux/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100#include <linux/stat.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101#include <net/dst.h>
102#include <net/pkt_sched.h>
103#include <net/checksum.h>
Arnd Bergmann44540962009-11-26 06:07:08 +0000104#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105#include <linux/highmem.h>
106#include <linux/init.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107#include <linux/module.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#include <linux/netpoll.h>
109#include <linux/rcupdate.h>
110#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700111#include <net/iw_handler.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112#include <asm/current.h>
Steve Grubb5bdb9882005-12-03 08:39:35 -0500113#include <linux/audit.h>
Chris Leechdb217332006-06-17 21:24:58 -0700114#include <linux/dmaengine.h>
Herbert Xuf6a78bf2006-06-22 02:57:17 -0700115#include <linux/err.h>
David S. Millerc7fa9d12006-08-15 16:34:13 -0700116#include <linux/ctype.h>
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700117#include <linux/if_arp.h>
Ben Hutchings6de329e2008-06-16 17:02:28 -0700118#include <linux/if_vlan.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700119#include <linux/ip.h>
Alexander Duyckad55dca2008-09-20 22:05:50 -0700120#include <net/ip.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700121#include <linux/ipv6.h>
122#include <linux/in.h>
David S. Millerb6b2fed2008-07-21 09:48:06 -0700123#include <linux/jhash.h>
124#include <linux/random.h>
David S. Miller9cbc1cb2009-06-15 03:02:23 -0700125#include <trace/events/napi.h>
Koki Sanagicf66ba52010-08-23 18:45:02 +0900126#include <trace/events/net.h>
Koki Sanagi07dc22e2010-08-23 18:46:12 +0900127#include <trace/events/skb.h>
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +0000128#include <linux/pci.h>
Stephen Rothwellcaeda9b2010-09-16 21:39:16 -0700129#include <linux/inetdevice.h>
Ben Hutchingsc4454772011-01-19 11:03:53 +0000130#include <linux/cpu_rmap.h>
Ingo Molnarc5905af2012-02-24 08:31:31 +0100131#include <linux/static_key.h>
Eliezer Tamiraf12fa62013-06-10 11:39:41 +0300132#include <linux/hashtable.h>
Eric Dumazet60877a32013-06-20 01:15:51 -0700133#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134
Pavel Emelyanov342709e2007-10-23 21:14:45 -0700135#include "net-sysfs.h"
136
Herbert Xud565b0a2008-12-15 23:38:52 -0800137/* Instead of increasing this, you should create a hash table. */
138#define MAX_GRO_SKBS 8
139
Herbert Xu5d38a072009-01-04 16:13:40 -0800140/* This should be increased if a protocol with a bigger head is added. */
141#define GRO_MAX_HEAD (MAX_HEADER + 128)
142
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143static DEFINE_SPINLOCK(ptype_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000144static DEFINE_SPINLOCK(offload_lock);
Cong Wang900ff8c2013-02-18 19:20:33 +0000145struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
146struct list_head ptype_all __read_mostly; /* Taps */
Vlad Yasevich62532da2012-11-15 08:49:10 +0000147static struct list_head offload_base __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149/*
Pavel Emelianov7562f872007-05-03 15:13:45 -0700150 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151 * semaphore.
152 *
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800153 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154 *
155 * Writers must hold the rtnl semaphore while they loop through the
Pavel Emelianov7562f872007-05-03 15:13:45 -0700156 * dev_base_head list, and hold dev_base_lock for writing when they do the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157 * actual updates. This allows pure readers to access the list even
158 * while a writer is preparing to update it.
159 *
160 * To put it another way, dev_base_lock is held for writing only to
161 * protect against pure readers; the rtnl semaphore provides the
162 * protection against other writers.
163 *
164 * See, for example usages, register_netdevice() and
165 * unregister_netdevice(), which must be called with the rtnl
166 * semaphore held.
167 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700168DEFINE_RWLOCK(dev_base_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169EXPORT_SYMBOL(dev_base_lock);
170
Eliezer Tamiraf12fa62013-06-10 11:39:41 +0300171/* protects napi_hash addition/deletion and napi_gen_id */
172static DEFINE_SPINLOCK(napi_hash_lock);
173
174static unsigned int napi_gen_id;
175static DEFINE_HASHTABLE(napi_hash, 8);
176
Eric Dumazet30e6c9f2012-12-20 17:25:08 +0000177seqcount_t devnet_rename_seq;
Brian Haleyc91f6df2012-11-26 05:21:08 +0000178
Thomas Graf4e985ad2011-06-21 03:11:20 +0000179static inline void dev_base_seq_inc(struct net *net)
180{
181 while (++net->dev_base_seq == 0);
182}
183
Eric W. Biederman881d9662007-09-17 11:56:21 -0700184static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185{
Eric Dumazet95c96172012-04-15 05:58:06 +0000186 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
187
stephen hemminger08e98972009-11-10 07:20:34 +0000188 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189}
190
Eric W. Biederman881d9662007-09-17 11:56:21 -0700191static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700192{
Eric Dumazet7c28bd02009-10-24 06:13:17 -0700193 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194}
195
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000196static inline void rps_lock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000197{
198#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000199 spin_lock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000200#endif
201}
202
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000203static inline void rps_unlock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000204{
205#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000206 spin_unlock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000207#endif
208}
209
Eric W. Biedermance286d32007-09-12 13:53:49 +0200210/* Device list insertion */
dingtianhong53759be2013-04-17 22:17:50 +0000211static void list_netdevice(struct net_device *dev)
Eric W. Biedermance286d32007-09-12 13:53:49 +0200212{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900213 struct net *net = dev_net(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200214
215 ASSERT_RTNL();
216
217 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800218 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
Eric Dumazet72c95282009-10-30 07:11:27 +0000219 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000220 hlist_add_head_rcu(&dev->index_hlist,
221 dev_index_hash(net, dev->ifindex));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200222 write_unlock_bh(&dev_base_lock);
Thomas Graf4e985ad2011-06-21 03:11:20 +0000223
224 dev_base_seq_inc(net);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200225}
226
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000227/* Device list removal
228 * caller must respect a RCU grace period before freeing/reusing dev
229 */
Eric W. Biedermance286d32007-09-12 13:53:49 +0200230static void unlist_netdevice(struct net_device *dev)
231{
232 ASSERT_RTNL();
233
234 /* Unlink dev from the device chain */
235 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800236 list_del_rcu(&dev->dev_list);
Eric Dumazet72c95282009-10-30 07:11:27 +0000237 hlist_del_rcu(&dev->name_hlist);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000238 hlist_del_rcu(&dev->index_hlist);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200239 write_unlock_bh(&dev_base_lock);
Thomas Graf4e985ad2011-06-21 03:11:20 +0000240
241 dev_base_seq_inc(dev_net(dev));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200242}
243
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244/*
245 * Our notifier list
246 */
247
Alan Sternf07d5b92006-05-09 15:23:03 -0700248static RAW_NOTIFIER_HEAD(netdev_chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700249
250/*
251 * Device drivers call our routines to queue packets here. We empty the
252 * queue in the local softnet handler.
253 */
Stephen Hemmingerbea33482007-10-03 16:41:36 -0700254
Eric Dumazet9958da02010-04-17 04:17:02 +0000255DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700256EXPORT_PER_CPU_SYMBOL(softnet_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257
David S. Millercf508b12008-07-22 14:16:42 -0700258#ifdef CONFIG_LOCKDEP
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700259/*
David S. Millerc773e842008-07-08 23:13:53 -0700260 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700261 * according to dev->type
262 */
263static const unsigned short netdev_lock_type[] =
264 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
265 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
266 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
267 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
268 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
269 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
270 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
271 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
272 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
273 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
274 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
275 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
Paul Gortmaker211ed862012-05-10 17:14:35 -0400276 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
277 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
278 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700279
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700280static const char *const netdev_lock_name[] =
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700281 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
282 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
283 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
284 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
285 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
286 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
287 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
288 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
289 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
290 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
291 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
292 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
Paul Gortmaker211ed862012-05-10 17:14:35 -0400293 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
294 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
295 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700296
297static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
David S. Millercf508b12008-07-22 14:16:42 -0700298static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700299
300static inline unsigned short netdev_lock_pos(unsigned short dev_type)
301{
302 int i;
303
304 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
305 if (netdev_lock_type[i] == dev_type)
306 return i;
307 /* the last key is used by default */
308 return ARRAY_SIZE(netdev_lock_type) - 1;
309}
310
David S. Millercf508b12008-07-22 14:16:42 -0700311static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
312 unsigned short dev_type)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700313{
314 int i;
315
316 i = netdev_lock_pos(dev_type);
317 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
318 netdev_lock_name[i]);
319}
David S. Millercf508b12008-07-22 14:16:42 -0700320
321static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
322{
323 int i;
324
325 i = netdev_lock_pos(dev->type);
326 lockdep_set_class_and_name(&dev->addr_list_lock,
327 &netdev_addr_lock_key[i],
328 netdev_lock_name[i]);
329}
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700330#else
David S. Millercf508b12008-07-22 14:16:42 -0700331static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
332 unsigned short dev_type)
333{
334}
335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700336{
337}
338#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339
340/*******************************************************************************
341
342 Protocol management and registration routines
343
344*******************************************************************************/
345
346/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347 * Add a protocol ID to the list. Now that the input handler is
348 * smarter we can dispense with all the messy stuff that used to be
349 * here.
350 *
351 * BEWARE!!! Protocol handlers, mangling input packets,
352 * MUST BE last in hash buckets and checking protocol handlers
353 * MUST start from promiscuous ptype_all chain in net_bh.
354 * It is true now, do not change it.
355 * Explanation follows: if protocol handler, mangling packet, will
356 * be the first on list, it is not able to sense, that packet
357 * is cloned and should be copied-on-write, so that it will
358 * change it and subsequent readers will get broken packet.
359 * --ANK (980803)
360 */
361
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000362static inline struct list_head *ptype_head(const struct packet_type *pt)
363{
364 if (pt->type == htons(ETH_P_ALL))
365 return &ptype_all;
366 else
367 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
368}
369
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370/**
371 * dev_add_pack - add packet handler
372 * @pt: packet type declaration
373 *
374 * Add a protocol handler to the networking stack. The passed &packet_type
375 * is linked into kernel lists and may not be freed until it has been
376 * removed from the kernel lists.
377 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900378 * This call does not sleep therefore it can not
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 * guarantee all CPU's that are in middle of receiving packets
380 * will see the new packet type (until the next received packet).
381 */
382
383void dev_add_pack(struct packet_type *pt)
384{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000385 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000387 spin_lock(&ptype_lock);
388 list_add_rcu(&pt->list, head);
389 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700391EXPORT_SYMBOL(dev_add_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700392
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393/**
394 * __dev_remove_pack - remove packet handler
395 * @pt: packet type declaration
396 *
397 * Remove a protocol handler that was previously added to the kernel
398 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
399 * from the kernel lists and can be freed or reused once this function
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900400 * returns.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401 *
402 * The packet type might still be in use by receivers
403 * and must not be freed until after all the CPU's have gone
404 * through a quiescent state.
405 */
406void __dev_remove_pack(struct packet_type *pt)
407{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000408 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700409 struct packet_type *pt1;
410
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000411 spin_lock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700412
413 list_for_each_entry(pt1, head, list) {
414 if (pt == pt1) {
415 list_del_rcu(&pt->list);
416 goto out;
417 }
418 }
419
Joe Perches7b6cd1c2012-02-01 10:54:43 +0000420 pr_warn("dev_remove_pack: %p not found\n", pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700421out:
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000422 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700423}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700424EXPORT_SYMBOL(__dev_remove_pack);
425
Linus Torvalds1da177e2005-04-16 15:20:36 -0700426/**
427 * dev_remove_pack - remove packet handler
428 * @pt: packet type declaration
429 *
430 * Remove a protocol handler that was previously added to the kernel
431 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
432 * from the kernel lists and can be freed or reused once this function
433 * returns.
434 *
435 * This call sleeps to guarantee that no CPU is looking at the packet
436 * type after return.
437 */
438void dev_remove_pack(struct packet_type *pt)
439{
440 __dev_remove_pack(pt);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900441
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442 synchronize_net();
443}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700444EXPORT_SYMBOL(dev_remove_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445
Vlad Yasevich62532da2012-11-15 08:49:10 +0000446
447/**
448 * dev_add_offload - register offload handlers
449 * @po: protocol offload declaration
450 *
451 * Add protocol offload handlers to the networking stack. The passed
452 * &proto_offload is linked into kernel lists and may not be freed until
453 * it has been removed from the kernel lists.
454 *
455 * This call does not sleep therefore it can not
456 * guarantee all CPU's that are in middle of receiving packets
457 * will see the new offload handlers (until the next received packet).
458 */
459void dev_add_offload(struct packet_offload *po)
460{
461 struct list_head *head = &offload_base;
462
463 spin_lock(&offload_lock);
464 list_add_rcu(&po->list, head);
465 spin_unlock(&offload_lock);
466}
467EXPORT_SYMBOL(dev_add_offload);
468
469/**
470 * __dev_remove_offload - remove offload handler
471 * @po: packet offload declaration
472 *
473 * Remove a protocol offload handler that was previously added to the
474 * kernel offload handlers by dev_add_offload(). The passed &offload_type
475 * is removed from the kernel lists and can be freed or reused once this
476 * function returns.
477 *
478 * The packet type might still be in use by receivers
479 * and must not be freed until after all the CPU's have gone
480 * through a quiescent state.
481 */
482void __dev_remove_offload(struct packet_offload *po)
483{
484 struct list_head *head = &offload_base;
485 struct packet_offload *po1;
486
Eric Dumazetc53aa502012-11-16 08:08:23 +0000487 spin_lock(&offload_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000488
489 list_for_each_entry(po1, head, list) {
490 if (po == po1) {
491 list_del_rcu(&po->list);
492 goto out;
493 }
494 }
495
496 pr_warn("dev_remove_offload: %p not found\n", po);
497out:
Eric Dumazetc53aa502012-11-16 08:08:23 +0000498 spin_unlock(&offload_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000499}
500EXPORT_SYMBOL(__dev_remove_offload);
501
502/**
503 * dev_remove_offload - remove packet offload handler
504 * @po: packet offload declaration
505 *
506 * Remove a packet offload handler that was previously added to the kernel
507 * offload handlers by dev_add_offload(). The passed &offload_type is
508 * removed from the kernel lists and can be freed or reused once this
509 * function returns.
510 *
511 * This call sleeps to guarantee that no CPU is looking at the packet
512 * type after return.
513 */
514void dev_remove_offload(struct packet_offload *po)
515{
516 __dev_remove_offload(po);
517
518 synchronize_net();
519}
520EXPORT_SYMBOL(dev_remove_offload);
521
Linus Torvalds1da177e2005-04-16 15:20:36 -0700522/******************************************************************************
523
524 Device Boot-time Settings Routines
525
526*******************************************************************************/
527
528/* Boot time configuration table */
529static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
530
531/**
532 * netdev_boot_setup_add - add new setup entry
533 * @name: name of the device
534 * @map: configured settings for the device
535 *
536 * Adds new setup entry to the dev_boot_setup list. The function
537 * returns 0 on error and 1 on success. This is a generic routine to
538 * all netdevices.
539 */
540static int netdev_boot_setup_add(char *name, struct ifmap *map)
541{
542 struct netdev_boot_setup *s;
543 int i;
544
545 s = dev_boot_setup;
546 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
547 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
548 memset(s[i].name, 0, sizeof(s[i].name));
Wang Chen93b3cff2008-07-01 19:57:19 -0700549 strlcpy(s[i].name, name, IFNAMSIZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700550 memcpy(&s[i].map, map, sizeof(s[i].map));
551 break;
552 }
553 }
554
555 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
556}
557
558/**
559 * netdev_boot_setup_check - check boot time settings
560 * @dev: the netdevice
561 *
562 * Check boot time settings for the device.
563 * The found settings are set for the device to be used
564 * later in the device probing.
565 * Returns 0 if no settings found, 1 if they are.
566 */
567int netdev_boot_setup_check(struct net_device *dev)
568{
569 struct netdev_boot_setup *s = dev_boot_setup;
570 int i;
571
572 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
573 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
Wang Chen93b3cff2008-07-01 19:57:19 -0700574 !strcmp(dev->name, s[i].name)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700575 dev->irq = s[i].map.irq;
576 dev->base_addr = s[i].map.base_addr;
577 dev->mem_start = s[i].map.mem_start;
578 dev->mem_end = s[i].map.mem_end;
579 return 1;
580 }
581 }
582 return 0;
583}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700584EXPORT_SYMBOL(netdev_boot_setup_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700585
586
587/**
588 * netdev_boot_base - get address from boot time settings
589 * @prefix: prefix for network device
590 * @unit: id for network device
591 *
592 * Check boot time settings for the base address of device.
593 * The found settings are set for the device to be used
594 * later in the device probing.
595 * Returns 0 if no settings found.
596 */
597unsigned long netdev_boot_base(const char *prefix, int unit)
598{
599 const struct netdev_boot_setup *s = dev_boot_setup;
600 char name[IFNAMSIZ];
601 int i;
602
603 sprintf(name, "%s%d", prefix, unit);
604
605 /*
606 * If device already registered then return base of 1
607 * to indicate not to probe for this interface
608 */
Eric W. Biederman881d9662007-09-17 11:56:21 -0700609 if (__dev_get_by_name(&init_net, name))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700610 return 1;
611
612 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
613 if (!strcmp(name, s[i].name))
614 return s[i].map.base_addr;
615 return 0;
616}
617
618/*
619 * Saves at boot time configured settings for any netdevice.
620 */
621int __init netdev_boot_setup(char *str)
622{
623 int ints[5];
624 struct ifmap map;
625
626 str = get_options(str, ARRAY_SIZE(ints), ints);
627 if (!str || !*str)
628 return 0;
629
630 /* Save settings */
631 memset(&map, 0, sizeof(map));
632 if (ints[0] > 0)
633 map.irq = ints[1];
634 if (ints[0] > 1)
635 map.base_addr = ints[2];
636 if (ints[0] > 2)
637 map.mem_start = ints[3];
638 if (ints[0] > 3)
639 map.mem_end = ints[4];
640
641 /* Add new entry to the list */
642 return netdev_boot_setup_add(str, &map);
643}
644
645__setup("netdev=", netdev_boot_setup);
646
647/*******************************************************************************
648
649 Device Interface Subroutines
650
651*******************************************************************************/
652
653/**
654 * __dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700655 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700656 * @name: name to find
657 *
658 * Find an interface by name. Must be called under RTNL semaphore
659 * or @dev_base_lock. If the name is found a pointer to the device
660 * is returned. If the name is not found then %NULL is returned. The
661 * reference counters are not incremented so the caller must be
662 * careful with locks.
663 */
664
Eric W. Biederman881d9662007-09-17 11:56:21 -0700665struct net_device *__dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666{
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700667 struct net_device *dev;
668 struct hlist_head *head = dev_name_hash(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669
Sasha Levinb67bfe02013-02-27 17:06:00 -0800670 hlist_for_each_entry(dev, head, name_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700671 if (!strncmp(dev->name, name, IFNAMSIZ))
672 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700673
Linus Torvalds1da177e2005-04-16 15:20:36 -0700674 return NULL;
675}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700676EXPORT_SYMBOL(__dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700677
678/**
Eric Dumazet72c95282009-10-30 07:11:27 +0000679 * dev_get_by_name_rcu - find a device by its name
680 * @net: the applicable net namespace
681 * @name: name to find
682 *
683 * Find an interface by name.
684 * If the name is found a pointer to the device is returned.
685 * If the name is not found then %NULL is returned.
686 * The reference counters are not incremented so the caller must be
687 * careful with locks. The caller must hold RCU lock.
688 */
689
690struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
691{
Eric Dumazet72c95282009-10-30 07:11:27 +0000692 struct net_device *dev;
693 struct hlist_head *head = dev_name_hash(net, name);
694
Sasha Levinb67bfe02013-02-27 17:06:00 -0800695 hlist_for_each_entry_rcu(dev, head, name_hlist)
Eric Dumazet72c95282009-10-30 07:11:27 +0000696 if (!strncmp(dev->name, name, IFNAMSIZ))
697 return dev;
698
699 return NULL;
700}
701EXPORT_SYMBOL(dev_get_by_name_rcu);
702
703/**
Linus Torvalds1da177e2005-04-16 15:20:36 -0700704 * dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700705 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700706 * @name: name to find
707 *
708 * Find an interface by name. This can be called from any
709 * context and does its own locking. The returned handle has
710 * the usage count incremented and the caller must use dev_put() to
711 * release it when it is no longer needed. %NULL is returned if no
712 * matching device is found.
713 */
714
Eric W. Biederman881d9662007-09-17 11:56:21 -0700715struct net_device *dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716{
717 struct net_device *dev;
718
Eric Dumazet72c95282009-10-30 07:11:27 +0000719 rcu_read_lock();
720 dev = dev_get_by_name_rcu(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700721 if (dev)
722 dev_hold(dev);
Eric Dumazet72c95282009-10-30 07:11:27 +0000723 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724 return dev;
725}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700726EXPORT_SYMBOL(dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700727
728/**
729 * __dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700730 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731 * @ifindex: index of device
732 *
733 * Search for an interface by index. Returns %NULL if the device
734 * is not found or a pointer to the device. The device has not
735 * had its reference counter increased so the caller must be careful
736 * about locking. The caller must hold either the RTNL semaphore
737 * or @dev_base_lock.
738 */
739
Eric W. Biederman881d9662007-09-17 11:56:21 -0700740struct net_device *__dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700741{
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700742 struct net_device *dev;
743 struct hlist_head *head = dev_index_hash(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700744
Sasha Levinb67bfe02013-02-27 17:06:00 -0800745 hlist_for_each_entry(dev, head, index_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700746 if (dev->ifindex == ifindex)
747 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700748
Linus Torvalds1da177e2005-04-16 15:20:36 -0700749 return NULL;
750}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700751EXPORT_SYMBOL(__dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700752
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000753/**
754 * dev_get_by_index_rcu - find a device by its ifindex
755 * @net: the applicable net namespace
756 * @ifindex: index of device
757 *
758 * Search for an interface by index. Returns %NULL if the device
759 * is not found or a pointer to the device. The device has not
760 * had its reference counter increased so the caller must be careful
761 * about locking. The caller must hold RCU lock.
762 */
763
764struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
765{
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000766 struct net_device *dev;
767 struct hlist_head *head = dev_index_hash(net, ifindex);
768
Sasha Levinb67bfe02013-02-27 17:06:00 -0800769 hlist_for_each_entry_rcu(dev, head, index_hlist)
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000770 if (dev->ifindex == ifindex)
771 return dev;
772
773 return NULL;
774}
775EXPORT_SYMBOL(dev_get_by_index_rcu);
776
Linus Torvalds1da177e2005-04-16 15:20:36 -0700777
778/**
779 * dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700780 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700781 * @ifindex: index of device
782 *
783 * Search for an interface by index. Returns NULL if the device
784 * is not found or a pointer to the device. The device returned has
785 * had a reference added and the pointer is safe until the user calls
786 * dev_put to indicate they have finished with it.
787 */
788
Eric W. Biederman881d9662007-09-17 11:56:21 -0700789struct net_device *dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790{
791 struct net_device *dev;
792
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000793 rcu_read_lock();
794 dev = dev_get_by_index_rcu(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700795 if (dev)
796 dev_hold(dev);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000797 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700798 return dev;
799}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700800EXPORT_SYMBOL(dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700801
802/**
Eric Dumazet941666c2010-12-05 01:23:53 +0000803 * dev_getbyhwaddr_rcu - find a device by its hardware address
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700804 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700805 * @type: media type of device
806 * @ha: hardware address
807 *
808 * Search for an interface by MAC address. Returns NULL if the device
Eric Dumazetc5066532011-01-24 13:16:16 -0800809 * is not found or a pointer to the device.
810 * The caller must hold RCU or RTNL.
Eric Dumazet941666c2010-12-05 01:23:53 +0000811 * The returned device has not had its ref count increased
Linus Torvalds1da177e2005-04-16 15:20:36 -0700812 * and the caller must therefore be careful about locking
813 *
Linus Torvalds1da177e2005-04-16 15:20:36 -0700814 */
815
Eric Dumazet941666c2010-12-05 01:23:53 +0000816struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
817 const char *ha)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700818{
819 struct net_device *dev;
820
Eric Dumazet941666c2010-12-05 01:23:53 +0000821 for_each_netdev_rcu(net, dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700822 if (dev->type == type &&
823 !memcmp(dev->dev_addr, ha, dev->addr_len))
Pavel Emelianov7562f872007-05-03 15:13:45 -0700824 return dev;
825
826 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700827}
Eric Dumazet941666c2010-12-05 01:23:53 +0000828EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
Jochen Friedrichcf309e32005-09-22 04:44:55 -0300829
Eric W. Biederman881d9662007-09-17 11:56:21 -0700830struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700831{
832 struct net_device *dev;
833
834 ASSERT_RTNL();
Eric W. Biederman881d9662007-09-17 11:56:21 -0700835 for_each_netdev(net, dev)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700836 if (dev->type == type)
Pavel Emelianov7562f872007-05-03 15:13:45 -0700837 return dev;
838
839 return NULL;
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700840}
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700841EXPORT_SYMBOL(__dev_getfirstbyhwtype);
842
Eric W. Biederman881d9662007-09-17 11:56:21 -0700843struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700844{
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000845 struct net_device *dev, *ret = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700846
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000847 rcu_read_lock();
848 for_each_netdev_rcu(net, dev)
849 if (dev->type == type) {
850 dev_hold(dev);
851 ret = dev;
852 break;
853 }
854 rcu_read_unlock();
855 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700856}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700857EXPORT_SYMBOL(dev_getfirstbyhwtype);
858
859/**
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000860 * dev_get_by_flags_rcu - find any device with given flags
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700861 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700862 * @if_flags: IFF_* values
863 * @mask: bitmask of bits in if_flags to check
864 *
865 * Search for any interface with the given flags. Returns NULL if a device
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000866 * is not found or a pointer to the device. Must be called inside
867 * rcu_read_lock(), and result refcount is unchanged.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700868 */
869
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000870struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700871 unsigned short mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700872{
Pavel Emelianov7562f872007-05-03 15:13:45 -0700873 struct net_device *dev, *ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700874
Pavel Emelianov7562f872007-05-03 15:13:45 -0700875 ret = NULL;
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800876 for_each_netdev_rcu(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700877 if (((dev->flags ^ if_flags) & mask) == 0) {
Pavel Emelianov7562f872007-05-03 15:13:45 -0700878 ret = dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700879 break;
880 }
881 }
Pavel Emelianov7562f872007-05-03 15:13:45 -0700882 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700883}
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000884EXPORT_SYMBOL(dev_get_by_flags_rcu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700885
886/**
887 * dev_valid_name - check if name is okay for network device
888 * @name: name string
889 *
890 * Network device names need to be valid file names to
David S. Millerc7fa9d12006-08-15 16:34:13 -0700891 * to allow sysfs to work. We also disallow any kind of
892 * whitespace.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700893 */
David S. Miller95f050b2012-03-06 16:12:15 -0500894bool dev_valid_name(const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700895{
David S. Millerc7fa9d12006-08-15 16:34:13 -0700896 if (*name == '\0')
David S. Miller95f050b2012-03-06 16:12:15 -0500897 return false;
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -0700898 if (strlen(name) >= IFNAMSIZ)
David S. Miller95f050b2012-03-06 16:12:15 -0500899 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700900 if (!strcmp(name, ".") || !strcmp(name, ".."))
David S. Miller95f050b2012-03-06 16:12:15 -0500901 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700902
903 while (*name) {
904 if (*name == '/' || isspace(*name))
David S. Miller95f050b2012-03-06 16:12:15 -0500905 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700906 name++;
907 }
David S. Miller95f050b2012-03-06 16:12:15 -0500908 return true;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700909}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700910EXPORT_SYMBOL(dev_valid_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700911
912/**
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200913 * __dev_alloc_name - allocate a name for a device
914 * @net: network namespace to allocate the device name in
Linus Torvalds1da177e2005-04-16 15:20:36 -0700915 * @name: name format string
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200916 * @buf: scratch buffer and result name string
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917 *
918 * Passed a format string - eg "lt%d" it will try and find a suitable
Stephen Hemminger3041a062006-05-26 13:25:24 -0700919 * id. It scans list of devices to build up a free map, then chooses
920 * the first empty slot. The caller must hold the dev_base or rtnl lock
921 * while allocating the name and adding the device in order to avoid
922 * duplicates.
923 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
924 * Returns the number of the unit assigned or a negative errno code.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700925 */
926
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200927static int __dev_alloc_name(struct net *net, const char *name, char *buf)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700928{
929 int i = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700930 const char *p;
931 const int max_netdevices = 8*PAGE_SIZE;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700932 unsigned long *inuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700933 struct net_device *d;
934
935 p = strnchr(name, IFNAMSIZ-1, '%');
936 if (p) {
937 /*
938 * Verify the string as this thing may have come from
939 * the user. There must be either one "%d" and no other "%"
940 * characters.
941 */
942 if (p[1] != 'd' || strchr(p + 2, '%'))
943 return -EINVAL;
944
945 /* Use one page as a bit array of possible slots */
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700946 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700947 if (!inuse)
948 return -ENOMEM;
949
Eric W. Biederman881d9662007-09-17 11:56:21 -0700950 for_each_netdev(net, d) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700951 if (!sscanf(d->name, name, &i))
952 continue;
953 if (i < 0 || i >= max_netdevices)
954 continue;
955
956 /* avoid cases where sscanf is not exact inverse of printf */
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200957 snprintf(buf, IFNAMSIZ, name, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700958 if (!strncmp(buf, d->name, IFNAMSIZ))
959 set_bit(i, inuse);
960 }
961
962 i = find_first_zero_bit(inuse, max_netdevices);
963 free_page((unsigned long) inuse);
964 }
965
Octavian Purdilad9031022009-11-18 02:36:59 +0000966 if (buf != name)
967 snprintf(buf, IFNAMSIZ, name, i);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200968 if (!__dev_get_by_name(net, buf))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700969 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700970
971 /* It is possible to run out of possible slots
972 * when the name is long and there isn't enough space left
973 * for the digits, or if all bits are used.
974 */
975 return -ENFILE;
976}
977
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200978/**
979 * dev_alloc_name - allocate a name for a device
980 * @dev: device
981 * @name: name format string
982 *
983 * Passed a format string - eg "lt%d" it will try and find a suitable
984 * id. It scans list of devices to build up a free map, then chooses
985 * the first empty slot. The caller must hold the dev_base or rtnl lock
986 * while allocating the name and adding the device in order to avoid
987 * duplicates.
988 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
989 * Returns the number of the unit assigned or a negative errno code.
990 */
991
992int dev_alloc_name(struct net_device *dev, const char *name)
993{
994 char buf[IFNAMSIZ];
995 struct net *net;
996 int ret;
997
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900998 BUG_ON(!dev_net(dev));
999 net = dev_net(dev);
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001000 ret = __dev_alloc_name(net, name, buf);
1001 if (ret >= 0)
1002 strlcpy(dev->name, buf, IFNAMSIZ);
1003 return ret;
1004}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001005EXPORT_SYMBOL(dev_alloc_name);
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001006
Gao feng828de4f2012-09-13 20:58:27 +00001007static int dev_alloc_name_ns(struct net *net,
1008 struct net_device *dev,
1009 const char *name)
Octavian Purdilad9031022009-11-18 02:36:59 +00001010{
Gao feng828de4f2012-09-13 20:58:27 +00001011 char buf[IFNAMSIZ];
1012 int ret;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001013
Gao feng828de4f2012-09-13 20:58:27 +00001014 ret = __dev_alloc_name(net, name, buf);
1015 if (ret >= 0)
1016 strlcpy(dev->name, buf, IFNAMSIZ);
1017 return ret;
1018}
1019
1020static int dev_get_valid_name(struct net *net,
1021 struct net_device *dev,
1022 const char *name)
1023{
1024 BUG_ON(!net);
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001025
Octavian Purdilad9031022009-11-18 02:36:59 +00001026 if (!dev_valid_name(name))
1027 return -EINVAL;
1028
Jiri Pirko1c5cae82011-04-30 01:21:32 +00001029 if (strchr(name, '%'))
Gao feng828de4f2012-09-13 20:58:27 +00001030 return dev_alloc_name_ns(net, dev, name);
Octavian Purdilad9031022009-11-18 02:36:59 +00001031 else if (__dev_get_by_name(net, name))
1032 return -EEXIST;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001033 else if (dev->name != name)
1034 strlcpy(dev->name, name, IFNAMSIZ);
Octavian Purdilad9031022009-11-18 02:36:59 +00001035
1036 return 0;
1037}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001038
1039/**
1040 * dev_change_name - change name of a device
1041 * @dev: device
1042 * @newname: name (or format string) must be at least IFNAMSIZ
1043 *
1044 * Change name of a device, can pass format strings "eth%d".
1045 * for wildcarding.
1046 */
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07001047int dev_change_name(struct net_device *dev, const char *newname)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001048{
Herbert Xufcc5a032007-07-30 17:03:38 -07001049 char oldname[IFNAMSIZ];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050 int err = 0;
Herbert Xufcc5a032007-07-30 17:03:38 -07001051 int ret;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001052 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053
1054 ASSERT_RTNL();
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001055 BUG_ON(!dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001056
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001057 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001058 if (dev->flags & IFF_UP)
1059 return -EBUSY;
1060
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001061 write_seqcount_begin(&devnet_rename_seq);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001062
1063 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001064 write_seqcount_end(&devnet_rename_seq);
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -07001065 return 0;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001066 }
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -07001067
Herbert Xufcc5a032007-07-30 17:03:38 -07001068 memcpy(oldname, dev->name, IFNAMSIZ);
1069
Gao feng828de4f2012-09-13 20:58:27 +00001070 err = dev_get_valid_name(net, dev, newname);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001071 if (err < 0) {
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001072 write_seqcount_end(&devnet_rename_seq);
Octavian Purdilad9031022009-11-18 02:36:59 +00001073 return err;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001074 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001075
Herbert Xufcc5a032007-07-30 17:03:38 -07001076rollback:
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001077 ret = device_rename(&dev->dev, dev->name);
1078 if (ret) {
1079 memcpy(dev->name, oldname, IFNAMSIZ);
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001080 write_seqcount_end(&devnet_rename_seq);
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001081 return ret;
Stephen Hemmingerdcc99772008-05-14 22:33:38 -07001082 }
Herbert Xu7f988ea2007-07-30 16:35:46 -07001083
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001084 write_seqcount_end(&devnet_rename_seq);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001085
Herbert Xu7f988ea2007-07-30 16:35:46 -07001086 write_lock_bh(&dev_base_lock);
Eric Dumazet372b2312011-05-17 13:56:59 -04001087 hlist_del_rcu(&dev->name_hlist);
Eric Dumazet72c95282009-10-30 07:11:27 +00001088 write_unlock_bh(&dev_base_lock);
1089
1090 synchronize_rcu();
1091
1092 write_lock_bh(&dev_base_lock);
1093 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Herbert Xu7f988ea2007-07-30 16:35:46 -07001094 write_unlock_bh(&dev_base_lock);
1095
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001096 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001097 ret = notifier_to_errno(ret);
1098
1099 if (ret) {
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001100 /* err >= 0 after dev_alloc_name() or stores the first errno */
1101 if (err >= 0) {
Herbert Xufcc5a032007-07-30 17:03:38 -07001102 err = ret;
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001103 write_seqcount_begin(&devnet_rename_seq);
Herbert Xufcc5a032007-07-30 17:03:38 -07001104 memcpy(dev->name, oldname, IFNAMSIZ);
1105 goto rollback;
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001106 } else {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001107 pr_err("%s: name change rollback failed: %d\n",
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001108 dev->name, ret);
Herbert Xufcc5a032007-07-30 17:03:38 -07001109 }
1110 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001111
1112 return err;
1113}
1114
1115/**
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001116 * dev_set_alias - change ifalias of a device
1117 * @dev: device
1118 * @alias: name up to IFALIASZ
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07001119 * @len: limit of bytes to copy from info
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001120 *
1121 * Set ifalias for a device,
1122 */
1123int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1124{
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001125 char *new_ifalias;
1126
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001127 ASSERT_RTNL();
1128
1129 if (len >= IFALIASZ)
1130 return -EINVAL;
1131
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001132 if (!len) {
Sachin Kamat388dfc22012-11-20 00:57:04 +00001133 kfree(dev->ifalias);
1134 dev->ifalias = NULL;
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001135 return 0;
1136 }
1137
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001138 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1139 if (!new_ifalias)
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001140 return -ENOMEM;
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001141 dev->ifalias = new_ifalias;
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001142
1143 strlcpy(dev->ifalias, alias, len+1);
1144 return len;
1145}
1146
1147
1148/**
Stephen Hemminger3041a062006-05-26 13:25:24 -07001149 * netdev_features_change - device changes features
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001150 * @dev: device to cause notification
1151 *
1152 * Called to indicate a device has changed features.
1153 */
1154void netdev_features_change(struct net_device *dev)
1155{
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001156 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001157}
1158EXPORT_SYMBOL(netdev_features_change);
1159
1160/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001161 * netdev_state_change - device changes state
1162 * @dev: device to cause notification
1163 *
1164 * Called to indicate a device has changed state. This function calls
1165 * the notifier chains for netdev_chain and sends a NEWLINK message
1166 * to the routing socket.
1167 */
1168void netdev_state_change(struct net_device *dev)
1169{
1170 if (dev->flags & IFF_UP) {
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001171 call_netdevice_notifiers(NETDEV_CHANGE, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001172 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1173 }
1174}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001175EXPORT_SYMBOL(netdev_state_change);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001176
Amerigo Wangee89bab2012-08-09 22:14:56 +00001177/**
1178 * netdev_notify_peers - notify network peers about existence of @dev
1179 * @dev: network device
1180 *
1181 * Generate traffic such that interested network peers are aware of
1182 * @dev, such as by generating a gratuitous ARP. This may be used when
1183 * a device wants to inform the rest of the network about some sort of
1184 * reconfiguration such as a failover event or virtual machine
1185 * migration.
1186 */
1187void netdev_notify_peers(struct net_device *dev)
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001188{
Amerigo Wangee89bab2012-08-09 22:14:56 +00001189 rtnl_lock();
1190 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1191 rtnl_unlock();
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001192}
Amerigo Wangee89bab2012-08-09 22:14:56 +00001193EXPORT_SYMBOL(netdev_notify_peers);
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001194
Patrick McHardybd380812010-02-26 06:34:53 +00001195static int __dev_open(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001196{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001197 const struct net_device_ops *ops = dev->netdev_ops;
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001198 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001199
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001200 ASSERT_RTNL();
1201
Linus Torvalds1da177e2005-04-16 15:20:36 -07001202 if (!netif_device_present(dev))
1203 return -ENODEV;
1204
Neil Hormanca99ca12013-02-05 08:05:43 +00001205 /* Block netpoll from trying to do any rx path servicing.
1206 * If we don't do this there is a chance ndo_poll_controller
1207 * or ndo_poll may be running while we open the device
1208 */
dingtianhongda6e3782013-05-27 19:53:31 +00001209 netpoll_rx_disable(dev);
Neil Hormanca99ca12013-02-05 08:05:43 +00001210
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001211 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1212 ret = notifier_to_errno(ret);
1213 if (ret)
1214 return ret;
1215
Linus Torvalds1da177e2005-04-16 15:20:36 -07001216 set_bit(__LINK_STATE_START, &dev->state);
Jeff Garzikbada3392007-10-23 20:19:37 -07001217
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001218 if (ops->ndo_validate_addr)
1219 ret = ops->ndo_validate_addr(dev);
Jeff Garzikbada3392007-10-23 20:19:37 -07001220
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001221 if (!ret && ops->ndo_open)
1222 ret = ops->ndo_open(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001223
Neil Hormanca99ca12013-02-05 08:05:43 +00001224 netpoll_rx_enable(dev);
1225
Jeff Garzikbada3392007-10-23 20:19:37 -07001226 if (ret)
1227 clear_bit(__LINK_STATE_START, &dev->state);
1228 else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001229 dev->flags |= IFF_UP;
David S. Millerb4bd07c2009-02-06 22:06:43 -08001230 net_dmaengine_get();
Patrick McHardy4417da62007-06-27 01:28:10 -07001231 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001232 dev_activate(dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04001233 add_device_randomness(dev->dev_addr, dev->addr_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001234 }
Jeff Garzikbada3392007-10-23 20:19:37 -07001235
Linus Torvalds1da177e2005-04-16 15:20:36 -07001236 return ret;
1237}
Patrick McHardybd380812010-02-26 06:34:53 +00001238
1239/**
1240 * dev_open - prepare an interface for use.
1241 * @dev: device to open
1242 *
1243 * Takes a device from down to up state. The device's private open
1244 * function is invoked and then the multicast lists are loaded. Finally
1245 * the device is moved into the up state and a %NETDEV_UP message is
1246 * sent to the netdev notifier chain.
1247 *
1248 * Calling this function on an active interface is a nop. On a failure
1249 * a negative errno code is returned.
1250 */
1251int dev_open(struct net_device *dev)
1252{
1253 int ret;
1254
Patrick McHardybd380812010-02-26 06:34:53 +00001255 if (dev->flags & IFF_UP)
1256 return 0;
1257
Patrick McHardybd380812010-02-26 06:34:53 +00001258 ret = __dev_open(dev);
1259 if (ret < 0)
1260 return ret;
1261
Patrick McHardybd380812010-02-26 06:34:53 +00001262 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1263 call_netdevice_notifiers(NETDEV_UP, dev);
1264
1265 return ret;
1266}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001267EXPORT_SYMBOL(dev_open);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001268
Octavian Purdila44345722010-12-13 12:44:07 +00001269static int __dev_close_many(struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001270{
Octavian Purdila44345722010-12-13 12:44:07 +00001271 struct net_device *dev;
Patrick McHardybd380812010-02-26 06:34:53 +00001272
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001273 ASSERT_RTNL();
David S. Miller9d5010d2007-09-12 14:33:25 +02001274 might_sleep();
1275
Octavian Purdila44345722010-12-13 12:44:07 +00001276 list_for_each_entry(dev, head, unreg_list) {
Octavian Purdila44345722010-12-13 12:44:07 +00001277 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001278
Octavian Purdila44345722010-12-13 12:44:07 +00001279 clear_bit(__LINK_STATE_START, &dev->state);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001280
Octavian Purdila44345722010-12-13 12:44:07 +00001281 /* Synchronize to scheduled poll. We cannot touch poll list, it
1282 * can be even on different cpu. So just clear netif_running().
1283 *
1284 * dev->stop() will invoke napi_disable() on all of it's
1285 * napi_struct instances on this device.
1286 */
1287 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1288 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001289
Octavian Purdila44345722010-12-13 12:44:07 +00001290 dev_deactivate_many(head);
1291
1292 list_for_each_entry(dev, head, unreg_list) {
1293 const struct net_device_ops *ops = dev->netdev_ops;
1294
1295 /*
1296 * Call the device specific close. This cannot fail.
1297 * Only if device is UP
1298 *
1299 * We allow it to be called even after a DETACH hot-plug
1300 * event.
1301 */
1302 if (ops->ndo_stop)
1303 ops->ndo_stop(dev);
1304
Octavian Purdila44345722010-12-13 12:44:07 +00001305 dev->flags &= ~IFF_UP;
Octavian Purdila44345722010-12-13 12:44:07 +00001306 net_dmaengine_put();
1307 }
1308
1309 return 0;
1310}
1311
1312static int __dev_close(struct net_device *dev)
1313{
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001314 int retval;
Octavian Purdila44345722010-12-13 12:44:07 +00001315 LIST_HEAD(single);
1316
Neil Hormanca99ca12013-02-05 08:05:43 +00001317 /* Temporarily disable netpoll until the interface is down */
dingtianhongda6e3782013-05-27 19:53:31 +00001318 netpoll_rx_disable(dev);
Neil Hormanca99ca12013-02-05 08:05:43 +00001319
Octavian Purdila44345722010-12-13 12:44:07 +00001320 list_add(&dev->unreg_list, &single);
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001321 retval = __dev_close_many(&single);
1322 list_del(&single);
Neil Hormanca99ca12013-02-05 08:05:43 +00001323
1324 netpoll_rx_enable(dev);
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001325 return retval;
Octavian Purdila44345722010-12-13 12:44:07 +00001326}
1327
Eric Dumazet3fbd8752011-01-19 21:23:22 +00001328static int dev_close_many(struct list_head *head)
Octavian Purdila44345722010-12-13 12:44:07 +00001329{
1330 struct net_device *dev, *tmp;
1331 LIST_HEAD(tmp_list);
1332
1333 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1334 if (!(dev->flags & IFF_UP))
1335 list_move(&dev->unreg_list, &tmp_list);
1336
1337 __dev_close_many(head);
Matti Linnanvuorid8b2a4d2008-02-12 23:10:11 -08001338
Octavian Purdila44345722010-12-13 12:44:07 +00001339 list_for_each_entry(dev, head, unreg_list) {
1340 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1341 call_netdevice_notifiers(NETDEV_DOWN, dev);
1342 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001343
Octavian Purdila44345722010-12-13 12:44:07 +00001344 /* rollback_registered_many needs the complete original list */
1345 list_splice(&tmp_list, head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001346 return 0;
1347}
Patrick McHardybd380812010-02-26 06:34:53 +00001348
1349/**
1350 * dev_close - shutdown an interface.
1351 * @dev: device to shutdown
1352 *
1353 * This function moves an active device into down state. A
1354 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1355 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1356 * chain.
1357 */
1358int dev_close(struct net_device *dev)
1359{
Eric Dumazete14a5992011-05-10 12:26:06 -07001360 if (dev->flags & IFF_UP) {
1361 LIST_HEAD(single);
Patrick McHardybd380812010-02-26 06:34:53 +00001362
Neil Hormanca99ca12013-02-05 08:05:43 +00001363 /* Block netpoll rx while the interface is going down */
dingtianhongda6e3782013-05-27 19:53:31 +00001364 netpoll_rx_disable(dev);
Neil Hormanca99ca12013-02-05 08:05:43 +00001365
Eric Dumazete14a5992011-05-10 12:26:06 -07001366 list_add(&dev->unreg_list, &single);
1367 dev_close_many(&single);
1368 list_del(&single);
Neil Hormanca99ca12013-02-05 08:05:43 +00001369
1370 netpoll_rx_enable(dev);
Eric Dumazete14a5992011-05-10 12:26:06 -07001371 }
dingtianhongda6e3782013-05-27 19:53:31 +00001372 return 0;
Patrick McHardybd380812010-02-26 06:34:53 +00001373}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001374EXPORT_SYMBOL(dev_close);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001375
1376
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001377/**
1378 * dev_disable_lro - disable Large Receive Offload on a device
1379 * @dev: device
1380 *
1381 * Disable Large Receive Offload (LRO) on a net device. Must be
1382 * called under RTNL. This is needed if received packets may be
1383 * forwarded to another interface.
1384 */
1385void dev_disable_lro(struct net_device *dev)
1386{
Neil Hormanf11970e2011-05-24 08:31:09 +00001387 /*
1388 * If we're trying to disable lro on a vlan device
1389 * use the underlying physical device instead
1390 */
1391 if (is_vlan_dev(dev))
1392 dev = vlan_dev_real_dev(dev);
1393
Michał Mirosławbc5787c62011-11-15 15:29:55 +00001394 dev->wanted_features &= ~NETIF_F_LRO;
1395 netdev_update_features(dev);
Michał Mirosław27660512011-03-18 16:56:34 +00001396
Michał Mirosław22d59692011-04-21 12:42:15 +00001397 if (unlikely(dev->features & NETIF_F_LRO))
1398 netdev_WARN(dev, "failed to disable LRO!\n");
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001399}
1400EXPORT_SYMBOL(dev_disable_lro);
1401
Jiri Pirko351638e2013-05-28 01:30:21 +00001402static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1403 struct net_device *dev)
1404{
1405 struct netdev_notifier_info info;
1406
1407 netdev_notifier_info_init(&info, dev);
1408 return nb->notifier_call(nb, val, &info);
1409}
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001410
Eric W. Biederman881d9662007-09-17 11:56:21 -07001411static int dev_boot_phase = 1;
1412
Linus Torvalds1da177e2005-04-16 15:20:36 -07001413/**
1414 * register_netdevice_notifier - register a network notifier block
1415 * @nb: notifier
1416 *
1417 * Register a notifier to be called when network device events occur.
1418 * The notifier passed is linked into the kernel structures and must
1419 * not be reused until it has been unregistered. A negative errno code
1420 * is returned on a failure.
1421 *
1422 * When registered all registration and up events are replayed
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001423 * to the new notifier to allow device to have a race free
Linus Torvalds1da177e2005-04-16 15:20:36 -07001424 * view of the network device list.
1425 */
1426
1427int register_netdevice_notifier(struct notifier_block *nb)
1428{
1429 struct net_device *dev;
Herbert Xufcc5a032007-07-30 17:03:38 -07001430 struct net_device *last;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001431 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001432 int err;
1433
1434 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001435 err = raw_notifier_chain_register(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001436 if (err)
1437 goto unlock;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001438 if (dev_boot_phase)
1439 goto unlock;
1440 for_each_net(net) {
1441 for_each_netdev(net, dev) {
Jiri Pirko351638e2013-05-28 01:30:21 +00001442 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
Eric W. Biederman881d9662007-09-17 11:56:21 -07001443 err = notifier_to_errno(err);
1444 if (err)
1445 goto rollback;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001446
Eric W. Biederman881d9662007-09-17 11:56:21 -07001447 if (!(dev->flags & IFF_UP))
1448 continue;
Herbert Xufcc5a032007-07-30 17:03:38 -07001449
Jiri Pirko351638e2013-05-28 01:30:21 +00001450 call_netdevice_notifier(nb, NETDEV_UP, dev);
Eric W. Biederman881d9662007-09-17 11:56:21 -07001451 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001452 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001453
1454unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001455 rtnl_unlock();
1456 return err;
Herbert Xufcc5a032007-07-30 17:03:38 -07001457
1458rollback:
1459 last = dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001460 for_each_net(net) {
1461 for_each_netdev(net, dev) {
1462 if (dev == last)
RongQing.Li8f891482011-11-30 23:43:07 -05001463 goto outroll;
Herbert Xufcc5a032007-07-30 17:03:38 -07001464
Eric W. Biederman881d9662007-09-17 11:56:21 -07001465 if (dev->flags & IFF_UP) {
Jiri Pirko351638e2013-05-28 01:30:21 +00001466 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1467 dev);
1468 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
Eric W. Biederman881d9662007-09-17 11:56:21 -07001469 }
Jiri Pirko351638e2013-05-28 01:30:21 +00001470 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001471 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001472 }
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001473
RongQing.Li8f891482011-11-30 23:43:07 -05001474outroll:
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001475 raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001476 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001477}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001478EXPORT_SYMBOL(register_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001479
1480/**
1481 * unregister_netdevice_notifier - unregister a network notifier block
1482 * @nb: notifier
1483 *
1484 * Unregister a notifier previously registered by
1485 * register_netdevice_notifier(). The notifier is unlinked into the
1486 * kernel structures and may then be reused. A negative errno code
1487 * is returned on a failure.
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001488 *
1489 * After unregistering unregister and down device events are synthesized
1490 * for all devices on the device list to the removed notifier to remove
1491 * the need for special case cleanup code.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001492 */
1493
1494int unregister_netdevice_notifier(struct notifier_block *nb)
1495{
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001496 struct net_device *dev;
1497 struct net *net;
Herbert Xu9f514952006-03-25 01:24:25 -08001498 int err;
1499
1500 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001501 err = raw_notifier_chain_unregister(&netdev_chain, nb);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001502 if (err)
1503 goto unlock;
1504
1505 for_each_net(net) {
1506 for_each_netdev(net, dev) {
1507 if (dev->flags & IFF_UP) {
Jiri Pirko351638e2013-05-28 01:30:21 +00001508 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1509 dev);
1510 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001511 }
Jiri Pirko351638e2013-05-28 01:30:21 +00001512 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001513 }
1514 }
1515unlock:
Herbert Xu9f514952006-03-25 01:24:25 -08001516 rtnl_unlock();
1517 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001518}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001519EXPORT_SYMBOL(unregister_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001520
1521/**
Jiri Pirko351638e2013-05-28 01:30:21 +00001522 * call_netdevice_notifiers_info - call all network notifier blocks
1523 * @val: value passed unmodified to notifier function
1524 * @dev: net_device pointer passed unmodified to notifier function
1525 * @info: notifier information data
1526 *
1527 * Call all network notifier blocks. Parameters and return value
1528 * are as for raw_notifier_call_chain().
1529 */
1530
1531int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
1532 struct netdev_notifier_info *info)
1533{
1534 ASSERT_RTNL();
1535 netdev_notifier_info_init(info, dev);
1536 return raw_notifier_call_chain(&netdev_chain, val, info);
1537}
1538EXPORT_SYMBOL(call_netdevice_notifiers_info);
1539
1540/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001541 * call_netdevice_notifiers - call all network notifier blocks
1542 * @val: value passed unmodified to notifier function
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001543 * @dev: net_device pointer passed unmodified to notifier function
Linus Torvalds1da177e2005-04-16 15:20:36 -07001544 *
1545 * Call all network notifier blocks. Parameters and return value
Alan Sternf07d5b92006-05-09 15:23:03 -07001546 * are as for raw_notifier_call_chain().
Linus Torvalds1da177e2005-04-16 15:20:36 -07001547 */
1548
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001549int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001550{
Jiri Pirko351638e2013-05-28 01:30:21 +00001551 struct netdev_notifier_info info;
1552
1553 return call_netdevice_notifiers_info(val, dev, &info);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001554}
stephen hemmingeredf947f2011-03-24 13:24:01 +00001555EXPORT_SYMBOL(call_netdevice_notifiers);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001556
Ingo Molnarc5905af2012-02-24 08:31:31 +01001557static struct static_key netstamp_needed __read_mostly;
Eric Dumazetb90e5792011-11-28 11:16:50 +00001558#ifdef HAVE_JUMP_LABEL
Ingo Molnarc5905af2012-02-24 08:31:31 +01001559/* We are not allowed to call static_key_slow_dec() from irq context
Eric Dumazetb90e5792011-11-28 11:16:50 +00001560 * If net_disable_timestamp() is called from irq context, defer the
Ingo Molnarc5905af2012-02-24 08:31:31 +01001561 * static_key_slow_dec() calls.
Eric Dumazetb90e5792011-11-28 11:16:50 +00001562 */
1563static atomic_t netstamp_needed_deferred;
1564#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001565
1566void net_enable_timestamp(void)
1567{
Eric Dumazetb90e5792011-11-28 11:16:50 +00001568#ifdef HAVE_JUMP_LABEL
1569 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1570
1571 if (deferred) {
1572 while (--deferred)
Ingo Molnarc5905af2012-02-24 08:31:31 +01001573 static_key_slow_dec(&netstamp_needed);
Eric Dumazetb90e5792011-11-28 11:16:50 +00001574 return;
1575 }
1576#endif
Ingo Molnarc5905af2012-02-24 08:31:31 +01001577 static_key_slow_inc(&netstamp_needed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001578}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001579EXPORT_SYMBOL(net_enable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001580
1581void net_disable_timestamp(void)
1582{
Eric Dumazetb90e5792011-11-28 11:16:50 +00001583#ifdef HAVE_JUMP_LABEL
1584 if (in_interrupt()) {
1585 atomic_inc(&netstamp_needed_deferred);
1586 return;
1587 }
1588#endif
Ingo Molnarc5905af2012-02-24 08:31:31 +01001589 static_key_slow_dec(&netstamp_needed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001590}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001591EXPORT_SYMBOL(net_disable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001592
Eric Dumazet3b098e22010-05-15 23:57:10 -07001593static inline void net_timestamp_set(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001594{
Eric Dumazet588f0332011-11-15 04:12:55 +00001595 skb->tstamp.tv64 = 0;
Ingo Molnarc5905af2012-02-24 08:31:31 +01001596 if (static_key_false(&netstamp_needed))
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001597 __net_timestamp(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001598}
1599
Eric Dumazet588f0332011-11-15 04:12:55 +00001600#define net_timestamp_check(COND, SKB) \
Ingo Molnarc5905af2012-02-24 08:31:31 +01001601 if (static_key_false(&netstamp_needed)) { \
Eric Dumazet588f0332011-11-15 04:12:55 +00001602 if ((COND) && !(SKB)->tstamp.tv64) \
1603 __net_timestamp(SKB); \
1604 } \
Eric Dumazet3b098e22010-05-15 23:57:10 -07001605
Daniel Lezcano79b569f2011-03-30 02:42:17 -07001606static inline bool is_skb_forwardable(struct net_device *dev,
1607 struct sk_buff *skb)
1608{
1609 unsigned int len;
1610
1611 if (!(dev->flags & IFF_UP))
1612 return false;
1613
1614 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1615 if (skb->len <= len)
1616 return true;
1617
1618 /* if TSO is enabled, we don't care about the length as the packet
1619 * could be forwarded without being segmented before
1620 */
1621 if (skb_is_gso(skb))
1622 return true;
1623
1624 return false;
1625}
1626
Arnd Bergmann44540962009-11-26 06:07:08 +00001627/**
1628 * dev_forward_skb - loopback an skb to another netif
1629 *
1630 * @dev: destination network device
1631 * @skb: buffer to forward
1632 *
1633 * return values:
1634 * NET_RX_SUCCESS (no congestion)
Eric Dumazet6ec82562010-05-06 00:53:53 -07001635 * NET_RX_DROP (packet was dropped, but freed)
Arnd Bergmann44540962009-11-26 06:07:08 +00001636 *
1637 * dev_forward_skb can be used for injecting an skb from the
1638 * start_xmit function of one device into the receive queue
1639 * of another device.
1640 *
1641 * The receiving device may be in another namespace, so
1642 * we have to clear all information in the skb that could
1643 * impact namespace isolation.
1644 */
1645int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1646{
Michael S. Tsirkin48c83012011-08-31 08:03:29 +00001647 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1648 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1649 atomic_long_inc(&dev->rx_dropped);
1650 kfree_skb(skb);
1651 return NET_RX_DROP;
1652 }
1653 }
1654
Daniel Lezcano79b569f2011-03-30 02:42:17 -07001655 if (unlikely(!is_skb_forwardable(dev, skb))) {
Eric Dumazetcaf586e2010-09-30 21:06:55 +00001656 atomic_long_inc(&dev->rx_dropped);
Eric Dumazet6ec82562010-05-06 00:53:53 -07001657 kfree_skb(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001658 return NET_RX_DROP;
Eric Dumazet6ec82562010-05-06 00:53:53 -07001659 }
Nicolas Dichtel621e84d2013-06-26 16:11:27 +02001660 skb_scrub_packet(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001661 skb->protocol = eth_type_trans(skb, dev);
Arnd Bergmann44540962009-11-26 06:07:08 +00001662 return netif_rx(skb);
1663}
1664EXPORT_SYMBOL_GPL(dev_forward_skb);
1665
Changli Gao71d9dec2010-12-15 19:57:25 +00001666static inline int deliver_skb(struct sk_buff *skb,
1667 struct packet_type *pt_prev,
1668 struct net_device *orig_dev)
1669{
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00001670 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1671 return -ENOMEM;
Changli Gao71d9dec2010-12-15 19:57:25 +00001672 atomic_inc(&skb->users);
1673 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1674}
1675
Eric Leblondc0de08d2012-08-16 22:02:58 +00001676static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1677{
Eric Leblonda3d744e2012-11-06 02:10:10 +00001678 if (!ptype->af_packet_priv || !skb->sk)
Eric Leblondc0de08d2012-08-16 22:02:58 +00001679 return false;
1680
1681 if (ptype->id_match)
1682 return ptype->id_match(ptype, skb->sk);
1683 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1684 return true;
1685
1686 return false;
1687}
1688
Linus Torvalds1da177e2005-04-16 15:20:36 -07001689/*
1690 * Support routine. Sends outgoing frames to any network
1691 * taps currently in use.
1692 */
1693
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001694static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001695{
1696 struct packet_type *ptype;
Changli Gao71d9dec2010-12-15 19:57:25 +00001697 struct sk_buff *skb2 = NULL;
1698 struct packet_type *pt_prev = NULL;
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001699
Linus Torvalds1da177e2005-04-16 15:20:36 -07001700 rcu_read_lock();
1701 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1702 /* Never send packets back to the socket
1703 * they originated from - MvS (miquels@drinkel.ow.org)
1704 */
1705 if ((ptype->dev == dev || !ptype->dev) &&
Eric Leblondc0de08d2012-08-16 22:02:58 +00001706 (!skb_loop_sk(ptype, skb))) {
Changli Gao71d9dec2010-12-15 19:57:25 +00001707 if (pt_prev) {
1708 deliver_skb(skb2, pt_prev, skb->dev);
1709 pt_prev = ptype;
1710 continue;
1711 }
1712
1713 skb2 = skb_clone(skb, GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001714 if (!skb2)
1715 break;
1716
Eric Dumazet70978182010-12-20 21:22:51 +00001717 net_timestamp_set(skb2);
1718
Linus Torvalds1da177e2005-04-16 15:20:36 -07001719 /* skb->nh should be correctly
1720 set by sender, so that the second statement is
1721 just protection against buggy protocols.
1722 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001723 skb_reset_mac_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001724
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001725 if (skb_network_header(skb2) < skb2->data ||
Simon Hormanced14f62013-05-28 20:34:25 +00001726 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
Joe Perchese87cc472012-05-13 21:56:26 +00001727 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1728 ntohs(skb2->protocol),
1729 dev->name);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07001730 skb_reset_network_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001731 }
1732
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001733 skb2->transport_header = skb2->network_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001734 skb2->pkt_type = PACKET_OUTGOING;
Changli Gao71d9dec2010-12-15 19:57:25 +00001735 pt_prev = ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001736 }
1737 }
Changli Gao71d9dec2010-12-15 19:57:25 +00001738 if (pt_prev)
1739 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001740 rcu_read_unlock();
1741}
1742
Ben Hutchings2c530402012-07-10 10:55:09 +00001743/**
1744 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
John Fastabend4f57c082011-01-17 08:06:04 +00001745 * @dev: Network device
1746 * @txq: number of queues available
1747 *
1748 * If real_num_tx_queues is changed the tc mappings may no longer be
1749 * valid. To resolve this verify the tc mapping remains valid and if
1750 * not NULL the mapping. With no priorities mapping to this
1751 * offset/count pair it will no longer be used. In the worst case TC0
1752 * is invalid nothing can be done so disable priority mappings. If is
1753 * expected that drivers will fix this mapping if they can before
1754 * calling netif_set_real_num_tx_queues.
1755 */
Eric Dumazetbb134d22011-01-20 19:18:08 +00001756static void netif_setup_tc(struct net_device *dev, unsigned int txq)
John Fastabend4f57c082011-01-17 08:06:04 +00001757{
1758 int i;
1759 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1760
1761 /* If TC0 is invalidated disable TC mapping */
1762 if (tc->offset + tc->count > txq) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001763 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
John Fastabend4f57c082011-01-17 08:06:04 +00001764 dev->num_tc = 0;
1765 return;
1766 }
1767
1768 /* Invalidated prio to tc mappings set to TC0 */
1769 for (i = 1; i < TC_BITMASK + 1; i++) {
1770 int q = netdev_get_prio_tc_map(dev, i);
1771
1772 tc = &dev->tc_to_txq[q];
1773 if (tc->offset + tc->count > txq) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001774 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1775 i, q);
John Fastabend4f57c082011-01-17 08:06:04 +00001776 netdev_set_prio_tc_map(dev, i, 0);
1777 }
1778 }
1779}
1780
Alexander Duyck537c00d2013-01-10 08:57:02 +00001781#ifdef CONFIG_XPS
1782static DEFINE_MUTEX(xps_map_mutex);
1783#define xmap_dereference(P) \
1784 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1785
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001786static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1787 int cpu, u16 index)
1788{
1789 struct xps_map *map = NULL;
1790 int pos;
1791
1792 if (dev_maps)
1793 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1794
1795 for (pos = 0; map && pos < map->len; pos++) {
1796 if (map->queues[pos] == index) {
1797 if (map->len > 1) {
1798 map->queues[pos] = map->queues[--map->len];
1799 } else {
1800 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1801 kfree_rcu(map, rcu);
1802 map = NULL;
1803 }
1804 break;
1805 }
1806 }
1807
1808 return map;
1809}
1810
Alexander Duyck024e9672013-01-10 08:57:46 +00001811static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
Alexander Duyck537c00d2013-01-10 08:57:02 +00001812{
1813 struct xps_dev_maps *dev_maps;
Alexander Duyck024e9672013-01-10 08:57:46 +00001814 int cpu, i;
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001815 bool active = false;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001816
1817 mutex_lock(&xps_map_mutex);
1818 dev_maps = xmap_dereference(dev->xps_maps);
1819
1820 if (!dev_maps)
1821 goto out_no_maps;
1822
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001823 for_each_possible_cpu(cpu) {
Alexander Duyck024e9672013-01-10 08:57:46 +00001824 for (i = index; i < dev->num_tx_queues; i++) {
1825 if (!remove_xps_queue(dev_maps, cpu, i))
1826 break;
1827 }
1828 if (i == dev->num_tx_queues)
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001829 active = true;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001830 }
1831
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001832 if (!active) {
Alexander Duyck537c00d2013-01-10 08:57:02 +00001833 RCU_INIT_POINTER(dev->xps_maps, NULL);
1834 kfree_rcu(dev_maps, rcu);
1835 }
1836
Alexander Duyck024e9672013-01-10 08:57:46 +00001837 for (i = index; i < dev->num_tx_queues; i++)
1838 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1839 NUMA_NO_NODE);
1840
Alexander Duyck537c00d2013-01-10 08:57:02 +00001841out_no_maps:
1842 mutex_unlock(&xps_map_mutex);
1843}
1844
Alexander Duyck01c5f862013-01-10 08:57:35 +00001845static struct xps_map *expand_xps_map(struct xps_map *map,
1846 int cpu, u16 index)
1847{
1848 struct xps_map *new_map;
1849 int alloc_len = XPS_MIN_MAP_ALLOC;
1850 int i, pos;
1851
1852 for (pos = 0; map && pos < map->len; pos++) {
1853 if (map->queues[pos] != index)
1854 continue;
1855 return map;
1856 }
1857
1858 /* Need to add queue to this CPU's existing map */
1859 if (map) {
1860 if (pos < map->alloc_len)
1861 return map;
1862
1863 alloc_len = map->alloc_len * 2;
1864 }
1865
1866 /* Need to allocate new map to store queue on this CPU's map */
1867 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1868 cpu_to_node(cpu));
1869 if (!new_map)
1870 return NULL;
1871
1872 for (i = 0; i < pos; i++)
1873 new_map->queues[i] = map->queues[i];
1874 new_map->alloc_len = alloc_len;
1875 new_map->len = pos;
1876
1877 return new_map;
1878}
1879
Alexander Duyck537c00d2013-01-10 08:57:02 +00001880int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1881{
Alexander Duyck01c5f862013-01-10 08:57:35 +00001882 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001883 struct xps_map *map, *new_map;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001884 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
Alexander Duyck01c5f862013-01-10 08:57:35 +00001885 int cpu, numa_node_id = -2;
1886 bool active = false;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001887
1888 mutex_lock(&xps_map_mutex);
1889
1890 dev_maps = xmap_dereference(dev->xps_maps);
1891
Alexander Duyck01c5f862013-01-10 08:57:35 +00001892 /* allocate memory for queue storage */
1893 for_each_online_cpu(cpu) {
1894 if (!cpumask_test_cpu(cpu, mask))
1895 continue;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001896
Alexander Duyck01c5f862013-01-10 08:57:35 +00001897 if (!new_dev_maps)
1898 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
Alexander Duyck2bb60cb2013-02-22 06:38:44 +00001899 if (!new_dev_maps) {
1900 mutex_unlock(&xps_map_mutex);
Alexander Duyck01c5f862013-01-10 08:57:35 +00001901 return -ENOMEM;
Alexander Duyck2bb60cb2013-02-22 06:38:44 +00001902 }
Alexander Duyck01c5f862013-01-10 08:57:35 +00001903
1904 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1905 NULL;
1906
1907 map = expand_xps_map(map, cpu, index);
1908 if (!map)
1909 goto error;
1910
1911 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1912 }
1913
1914 if (!new_dev_maps)
1915 goto out_no_new_maps;
1916
1917 for_each_possible_cpu(cpu) {
1918 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1919 /* add queue to CPU maps */
1920 int pos = 0;
1921
1922 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1923 while ((pos < map->len) && (map->queues[pos] != index))
1924 pos++;
1925
1926 if (pos == map->len)
1927 map->queues[map->len++] = index;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001928#ifdef CONFIG_NUMA
Alexander Duyck537c00d2013-01-10 08:57:02 +00001929 if (numa_node_id == -2)
1930 numa_node_id = cpu_to_node(cpu);
1931 else if (numa_node_id != cpu_to_node(cpu))
1932 numa_node_id = -1;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001933#endif
Alexander Duyck01c5f862013-01-10 08:57:35 +00001934 } else if (dev_maps) {
1935 /* fill in the new device map from the old device map */
1936 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1937 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
Alexander Duyck537c00d2013-01-10 08:57:02 +00001938 }
Alexander Duyck01c5f862013-01-10 08:57:35 +00001939
Alexander Duyck537c00d2013-01-10 08:57:02 +00001940 }
1941
Alexander Duyck01c5f862013-01-10 08:57:35 +00001942 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1943
Alexander Duyck537c00d2013-01-10 08:57:02 +00001944 /* Cleanup old maps */
Alexander Duyck01c5f862013-01-10 08:57:35 +00001945 if (dev_maps) {
1946 for_each_possible_cpu(cpu) {
1947 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1948 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1949 if (map && map != new_map)
1950 kfree_rcu(map, rcu);
1951 }
Alexander Duyck537c00d2013-01-10 08:57:02 +00001952
Alexander Duyck537c00d2013-01-10 08:57:02 +00001953 kfree_rcu(dev_maps, rcu);
Alexander Duyck01c5f862013-01-10 08:57:35 +00001954 }
Alexander Duyck537c00d2013-01-10 08:57:02 +00001955
Alexander Duyck01c5f862013-01-10 08:57:35 +00001956 dev_maps = new_dev_maps;
1957 active = true;
1958
1959out_no_new_maps:
1960 /* update Tx queue numa node */
Alexander Duyck537c00d2013-01-10 08:57:02 +00001961 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1962 (numa_node_id >= 0) ? numa_node_id :
1963 NUMA_NO_NODE);
1964
Alexander Duyck01c5f862013-01-10 08:57:35 +00001965 if (!dev_maps)
1966 goto out_no_maps;
1967
1968 /* removes queue from unused CPUs */
1969 for_each_possible_cpu(cpu) {
1970 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1971 continue;
1972
1973 if (remove_xps_queue(dev_maps, cpu, index))
1974 active = true;
1975 }
1976
1977 /* free map if not active */
1978 if (!active) {
1979 RCU_INIT_POINTER(dev->xps_maps, NULL);
1980 kfree_rcu(dev_maps, rcu);
1981 }
1982
1983out_no_maps:
Alexander Duyck537c00d2013-01-10 08:57:02 +00001984 mutex_unlock(&xps_map_mutex);
1985
1986 return 0;
1987error:
Alexander Duyck01c5f862013-01-10 08:57:35 +00001988 /* remove any maps that we added */
1989 for_each_possible_cpu(cpu) {
1990 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1991 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1992 NULL;
1993 if (new_map && new_map != map)
1994 kfree(new_map);
1995 }
1996
Alexander Duyck537c00d2013-01-10 08:57:02 +00001997 mutex_unlock(&xps_map_mutex);
1998
Alexander Duyck537c00d2013-01-10 08:57:02 +00001999 kfree(new_dev_maps);
2000 return -ENOMEM;
2001}
2002EXPORT_SYMBOL(netif_set_xps_queue);
2003
2004#endif
John Fastabendf0796d52010-07-01 13:21:57 +00002005/*
2006 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2007 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2008 */
Tom Herberte6484932010-10-18 18:04:39 +00002009int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
John Fastabendf0796d52010-07-01 13:21:57 +00002010{
Tom Herbert1d24eb42010-11-21 13:17:27 +00002011 int rc;
2012
Tom Herberte6484932010-10-18 18:04:39 +00002013 if (txq < 1 || txq > dev->num_tx_queues)
2014 return -EINVAL;
John Fastabendf0796d52010-07-01 13:21:57 +00002015
Ben Hutchings5c565802011-02-15 19:39:21 +00002016 if (dev->reg_state == NETREG_REGISTERED ||
2017 dev->reg_state == NETREG_UNREGISTERING) {
Tom Herberte6484932010-10-18 18:04:39 +00002018 ASSERT_RTNL();
2019
Tom Herbert1d24eb42010-11-21 13:17:27 +00002020 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2021 txq);
Tom Herbertbf264142010-11-26 08:36:09 +00002022 if (rc)
2023 return rc;
2024
John Fastabend4f57c082011-01-17 08:06:04 +00002025 if (dev->num_tc)
2026 netif_setup_tc(dev, txq);
2027
Alexander Duyck024e9672013-01-10 08:57:46 +00002028 if (txq < dev->real_num_tx_queues) {
Tom Herberte6484932010-10-18 18:04:39 +00002029 qdisc_reset_all_tx_gt(dev, txq);
Alexander Duyck024e9672013-01-10 08:57:46 +00002030#ifdef CONFIG_XPS
2031 netif_reset_xps_queues_gt(dev, txq);
2032#endif
2033 }
John Fastabendf0796d52010-07-01 13:21:57 +00002034 }
Tom Herberte6484932010-10-18 18:04:39 +00002035
2036 dev->real_num_tx_queues = txq;
2037 return 0;
John Fastabendf0796d52010-07-01 13:21:57 +00002038}
2039EXPORT_SYMBOL(netif_set_real_num_tx_queues);
Denis Vlasenko56079432006-03-29 15:57:29 -08002040
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002041#ifdef CONFIG_RPS
2042/**
2043 * netif_set_real_num_rx_queues - set actual number of RX queues used
2044 * @dev: Network device
2045 * @rxq: Actual number of RX queues
2046 *
2047 * This must be called either with the rtnl_lock held or before
2048 * registration of the net device. Returns 0 on success, or a
Ben Hutchings4e7f7952010-10-08 10:33:39 -07002049 * negative error code. If called before registration, it always
2050 * succeeds.
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002051 */
2052int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2053{
2054 int rc;
2055
Tom Herbertbd25fa72010-10-18 18:00:16 +00002056 if (rxq < 1 || rxq > dev->num_rx_queues)
2057 return -EINVAL;
2058
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002059 if (dev->reg_state == NETREG_REGISTERED) {
2060 ASSERT_RTNL();
2061
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002062 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2063 rxq);
2064 if (rc)
2065 return rc;
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002066 }
2067
2068 dev->real_num_rx_queues = rxq;
2069 return 0;
2070}
2071EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2072#endif
2073
Ben Hutchings2c530402012-07-10 10:55:09 +00002074/**
2075 * netif_get_num_default_rss_queues - default number of RSS queues
Yuval Mintz16917b82012-07-01 03:18:50 +00002076 *
2077 * This routine should set an upper limit on the number of RSS queues
2078 * used by default by multiqueue devices.
2079 */
Ben Hutchingsa55b1382012-07-10 10:54:38 +00002080int netif_get_num_default_rss_queues(void)
Yuval Mintz16917b82012-07-01 03:18:50 +00002081{
2082 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2083}
2084EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2085
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002086static inline void __netif_reschedule(struct Qdisc *q)
2087{
2088 struct softnet_data *sd;
2089 unsigned long flags;
2090
2091 local_irq_save(flags);
2092 sd = &__get_cpu_var(softnet_data);
Changli Gaoa9cbd582010-04-26 23:06:24 +00002093 q->next_sched = NULL;
2094 *sd->output_queue_tailp = q;
2095 sd->output_queue_tailp = &q->next_sched;
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002096 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2097 local_irq_restore(flags);
2098}
2099
David S. Miller37437bb2008-07-16 02:15:04 -07002100void __netif_schedule(struct Qdisc *q)
Denis Vlasenko56079432006-03-29 15:57:29 -08002101{
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002102 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2103 __netif_reschedule(q);
Denis Vlasenko56079432006-03-29 15:57:29 -08002104}
2105EXPORT_SYMBOL(__netif_schedule);
2106
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002107void dev_kfree_skb_irq(struct sk_buff *skb)
Denis Vlasenko56079432006-03-29 15:57:29 -08002108{
David S. Miller3578b0c2010-08-03 00:24:04 -07002109 if (atomic_dec_and_test(&skb->users)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002110 struct softnet_data *sd;
2111 unsigned long flags;
Denis Vlasenko56079432006-03-29 15:57:29 -08002112
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002113 local_irq_save(flags);
2114 sd = &__get_cpu_var(softnet_data);
2115 skb->next = sd->completion_queue;
2116 sd->completion_queue = skb;
2117 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2118 local_irq_restore(flags);
2119 }
Denis Vlasenko56079432006-03-29 15:57:29 -08002120}
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002121EXPORT_SYMBOL(dev_kfree_skb_irq);
Denis Vlasenko56079432006-03-29 15:57:29 -08002122
2123void dev_kfree_skb_any(struct sk_buff *skb)
2124{
2125 if (in_irq() || irqs_disabled())
2126 dev_kfree_skb_irq(skb);
2127 else
2128 dev_kfree_skb(skb);
2129}
2130EXPORT_SYMBOL(dev_kfree_skb_any);
2131
2132
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002133/**
2134 * netif_device_detach - mark device as removed
2135 * @dev: network device
2136 *
2137 * Mark device as removed from system and therefore no longer available.
2138 */
Denis Vlasenko56079432006-03-29 15:57:29 -08002139void netif_device_detach(struct net_device *dev)
2140{
2141 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2142 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00002143 netif_tx_stop_all_queues(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08002144 }
2145}
2146EXPORT_SYMBOL(netif_device_detach);
2147
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002148/**
2149 * netif_device_attach - mark device as attached
2150 * @dev: network device
2151 *
2152 * Mark device as attached from system and restart if needed.
2153 */
Denis Vlasenko56079432006-03-29 15:57:29 -08002154void netif_device_attach(struct net_device *dev)
2155{
2156 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2157 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00002158 netif_tx_wake_all_queues(dev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002159 __netdev_watchdog_up(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08002160 }
2161}
2162EXPORT_SYMBOL(netif_device_attach);
2163
Ben Hutchings36c92472012-01-17 07:57:56 +00002164static void skb_warn_bad_offload(const struct sk_buff *skb)
2165{
Michał Mirosław65e9d2f2012-01-17 10:00:40 +00002166 static const netdev_features_t null_features = 0;
Ben Hutchings36c92472012-01-17 07:57:56 +00002167 struct net_device *dev = skb->dev;
2168 const char *driver = "";
2169
Ben Greearc846ad92013-04-19 10:45:52 +00002170 if (!net_ratelimit())
2171 return;
2172
Ben Hutchings36c92472012-01-17 07:57:56 +00002173 if (dev && dev->dev.parent)
2174 driver = dev_driver_string(dev->dev.parent);
2175
2176 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2177 "gso_type=%d ip_summed=%d\n",
Michał Mirosław65e9d2f2012-01-17 10:00:40 +00002178 driver, dev ? &dev->features : &null_features,
2179 skb->sk ? &skb->sk->sk_route_caps : &null_features,
Ben Hutchings36c92472012-01-17 07:57:56 +00002180 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2181 skb_shinfo(skb)->gso_type, skb->ip_summed);
2182}
2183
Linus Torvalds1da177e2005-04-16 15:20:36 -07002184/*
2185 * Invalidate hardware checksum when packet is to be mangled, and
2186 * complete checksum manually on outgoing path.
2187 */
Patrick McHardy84fa7932006-08-29 16:44:56 -07002188int skb_checksum_help(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002189{
Al Virod3bc23e2006-11-14 21:24:49 -08002190 __wsum csum;
Herbert Xu663ead32007-04-09 11:59:07 -07002191 int ret = 0, offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002192
Patrick McHardy84fa7932006-08-29 16:44:56 -07002193 if (skb->ip_summed == CHECKSUM_COMPLETE)
Herbert Xua430a432006-07-08 13:34:56 -07002194 goto out_set_summed;
2195
2196 if (unlikely(skb_shinfo(skb)->gso_size)) {
Ben Hutchings36c92472012-01-17 07:57:56 +00002197 skb_warn_bad_offload(skb);
2198 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002199 }
2200
Eric Dumazetcef401d2013-01-25 20:34:37 +00002201 /* Before computing a checksum, we should make sure no frag could
2202 * be modified by an external entity : checksum could be wrong.
2203 */
2204 if (skb_has_shared_frag(skb)) {
2205 ret = __skb_linearize(skb);
2206 if (ret)
2207 goto out;
2208 }
2209
Michał Mirosław55508d62010-12-14 15:24:08 +00002210 offset = skb_checksum_start_offset(skb);
Herbert Xua0308472007-10-15 01:47:15 -07002211 BUG_ON(offset >= skb_headlen(skb));
2212 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2213
2214 offset += skb->csum_offset;
2215 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2216
2217 if (skb_cloned(skb) &&
2218 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002219 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2220 if (ret)
2221 goto out;
2222 }
2223
Herbert Xua0308472007-10-15 01:47:15 -07002224 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
Herbert Xua430a432006-07-08 13:34:56 -07002225out_set_summed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002226 skb->ip_summed = CHECKSUM_NONE;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002227out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002228 return ret;
2229}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002230EXPORT_SYMBOL(skb_checksum_help);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002231
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002232__be16 skb_network_protocol(struct sk_buff *skb)
2233{
2234 __be16 type = skb->protocol;
David S. Miller61816592013-03-20 12:46:26 -04002235 int vlan_depth = ETH_HLEN;
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002236
Pravin B Shelar19acc322013-05-07 20:41:07 +00002237 /* Tunnel gso handlers can set protocol to ethernet. */
2238 if (type == htons(ETH_P_TEB)) {
2239 struct ethhdr *eth;
2240
2241 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2242 return 0;
2243
2244 eth = (struct ethhdr *)skb_mac_header(skb);
2245 type = eth->h_proto;
2246 }
2247
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002248 while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002249 struct vlan_hdr *vh;
2250
2251 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2252 return 0;
2253
2254 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2255 type = vh->h_vlan_encapsulated_proto;
2256 vlan_depth += VLAN_HLEN;
2257 }
2258
2259 return type;
2260}
2261
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002262/**
2263 * skb_mac_gso_segment - mac layer segmentation handler.
2264 * @skb: buffer to segment
2265 * @features: features for the output path (see dev->features)
2266 */
2267struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2268 netdev_features_t features)
2269{
2270 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2271 struct packet_offload *ptype;
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002272 __be16 type = skb_network_protocol(skb);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002273
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002274 if (unlikely(!type))
2275 return ERR_PTR(-EINVAL);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002276
2277 __skb_pull(skb, skb->mac_len);
2278
2279 rcu_read_lock();
2280 list_for_each_entry_rcu(ptype, &offload_base, list) {
2281 if (ptype->type == type && ptype->callbacks.gso_segment) {
2282 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2283 int err;
2284
2285 err = ptype->callbacks.gso_send_check(skb);
2286 segs = ERR_PTR(err);
2287 if (err || skb_gso_ok(skb, features))
2288 break;
2289 __skb_push(skb, (skb->data -
2290 skb_network_header(skb)));
2291 }
2292 segs = ptype->callbacks.gso_segment(skb, features);
2293 break;
2294 }
2295 }
2296 rcu_read_unlock();
2297
2298 __skb_push(skb, skb->data - skb_mac_header(skb));
2299
2300 return segs;
2301}
2302EXPORT_SYMBOL(skb_mac_gso_segment);
2303
2304
Cong Wang12b00042013-02-05 16:36:38 +00002305/* openvswitch calls this on rx path, so we need a different check.
2306 */
2307static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2308{
2309 if (tx_path)
2310 return skb->ip_summed != CHECKSUM_PARTIAL;
2311 else
2312 return skb->ip_summed == CHECKSUM_NONE;
2313}
2314
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002315/**
Cong Wang12b00042013-02-05 16:36:38 +00002316 * __skb_gso_segment - Perform segmentation on skb.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002317 * @skb: buffer to segment
Herbert Xu576a30e2006-06-27 13:22:38 -07002318 * @features: features for the output path (see dev->features)
Cong Wang12b00042013-02-05 16:36:38 +00002319 * @tx_path: whether it is called in TX path
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002320 *
2321 * This function segments the given skb and returns a list of segments.
Herbert Xu576a30e2006-06-27 13:22:38 -07002322 *
2323 * It may return NULL if the skb requires no segmentation. This is
2324 * only possible when GSO is used for verifying header integrity.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002325 */
Cong Wang12b00042013-02-05 16:36:38 +00002326struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2327 netdev_features_t features, bool tx_path)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002328{
Cong Wang12b00042013-02-05 16:36:38 +00002329 if (unlikely(skb_needs_check(skb, tx_path))) {
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002330 int err;
2331
Ben Hutchings36c92472012-01-17 07:57:56 +00002332 skb_warn_bad_offload(skb);
Herbert Xu67fd1a72009-01-19 16:26:44 -08002333
Herbert Xua430a432006-07-08 13:34:56 -07002334 if (skb_header_cloned(skb) &&
2335 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2336 return ERR_PTR(err);
2337 }
2338
Pravin B Shelar68c33162013-02-14 14:02:41 +00002339 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002340 skb_reset_mac_header(skb);
2341 skb_reset_mac_len(skb);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002342
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002343 return skb_mac_gso_segment(skb, features);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002344}
Cong Wang12b00042013-02-05 16:36:38 +00002345EXPORT_SYMBOL(__skb_gso_segment);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002346
Herbert Xufb286bb2005-11-10 13:01:24 -08002347/* Take action when hardware reception checksum errors are detected. */
2348#ifdef CONFIG_BUG
2349void netdev_rx_csum_fault(struct net_device *dev)
2350{
2351 if (net_ratelimit()) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00002352 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
Herbert Xufb286bb2005-11-10 13:01:24 -08002353 dump_stack();
2354 }
2355}
2356EXPORT_SYMBOL(netdev_rx_csum_fault);
2357#endif
2358
Linus Torvalds1da177e2005-04-16 15:20:36 -07002359/* Actually, we should eliminate this check as soon as we know, that:
2360 * 1. IOMMU is present and allows to map all the memory.
2361 * 2. No high memory really exists on this machine.
2362 */
2363
Eric Dumazet9092c652010-04-02 13:34:49 -07002364static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002365{
Herbert Xu3d3a8532006-06-27 13:33:10 -07002366#ifdef CONFIG_HIGHMEM
Linus Torvalds1da177e2005-04-16 15:20:36 -07002367 int i;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002368 if (!(dev->features & NETIF_F_HIGHDMA)) {
Ian Campbellea2ab692011-08-22 23:44:58 +00002369 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2370 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2371 if (PageHighMem(skb_frag_page(frag)))
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002372 return 1;
Ian Campbellea2ab692011-08-22 23:44:58 +00002373 }
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002374 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002375
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002376 if (PCI_DMA_BUS_IS_PHYS) {
2377 struct device *pdev = dev->dev.parent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002378
Eric Dumazet9092c652010-04-02 13:34:49 -07002379 if (!pdev)
2380 return 0;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002381 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
Ian Campbellea2ab692011-08-22 23:44:58 +00002382 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2383 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002384 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2385 return 1;
2386 }
2387 }
Herbert Xu3d3a8532006-06-27 13:33:10 -07002388#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002389 return 0;
2390}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002391
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002392struct dev_gso_cb {
2393 void (*destructor)(struct sk_buff *skb);
2394};
2395
2396#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2397
2398static void dev_gso_skb_destructor(struct sk_buff *skb)
2399{
2400 struct dev_gso_cb *cb;
2401
2402 do {
2403 struct sk_buff *nskb = skb->next;
2404
2405 skb->next = nskb->next;
2406 nskb->next = NULL;
2407 kfree_skb(nskb);
2408 } while (skb->next);
2409
2410 cb = DEV_GSO_CB(skb);
2411 if (cb->destructor)
2412 cb->destructor(skb);
2413}
2414
2415/**
2416 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2417 * @skb: buffer to segment
Jesse Gross91ecb632011-01-09 06:23:33 +00002418 * @features: device features as applicable to this skb
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002419 *
2420 * This function segments the given skb and stores the list of segments
2421 * in skb->next.
2422 */
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002423static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002424{
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002425 struct sk_buff *segs;
2426
Herbert Xu576a30e2006-06-27 13:22:38 -07002427 segs = skb_gso_segment(skb, features);
2428
2429 /* Verifying header integrity only. */
2430 if (!segs)
2431 return 0;
2432
Hirofumi Nakagawa801678c2008-04-29 01:03:09 -07002433 if (IS_ERR(segs))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002434 return PTR_ERR(segs);
2435
2436 skb->next = segs;
2437 DEV_GSO_CB(skb)->destructor = skb->destructor;
2438 skb->destructor = dev_gso_skb_destructor;
2439
2440 return 0;
2441}
2442
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002443static netdev_features_t harmonize_features(struct sk_buff *skb,
2444 __be16 protocol, netdev_features_t features)
Jesse Grossf01a5232011-01-09 06:23:31 +00002445{
Ed Cashinc0d680e2012-09-19 15:49:00 +00002446 if (skb->ip_summed != CHECKSUM_NONE &&
2447 !can_checksum_protocol(features, protocol)) {
Jesse Grossf01a5232011-01-09 06:23:31 +00002448 features &= ~NETIF_F_ALL_CSUM;
Jesse Grossf01a5232011-01-09 06:23:31 +00002449 } else if (illegal_highdma(skb->dev, skb)) {
2450 features &= ~NETIF_F_SG;
2451 }
2452
2453 return features;
2454}
2455
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002456netdev_features_t netif_skb_features(struct sk_buff *skb)
Jesse Gross58e998c2010-10-29 12:14:55 +00002457{
2458 __be16 protocol = skb->protocol;
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002459 netdev_features_t features = skb->dev->features;
Jesse Gross58e998c2010-10-29 12:14:55 +00002460
Ben Hutchings30b678d2012-07-30 15:57:00 +00002461 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2462 features &= ~NETIF_F_GSO_MASK;
2463
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002464 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
Jesse Gross58e998c2010-10-29 12:14:55 +00002465 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2466 protocol = veh->h_vlan_encapsulated_proto;
Jesse Grossf01a5232011-01-09 06:23:31 +00002467 } else if (!vlan_tx_tag_present(skb)) {
2468 return harmonize_features(skb, protocol, features);
2469 }
Jesse Gross58e998c2010-10-29 12:14:55 +00002470
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002471 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2472 NETIF_F_HW_VLAN_STAG_TX);
Jesse Grossf01a5232011-01-09 06:23:31 +00002473
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002474 if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
Jesse Grossf01a5232011-01-09 06:23:31 +00002475 return harmonize_features(skb, protocol, features);
2476 } else {
2477 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002478 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2479 NETIF_F_HW_VLAN_STAG_TX;
Jesse Grossf01a5232011-01-09 06:23:31 +00002480 return harmonize_features(skb, protocol, features);
2481 }
Jesse Gross58e998c2010-10-29 12:14:55 +00002482}
Jesse Grossf01a5232011-01-09 06:23:31 +00002483EXPORT_SYMBOL(netif_skb_features);
Jesse Gross58e998c2010-10-29 12:14:55 +00002484
John Fastabend6afff0c2010-06-16 14:18:12 +00002485/*
2486 * Returns true if either:
2487 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
Rami Rosend1a53df2012-08-27 23:39:24 +00002488 * 2. skb is fragmented and the device does not support SG.
John Fastabend6afff0c2010-06-16 14:18:12 +00002489 */
2490static inline int skb_needs_linearize(struct sk_buff *skb,
Patrick McHardy6708c9e2013-05-01 22:36:49 +00002491 netdev_features_t features)
John Fastabend6afff0c2010-06-16 14:18:12 +00002492{
Jesse Gross02932ce2011-01-09 06:23:34 +00002493 return skb_is_nonlinear(skb) &&
2494 ((skb_has_frag_list(skb) &&
2495 !(features & NETIF_F_FRAGLIST)) ||
Jesse Grosse1e78db2010-10-29 12:14:53 +00002496 (skb_shinfo(skb)->nr_frags &&
Jesse Gross02932ce2011-01-09 06:23:34 +00002497 !(features & NETIF_F_SG)));
John Fastabend6afff0c2010-06-16 14:18:12 +00002498}
2499
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002500int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2501 struct netdev_queue *txq)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002502{
Stephen Hemminger00829822008-11-20 20:14:53 -08002503 const struct net_device_ops *ops = dev->netdev_ops;
Patrick McHardy572a9d72009-11-10 06:14:14 +00002504 int rc = NETDEV_TX_OK;
Koki Sanagiec764bf2011-05-30 21:48:34 +00002505 unsigned int skb_len;
Stephen Hemminger00829822008-11-20 20:14:53 -08002506
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002507 if (likely(!skb->next)) {
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002508 netdev_features_t features;
Jesse Grossfc741212011-01-09 06:23:32 +00002509
Eric Dumazet93f154b2009-05-18 22:19:19 -07002510 /*
Lucas De Marchi25985ed2011-03-30 22:57:33 -03002511 * If device doesn't need skb->dst, release it right now while
Eric Dumazet93f154b2009-05-18 22:19:19 -07002512 * its hot in this cpu cache
2513 */
Eric Dumazetadf30902009-06-02 05:19:30 +00002514 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2515 skb_dst_drop(skb);
2516
Jesse Grossfc741212011-01-09 06:23:32 +00002517 features = netif_skb_features(skb);
2518
Jesse Gross7b9c6092010-10-20 13:56:04 +00002519 if (vlan_tx_tag_present(skb) &&
Patrick McHardy86a9bad2013-04-19 02:04:30 +00002520 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2521 skb = __vlan_put_tag(skb, skb->vlan_proto,
2522 vlan_tx_tag_get(skb));
Jesse Gross7b9c6092010-10-20 13:56:04 +00002523 if (unlikely(!skb))
2524 goto out;
2525
2526 skb->vlan_tci = 0;
2527 }
2528
Alexander Duyckfc70fb62012-12-07 14:14:15 +00002529 /* If encapsulation offload request, verify we are testing
2530 * hardware encapsulation features instead of standard
2531 * features for the netdev
2532 */
2533 if (skb->encapsulation)
2534 features &= dev->hw_enc_features;
2535
Jesse Grossfc741212011-01-09 06:23:32 +00002536 if (netif_needs_gso(skb, features)) {
Jesse Gross91ecb632011-01-09 06:23:33 +00002537 if (unlikely(dev_gso_segment(skb, features)))
David S. Miller9ccb8972010-04-22 01:02:07 -07002538 goto out_kfree_skb;
2539 if (skb->next)
2540 goto gso;
John Fastabend6afff0c2010-06-16 14:18:12 +00002541 } else {
Jesse Gross02932ce2011-01-09 06:23:34 +00002542 if (skb_needs_linearize(skb, features) &&
John Fastabend6afff0c2010-06-16 14:18:12 +00002543 __skb_linearize(skb))
2544 goto out_kfree_skb;
2545
2546 /* If packet is not checksummed and device does not
2547 * support checksumming for this protocol, complete
2548 * checksumming here.
2549 */
2550 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Alexander Duyckfc70fb62012-12-07 14:14:15 +00002551 if (skb->encapsulation)
2552 skb_set_inner_transport_header(skb,
2553 skb_checksum_start_offset(skb));
2554 else
2555 skb_set_transport_header(skb,
2556 skb_checksum_start_offset(skb));
Jesse Gross03634662011-01-09 06:23:35 +00002557 if (!(features & NETIF_F_ALL_CSUM) &&
John Fastabend6afff0c2010-06-16 14:18:12 +00002558 skb_checksum_help(skb))
2559 goto out_kfree_skb;
2560 }
David S. Miller9ccb8972010-04-22 01:02:07 -07002561 }
2562
Eric Dumazetb40863c2012-09-18 20:44:49 +00002563 if (!list_empty(&ptype_all))
2564 dev_queue_xmit_nit(skb, dev);
2565
Koki Sanagiec764bf2011-05-30 21:48:34 +00002566 skb_len = skb->len;
Patrick Ohlyac45f602009-02-12 05:03:37 +00002567 rc = ops->ndo_start_xmit(skb, dev);
Koki Sanagiec764bf2011-05-30 21:48:34 +00002568 trace_net_dev_xmit(skb, rc, dev, skb_len);
Patrick McHardyec634fe2009-07-05 19:23:38 -07002569 if (rc == NETDEV_TX_OK)
Eric Dumazet08baf562009-05-25 22:58:01 -07002570 txq_trans_update(txq);
Patrick Ohlyac45f602009-02-12 05:03:37 +00002571 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002572 }
2573
Herbert Xu576a30e2006-06-27 13:22:38 -07002574gso:
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002575 do {
2576 struct sk_buff *nskb = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002577
2578 skb->next = nskb->next;
2579 nskb->next = NULL;
Krishna Kumar068a2de2009-12-09 20:59:58 +00002580
Eric Dumazetb40863c2012-09-18 20:44:49 +00002581 if (!list_empty(&ptype_all))
2582 dev_queue_xmit_nit(nskb, dev);
2583
Koki Sanagiec764bf2011-05-30 21:48:34 +00002584 skb_len = nskb->len;
Stephen Hemminger00829822008-11-20 20:14:53 -08002585 rc = ops->ndo_start_xmit(nskb, dev);
Koki Sanagiec764bf2011-05-30 21:48:34 +00002586 trace_net_dev_xmit(nskb, rc, dev, skb_len);
Patrick McHardyec634fe2009-07-05 19:23:38 -07002587 if (unlikely(rc != NETDEV_TX_OK)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002588 if (rc & ~NETDEV_TX_MASK)
2589 goto out_kfree_gso_skb;
Michael Chanf54d9e82006-06-25 23:57:04 -07002590 nskb->next = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002591 skb->next = nskb;
2592 return rc;
2593 }
Eric Dumazet08baf562009-05-25 22:58:01 -07002594 txq_trans_update(txq);
Tom Herbert734664982011-11-28 16:32:44 +00002595 if (unlikely(netif_xmit_stopped(txq) && skb->next))
Michael Chanf54d9e82006-06-25 23:57:04 -07002596 return NETDEV_TX_BUSY;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002597 } while (skb->next);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002598
Patrick McHardy572a9d72009-11-10 06:14:14 +00002599out_kfree_gso_skb:
Sridhar Samudrala0c772152013-04-29 13:02:42 +00002600 if (likely(skb->next == NULL)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002601 skb->destructor = DEV_GSO_CB(skb)->destructor;
Sridhar Samudrala0c772152013-04-29 13:02:42 +00002602 consume_skb(skb);
2603 return rc;
2604 }
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002605out_kfree_skb:
2606 kfree_skb(skb);
Jesse Gross7b9c6092010-10-20 13:56:04 +00002607out:
Patrick McHardy572a9d72009-11-10 06:14:14 +00002608 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002609}
2610
Eric Dumazet1def9232013-01-10 12:36:42 +00002611static void qdisc_pkt_len_init(struct sk_buff *skb)
2612{
2613 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2614
2615 qdisc_skb_cb(skb)->pkt_len = skb->len;
2616
2617 /* To get more precise estimation of bytes sent on wire,
2618 * we add to pkt_len the headers size of all segments
2619 */
2620 if (shinfo->gso_size) {
Eric Dumazet757b8b12013-01-15 21:14:21 -08002621 unsigned int hdr_len;
Jason Wang15e5a032013-03-25 20:19:59 +00002622 u16 gso_segs = shinfo->gso_segs;
Eric Dumazet1def9232013-01-10 12:36:42 +00002623
Eric Dumazet757b8b12013-01-15 21:14:21 -08002624 /* mac layer + network layer */
2625 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2626
2627 /* + transport layer */
Eric Dumazet1def9232013-01-10 12:36:42 +00002628 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2629 hdr_len += tcp_hdrlen(skb);
2630 else
2631 hdr_len += sizeof(struct udphdr);
Jason Wang15e5a032013-03-25 20:19:59 +00002632
2633 if (shinfo->gso_type & SKB_GSO_DODGY)
2634 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2635 shinfo->gso_size);
2636
2637 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
Eric Dumazet1def9232013-01-10 12:36:42 +00002638 }
2639}
2640
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002641static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2642 struct net_device *dev,
2643 struct netdev_queue *txq)
2644{
2645 spinlock_t *root_lock = qdisc_lock(q);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002646 bool contended;
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002647 int rc;
2648
Eric Dumazet1def9232013-01-10 12:36:42 +00002649 qdisc_pkt_len_init(skb);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002650 qdisc_calculate_pkt_len(skb, q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002651 /*
2652 * Heuristic to force contended enqueues to serialize on a
2653 * separate lock before trying to get qdisc main lock.
2654 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2655 * and dequeue packets faster.
2656 */
Eric Dumazeta2da5702011-01-20 03:48:19 +00002657 contended = qdisc_is_running(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002658 if (unlikely(contended))
2659 spin_lock(&q->busylock);
2660
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002661 spin_lock(root_lock);
2662 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2663 kfree_skb(skb);
2664 rc = NET_XMIT_DROP;
2665 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
Eric Dumazetbc135b22010-06-02 03:23:51 -07002666 qdisc_run_begin(q)) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002667 /*
2668 * This is a work-conserving queue; there are no old skbs
2669 * waiting to be sent out; and the qdisc is not running -
2670 * xmit the skb directly.
2671 */
Eric Dumazet7fee2262010-05-11 23:19:48 +00002672 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2673 skb_dst_force(skb);
Eric Dumazetbfe0d022011-01-09 08:30:54 +00002674
Eric Dumazetbfe0d022011-01-09 08:30:54 +00002675 qdisc_bstats_update(q, skb);
2676
Eric Dumazet79640a42010-06-02 05:09:29 -07002677 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2678 if (unlikely(contended)) {
2679 spin_unlock(&q->busylock);
2680 contended = false;
2681 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002682 __qdisc_run(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002683 } else
Eric Dumazetbc135b22010-06-02 03:23:51 -07002684 qdisc_run_end(q);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002685
2686 rc = NET_XMIT_SUCCESS;
2687 } else {
Eric Dumazet7fee2262010-05-11 23:19:48 +00002688 skb_dst_force(skb);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002689 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
Eric Dumazet79640a42010-06-02 05:09:29 -07002690 if (qdisc_run_begin(q)) {
2691 if (unlikely(contended)) {
2692 spin_unlock(&q->busylock);
2693 contended = false;
2694 }
2695 __qdisc_run(q);
2696 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002697 }
2698 spin_unlock(root_lock);
Eric Dumazet79640a42010-06-02 05:09:29 -07002699 if (unlikely(contended))
2700 spin_unlock(&q->busylock);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002701 return rc;
2702}
2703
Neil Horman5bc14212011-11-22 05:10:51 +00002704#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2705static void skb_update_prio(struct sk_buff *skb)
2706{
Igor Maravic6977a792011-11-25 07:44:54 +00002707 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
Neil Horman5bc14212011-11-22 05:10:51 +00002708
Eric Dumazet91c68ce2012-07-08 21:45:10 +00002709 if (!skb->priority && skb->sk && map) {
2710 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2711
2712 if (prioidx < map->priomap_len)
2713 skb->priority = map->priomap[prioidx];
2714 }
Neil Horman5bc14212011-11-22 05:10:51 +00002715}
2716#else
2717#define skb_update_prio(skb)
2718#endif
2719
Eric Dumazet745e20f2010-09-29 13:23:09 -07002720static DEFINE_PER_CPU(int, xmit_recursion);
David S. Miller11a766c2010-10-25 12:51:55 -07002721#define RECURSION_LIMIT 10
Eric Dumazet745e20f2010-09-29 13:23:09 -07002722
Dave Jonesd29f7492008-07-22 14:09:06 -07002723/**
Michel Machado95603e22012-06-12 10:16:35 +00002724 * dev_loopback_xmit - loop back @skb
2725 * @skb: buffer to transmit
2726 */
2727int dev_loopback_xmit(struct sk_buff *skb)
2728{
2729 skb_reset_mac_header(skb);
2730 __skb_pull(skb, skb_network_offset(skb));
2731 skb->pkt_type = PACKET_LOOPBACK;
2732 skb->ip_summed = CHECKSUM_UNNECESSARY;
2733 WARN_ON(!skb_dst(skb));
2734 skb_dst_force(skb);
2735 netif_rx_ni(skb);
2736 return 0;
2737}
2738EXPORT_SYMBOL(dev_loopback_xmit);
2739
2740/**
Dave Jonesd29f7492008-07-22 14:09:06 -07002741 * dev_queue_xmit - transmit a buffer
2742 * @skb: buffer to transmit
2743 *
2744 * Queue a buffer for transmission to a network device. The caller must
2745 * have set the device and priority and built the buffer before calling
2746 * this function. The function can be called from an interrupt.
2747 *
2748 * A negative errno code is returned on a failure. A success does not
2749 * guarantee the frame will be transmitted as it may be dropped due
2750 * to congestion or traffic shaping.
2751 *
2752 * -----------------------------------------------------------------------------------
2753 * I notice this method can also return errors from the queue disciplines,
2754 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2755 * be positive.
2756 *
2757 * Regardless of the return value, the skb is consumed, so it is currently
2758 * difficult to retry a send to this method. (You can bump the ref count
2759 * before sending to hold a reference for retry if you are careful.)
2760 *
2761 * When calling this method, interrupts MUST be enabled. This is because
2762 * the BH enable code must have IRQs enabled so that it will not deadlock.
2763 * --BLG
2764 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002765int dev_queue_xmit(struct sk_buff *skb)
2766{
2767 struct net_device *dev = skb->dev;
David S. Millerdc2b4842008-07-08 17:18:23 -07002768 struct netdev_queue *txq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002769 struct Qdisc *q;
2770 int rc = -ENOMEM;
2771
Eric Dumazet6d1ccff2013-02-05 20:22:20 +00002772 skb_reset_mac_header(skb);
2773
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002774 /* Disable soft irqs for various locks below. Also
2775 * stops preemption for RCU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002776 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002777 rcu_read_lock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002778
Neil Horman5bc14212011-11-22 05:10:51 +00002779 skb_update_prio(skb);
2780
Amerigo Wang8c4c49d2012-09-17 20:16:31 +00002781 txq = netdev_pick_tx(dev, skb);
Paul E. McKenneya898def2010-02-22 17:04:49 -08002782 q = rcu_dereference_bh(txq->qdisc);
David S. Miller37437bb2008-07-16 02:15:04 -07002783
Linus Torvalds1da177e2005-04-16 15:20:36 -07002784#ifdef CONFIG_NET_CLS_ACT
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002785 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002786#endif
Koki Sanagicf66ba52010-08-23 18:45:02 +09002787 trace_net_dev_queue(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002788 if (q->enqueue) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002789 rc = __dev_xmit_skb(skb, q, dev, txq);
David S. Miller37437bb2008-07-16 02:15:04 -07002790 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002791 }
2792
2793 /* The device has no queue. Common case for software devices:
2794 loopback, all the sorts of tunnels...
2795
Herbert Xu932ff272006-06-09 12:20:56 -07002796 Really, it is unlikely that netif_tx_lock protection is necessary
2797 here. (f.e. loopback and IP tunnels are clean ignoring statistics
Linus Torvalds1da177e2005-04-16 15:20:36 -07002798 counters.)
2799 However, it is possible, that they rely on protection
2800 made by us here.
2801
2802 Check this and shot the lock. It is not prone from deadlocks.
2803 Either shot noqueue qdisc, it is even simpler 8)
2804 */
2805 if (dev->flags & IFF_UP) {
2806 int cpu = smp_processor_id(); /* ok because BHs are off */
2807
David S. Millerc773e842008-07-08 23:13:53 -07002808 if (txq->xmit_lock_owner != cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002809
Eric Dumazet745e20f2010-09-29 13:23:09 -07002810 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2811 goto recursion_alert;
2812
David S. Millerc773e842008-07-08 23:13:53 -07002813 HARD_TX_LOCK(dev, txq, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002814
Tom Herbert734664982011-11-28 16:32:44 +00002815 if (!netif_xmit_stopped(txq)) {
Eric Dumazet745e20f2010-09-29 13:23:09 -07002816 __this_cpu_inc(xmit_recursion);
Patrick McHardy572a9d72009-11-10 06:14:14 +00002817 rc = dev_hard_start_xmit(skb, dev, txq);
Eric Dumazet745e20f2010-09-29 13:23:09 -07002818 __this_cpu_dec(xmit_recursion);
Patrick McHardy572a9d72009-11-10 06:14:14 +00002819 if (dev_xmit_complete(rc)) {
David S. Millerc773e842008-07-08 23:13:53 -07002820 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002821 goto out;
2822 }
2823 }
David S. Millerc773e842008-07-08 23:13:53 -07002824 HARD_TX_UNLOCK(dev, txq);
Joe Perchese87cc472012-05-13 21:56:26 +00002825 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2826 dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002827 } else {
2828 /* Recursion is detected! It is possible,
Eric Dumazet745e20f2010-09-29 13:23:09 -07002829 * unfortunately
2830 */
2831recursion_alert:
Joe Perchese87cc472012-05-13 21:56:26 +00002832 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2833 dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002834 }
2835 }
2836
2837 rc = -ENETDOWN;
Herbert Xud4828d82006-06-22 02:28:18 -07002838 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002839
Linus Torvalds1da177e2005-04-16 15:20:36 -07002840 kfree_skb(skb);
2841 return rc;
2842out:
Herbert Xud4828d82006-06-22 02:28:18 -07002843 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002844 return rc;
2845}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002846EXPORT_SYMBOL(dev_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002847
2848
2849/*=======================================================================
2850 Receiver routines
2851 =======================================================================*/
2852
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002853int netdev_max_backlog __read_mostly = 1000;
Eric Dumazetc9e6bc62012-09-27 19:29:05 +00002854EXPORT_SYMBOL(netdev_max_backlog);
2855
Eric Dumazet3b098e22010-05-15 23:57:10 -07002856int netdev_tstamp_prequeue __read_mostly = 1;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002857int netdev_budget __read_mostly = 300;
2858int weight_p __read_mostly = 64; /* old backlog weight */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002859
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002860/* Called with irq disabled */
2861static inline void ____napi_schedule(struct softnet_data *sd,
2862 struct napi_struct *napi)
2863{
2864 list_add_tail(&napi->poll_list, &sd->poll_list);
2865 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2866}
2867
Eric Dumazetdf334542010-03-24 19:13:54 +00002868#ifdef CONFIG_RPS
Tom Herbertfec5e652010-04-16 16:01:27 -07002869
2870/* One global table that all flow-based protocols share. */
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002871struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
Tom Herbertfec5e652010-04-16 16:01:27 -07002872EXPORT_SYMBOL(rps_sock_flow_table);
2873
Ingo Molnarc5905af2012-02-24 08:31:31 +01002874struct static_key rps_needed __read_mostly;
Eric Dumazetadc93002011-11-17 03:13:26 +00002875
Ben Hutchingsc4454772011-01-19 11:03:53 +00002876static struct rps_dev_flow *
2877set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2878 struct rps_dev_flow *rflow, u16 next_cpu)
2879{
Ben Hutchings09994d12011-10-03 04:42:46 +00002880 if (next_cpu != RPS_NO_CPU) {
Ben Hutchingsc4454772011-01-19 11:03:53 +00002881#ifdef CONFIG_RFS_ACCEL
2882 struct netdev_rx_queue *rxqueue;
2883 struct rps_dev_flow_table *flow_table;
2884 struct rps_dev_flow *old_rflow;
2885 u32 flow_id;
2886 u16 rxq_index;
2887 int rc;
2888
2889 /* Should we steer this flow to a different hardware queue? */
Ben Hutchings69a19ee2011-02-15 20:32:04 +00002890 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2891 !(dev->features & NETIF_F_NTUPLE))
Ben Hutchingsc4454772011-01-19 11:03:53 +00002892 goto out;
2893 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2894 if (rxq_index == skb_get_rx_queue(skb))
2895 goto out;
2896
2897 rxqueue = dev->_rx + rxq_index;
2898 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2899 if (!flow_table)
2900 goto out;
2901 flow_id = skb->rxhash & flow_table->mask;
2902 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2903 rxq_index, flow_id);
2904 if (rc < 0)
2905 goto out;
2906 old_rflow = rflow;
2907 rflow = &flow_table->flows[flow_id];
Ben Hutchingsc4454772011-01-19 11:03:53 +00002908 rflow->filter = rc;
2909 if (old_rflow->filter == rflow->filter)
2910 old_rflow->filter = RPS_NO_FILTER;
2911 out:
2912#endif
2913 rflow->last_qtail =
Ben Hutchings09994d12011-10-03 04:42:46 +00002914 per_cpu(softnet_data, next_cpu).input_queue_head;
Ben Hutchingsc4454772011-01-19 11:03:53 +00002915 }
2916
Ben Hutchings09994d12011-10-03 04:42:46 +00002917 rflow->cpu = next_cpu;
Ben Hutchingsc4454772011-01-19 11:03:53 +00002918 return rflow;
2919}
2920
Tom Herbert0a9627f2010-03-16 08:03:29 +00002921/*
2922 * get_rps_cpu is called from netif_receive_skb and returns the target
2923 * CPU from the RPS map of the receiving queue for a given skb.
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002924 * rcu_read_lock must be held on entry.
Tom Herbert0a9627f2010-03-16 08:03:29 +00002925 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002926static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2927 struct rps_dev_flow **rflowp)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002928{
Tom Herbert0a9627f2010-03-16 08:03:29 +00002929 struct netdev_rx_queue *rxqueue;
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002930 struct rps_map *map;
Tom Herbertfec5e652010-04-16 16:01:27 -07002931 struct rps_dev_flow_table *flow_table;
2932 struct rps_sock_flow_table *sock_flow_table;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002933 int cpu = -1;
Tom Herbertfec5e652010-04-16 16:01:27 -07002934 u16 tcpu;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002935
Tom Herbert0a9627f2010-03-16 08:03:29 +00002936 if (skb_rx_queue_recorded(skb)) {
2937 u16 index = skb_get_rx_queue(skb);
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002938 if (unlikely(index >= dev->real_num_rx_queues)) {
2939 WARN_ONCE(dev->real_num_rx_queues > 1,
2940 "%s received packet on queue %u, but number "
2941 "of RX queues is %u\n",
2942 dev->name, index, dev->real_num_rx_queues);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002943 goto done;
2944 }
2945 rxqueue = dev->_rx + index;
2946 } else
2947 rxqueue = dev->_rx;
2948
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002949 map = rcu_dereference(rxqueue->rps_map);
2950 if (map) {
Tom Herbert85875232011-01-31 16:23:42 -08002951 if (map->len == 1 &&
Eric Dumazet33d480c2011-08-11 19:30:52 +00002952 !rcu_access_pointer(rxqueue->rps_flow_table)) {
Changli Gao6febfca2010-09-03 23:12:37 +00002953 tcpu = map->cpus[0];
2954 if (cpu_online(tcpu))
2955 cpu = tcpu;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002956 goto done;
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002957 }
Eric Dumazet33d480c2011-08-11 19:30:52 +00002958 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00002959 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002960 }
2961
Changli Gao2d47b452010-08-17 19:00:56 +00002962 skb_reset_network_header(skb);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002963 if (!skb_get_rxhash(skb))
Tom Herbert0a9627f2010-03-16 08:03:29 +00002964 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002965
Tom Herbertfec5e652010-04-16 16:01:27 -07002966 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2967 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2968 if (flow_table && sock_flow_table) {
2969 u16 next_cpu;
2970 struct rps_dev_flow *rflow;
2971
2972 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2973 tcpu = rflow->cpu;
2974
2975 next_cpu = sock_flow_table->ents[skb->rxhash &
2976 sock_flow_table->mask];
2977
2978 /*
2979 * If the desired CPU (where last recvmsg was done) is
2980 * different from current CPU (one in the rx-queue flow
2981 * table entry), switch if one of the following holds:
2982 * - Current CPU is unset (equal to RPS_NO_CPU).
2983 * - Current CPU is offline.
2984 * - The current CPU's queue tail has advanced beyond the
2985 * last packet that was enqueued using this table entry.
2986 * This guarantees that all previous packets for the flow
2987 * have been dequeued, thus preserving in order delivery.
2988 */
2989 if (unlikely(tcpu != next_cpu) &&
2990 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2991 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
Tom Herbertbaefa312012-11-16 09:04:15 +00002992 rflow->last_qtail)) >= 0)) {
2993 tcpu = next_cpu;
Ben Hutchingsc4454772011-01-19 11:03:53 +00002994 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
Tom Herbertbaefa312012-11-16 09:04:15 +00002995 }
Ben Hutchingsc4454772011-01-19 11:03:53 +00002996
Tom Herbertfec5e652010-04-16 16:01:27 -07002997 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2998 *rflowp = rflow;
2999 cpu = tcpu;
3000 goto done;
3001 }
3002 }
3003
Tom Herbert0a9627f2010-03-16 08:03:29 +00003004 if (map) {
Tom Herbertfec5e652010-04-16 16:01:27 -07003005 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
Tom Herbert0a9627f2010-03-16 08:03:29 +00003006
3007 if (cpu_online(tcpu)) {
3008 cpu = tcpu;
3009 goto done;
3010 }
3011 }
3012
3013done:
Tom Herbert0a9627f2010-03-16 08:03:29 +00003014 return cpu;
3015}
3016
Ben Hutchingsc4454772011-01-19 11:03:53 +00003017#ifdef CONFIG_RFS_ACCEL
3018
3019/**
3020 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3021 * @dev: Device on which the filter was set
3022 * @rxq_index: RX queue index
3023 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3024 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3025 *
3026 * Drivers that implement ndo_rx_flow_steer() should periodically call
3027 * this function for each installed filter and remove the filters for
3028 * which it returns %true.
3029 */
3030bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3031 u32 flow_id, u16 filter_id)
3032{
3033 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3034 struct rps_dev_flow_table *flow_table;
3035 struct rps_dev_flow *rflow;
3036 bool expire = true;
3037 int cpu;
3038
3039 rcu_read_lock();
3040 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3041 if (flow_table && flow_id <= flow_table->mask) {
3042 rflow = &flow_table->flows[flow_id];
3043 cpu = ACCESS_ONCE(rflow->cpu);
3044 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3045 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3046 rflow->last_qtail) <
3047 (int)(10 * flow_table->mask)))
3048 expire = false;
3049 }
3050 rcu_read_unlock();
3051 return expire;
3052}
3053EXPORT_SYMBOL(rps_may_expire_flow);
3054
3055#endif /* CONFIG_RFS_ACCEL */
3056
Tom Herbert0a9627f2010-03-16 08:03:29 +00003057/* Called from hardirq (IPI) context */
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003058static void rps_trigger_softirq(void *data)
Tom Herbert0a9627f2010-03-16 08:03:29 +00003059{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003060 struct softnet_data *sd = data;
3061
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003062 ____napi_schedule(sd, &sd->backlog);
Changli Gaodee42872010-05-02 05:42:16 +00003063 sd->received_rps++;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003064}
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003065
Tom Herbertfec5e652010-04-16 16:01:27 -07003066#endif /* CONFIG_RPS */
Tom Herbert0a9627f2010-03-16 08:03:29 +00003067
3068/*
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003069 * Check if this softnet_data structure is another cpu one
3070 * If yes, queue it to our IPI list and return 1
3071 * If no, return 0
3072 */
3073static int rps_ipi_queued(struct softnet_data *sd)
3074{
3075#ifdef CONFIG_RPS
3076 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3077
3078 if (sd != mysd) {
3079 sd->rps_ipi_next = mysd->rps_ipi_list;
3080 mysd->rps_ipi_list = sd;
3081
3082 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3083 return 1;
3084 }
3085#endif /* CONFIG_RPS */
3086 return 0;
3087}
3088
Willem de Bruijn99bbc702013-05-20 04:02:32 +00003089#ifdef CONFIG_NET_FLOW_LIMIT
3090int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3091#endif
3092
3093static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3094{
3095#ifdef CONFIG_NET_FLOW_LIMIT
3096 struct sd_flow_limit *fl;
3097 struct softnet_data *sd;
3098 unsigned int old_flow, new_flow;
3099
3100 if (qlen < (netdev_max_backlog >> 1))
3101 return false;
3102
3103 sd = &__get_cpu_var(softnet_data);
3104
3105 rcu_read_lock();
3106 fl = rcu_dereference(sd->flow_limit);
3107 if (fl) {
3108 new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
3109 old_flow = fl->history[fl->history_head];
3110 fl->history[fl->history_head] = new_flow;
3111
3112 fl->history_head++;
3113 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3114
3115 if (likely(fl->buckets[old_flow]))
3116 fl->buckets[old_flow]--;
3117
3118 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3119 fl->count++;
3120 rcu_read_unlock();
3121 return true;
3122 }
3123 }
3124 rcu_read_unlock();
3125#endif
3126 return false;
3127}
3128
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003129/*
Tom Herbert0a9627f2010-03-16 08:03:29 +00003130 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3131 * queue (may be a remote CPU queue).
3132 */
Tom Herbertfec5e652010-04-16 16:01:27 -07003133static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3134 unsigned int *qtail)
Tom Herbert0a9627f2010-03-16 08:03:29 +00003135{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003136 struct softnet_data *sd;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003137 unsigned long flags;
Willem de Bruijn99bbc702013-05-20 04:02:32 +00003138 unsigned int qlen;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003139
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003140 sd = &per_cpu(softnet_data, cpu);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003141
3142 local_irq_save(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003143
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003144 rps_lock(sd);
Willem de Bruijn99bbc702013-05-20 04:02:32 +00003145 qlen = skb_queue_len(&sd->input_pkt_queue);
3146 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
Changli Gao6e7676c2010-04-27 15:07:33 -07003147 if (skb_queue_len(&sd->input_pkt_queue)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00003148enqueue:
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003149 __skb_queue_tail(&sd->input_pkt_queue, skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003150 input_queue_tail_incr_save(sd, qtail);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003151 rps_unlock(sd);
Changli Gao152102c2010-03-30 20:16:22 +00003152 local_irq_restore(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003153 return NET_RX_SUCCESS;
3154 }
3155
Eric Dumazetebda37c22010-05-06 23:51:21 +00003156 /* Schedule NAPI for backlog device
3157 * We can use non atomic operation since we own the queue lock
3158 */
3159 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003160 if (!rps_ipi_queued(sd))
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003161 ____napi_schedule(sd, &sd->backlog);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003162 }
3163 goto enqueue;
3164 }
3165
Changli Gaodee42872010-05-02 05:42:16 +00003166 sd->dropped++;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003167 rps_unlock(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003168
Tom Herbert0a9627f2010-03-16 08:03:29 +00003169 local_irq_restore(flags);
3170
Eric Dumazetcaf586e2010-09-30 21:06:55 +00003171 atomic_long_inc(&skb->dev->rx_dropped);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003172 kfree_skb(skb);
3173 return NET_RX_DROP;
3174}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003175
Linus Torvalds1da177e2005-04-16 15:20:36 -07003176/**
3177 * netif_rx - post buffer to the network code
3178 * @skb: buffer to post
3179 *
3180 * This function receives a packet from a device driver and queues it for
3181 * the upper (protocol) levels to process. It always succeeds. The buffer
3182 * may be dropped during processing for congestion control or by the
3183 * protocol layers.
3184 *
3185 * return values:
3186 * NET_RX_SUCCESS (no congestion)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003187 * NET_RX_DROP (packet was dropped)
3188 *
3189 */
3190
3191int netif_rx(struct sk_buff *skb)
3192{
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003193 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003194
3195 /* if netpoll wants it, pretend we never saw it */
3196 if (netpoll_rx(skb))
3197 return NET_RX_DROP;
3198
Eric Dumazet588f0332011-11-15 04:12:55 +00003199 net_timestamp_check(netdev_tstamp_prequeue, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003200
Koki Sanagicf66ba52010-08-23 18:45:02 +09003201 trace_netif_rx(skb);
Eric Dumazetdf334542010-03-24 19:13:54 +00003202#ifdef CONFIG_RPS
Ingo Molnarc5905af2012-02-24 08:31:31 +01003203 if (static_key_false(&rps_needed)) {
Tom Herbertfec5e652010-04-16 16:01:27 -07003204 struct rps_dev_flow voidflow, *rflow = &voidflow;
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003205 int cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003206
Changli Gaocece1942010-08-07 20:35:43 -07003207 preempt_disable();
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003208 rcu_read_lock();
Tom Herbertfec5e652010-04-16 16:01:27 -07003209
3210 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003211 if (cpu < 0)
3212 cpu = smp_processor_id();
Tom Herbertfec5e652010-04-16 16:01:27 -07003213
3214 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3215
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003216 rcu_read_unlock();
Changli Gaocece1942010-08-07 20:35:43 -07003217 preempt_enable();
Eric Dumazetadc93002011-11-17 03:13:26 +00003218 } else
3219#endif
Tom Herbertfec5e652010-04-16 16:01:27 -07003220 {
3221 unsigned int qtail;
3222 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3223 put_cpu();
3224 }
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003225 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003226}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003227EXPORT_SYMBOL(netif_rx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003228
3229int netif_rx_ni(struct sk_buff *skb)
3230{
3231 int err;
3232
3233 preempt_disable();
3234 err = netif_rx(skb);
3235 if (local_softirq_pending())
3236 do_softirq();
3237 preempt_enable();
3238
3239 return err;
3240}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003241EXPORT_SYMBOL(netif_rx_ni);
3242
Linus Torvalds1da177e2005-04-16 15:20:36 -07003243static void net_tx_action(struct softirq_action *h)
3244{
3245 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3246
3247 if (sd->completion_queue) {
3248 struct sk_buff *clist;
3249
3250 local_irq_disable();
3251 clist = sd->completion_queue;
3252 sd->completion_queue = NULL;
3253 local_irq_enable();
3254
3255 while (clist) {
3256 struct sk_buff *skb = clist;
3257 clist = clist->next;
3258
Ilpo Järvinen547b7922008-07-25 21:43:18 -07003259 WARN_ON(atomic_read(&skb->users));
Koki Sanagi07dc22e2010-08-23 18:46:12 +09003260 trace_kfree_skb(skb, net_tx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003261 __kfree_skb(skb);
3262 }
3263 }
3264
3265 if (sd->output_queue) {
David S. Miller37437bb2008-07-16 02:15:04 -07003266 struct Qdisc *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003267
3268 local_irq_disable();
3269 head = sd->output_queue;
3270 sd->output_queue = NULL;
Changli Gaoa9cbd582010-04-26 23:06:24 +00003271 sd->output_queue_tailp = &sd->output_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003272 local_irq_enable();
3273
3274 while (head) {
David S. Miller37437bb2008-07-16 02:15:04 -07003275 struct Qdisc *q = head;
3276 spinlock_t *root_lock;
3277
Linus Torvalds1da177e2005-04-16 15:20:36 -07003278 head = head->next_sched;
3279
David S. Miller5fb66222008-08-02 20:02:43 -07003280 root_lock = qdisc_lock(q);
David S. Miller37437bb2008-07-16 02:15:04 -07003281 if (spin_trylock(root_lock)) {
Jarek Poplawskidef82a12008-08-17 21:54:43 -07003282 smp_mb__before_clear_bit();
3283 clear_bit(__QDISC_STATE_SCHED,
3284 &q->state);
David S. Miller37437bb2008-07-16 02:15:04 -07003285 qdisc_run(q);
3286 spin_unlock(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003287 } else {
David S. Miller195648b2008-08-19 04:00:36 -07003288 if (!test_bit(__QDISC_STATE_DEACTIVATED,
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07003289 &q->state)) {
David S. Miller195648b2008-08-19 04:00:36 -07003290 __netif_reschedule(q);
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07003291 } else {
3292 smp_mb__before_clear_bit();
3293 clear_bit(__QDISC_STATE_SCHED,
3294 &q->state);
3295 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003296 }
3297 }
3298 }
3299}
3300
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003301#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3302 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
Michał Mirosławda678292009-06-05 05:35:28 +00003303/* This hook is defined here for ATM LANE */
3304int (*br_fdb_test_addr_hook)(struct net_device *dev,
3305 unsigned char *addr) __read_mostly;
Stephen Hemminger4fb019a2009-09-11 11:50:08 -07003306EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
Michał Mirosławda678292009-06-05 05:35:28 +00003307#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003308
Linus Torvalds1da177e2005-04-16 15:20:36 -07003309#ifdef CONFIG_NET_CLS_ACT
3310/* TODO: Maybe we should just force sch_ingress to be compiled in
3311 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3312 * a compare and 2 stores extra right now if we dont have it on
3313 * but have CONFIG_NET_CLS_ACT
Lucas De Marchi25985ed2011-03-30 22:57:33 -03003314 * NOTE: This doesn't stop any functionality; if you dont have
3315 * the ingress scheduler, you just can't add policies on ingress.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003316 *
3317 */
Eric Dumazet24824a02010-10-02 06:11:55 +00003318static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003319{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003320 struct net_device *dev = skb->dev;
Herbert Xuf697c3e2007-10-14 00:38:47 -07003321 u32 ttl = G_TC_RTTL(skb->tc_verd);
David S. Miller555353c2008-07-08 17:33:13 -07003322 int result = TC_ACT_OK;
3323 struct Qdisc *q;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003324
Stephen Hemmingerde384832010-08-01 00:33:23 -07003325 if (unlikely(MAX_RED_LOOP < ttl++)) {
Joe Perchese87cc472012-05-13 21:56:26 +00003326 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3327 skb->skb_iif, dev->ifindex);
Herbert Xuf697c3e2007-10-14 00:38:47 -07003328 return TC_ACT_SHOT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003329 }
3330
Herbert Xuf697c3e2007-10-14 00:38:47 -07003331 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3332 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3333
David S. Miller83874002008-07-17 00:53:03 -07003334 q = rxq->qdisc;
David S. Miller8d50b532008-07-30 02:37:46 -07003335 if (q != &noop_qdisc) {
David S. Miller83874002008-07-17 00:53:03 -07003336 spin_lock(qdisc_lock(q));
David S. Millera9312ae2008-08-17 21:51:03 -07003337 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3338 result = qdisc_enqueue_root(skb, q);
David S. Miller83874002008-07-17 00:53:03 -07003339 spin_unlock(qdisc_lock(q));
3340 }
Herbert Xuf697c3e2007-10-14 00:38:47 -07003341
Linus Torvalds1da177e2005-04-16 15:20:36 -07003342 return result;
3343}
Herbert Xuf697c3e2007-10-14 00:38:47 -07003344
3345static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3346 struct packet_type **pt_prev,
3347 int *ret, struct net_device *orig_dev)
3348{
Eric Dumazet24824a02010-10-02 06:11:55 +00003349 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3350
3351 if (!rxq || rxq->qdisc == &noop_qdisc)
Herbert Xuf697c3e2007-10-14 00:38:47 -07003352 goto out;
3353
3354 if (*pt_prev) {
3355 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3356 *pt_prev = NULL;
Herbert Xuf697c3e2007-10-14 00:38:47 -07003357 }
3358
Eric Dumazet24824a02010-10-02 06:11:55 +00003359 switch (ing_filter(skb, rxq)) {
Herbert Xuf697c3e2007-10-14 00:38:47 -07003360 case TC_ACT_SHOT:
3361 case TC_ACT_STOLEN:
3362 kfree_skb(skb);
3363 return NULL;
3364 }
3365
3366out:
3367 skb->tc_verd = 0;
3368 return skb;
3369}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003370#endif
3371
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003372/**
3373 * netdev_rx_handler_register - register receive handler
3374 * @dev: device to register a handler for
3375 * @rx_handler: receive handler to register
Jiri Pirko93e2c322010-06-10 03:34:59 +00003376 * @rx_handler_data: data pointer that is used by rx handler
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003377 *
3378 * Register a receive hander for a device. This handler will then be
3379 * called from __netif_receive_skb. A negative errno code is returned
3380 * on a failure.
3381 *
3382 * The caller must hold the rtnl_mutex.
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003383 *
3384 * For a general description of rx_handler, see enum rx_handler_result.
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003385 */
3386int netdev_rx_handler_register(struct net_device *dev,
Jiri Pirko93e2c322010-06-10 03:34:59 +00003387 rx_handler_func_t *rx_handler,
3388 void *rx_handler_data)
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003389{
3390 ASSERT_RTNL();
3391
3392 if (dev->rx_handler)
3393 return -EBUSY;
3394
Eric Dumazet00cfec32013-03-29 03:01:22 +00003395 /* Note: rx_handler_data must be set before rx_handler */
Jiri Pirko93e2c322010-06-10 03:34:59 +00003396 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003397 rcu_assign_pointer(dev->rx_handler, rx_handler);
3398
3399 return 0;
3400}
3401EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3402
3403/**
3404 * netdev_rx_handler_unregister - unregister receive handler
3405 * @dev: device to unregister a handler from
3406 *
Kusanagi Kouichi166ec362013-03-18 02:59:52 +00003407 * Unregister a receive handler from a device.
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003408 *
3409 * The caller must hold the rtnl_mutex.
3410 */
3411void netdev_rx_handler_unregister(struct net_device *dev)
3412{
3413
3414 ASSERT_RTNL();
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00003415 RCU_INIT_POINTER(dev->rx_handler, NULL);
Eric Dumazet00cfec32013-03-29 03:01:22 +00003416 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3417 * section has a guarantee to see a non NULL rx_handler_data
3418 * as well.
3419 */
3420 synchronize_net();
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00003421 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003422}
3423EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3424
Mel Gormanb4b9e352012-07-31 16:44:26 -07003425/*
3426 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3427 * the special handling of PFMEMALLOC skbs.
3428 */
3429static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3430{
3431 switch (skb->protocol) {
3432 case __constant_htons(ETH_P_ARP):
3433 case __constant_htons(ETH_P_IP):
3434 case __constant_htons(ETH_P_IPV6):
3435 case __constant_htons(ETH_P_8021Q):
Patrick McHardy8ad227f2013-04-19 02:04:31 +00003436 case __constant_htons(ETH_P_8021AD):
Mel Gormanb4b9e352012-07-31 16:44:26 -07003437 return true;
3438 default:
3439 return false;
3440 }
3441}
3442
David S. Miller9754e292013-02-14 15:57:38 -05003443static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003444{
3445 struct packet_type *ptype, *pt_prev;
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003446 rx_handler_func_t *rx_handler;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003447 struct net_device *orig_dev;
David S. Miller63d8ea72011-02-28 10:48:59 -08003448 struct net_device *null_or_dev;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003449 bool deliver_exact = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003450 int ret = NET_RX_DROP;
Al Viro252e3342006-11-14 20:48:11 -08003451 __be16 type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003452
Eric Dumazet588f0332011-11-15 04:12:55 +00003453 net_timestamp_check(!netdev_tstamp_prequeue, skb);
Eric Dumazet81bbb3d2009-09-30 16:42:42 -07003454
Koki Sanagicf66ba52010-08-23 18:45:02 +09003455 trace_netif_receive_skb(skb);
Patrick McHardy9b22ea52008-11-04 14:49:57 -08003456
Linus Torvalds1da177e2005-04-16 15:20:36 -07003457 /* if we've gotten here through NAPI, check netpoll */
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003458 if (netpoll_receive_skb(skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003459 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003460
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07003461 orig_dev = skb->dev;
Jiri Pirko1765a572011-02-12 06:48:36 +00003462
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003463 skb_reset_network_header(skb);
Eric Dumazetfda55ec2013-01-07 09:28:21 +00003464 if (!skb_transport_header_was_set(skb))
3465 skb_reset_transport_header(skb);
Jiri Pirko0b5c9db2011-06-10 06:56:58 +00003466 skb_reset_mac_len(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003467
3468 pt_prev = NULL;
3469
3470 rcu_read_lock();
3471
David S. Miller63d8ea72011-02-28 10:48:59 -08003472another_round:
David S. Millerb6858172012-07-23 16:27:54 -07003473 skb->skb_iif = skb->dev->ifindex;
David S. Miller63d8ea72011-02-28 10:48:59 -08003474
3475 __this_cpu_inc(softnet_data.processed);
3476
Patrick McHardy8ad227f2013-04-19 02:04:31 +00003477 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3478 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
Jiri Pirkobcc6d472011-04-07 19:48:33 +00003479 skb = vlan_untag(skb);
3480 if (unlikely(!skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003481 goto unlock;
Jiri Pirkobcc6d472011-04-07 19:48:33 +00003482 }
3483
Linus Torvalds1da177e2005-04-16 15:20:36 -07003484#ifdef CONFIG_NET_CLS_ACT
3485 if (skb->tc_verd & TC_NCLS) {
3486 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3487 goto ncls;
3488 }
3489#endif
3490
David S. Miller9754e292013-02-14 15:57:38 -05003491 if (pfmemalloc)
Mel Gormanb4b9e352012-07-31 16:44:26 -07003492 goto skip_taps;
3493
Linus Torvalds1da177e2005-04-16 15:20:36 -07003494 list_for_each_entry_rcu(ptype, &ptype_all, list) {
David S. Miller63d8ea72011-02-28 10:48:59 -08003495 if (!ptype->dev || ptype->dev == skb->dev) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003496 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003497 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003498 pt_prev = ptype;
3499 }
3500 }
3501
Mel Gormanb4b9e352012-07-31 16:44:26 -07003502skip_taps:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003503#ifdef CONFIG_NET_CLS_ACT
Herbert Xuf697c3e2007-10-14 00:38:47 -07003504 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3505 if (!skb)
Mel Gormanb4b9e352012-07-31 16:44:26 -07003506 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003507ncls:
3508#endif
3509
David S. Miller9754e292013-02-14 15:57:38 -05003510 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003511 goto drop;
3512
John Fastabend24257172011-10-10 09:16:41 +00003513 if (vlan_tx_tag_present(skb)) {
3514 if (pt_prev) {
3515 ret = deliver_skb(skb, pt_prev, orig_dev);
3516 pt_prev = NULL;
3517 }
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003518 if (vlan_do_receive(&skb))
John Fastabend24257172011-10-10 09:16:41 +00003519 goto another_round;
3520 else if (unlikely(!skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003521 goto unlock;
John Fastabend24257172011-10-10 09:16:41 +00003522 }
3523
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003524 rx_handler = rcu_dereference(skb->dev->rx_handler);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003525 if (rx_handler) {
3526 if (pt_prev) {
3527 ret = deliver_skb(skb, pt_prev, orig_dev);
3528 pt_prev = NULL;
3529 }
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003530 switch (rx_handler(&skb)) {
3531 case RX_HANDLER_CONSUMED:
Cristian Bercaru3bc1b1a2013-03-08 07:03:38 +00003532 ret = NET_RX_SUCCESS;
Mel Gormanb4b9e352012-07-31 16:44:26 -07003533 goto unlock;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003534 case RX_HANDLER_ANOTHER:
David S. Miller63d8ea72011-02-28 10:48:59 -08003535 goto another_round;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003536 case RX_HANDLER_EXACT:
3537 deliver_exact = true;
3538 case RX_HANDLER_PASS:
3539 break;
3540 default:
3541 BUG();
3542 }
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003543 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003544
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003545 if (vlan_tx_nonzero_tag_present(skb))
3546 skb->pkt_type = PACKET_OTHERHOST;
3547
David S. Miller63d8ea72011-02-28 10:48:59 -08003548 /* deliver only exact match when indicated */
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003549 null_or_dev = deliver_exact ? skb->dev : NULL;
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00003550
Linus Torvalds1da177e2005-04-16 15:20:36 -07003551 type = skb->protocol;
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08003552 list_for_each_entry_rcu(ptype,
3553 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
David S. Miller63d8ea72011-02-28 10:48:59 -08003554 if (ptype->type == type &&
Jiri Pirkoe3f48d32011-02-28 20:26:31 +00003555 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3556 ptype->dev == orig_dev)) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003557 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003558 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003559 pt_prev = ptype;
3560 }
3561 }
3562
3563 if (pt_prev) {
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00003564 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
Michael S. Tsirkin0e698bf2012-09-15 22:44:16 +00003565 goto drop;
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00003566 else
3567 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003568 } else {
Mel Gormanb4b9e352012-07-31 16:44:26 -07003569drop:
Eric Dumazetcaf586e2010-09-30 21:06:55 +00003570 atomic_long_inc(&skb->dev->rx_dropped);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003571 kfree_skb(skb);
3572 /* Jamal, now you will not able to escape explaining
3573 * me how you were going to use this. :-)
3574 */
3575 ret = NET_RX_DROP;
3576 }
3577
Mel Gormanb4b9e352012-07-31 16:44:26 -07003578unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003579 rcu_read_unlock();
Mel Gormanb4b9e352012-07-31 16:44:26 -07003580out:
David S. Miller9754e292013-02-14 15:57:38 -05003581 return ret;
3582}
3583
3584static int __netif_receive_skb(struct sk_buff *skb)
3585{
3586 int ret;
3587
3588 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3589 unsigned long pflags = current->flags;
3590
3591 /*
3592 * PFMEMALLOC skbs are special, they should
3593 * - be delivered to SOCK_MEMALLOC sockets only
3594 * - stay away from userspace
3595 * - have bounded memory usage
3596 *
3597 * Use PF_MEMALLOC as this saves us from propagating the allocation
3598 * context down to all allocation sites.
3599 */
3600 current->flags |= PF_MEMALLOC;
3601 ret = __netif_receive_skb_core(skb, true);
3602 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3603 } else
3604 ret = __netif_receive_skb_core(skb, false);
3605
Linus Torvalds1da177e2005-04-16 15:20:36 -07003606 return ret;
3607}
Tom Herbert0a9627f2010-03-16 08:03:29 +00003608
3609/**
3610 * netif_receive_skb - process receive buffer from network
3611 * @skb: buffer to process
3612 *
3613 * netif_receive_skb() is the main receive data processing function.
3614 * It always succeeds. The buffer may be dropped during processing
3615 * for congestion control or by the protocol layers.
3616 *
3617 * This function may only be called from softirq context and interrupts
3618 * should be enabled.
3619 *
3620 * Return values (usually ignored):
3621 * NET_RX_SUCCESS: no congestion
3622 * NET_RX_DROP: packet was dropped
3623 */
3624int netif_receive_skb(struct sk_buff *skb)
3625{
Eric Dumazet588f0332011-11-15 04:12:55 +00003626 net_timestamp_check(netdev_tstamp_prequeue, skb);
Eric Dumazet3b098e22010-05-15 23:57:10 -07003627
Richard Cochranc1f19b52010-07-17 08:49:36 +00003628 if (skb_defer_rx_timestamp(skb))
3629 return NET_RX_SUCCESS;
3630
Eric Dumazetdf334542010-03-24 19:13:54 +00003631#ifdef CONFIG_RPS
Ingo Molnarc5905af2012-02-24 08:31:31 +01003632 if (static_key_false(&rps_needed)) {
Eric Dumazet3b098e22010-05-15 23:57:10 -07003633 struct rps_dev_flow voidflow, *rflow = &voidflow;
3634 int cpu, ret;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003635
Eric Dumazet3b098e22010-05-15 23:57:10 -07003636 rcu_read_lock();
Tom Herbert0a9627f2010-03-16 08:03:29 +00003637
Eric Dumazet3b098e22010-05-15 23:57:10 -07003638 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Tom Herbertfec5e652010-04-16 16:01:27 -07003639
Eric Dumazet3b098e22010-05-15 23:57:10 -07003640 if (cpu >= 0) {
3641 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3642 rcu_read_unlock();
Eric Dumazetadc93002011-11-17 03:13:26 +00003643 return ret;
Eric Dumazet3b098e22010-05-15 23:57:10 -07003644 }
Eric Dumazetadc93002011-11-17 03:13:26 +00003645 rcu_read_unlock();
Tom Herbertfec5e652010-04-16 16:01:27 -07003646 }
Tom Herbert1e94d722010-03-18 17:45:44 -07003647#endif
Eric Dumazetadc93002011-11-17 03:13:26 +00003648 return __netif_receive_skb(skb);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003649}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003650EXPORT_SYMBOL(netif_receive_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003651
Eric Dumazet88751272010-04-19 05:07:33 +00003652/* Network device is going away, flush any packets still pending
3653 * Called with irqs disabled.
3654 */
Changli Gao152102c2010-03-30 20:16:22 +00003655static void flush_backlog(void *arg)
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003656{
Changli Gao152102c2010-03-30 20:16:22 +00003657 struct net_device *dev = arg;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003658 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003659 struct sk_buff *skb, *tmp;
3660
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003661 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003662 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003663 if (skb->dev == dev) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003664 __skb_unlink(skb, &sd->input_pkt_queue);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003665 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003666 input_queue_head_incr(sd);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003667 }
Changli Gao6e7676c2010-04-27 15:07:33 -07003668 }
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003669 rps_unlock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003670
3671 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3672 if (skb->dev == dev) {
3673 __skb_unlink(skb, &sd->process_queue);
3674 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003675 input_queue_head_incr(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003676 }
3677 }
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003678}
3679
Herbert Xud565b0a2008-12-15 23:38:52 -08003680static int napi_gro_complete(struct sk_buff *skb)
3681{
Vlad Yasevich22061d82012-11-15 08:49:11 +00003682 struct packet_offload *ptype;
Herbert Xud565b0a2008-12-15 23:38:52 -08003683 __be16 type = skb->protocol;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003684 struct list_head *head = &offload_base;
Herbert Xud565b0a2008-12-15 23:38:52 -08003685 int err = -ENOENT;
3686
Eric Dumazetc3c7c252012-12-06 13:54:59 +00003687 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3688
Herbert Xufc59f9a2009-04-14 15:11:06 -07003689 if (NAPI_GRO_CB(skb)->count == 1) {
3690 skb_shinfo(skb)->gso_size = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003691 goto out;
Herbert Xufc59f9a2009-04-14 15:11:06 -07003692 }
Herbert Xud565b0a2008-12-15 23:38:52 -08003693
3694 rcu_read_lock();
3695 list_for_each_entry_rcu(ptype, head, list) {
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003696 if (ptype->type != type || !ptype->callbacks.gro_complete)
Herbert Xud565b0a2008-12-15 23:38:52 -08003697 continue;
3698
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003699 err = ptype->callbacks.gro_complete(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003700 break;
3701 }
3702 rcu_read_unlock();
3703
3704 if (err) {
3705 WARN_ON(&ptype->list == head);
3706 kfree_skb(skb);
3707 return NET_RX_SUCCESS;
3708 }
3709
3710out:
Herbert Xud565b0a2008-12-15 23:38:52 -08003711 return netif_receive_skb(skb);
3712}
3713
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003714/* napi->gro_list contains packets ordered by age.
3715 * youngest packets at the head of it.
3716 * Complete skbs in reverse order to reduce latencies.
3717 */
3718void napi_gro_flush(struct napi_struct *napi, bool flush_old)
Herbert Xud565b0a2008-12-15 23:38:52 -08003719{
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003720 struct sk_buff *skb, *prev = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08003721
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003722 /* scan list and build reverse chain */
3723 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3724 skb->prev = prev;
3725 prev = skb;
Herbert Xud565b0a2008-12-15 23:38:52 -08003726 }
3727
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003728 for (skb = prev; skb; skb = prev) {
3729 skb->next = NULL;
3730
3731 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3732 return;
3733
3734 prev = skb->prev;
3735 napi_gro_complete(skb);
3736 napi->gro_count--;
3737 }
3738
Herbert Xud565b0a2008-12-15 23:38:52 -08003739 napi->gro_list = NULL;
3740}
Eric Dumazet86cac582010-08-31 18:25:32 +00003741EXPORT_SYMBOL(napi_gro_flush);
Herbert Xud565b0a2008-12-15 23:38:52 -08003742
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003743static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3744{
3745 struct sk_buff *p;
3746 unsigned int maclen = skb->dev->hard_header_len;
3747
3748 for (p = napi->gro_list; p; p = p->next) {
3749 unsigned long diffs;
3750
3751 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3752 diffs |= p->vlan_tci ^ skb->vlan_tci;
3753 if (maclen == ETH_HLEN)
3754 diffs |= compare_ether_header(skb_mac_header(p),
3755 skb_gro_mac_header(skb));
3756 else if (!diffs)
3757 diffs = memcmp(skb_mac_header(p),
3758 skb_gro_mac_header(skb),
3759 maclen);
3760 NAPI_GRO_CB(p)->same_flow = !diffs;
3761 NAPI_GRO_CB(p)->flush = 0;
3762 }
3763}
3764
Rami Rosenbb728822012-11-28 21:55:25 +00003765static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xud565b0a2008-12-15 23:38:52 -08003766{
3767 struct sk_buff **pp = NULL;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003768 struct packet_offload *ptype;
Herbert Xud565b0a2008-12-15 23:38:52 -08003769 __be16 type = skb->protocol;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003770 struct list_head *head = &offload_base;
Herbert Xu0da2afd52008-12-26 14:57:42 -08003771 int same_flow;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003772 enum gro_result ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003773
Jarek Poplawskice9e76c2010-08-05 01:19:11 +00003774 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
Herbert Xud565b0a2008-12-15 23:38:52 -08003775 goto normal;
3776
David S. Miller21dc3302010-08-23 00:13:46 -07003777 if (skb_is_gso(skb) || skb_has_frag_list(skb))
Herbert Xuf17f5c92009-01-14 14:36:12 -08003778 goto normal;
3779
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003780 gro_list_prepare(napi, skb);
3781
Herbert Xud565b0a2008-12-15 23:38:52 -08003782 rcu_read_lock();
3783 list_for_each_entry_rcu(ptype, head, list) {
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003784 if (ptype->type != type || !ptype->callbacks.gro_receive)
Herbert Xud565b0a2008-12-15 23:38:52 -08003785 continue;
3786
Herbert Xu86911732009-01-29 14:19:50 +00003787 skb_set_network_header(skb, skb_gro_offset(skb));
Eric Dumazetefd94502013-02-14 17:31:48 +00003788 skb_reset_mac_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003789 NAPI_GRO_CB(skb)->same_flow = 0;
3790 NAPI_GRO_CB(skb)->flush = 0;
Herbert Xu5d38a072009-01-04 16:13:40 -08003791 NAPI_GRO_CB(skb)->free = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003792
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003793 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003794 break;
3795 }
3796 rcu_read_unlock();
3797
3798 if (&ptype->list == head)
3799 goto normal;
3800
Herbert Xu0da2afd52008-12-26 14:57:42 -08003801 same_flow = NAPI_GRO_CB(skb)->same_flow;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003802 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
Herbert Xu0da2afd52008-12-26 14:57:42 -08003803
Herbert Xud565b0a2008-12-15 23:38:52 -08003804 if (pp) {
3805 struct sk_buff *nskb = *pp;
3806
3807 *pp = nskb->next;
3808 nskb->next = NULL;
3809 napi_gro_complete(nskb);
Herbert Xu4ae55442009-02-08 18:00:36 +00003810 napi->gro_count--;
Herbert Xud565b0a2008-12-15 23:38:52 -08003811 }
3812
Herbert Xu0da2afd52008-12-26 14:57:42 -08003813 if (same_flow)
Herbert Xud565b0a2008-12-15 23:38:52 -08003814 goto ok;
3815
Herbert Xu4ae55442009-02-08 18:00:36 +00003816 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
Herbert Xud565b0a2008-12-15 23:38:52 -08003817 goto normal;
Herbert Xud565b0a2008-12-15 23:38:52 -08003818
Herbert Xu4ae55442009-02-08 18:00:36 +00003819 napi->gro_count++;
Herbert Xud565b0a2008-12-15 23:38:52 -08003820 NAPI_GRO_CB(skb)->count = 1;
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003821 NAPI_GRO_CB(skb)->age = jiffies;
Herbert Xu86911732009-01-29 14:19:50 +00003822 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003823 skb->next = napi->gro_list;
3824 napi->gro_list = skb;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003825 ret = GRO_HELD;
Herbert Xud565b0a2008-12-15 23:38:52 -08003826
Herbert Xuad0f9902009-02-01 01:24:55 -08003827pull:
Herbert Xucb189782009-05-26 18:50:31 +00003828 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3829 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3830
3831 BUG_ON(skb->end - skb->tail < grow);
3832
3833 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3834
3835 skb->tail += grow;
3836 skb->data_len -= grow;
3837
3838 skb_shinfo(skb)->frags[0].page_offset += grow;
Eric Dumazet9e903e02011-10-18 21:00:24 +00003839 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
Herbert Xucb189782009-05-26 18:50:31 +00003840
Eric Dumazet9e903e02011-10-18 21:00:24 +00003841 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
Ian Campbellea2ab692011-08-22 23:44:58 +00003842 skb_frag_unref(skb, 0);
Herbert Xucb189782009-05-26 18:50:31 +00003843 memmove(skb_shinfo(skb)->frags,
3844 skb_shinfo(skb)->frags + 1,
Jarek Poplawskie5093ae2010-08-11 02:02:10 +00003845 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
Herbert Xucb189782009-05-26 18:50:31 +00003846 }
Herbert Xuad0f9902009-02-01 01:24:55 -08003847 }
3848
Herbert Xud565b0a2008-12-15 23:38:52 -08003849ok:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003850 return ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003851
3852normal:
Herbert Xuad0f9902009-02-01 01:24:55 -08003853 ret = GRO_NORMAL;
3854 goto pull;
Herbert Xu5d38a072009-01-04 16:13:40 -08003855}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003856
Herbert Xu96e93ea2009-01-06 10:49:34 -08003857
Rami Rosenbb728822012-11-28 21:55:25 +00003858static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
Herbert Xu5d38a072009-01-04 16:13:40 -08003859{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003860 switch (ret) {
3861 case GRO_NORMAL:
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003862 if (netif_receive_skb(skb))
3863 ret = GRO_DROP;
3864 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003865
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003866 case GRO_DROP:
Herbert Xu5d38a072009-01-04 16:13:40 -08003867 kfree_skb(skb);
3868 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003869
Eric Dumazetdaa86542012-04-19 07:07:40 +00003870 case GRO_MERGED_FREE:
Eric Dumazetd7e88832012-04-30 08:10:34 +00003871 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3872 kmem_cache_free(skbuff_head_cache, skb);
3873 else
3874 __kfree_skb(skb);
Eric Dumazetdaa86542012-04-19 07:07:40 +00003875 break;
3876
Ben Hutchings5b252f02009-10-29 07:17:09 +00003877 case GRO_HELD:
3878 case GRO_MERGED:
3879 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003880 }
3881
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003882 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003883}
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003884
Eric Dumazetca07e432012-10-06 22:28:06 +00003885static void skb_gro_reset_offset(struct sk_buff *skb)
Herbert Xu78a478d2009-05-26 18:50:21 +00003886{
Eric Dumazetca07e432012-10-06 22:28:06 +00003887 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3888 const skb_frag_t *frag0 = &pinfo->frags[0];
3889
Herbert Xu78a478d2009-05-26 18:50:21 +00003890 NAPI_GRO_CB(skb)->data_offset = 0;
3891 NAPI_GRO_CB(skb)->frag0 = NULL;
Herbert Xu74895942009-05-26 18:50:27 +00003892 NAPI_GRO_CB(skb)->frag0_len = 0;
Herbert Xu78a478d2009-05-26 18:50:21 +00003893
Simon Hormanced14f62013-05-28 20:34:25 +00003894 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
Eric Dumazetca07e432012-10-06 22:28:06 +00003895 pinfo->nr_frags &&
3896 !PageHighMem(skb_frag_page(frag0))) {
3897 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3898 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
Herbert Xu74895942009-05-26 18:50:27 +00003899 }
Herbert Xu78a478d2009-05-26 18:50:21 +00003900}
Herbert Xu78a478d2009-05-26 18:50:21 +00003901
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003902gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003903{
Herbert Xu86911732009-01-29 14:19:50 +00003904 skb_gro_reset_offset(skb);
3905
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003906 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003907}
3908EXPORT_SYMBOL(napi_gro_receive);
3909
stephen hemmingerd0c2b0d2010-10-19 07:12:10 +00003910static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003911{
Herbert Xu96e93ea2009-01-06 10:49:34 -08003912 __skb_pull(skb, skb_headlen(skb));
Eric Dumazet2a2a4592012-03-21 06:58:03 +00003913 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3914 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
Jesse Gross3701e512010-10-20 13:56:06 +00003915 skb->vlan_tci = 0;
Herbert Xu66c46d72011-01-29 20:44:54 -08003916 skb->dev = napi->dev;
Andy Gospodarek6d152e22011-02-02 14:53:25 -08003917 skb->skb_iif = 0;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003918
3919 napi->skb = skb;
3920}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003921
Herbert Xu76620aa2009-04-16 02:02:07 -07003922struct sk_buff *napi_get_frags(struct napi_struct *napi)
Herbert Xu5d38a072009-01-04 16:13:40 -08003923{
Herbert Xu5d38a072009-01-04 16:13:40 -08003924 struct sk_buff *skb = napi->skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003925
3926 if (!skb) {
Eric Dumazet89d71a62009-10-13 05:34:20 +00003927 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3928 if (skb)
3929 napi->skb = skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003930 }
Herbert Xu96e93ea2009-01-06 10:49:34 -08003931 return skb;
3932}
Herbert Xu76620aa2009-04-16 02:02:07 -07003933EXPORT_SYMBOL(napi_get_frags);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003934
Rami Rosenbb728822012-11-28 21:55:25 +00003935static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003936 gro_result_t ret)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003937{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003938 switch (ret) {
3939 case GRO_NORMAL:
Herbert Xu86911732009-01-29 14:19:50 +00003940 case GRO_HELD:
Ajit Khapardee76b69c2010-02-16 20:25:43 +00003941 skb->protocol = eth_type_trans(skb, skb->dev);
Herbert Xu86911732009-01-29 14:19:50 +00003942
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003943 if (ret == GRO_HELD)
3944 skb_gro_pull(skb, -ETH_HLEN);
3945 else if (netif_receive_skb(skb))
3946 ret = GRO_DROP;
Herbert Xu86911732009-01-29 14:19:50 +00003947 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003948
3949 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003950 case GRO_MERGED_FREE:
3951 napi_reuse_skb(napi, skb);
3952 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003953
3954 case GRO_MERGED:
3955 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003956 }
3957
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003958 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003959}
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003960
Eric Dumazet4adb9c42012-05-18 20:49:06 +00003961static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003962{
Herbert Xu76620aa2009-04-16 02:02:07 -07003963 struct sk_buff *skb = napi->skb;
3964 struct ethhdr *eth;
Herbert Xua5b1cf22009-05-26 18:50:28 +00003965 unsigned int hlen;
3966 unsigned int off;
Herbert Xu76620aa2009-04-16 02:02:07 -07003967
3968 napi->skb = NULL;
3969
3970 skb_reset_mac_header(skb);
3971 skb_gro_reset_offset(skb);
3972
Herbert Xua5b1cf22009-05-26 18:50:28 +00003973 off = skb_gro_offset(skb);
3974 hlen = off + sizeof(*eth);
3975 eth = skb_gro_header_fast(skb, off);
3976 if (skb_gro_header_hard(skb, hlen)) {
3977 eth = skb_gro_header_slow(skb, hlen, off);
3978 if (unlikely(!eth)) {
3979 napi_reuse_skb(napi, skb);
3980 skb = NULL;
3981 goto out;
3982 }
Herbert Xu76620aa2009-04-16 02:02:07 -07003983 }
3984
3985 skb_gro_pull(skb, sizeof(*eth));
3986
3987 /*
3988 * This works because the only protocols we care about don't require
3989 * special handling. We'll fix it up properly at the end.
3990 */
3991 skb->protocol = eth->h_proto;
3992
3993out:
3994 return skb;
3995}
Herbert Xu76620aa2009-04-16 02:02:07 -07003996
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003997gro_result_t napi_gro_frags(struct napi_struct *napi)
Herbert Xu76620aa2009-04-16 02:02:07 -07003998{
3999 struct sk_buff *skb = napi_frags_skb(napi);
Herbert Xu96e93ea2009-01-06 10:49:34 -08004000
4001 if (!skb)
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07004002 return GRO_DROP;
Herbert Xu96e93ea2009-01-06 10:49:34 -08004003
Eric Dumazet89c5fa32012-12-10 13:28:16 +00004004 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
Herbert Xu5d38a072009-01-04 16:13:40 -08004005}
4006EXPORT_SYMBOL(napi_gro_frags);
4007
Eric Dumazete326bed2010-04-22 00:22:45 -07004008/*
4009 * net_rps_action sends any pending IPI's for rps.
4010 * Note: called with local irq disabled, but exits with local irq enabled.
4011 */
4012static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4013{
4014#ifdef CONFIG_RPS
4015 struct softnet_data *remsd = sd->rps_ipi_list;
4016
4017 if (remsd) {
4018 sd->rps_ipi_list = NULL;
4019
4020 local_irq_enable();
4021
4022 /* Send pending IPI's to kick RPS processing on remote cpus. */
4023 while (remsd) {
4024 struct softnet_data *next = remsd->rps_ipi_next;
4025
4026 if (cpu_online(remsd->cpu))
4027 __smp_call_function_single(remsd->cpu,
4028 &remsd->csd, 0);
4029 remsd = next;
4030 }
4031 } else
4032#endif
4033 local_irq_enable();
4034}
4035
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004036static int process_backlog(struct napi_struct *napi, int quota)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004037{
4038 int work = 0;
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004039 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004040
Eric Dumazete326bed2010-04-22 00:22:45 -07004041#ifdef CONFIG_RPS
4042 /* Check if we have pending ipi, its better to send them now,
4043 * not waiting net_rx_action() end.
4044 */
4045 if (sd->rps_ipi_list) {
4046 local_irq_disable();
4047 net_rps_action_and_irq_enable(sd);
4048 }
4049#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004050 napi->weight = weight_p;
Changli Gao6e7676c2010-04-27 15:07:33 -07004051 local_irq_disable();
4052 while (work < quota) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004053 struct sk_buff *skb;
Changli Gao6e7676c2010-04-27 15:07:33 -07004054 unsigned int qlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004055
Changli Gao6e7676c2010-04-27 15:07:33 -07004056 while ((skb = __skb_dequeue(&sd->process_queue))) {
Eric Dumazete4008272010-04-05 15:42:39 -07004057 local_irq_enable();
Changli Gao6e7676c2010-04-27 15:07:33 -07004058 __netif_receive_skb(skb);
Changli Gao6e7676c2010-04-27 15:07:33 -07004059 local_irq_disable();
Tom Herbert76cc8b12010-05-20 18:37:59 +00004060 input_queue_head_incr(sd);
4061 if (++work >= quota) {
4062 local_irq_enable();
4063 return work;
4064 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004065 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004066
Changli Gao6e7676c2010-04-27 15:07:33 -07004067 rps_lock(sd);
4068 qlen = skb_queue_len(&sd->input_pkt_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00004069 if (qlen)
Changli Gao6e7676c2010-04-27 15:07:33 -07004070 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4071 &sd->process_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00004072
Changli Gao6e7676c2010-04-27 15:07:33 -07004073 if (qlen < quota - work) {
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004074 /*
4075 * Inline a custom version of __napi_complete().
4076 * only current cpu owns and manipulates this napi,
4077 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4078 * we can use a plain write instead of clear_bit(),
4079 * and we dont need an smp_mb() memory barrier.
4080 */
4081 list_del(&napi->poll_list);
4082 napi->state = 0;
4083
Changli Gao6e7676c2010-04-27 15:07:33 -07004084 quota = work + qlen;
4085 }
4086 rps_unlock(sd);
4087 }
4088 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004089
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004090 return work;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004091}
4092
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004093/**
4094 * __napi_schedule - schedule for receive
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004095 * @n: entry to schedule
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004096 *
4097 * The entry's receive function will be scheduled to run
4098 */
Harvey Harrisonb5606c22008-02-13 15:03:16 -08004099void __napi_schedule(struct napi_struct *n)
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004100{
4101 unsigned long flags;
4102
4103 local_irq_save(flags);
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004104 ____napi_schedule(&__get_cpu_var(softnet_data), n);
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004105 local_irq_restore(flags);
4106}
4107EXPORT_SYMBOL(__napi_schedule);
4108
Herbert Xud565b0a2008-12-15 23:38:52 -08004109void __napi_complete(struct napi_struct *n)
4110{
4111 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4112 BUG_ON(n->gro_list);
4113
4114 list_del(&n->poll_list);
4115 smp_mb__before_clear_bit();
4116 clear_bit(NAPI_STATE_SCHED, &n->state);
4117}
4118EXPORT_SYMBOL(__napi_complete);
4119
4120void napi_complete(struct napi_struct *n)
4121{
4122 unsigned long flags;
4123
4124 /*
4125 * don't let napi dequeue from the cpu poll list
4126 * just in case its running on a different cpu
4127 */
4128 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4129 return;
4130
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004131 napi_gro_flush(n, false);
Herbert Xud565b0a2008-12-15 23:38:52 -08004132 local_irq_save(flags);
4133 __napi_complete(n);
4134 local_irq_restore(flags);
4135}
4136EXPORT_SYMBOL(napi_complete);
4137
Eliezer Tamiraf12fa62013-06-10 11:39:41 +03004138/* must be called under rcu_read_lock(), as we dont take a reference */
4139struct napi_struct *napi_by_id(unsigned int napi_id)
4140{
4141 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4142 struct napi_struct *napi;
4143
4144 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4145 if (napi->napi_id == napi_id)
4146 return napi;
4147
4148 return NULL;
4149}
4150EXPORT_SYMBOL_GPL(napi_by_id);
4151
4152void napi_hash_add(struct napi_struct *napi)
4153{
4154 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4155
4156 spin_lock(&napi_hash_lock);
4157
4158 /* 0 is not a valid id, we also skip an id that is taken
4159 * we expect both events to be extremely rare
4160 */
4161 napi->napi_id = 0;
4162 while (!napi->napi_id) {
4163 napi->napi_id = ++napi_gen_id;
4164 if (napi_by_id(napi->napi_id))
4165 napi->napi_id = 0;
4166 }
4167
4168 hlist_add_head_rcu(&napi->napi_hash_node,
4169 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4170
4171 spin_unlock(&napi_hash_lock);
4172 }
4173}
4174EXPORT_SYMBOL_GPL(napi_hash_add);
4175
4176/* Warning : caller is responsible to make sure rcu grace period
4177 * is respected before freeing memory containing @napi
4178 */
4179void napi_hash_del(struct napi_struct *napi)
4180{
4181 spin_lock(&napi_hash_lock);
4182
4183 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4184 hlist_del_rcu(&napi->napi_hash_node);
4185
4186 spin_unlock(&napi_hash_lock);
4187}
4188EXPORT_SYMBOL_GPL(napi_hash_del);
4189
Herbert Xud565b0a2008-12-15 23:38:52 -08004190void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4191 int (*poll)(struct napi_struct *, int), int weight)
4192{
4193 INIT_LIST_HEAD(&napi->poll_list);
Herbert Xu4ae55442009-02-08 18:00:36 +00004194 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08004195 napi->gro_list = NULL;
Herbert Xu5d38a072009-01-04 16:13:40 -08004196 napi->skb = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08004197 napi->poll = poll;
Eric Dumazet82dc3c62013-03-05 15:57:22 +00004198 if (weight > NAPI_POLL_WEIGHT)
4199 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4200 weight, dev->name);
Herbert Xud565b0a2008-12-15 23:38:52 -08004201 napi->weight = weight;
4202 list_add(&napi->dev_list, &dev->napi_list);
Herbert Xud565b0a2008-12-15 23:38:52 -08004203 napi->dev = dev;
Herbert Xu5d38a072009-01-04 16:13:40 -08004204#ifdef CONFIG_NETPOLL
Herbert Xud565b0a2008-12-15 23:38:52 -08004205 spin_lock_init(&napi->poll_lock);
4206 napi->poll_owner = -1;
4207#endif
4208 set_bit(NAPI_STATE_SCHED, &napi->state);
4209}
4210EXPORT_SYMBOL(netif_napi_add);
4211
4212void netif_napi_del(struct napi_struct *napi)
4213{
4214 struct sk_buff *skb, *next;
4215
Peter P Waskiewicz Jrd7b06632008-12-26 01:35:35 -08004216 list_del_init(&napi->dev_list);
Herbert Xu76620aa2009-04-16 02:02:07 -07004217 napi_free_frags(napi);
Herbert Xud565b0a2008-12-15 23:38:52 -08004218
4219 for (skb = napi->gro_list; skb; skb = next) {
4220 next = skb->next;
4221 skb->next = NULL;
4222 kfree_skb(skb);
4223 }
4224
4225 napi->gro_list = NULL;
Herbert Xu4ae55442009-02-08 18:00:36 +00004226 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08004227}
4228EXPORT_SYMBOL(netif_napi_del);
4229
Linus Torvalds1da177e2005-04-16 15:20:36 -07004230static void net_rx_action(struct softirq_action *h)
4231{
Eric Dumazete326bed2010-04-22 00:22:45 -07004232 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger24f8b232008-11-03 17:14:38 -08004233 unsigned long time_limit = jiffies + 2;
Stephen Hemminger51b0bde2005-06-23 20:14:40 -07004234 int budget = netdev_budget;
Matt Mackall53fb95d2005-08-11 19:27:43 -07004235 void *have;
4236
Linus Torvalds1da177e2005-04-16 15:20:36 -07004237 local_irq_disable();
4238
Eric Dumazete326bed2010-04-22 00:22:45 -07004239 while (!list_empty(&sd->poll_list)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004240 struct napi_struct *n;
4241 int work, weight;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004242
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004243 /* If softirq window is exhuasted then punt.
Stephen Hemminger24f8b232008-11-03 17:14:38 -08004244 * Allow this to run for 2 jiffies since which will allow
4245 * an average latency of 1.5/HZ.
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004246 */
Eric Dumazetd1f41b62013-03-05 07:15:13 +00004247 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004248 goto softnet_break;
4249
4250 local_irq_enable();
4251
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004252 /* Even though interrupts have been re-enabled, this
4253 * access is safe because interrupts can only add new
4254 * entries to the tail of this list, and only ->poll()
4255 * calls can remove this head entry from the list.
4256 */
Eric Dumazete326bed2010-04-22 00:22:45 -07004257 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004258
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004259 have = netpoll_poll_lock(n);
4260
4261 weight = n->weight;
4262
David S. Miller0a7606c2007-10-29 21:28:47 -07004263 /* This NAPI_STATE_SCHED test is for avoiding a race
4264 * with netpoll's poll_napi(). Only the entity which
4265 * obtains the lock and sees NAPI_STATE_SCHED set will
4266 * actually make the ->poll() call. Therefore we avoid
Lucas De Marchi25985ed2011-03-30 22:57:33 -03004267 * accidentally calling ->poll() when NAPI is not scheduled.
David S. Miller0a7606c2007-10-29 21:28:47 -07004268 */
4269 work = 0;
Neil Horman4ea7e382009-05-21 07:36:08 +00004270 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
David S. Miller0a7606c2007-10-29 21:28:47 -07004271 work = n->poll(n, weight);
Neil Horman4ea7e382009-05-21 07:36:08 +00004272 trace_napi_poll(n);
4273 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004274
4275 WARN_ON_ONCE(work > weight);
4276
4277 budget -= work;
4278
4279 local_irq_disable();
4280
4281 /* Drivers must not modify the NAPI state if they
4282 * consume the entire weight. In such cases this code
4283 * still "owns" the NAPI instance and therefore can
4284 * move the instance around on the list at-will.
4285 */
David S. Millerfed17f32008-01-07 21:00:40 -08004286 if (unlikely(work == weight)) {
Herbert Xuff780cd2009-06-26 19:27:04 -07004287 if (unlikely(napi_disable_pending(n))) {
4288 local_irq_enable();
4289 napi_complete(n);
4290 local_irq_disable();
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004291 } else {
4292 if (n->gro_list) {
4293 /* flush too old packets
4294 * If HZ < 1000, flush all packets.
4295 */
4296 local_irq_enable();
4297 napi_gro_flush(n, HZ >= 1000);
4298 local_irq_disable();
4299 }
Eric Dumazete326bed2010-04-22 00:22:45 -07004300 list_move_tail(&n->poll_list, &sd->poll_list);
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004301 }
David S. Millerfed17f32008-01-07 21:00:40 -08004302 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004303
4304 netpoll_poll_unlock(have);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004305 }
4306out:
Eric Dumazete326bed2010-04-22 00:22:45 -07004307 net_rps_action_and_irq_enable(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00004308
Chris Leechdb217332006-06-17 21:24:58 -07004309#ifdef CONFIG_NET_DMA
4310 /*
4311 * There may not be any more sk_buffs coming right now, so push
4312 * any pending DMA copies to hardware
4313 */
Dan Williams2ba05622009-01-06 11:38:14 -07004314 dma_issue_pending_all();
Chris Leechdb217332006-06-17 21:24:58 -07004315#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004316
Linus Torvalds1da177e2005-04-16 15:20:36 -07004317 return;
4318
4319softnet_break:
Changli Gaodee42872010-05-02 05:42:16 +00004320 sd->time_squeeze++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004321 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4322 goto out;
4323}
4324
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004325struct netdev_upper {
4326 struct net_device *dev;
4327 bool master;
4328 struct list_head list;
4329 struct rcu_head rcu;
4330 struct list_head search_list;
4331};
4332
4333static void __append_search_uppers(struct list_head *search_list,
4334 struct net_device *dev)
4335{
4336 struct netdev_upper *upper;
4337
4338 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4339 /* check if this upper is not already in search list */
4340 if (list_empty(&upper->search_list))
4341 list_add_tail(&upper->search_list, search_list);
4342 }
4343}
4344
4345static bool __netdev_search_upper_dev(struct net_device *dev,
4346 struct net_device *upper_dev)
4347{
4348 LIST_HEAD(search_list);
4349 struct netdev_upper *upper;
4350 struct netdev_upper *tmp;
4351 bool ret = false;
4352
4353 __append_search_uppers(&search_list, dev);
4354 list_for_each_entry(upper, &search_list, search_list) {
4355 if (upper->dev == upper_dev) {
4356 ret = true;
4357 break;
4358 }
4359 __append_search_uppers(&search_list, upper->dev);
4360 }
4361 list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4362 INIT_LIST_HEAD(&upper->search_list);
4363 return ret;
4364}
4365
4366static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4367 struct net_device *upper_dev)
4368{
4369 struct netdev_upper *upper;
4370
4371 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4372 if (upper->dev == upper_dev)
4373 return upper;
4374 }
4375 return NULL;
4376}
4377
4378/**
4379 * netdev_has_upper_dev - Check if device is linked to an upper device
4380 * @dev: device
4381 * @upper_dev: upper device to check
4382 *
4383 * Find out if a device is linked to specified upper device and return true
4384 * in case it is. Note that this checks only immediate upper device,
4385 * not through a complete stack of devices. The caller must hold the RTNL lock.
4386 */
4387bool netdev_has_upper_dev(struct net_device *dev,
4388 struct net_device *upper_dev)
4389{
4390 ASSERT_RTNL();
4391
4392 return __netdev_find_upper(dev, upper_dev);
4393}
4394EXPORT_SYMBOL(netdev_has_upper_dev);
4395
4396/**
4397 * netdev_has_any_upper_dev - Check if device is linked to some device
4398 * @dev: device
4399 *
4400 * Find out if a device is linked to an upper device and return true in case
4401 * it is. The caller must hold the RTNL lock.
4402 */
4403bool netdev_has_any_upper_dev(struct net_device *dev)
4404{
4405 ASSERT_RTNL();
4406
4407 return !list_empty(&dev->upper_dev_list);
4408}
4409EXPORT_SYMBOL(netdev_has_any_upper_dev);
4410
4411/**
4412 * netdev_master_upper_dev_get - Get master upper device
4413 * @dev: device
4414 *
4415 * Find a master upper device and return pointer to it or NULL in case
4416 * it's not there. The caller must hold the RTNL lock.
4417 */
4418struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4419{
4420 struct netdev_upper *upper;
4421
4422 ASSERT_RTNL();
4423
4424 if (list_empty(&dev->upper_dev_list))
4425 return NULL;
4426
4427 upper = list_first_entry(&dev->upper_dev_list,
4428 struct netdev_upper, list);
4429 if (likely(upper->master))
4430 return upper->dev;
4431 return NULL;
4432}
4433EXPORT_SYMBOL(netdev_master_upper_dev_get);
4434
4435/**
4436 * netdev_master_upper_dev_get_rcu - Get master upper device
4437 * @dev: device
4438 *
4439 * Find a master upper device and return pointer to it or NULL in case
4440 * it's not there. The caller must hold the RCU read lock.
4441 */
4442struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4443{
4444 struct netdev_upper *upper;
4445
4446 upper = list_first_or_null_rcu(&dev->upper_dev_list,
4447 struct netdev_upper, list);
4448 if (upper && likely(upper->master))
4449 return upper->dev;
4450 return NULL;
4451}
4452EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4453
4454static int __netdev_upper_dev_link(struct net_device *dev,
4455 struct net_device *upper_dev, bool master)
4456{
4457 struct netdev_upper *upper;
4458
4459 ASSERT_RTNL();
4460
4461 if (dev == upper_dev)
4462 return -EBUSY;
4463
4464 /* To prevent loops, check if dev is not upper device to upper_dev. */
4465 if (__netdev_search_upper_dev(upper_dev, dev))
4466 return -EBUSY;
4467
4468 if (__netdev_find_upper(dev, upper_dev))
4469 return -EEXIST;
4470
4471 if (master && netdev_master_upper_dev_get(dev))
4472 return -EBUSY;
4473
4474 upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4475 if (!upper)
4476 return -ENOMEM;
4477
4478 upper->dev = upper_dev;
4479 upper->master = master;
4480 INIT_LIST_HEAD(&upper->search_list);
4481
4482 /* Ensure that master upper link is always the first item in list. */
4483 if (master)
4484 list_add_rcu(&upper->list, &dev->upper_dev_list);
4485 else
4486 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4487 dev_hold(upper_dev);
Jiri Pirko42e52bf2013-05-25 04:12:10 +00004488 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004489 return 0;
4490}
4491
4492/**
4493 * netdev_upper_dev_link - Add a link to the upper device
4494 * @dev: device
4495 * @upper_dev: new upper device
4496 *
4497 * Adds a link to device which is upper to this one. The caller must hold
4498 * the RTNL lock. On a failure a negative errno code is returned.
4499 * On success the reference counts are adjusted and the function
4500 * returns zero.
4501 */
4502int netdev_upper_dev_link(struct net_device *dev,
4503 struct net_device *upper_dev)
4504{
4505 return __netdev_upper_dev_link(dev, upper_dev, false);
4506}
4507EXPORT_SYMBOL(netdev_upper_dev_link);
4508
4509/**
4510 * netdev_master_upper_dev_link - Add a master link to the upper device
4511 * @dev: device
4512 * @upper_dev: new upper device
4513 *
4514 * Adds a link to device which is upper to this one. In this case, only
4515 * one master upper device can be linked, although other non-master devices
4516 * might be linked as well. The caller must hold the RTNL lock.
4517 * On a failure a negative errno code is returned. On success the reference
4518 * counts are adjusted and the function returns zero.
4519 */
4520int netdev_master_upper_dev_link(struct net_device *dev,
4521 struct net_device *upper_dev)
4522{
4523 return __netdev_upper_dev_link(dev, upper_dev, true);
4524}
4525EXPORT_SYMBOL(netdev_master_upper_dev_link);
4526
4527/**
4528 * netdev_upper_dev_unlink - Removes a link to upper device
4529 * @dev: device
4530 * @upper_dev: new upper device
4531 *
4532 * Removes a link to device which is upper to this one. The caller must hold
4533 * the RTNL lock.
4534 */
4535void netdev_upper_dev_unlink(struct net_device *dev,
4536 struct net_device *upper_dev)
4537{
4538 struct netdev_upper *upper;
4539
4540 ASSERT_RTNL();
4541
4542 upper = __netdev_find_upper(dev, upper_dev);
4543 if (!upper)
4544 return;
4545 list_del_rcu(&upper->list);
4546 dev_put(upper_dev);
4547 kfree_rcu(upper, rcu);
Jiri Pirko42e52bf2013-05-25 04:12:10 +00004548 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004549}
4550EXPORT_SYMBOL(netdev_upper_dev_unlink);
4551
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004552static void dev_change_rx_flags(struct net_device *dev, int flags)
4553{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004554 const struct net_device_ops *ops = dev->netdev_ops;
4555
4556 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4557 ops->ndo_change_rx_flags(dev, flags);
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004558}
4559
Wang Chendad9b332008-06-18 01:48:28 -07004560static int __dev_set_promiscuity(struct net_device *dev, int inc)
Patrick McHardy4417da62007-06-27 01:28:10 -07004561{
Eric Dumazetb536db92011-11-30 21:42:26 +00004562 unsigned int old_flags = dev->flags;
Eric W. Biedermand04a48b2012-05-23 17:01:57 -06004563 kuid_t uid;
4564 kgid_t gid;
Patrick McHardy4417da62007-06-27 01:28:10 -07004565
Patrick McHardy24023452007-07-14 18:51:31 -07004566 ASSERT_RTNL();
4567
Wang Chendad9b332008-06-18 01:48:28 -07004568 dev->flags |= IFF_PROMISC;
4569 dev->promiscuity += inc;
4570 if (dev->promiscuity == 0) {
4571 /*
4572 * Avoid overflow.
4573 * If inc causes overflow, untouch promisc and return error.
4574 */
4575 if (inc < 0)
4576 dev->flags &= ~IFF_PROMISC;
4577 else {
4578 dev->promiscuity -= inc;
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004579 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4580 dev->name);
Wang Chendad9b332008-06-18 01:48:28 -07004581 return -EOVERFLOW;
4582 }
4583 }
Patrick McHardy4417da62007-06-27 01:28:10 -07004584 if (dev->flags != old_flags) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004585 pr_info("device %s %s promiscuous mode\n",
4586 dev->name,
4587 dev->flags & IFF_PROMISC ? "entered" : "left");
David Howells8192b0c2008-11-14 10:39:10 +11004588 if (audit_enabled) {
4589 current_uid_gid(&uid, &gid);
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004590 audit_log(current->audit_context, GFP_ATOMIC,
4591 AUDIT_ANOM_PROMISCUOUS,
4592 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4593 dev->name, (dev->flags & IFF_PROMISC),
4594 (old_flags & IFF_PROMISC),
Eric W. Biedermane1760bd2012-09-10 22:39:43 -07004595 from_kuid(&init_user_ns, audit_get_loginuid(current)),
Eric W. Biedermand04a48b2012-05-23 17:01:57 -06004596 from_kuid(&init_user_ns, uid),
4597 from_kgid(&init_user_ns, gid),
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004598 audit_get_sessionid(current));
David Howells8192b0c2008-11-14 10:39:10 +11004599 }
Patrick McHardy24023452007-07-14 18:51:31 -07004600
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004601 dev_change_rx_flags(dev, IFF_PROMISC);
Patrick McHardy4417da62007-06-27 01:28:10 -07004602 }
Wang Chendad9b332008-06-18 01:48:28 -07004603 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004604}
4605
Linus Torvalds1da177e2005-04-16 15:20:36 -07004606/**
4607 * dev_set_promiscuity - update promiscuity count on a device
4608 * @dev: device
4609 * @inc: modifier
4610 *
Stephen Hemminger3041a062006-05-26 13:25:24 -07004611 * Add or remove promiscuity from a device. While the count in the device
Linus Torvalds1da177e2005-04-16 15:20:36 -07004612 * remains above zero the interface remains promiscuous. Once it hits zero
4613 * the device reverts back to normal filtering operation. A negative inc
4614 * value is used to drop promiscuity on the device.
Wang Chendad9b332008-06-18 01:48:28 -07004615 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004616 */
Wang Chendad9b332008-06-18 01:48:28 -07004617int dev_set_promiscuity(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004618{
Eric Dumazetb536db92011-11-30 21:42:26 +00004619 unsigned int old_flags = dev->flags;
Wang Chendad9b332008-06-18 01:48:28 -07004620 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004621
Wang Chendad9b332008-06-18 01:48:28 -07004622 err = __dev_set_promiscuity(dev, inc);
Patrick McHardy4b5a6982008-07-06 15:49:08 -07004623 if (err < 0)
Wang Chendad9b332008-06-18 01:48:28 -07004624 return err;
Patrick McHardy4417da62007-06-27 01:28:10 -07004625 if (dev->flags != old_flags)
4626 dev_set_rx_mode(dev);
Wang Chendad9b332008-06-18 01:48:28 -07004627 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004628}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004629EXPORT_SYMBOL(dev_set_promiscuity);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004630
4631/**
4632 * dev_set_allmulti - update allmulti count on a device
4633 * @dev: device
4634 * @inc: modifier
4635 *
4636 * Add or remove reception of all multicast frames to a device. While the
4637 * count in the device remains above zero the interface remains listening
4638 * to all interfaces. Once it hits zero the device reverts back to normal
4639 * filtering operation. A negative @inc value is used to drop the counter
4640 * when releasing a resource needing all multicasts.
Wang Chendad9b332008-06-18 01:48:28 -07004641 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004642 */
4643
Wang Chendad9b332008-06-18 01:48:28 -07004644int dev_set_allmulti(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004645{
Eric Dumazetb536db92011-11-30 21:42:26 +00004646 unsigned int old_flags = dev->flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004647
Patrick McHardy24023452007-07-14 18:51:31 -07004648 ASSERT_RTNL();
4649
Linus Torvalds1da177e2005-04-16 15:20:36 -07004650 dev->flags |= IFF_ALLMULTI;
Wang Chendad9b332008-06-18 01:48:28 -07004651 dev->allmulti += inc;
4652 if (dev->allmulti == 0) {
4653 /*
4654 * Avoid overflow.
4655 * If inc causes overflow, untouch allmulti and return error.
4656 */
4657 if (inc < 0)
4658 dev->flags &= ~IFF_ALLMULTI;
4659 else {
4660 dev->allmulti -= inc;
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004661 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4662 dev->name);
Wang Chendad9b332008-06-18 01:48:28 -07004663 return -EOVERFLOW;
4664 }
4665 }
Patrick McHardy24023452007-07-14 18:51:31 -07004666 if (dev->flags ^ old_flags) {
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004667 dev_change_rx_flags(dev, IFF_ALLMULTI);
Patrick McHardy4417da62007-06-27 01:28:10 -07004668 dev_set_rx_mode(dev);
Patrick McHardy24023452007-07-14 18:51:31 -07004669 }
Wang Chendad9b332008-06-18 01:48:28 -07004670 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004671}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004672EXPORT_SYMBOL(dev_set_allmulti);
Patrick McHardy4417da62007-06-27 01:28:10 -07004673
4674/*
4675 * Upload unicast and multicast address lists to device and
4676 * configure RX filtering. When the device doesn't support unicast
Joe Perches53ccaae2007-12-20 14:02:06 -08004677 * filtering it is put in promiscuous mode while unicast addresses
Patrick McHardy4417da62007-06-27 01:28:10 -07004678 * are present.
4679 */
4680void __dev_set_rx_mode(struct net_device *dev)
4681{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004682 const struct net_device_ops *ops = dev->netdev_ops;
4683
Patrick McHardy4417da62007-06-27 01:28:10 -07004684 /* dev_open will call this function so the list will stay sane. */
4685 if (!(dev->flags&IFF_UP))
4686 return;
4687
4688 if (!netif_device_present(dev))
YOSHIFUJI Hideaki40b77c92007-07-19 10:43:23 +09004689 return;
Patrick McHardy4417da62007-06-27 01:28:10 -07004690
Jiri Pirko01789342011-08-16 06:29:00 +00004691 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004692 /* Unicast addresses changes may only happen under the rtnl,
4693 * therefore calling __dev_set_promiscuity here is safe.
4694 */
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004695 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004696 __dev_set_promiscuity(dev, 1);
Joe Perches2d348d12011-07-25 16:17:35 -07004697 dev->uc_promisc = true;
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004698 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004699 __dev_set_promiscuity(dev, -1);
Joe Perches2d348d12011-07-25 16:17:35 -07004700 dev->uc_promisc = false;
Patrick McHardy4417da62007-06-27 01:28:10 -07004701 }
Patrick McHardy4417da62007-06-27 01:28:10 -07004702 }
Jiri Pirko01789342011-08-16 06:29:00 +00004703
4704 if (ops->ndo_set_rx_mode)
4705 ops->ndo_set_rx_mode(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004706}
4707
4708void dev_set_rx_mode(struct net_device *dev)
4709{
David S. Millerb9e40852008-07-15 00:15:08 -07004710 netif_addr_lock_bh(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004711 __dev_set_rx_mode(dev);
David S. Millerb9e40852008-07-15 00:15:08 -07004712 netif_addr_unlock_bh(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004713}
4714
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004715/**
4716 * dev_get_flags - get flags reported to userspace
4717 * @dev: device
4718 *
4719 * Get the combination of flag bits exported through APIs to userspace.
4720 */
Eric Dumazet95c96172012-04-15 05:58:06 +00004721unsigned int dev_get_flags(const struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004722{
Eric Dumazet95c96172012-04-15 05:58:06 +00004723 unsigned int flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004724
4725 flags = (dev->flags & ~(IFF_PROMISC |
4726 IFF_ALLMULTI |
Stefan Rompfb00055a2006-03-20 17:09:11 -08004727 IFF_RUNNING |
4728 IFF_LOWER_UP |
4729 IFF_DORMANT)) |
Linus Torvalds1da177e2005-04-16 15:20:36 -07004730 (dev->gflags & (IFF_PROMISC |
4731 IFF_ALLMULTI));
4732
Stefan Rompfb00055a2006-03-20 17:09:11 -08004733 if (netif_running(dev)) {
4734 if (netif_oper_up(dev))
4735 flags |= IFF_RUNNING;
4736 if (netif_carrier_ok(dev))
4737 flags |= IFF_LOWER_UP;
4738 if (netif_dormant(dev))
4739 flags |= IFF_DORMANT;
4740 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004741
4742 return flags;
4743}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004744EXPORT_SYMBOL(dev_get_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004745
Patrick McHardybd380812010-02-26 06:34:53 +00004746int __dev_change_flags(struct net_device *dev, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004747{
Eric Dumazetb536db92011-11-30 21:42:26 +00004748 unsigned int old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00004749 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004750
Patrick McHardy24023452007-07-14 18:51:31 -07004751 ASSERT_RTNL();
4752
Linus Torvalds1da177e2005-04-16 15:20:36 -07004753 /*
4754 * Set the flags on our device.
4755 */
4756
4757 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4758 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4759 IFF_AUTOMEDIA)) |
4760 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4761 IFF_ALLMULTI));
4762
4763 /*
4764 * Load in the correct multicast list now the flags have changed.
4765 */
4766
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004767 if ((old_flags ^ flags) & IFF_MULTICAST)
4768 dev_change_rx_flags(dev, IFF_MULTICAST);
Patrick McHardy24023452007-07-14 18:51:31 -07004769
Patrick McHardy4417da62007-06-27 01:28:10 -07004770 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004771
4772 /*
4773 * Have we downed the interface. We handle IFF_UP ourselves
4774 * according to user attempts to set it, rather than blindly
4775 * setting it.
4776 */
4777
4778 ret = 0;
4779 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
Patrick McHardybd380812010-02-26 06:34:53 +00004780 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004781
4782 if (!ret)
Patrick McHardy4417da62007-06-27 01:28:10 -07004783 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004784 }
4785
Linus Torvalds1da177e2005-04-16 15:20:36 -07004786 if ((flags ^ dev->gflags) & IFF_PROMISC) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004787 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4788
Linus Torvalds1da177e2005-04-16 15:20:36 -07004789 dev->gflags ^= IFF_PROMISC;
4790 dev_set_promiscuity(dev, inc);
4791 }
4792
4793 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4794 is important. Some (broken) drivers set IFF_PROMISC, when
4795 IFF_ALLMULTI is requested not asking us and not reporting.
4796 */
4797 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004798 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4799
Linus Torvalds1da177e2005-04-16 15:20:36 -07004800 dev->gflags ^= IFF_ALLMULTI;
4801 dev_set_allmulti(dev, inc);
4802 }
4803
Patrick McHardybd380812010-02-26 06:34:53 +00004804 return ret;
4805}
4806
4807void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4808{
4809 unsigned int changes = dev->flags ^ old_flags;
4810
4811 if (changes & IFF_UP) {
4812 if (dev->flags & IFF_UP)
4813 call_netdevice_notifiers(NETDEV_UP, dev);
4814 else
4815 call_netdevice_notifiers(NETDEV_DOWN, dev);
4816 }
4817
4818 if (dev->flags & IFF_UP &&
Jiri Pirkobe9efd32013-05-28 01:30:22 +00004819 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
4820 struct netdev_notifier_change_info change_info;
4821
4822 change_info.flags_changed = changes;
4823 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
4824 &change_info.info);
4825 }
Patrick McHardybd380812010-02-26 06:34:53 +00004826}
4827
4828/**
4829 * dev_change_flags - change device settings
4830 * @dev: device
4831 * @flags: device state flags
4832 *
4833 * Change settings on device based state flags. The flags are
4834 * in the userspace exported format.
4835 */
Eric Dumazetb536db92011-11-30 21:42:26 +00004836int dev_change_flags(struct net_device *dev, unsigned int flags)
Patrick McHardybd380812010-02-26 06:34:53 +00004837{
Eric Dumazetb536db92011-11-30 21:42:26 +00004838 int ret;
4839 unsigned int changes, old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00004840
4841 ret = __dev_change_flags(dev, flags);
4842 if (ret < 0)
4843 return ret;
4844
4845 changes = old_flags ^ dev->flags;
Thomas Graf7c355f52007-06-05 16:03:03 -07004846 if (changes)
4847 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004848
Patrick McHardybd380812010-02-26 06:34:53 +00004849 __dev_notify_flags(dev, old_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004850 return ret;
4851}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004852EXPORT_SYMBOL(dev_change_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004853
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004854/**
4855 * dev_set_mtu - Change maximum transfer unit
4856 * @dev: device
4857 * @new_mtu: new transfer unit
4858 *
4859 * Change the maximum transfer size of the network device.
4860 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004861int dev_set_mtu(struct net_device *dev, int new_mtu)
4862{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004863 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004864 int err;
4865
4866 if (new_mtu == dev->mtu)
4867 return 0;
4868
4869 /* MTU must be positive. */
4870 if (new_mtu < 0)
4871 return -EINVAL;
4872
4873 if (!netif_device_present(dev))
4874 return -ENODEV;
4875
4876 err = 0;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004877 if (ops->ndo_change_mtu)
4878 err = ops->ndo_change_mtu(dev, new_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004879 else
4880 dev->mtu = new_mtu;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004881
Jiri Pirkoe3d8fab2012-12-03 01:16:32 +00004882 if (!err)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07004883 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004884 return err;
4885}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004886EXPORT_SYMBOL(dev_set_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004887
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004888/**
Vlad Dogarucbda10f2011-01-13 23:38:30 +00004889 * dev_set_group - Change group this device belongs to
4890 * @dev: device
4891 * @new_group: group this device should belong to
4892 */
4893void dev_set_group(struct net_device *dev, int new_group)
4894{
4895 dev->group = new_group;
4896}
4897EXPORT_SYMBOL(dev_set_group);
4898
4899/**
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004900 * dev_set_mac_address - Change Media Access Control Address
4901 * @dev: device
4902 * @sa: new address
4903 *
4904 * Change the hardware (MAC) address of the device
4905 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004906int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4907{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004908 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004909 int err;
4910
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004911 if (!ops->ndo_set_mac_address)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004912 return -EOPNOTSUPP;
4913 if (sa->sa_family != dev->type)
4914 return -EINVAL;
4915 if (!netif_device_present(dev))
4916 return -ENODEV;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004917 err = ops->ndo_set_mac_address(dev, sa);
Jiri Pirkof6521512013-01-01 03:30:14 +00004918 if (err)
4919 return err;
Jiri Pirkofbdeca22013-01-01 03:30:16 +00004920 dev->addr_assign_type = NET_ADDR_SET;
Jiri Pirkof6521512013-01-01 03:30:14 +00004921 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04004922 add_device_randomness(dev->dev_addr, dev->addr_len);
Jiri Pirkof6521512013-01-01 03:30:14 +00004923 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004924}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004925EXPORT_SYMBOL(dev_set_mac_address);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004926
Jiri Pirko4bf84c32012-12-27 23:49:37 +00004927/**
4928 * dev_change_carrier - Change device carrier
4929 * @dev: device
Randy Dunlap691b3b72013-03-04 12:32:43 +00004930 * @new_carrier: new value
Jiri Pirko4bf84c32012-12-27 23:49:37 +00004931 *
4932 * Change device carrier
4933 */
4934int dev_change_carrier(struct net_device *dev, bool new_carrier)
4935{
4936 const struct net_device_ops *ops = dev->netdev_ops;
4937
4938 if (!ops->ndo_change_carrier)
4939 return -EOPNOTSUPP;
4940 if (!netif_device_present(dev))
4941 return -ENODEV;
4942 return ops->ndo_change_carrier(dev, new_carrier);
4943}
4944EXPORT_SYMBOL(dev_change_carrier);
4945
Linus Torvalds1da177e2005-04-16 15:20:36 -07004946/**
4947 * dev_new_index - allocate an ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004948 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07004949 *
4950 * Returns a suitable unique value for a new device interface
4951 * number. The caller must hold the rtnl semaphore or the
4952 * dev_base_lock to be sure it remains unique.
4953 */
Eric W. Biederman881d9662007-09-17 11:56:21 -07004954static int dev_new_index(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004955{
Pavel Emelyanovaa79e662012-08-08 21:53:19 +00004956 int ifindex = net->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004957 for (;;) {
4958 if (++ifindex <= 0)
4959 ifindex = 1;
Eric W. Biederman881d9662007-09-17 11:56:21 -07004960 if (!__dev_get_by_index(net, ifindex))
Pavel Emelyanovaa79e662012-08-08 21:53:19 +00004961 return net->ifindex = ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004962 }
4963}
4964
Linus Torvalds1da177e2005-04-16 15:20:36 -07004965/* Delayed registration/unregisteration */
Denis Cheng3b5b34f2007-12-07 00:49:17 -08004966static LIST_HEAD(net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004967
Stephen Hemminger6f05f622007-03-08 20:46:03 -08004968static void net_set_todo(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004969{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004970 list_add_tail(&dev->todo_list, &net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004971}
4972
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004973static void rollback_registered_many(struct list_head *head)
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004974{
Krishna Kumare93737b2009-12-08 22:26:02 +00004975 struct net_device *dev, *tmp;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004976
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004977 BUG_ON(dev_boot_phase);
4978 ASSERT_RTNL();
4979
Krishna Kumare93737b2009-12-08 22:26:02 +00004980 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004981 /* Some devices call without registering
Krishna Kumare93737b2009-12-08 22:26:02 +00004982 * for initialization unwind. Remove those
4983 * devices and proceed with the remaining.
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004984 */
4985 if (dev->reg_state == NETREG_UNINITIALIZED) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004986 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4987 dev->name, dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004988
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004989 WARN_ON(1);
Krishna Kumare93737b2009-12-08 22:26:02 +00004990 list_del(&dev->unreg_list);
4991 continue;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004992 }
Eric Dumazet449f4542011-05-19 12:24:16 +00004993 dev->dismantle = true;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004994 BUG_ON(dev->reg_state != NETREG_REGISTERED);
Octavian Purdila44345722010-12-13 12:44:07 +00004995 }
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004996
Octavian Purdila44345722010-12-13 12:44:07 +00004997 /* If device is running, close it first. */
4998 dev_close_many(head);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004999
Octavian Purdila44345722010-12-13 12:44:07 +00005000 list_for_each_entry(dev, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005001 /* And unlink it from device chain. */
5002 unlist_netdevice(dev);
5003
5004 dev->reg_state = NETREG_UNREGISTERING;
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005005 }
5006
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005007 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005008
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005009 list_for_each_entry(dev, head, unreg_list) {
5010 /* Shutdown queueing discipline. */
5011 dev_shutdown(dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005012
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005013
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005014 /* Notify protocols, that we are about to destroy
5015 this device. They should clean all the things.
5016 */
5017 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5018
Patrick McHardya2835762010-02-26 06:34:51 +00005019 if (!dev->rtnl_link_ops ||
5020 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5021 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5022
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005023 /*
5024 * Flush the unicast and multicast chains
5025 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005026 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00005027 dev_mc_flush(dev);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005028
5029 if (dev->netdev_ops->ndo_uninit)
5030 dev->netdev_ops->ndo_uninit(dev);
5031
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005032 /* Notifier chain MUST detach us all upper devices. */
5033 WARN_ON(netdev_has_any_upper_dev(dev));
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005034
5035 /* Remove entries from kobject tree */
5036 netdev_unregister_kobject(dev);
Alexander Duyck024e9672013-01-10 08:57:46 +00005037#ifdef CONFIG_XPS
5038 /* Remove XPS queueing entries */
5039 netif_reset_xps_queues_gt(dev, 0);
5040#endif
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005041 }
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005042
Eric W. Biederman850a5452011-10-13 22:25:23 +00005043 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005044
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005045 list_for_each_entry(dev, head, unreg_list)
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005046 dev_put(dev);
5047}
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005048
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005049static void rollback_registered(struct net_device *dev)
5050{
5051 LIST_HEAD(single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005052
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005053 list_add(&dev->unreg_list, &single);
5054 rollback_registered_many(&single);
Eric Dumazetceaaec92011-02-17 22:59:19 +00005055 list_del(&single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005056}
5057
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005058static netdev_features_t netdev_fix_features(struct net_device *dev,
5059 netdev_features_t features)
Herbert Xub63365a2008-10-23 01:11:29 -07005060{
Michał Mirosław57422dc2011-01-22 12:14:12 +00005061 /* Fix illegal checksum combinations */
5062 if ((features & NETIF_F_HW_CSUM) &&
5063 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005064 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
Michał Mirosław57422dc2011-01-22 12:14:12 +00005065 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5066 }
5067
Herbert Xub63365a2008-10-23 01:11:29 -07005068 /* TSO requires that SG is present as well. */
Ben Hutchingsea2d3682011-04-12 14:38:37 +00005069 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005070 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
Ben Hutchingsea2d3682011-04-12 14:38:37 +00005071 features &= ~NETIF_F_ALL_TSO;
Herbert Xub63365a2008-10-23 01:11:29 -07005072 }
5073
Pravin B Shelarec5f0612013-03-07 09:28:01 +00005074 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5075 !(features & NETIF_F_IP_CSUM)) {
5076 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5077 features &= ~NETIF_F_TSO;
5078 features &= ~NETIF_F_TSO_ECN;
5079 }
5080
5081 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5082 !(features & NETIF_F_IPV6_CSUM)) {
5083 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5084 features &= ~NETIF_F_TSO6;
5085 }
5086
Ben Hutchings31d8b9e2011-04-12 14:47:15 +00005087 /* TSO ECN requires that TSO is present as well. */
5088 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5089 features &= ~NETIF_F_TSO_ECN;
5090
Michał Mirosław212b5732011-02-15 16:59:16 +00005091 /* Software GSO depends on SG. */
5092 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005093 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
Michał Mirosław212b5732011-02-15 16:59:16 +00005094 features &= ~NETIF_F_GSO;
5095 }
5096
Michał Mirosławacd11302011-01-24 15:45:15 -08005097 /* UFO needs SG and checksumming */
Herbert Xub63365a2008-10-23 01:11:29 -07005098 if (features & NETIF_F_UFO) {
Michał Mirosław79032642010-11-30 06:38:00 +00005099 /* maybe split UFO into V4 and V6? */
5100 if (!((features & NETIF_F_GEN_CSUM) ||
5101 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5102 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005103 netdev_dbg(dev,
Michał Mirosławacd11302011-01-24 15:45:15 -08005104 "Dropping NETIF_F_UFO since no checksum offload features.\n");
Herbert Xub63365a2008-10-23 01:11:29 -07005105 features &= ~NETIF_F_UFO;
5106 }
5107
5108 if (!(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005109 netdev_dbg(dev,
Michał Mirosławacd11302011-01-24 15:45:15 -08005110 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
Herbert Xub63365a2008-10-23 01:11:29 -07005111 features &= ~NETIF_F_UFO;
5112 }
5113 }
5114
5115 return features;
5116}
Herbert Xub63365a2008-10-23 01:11:29 -07005117
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005118int __netdev_update_features(struct net_device *dev)
Michał Mirosław5455c692011-02-15 16:59:17 +00005119{
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005120 netdev_features_t features;
Michał Mirosław5455c692011-02-15 16:59:17 +00005121 int err = 0;
5122
Michał Mirosław87267482011-04-12 09:56:38 +00005123 ASSERT_RTNL();
5124
Michał Mirosław5455c692011-02-15 16:59:17 +00005125 features = netdev_get_wanted_features(dev);
5126
5127 if (dev->netdev_ops->ndo_fix_features)
5128 features = dev->netdev_ops->ndo_fix_features(dev, features);
5129
5130 /* driver might be less strict about feature dependencies */
5131 features = netdev_fix_features(dev, features);
5132
5133 if (dev->features == features)
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005134 return 0;
Michał Mirosław5455c692011-02-15 16:59:17 +00005135
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005136 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5137 &dev->features, &features);
Michał Mirosław5455c692011-02-15 16:59:17 +00005138
5139 if (dev->netdev_ops->ndo_set_features)
5140 err = dev->netdev_ops->ndo_set_features(dev, features);
5141
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005142 if (unlikely(err < 0)) {
Michał Mirosław5455c692011-02-15 16:59:17 +00005143 netdev_err(dev,
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005144 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5145 err, &features, &dev->features);
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005146 return -1;
5147 }
5148
5149 if (!err)
5150 dev->features = features;
5151
5152 return 1;
5153}
5154
Michał Mirosławafe12cc2011-05-07 03:22:17 +00005155/**
5156 * netdev_update_features - recalculate device features
5157 * @dev: the device to check
5158 *
5159 * Recalculate dev->features set and send notifications if it
5160 * has changed. Should be called after driver or hardware dependent
5161 * conditions might have changed that influence the features.
5162 */
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005163void netdev_update_features(struct net_device *dev)
5164{
5165 if (__netdev_update_features(dev))
5166 netdev_features_change(dev);
Michał Mirosław5455c692011-02-15 16:59:17 +00005167}
5168EXPORT_SYMBOL(netdev_update_features);
5169
Linus Torvalds1da177e2005-04-16 15:20:36 -07005170/**
Michał Mirosławafe12cc2011-05-07 03:22:17 +00005171 * netdev_change_features - recalculate device features
5172 * @dev: the device to check
5173 *
5174 * Recalculate dev->features set and send notifications even
5175 * if they have not changed. Should be called instead of
5176 * netdev_update_features() if also dev->vlan_features might
5177 * have changed to allow the changes to be propagated to stacked
5178 * VLAN devices.
5179 */
5180void netdev_change_features(struct net_device *dev)
5181{
5182 __netdev_update_features(dev);
5183 netdev_features_change(dev);
5184}
5185EXPORT_SYMBOL(netdev_change_features);
5186
5187/**
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08005188 * netif_stacked_transfer_operstate - transfer operstate
5189 * @rootdev: the root or lower level device to transfer state from
5190 * @dev: the device to transfer operstate to
5191 *
5192 * Transfer operational state from root to device. This is normally
5193 * called when a stacking relationship exists between the root
5194 * device and the device(a leaf device).
5195 */
5196void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5197 struct net_device *dev)
5198{
5199 if (rootdev->operstate == IF_OPER_DORMANT)
5200 netif_dormant_on(dev);
5201 else
5202 netif_dormant_off(dev);
5203
5204 if (netif_carrier_ok(rootdev)) {
5205 if (!netif_carrier_ok(dev))
5206 netif_carrier_on(dev);
5207 } else {
5208 if (netif_carrier_ok(dev))
5209 netif_carrier_off(dev);
5210 }
5211}
5212EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5213
Tom Herbertbf264142010-11-26 08:36:09 +00005214#ifdef CONFIG_RPS
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005215static int netif_alloc_rx_queues(struct net_device *dev)
5216{
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005217 unsigned int i, count = dev->num_rx_queues;
Tom Herbertbd25fa72010-10-18 18:00:16 +00005218 struct netdev_rx_queue *rx;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005219
Tom Herbertbd25fa72010-10-18 18:00:16 +00005220 BUG_ON(count < 1);
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005221
Tom Herbertbd25fa72010-10-18 18:00:16 +00005222 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
Joe Perches62b59422013-02-04 16:48:16 +00005223 if (!rx)
Tom Herbertbd25fa72010-10-18 18:00:16 +00005224 return -ENOMEM;
Joe Perches62b59422013-02-04 16:48:16 +00005225
Tom Herbertbd25fa72010-10-18 18:00:16 +00005226 dev->_rx = rx;
5227
Tom Herbertbd25fa72010-10-18 18:00:16 +00005228 for (i = 0; i < count; i++)
Tom Herbertfe822242010-11-09 10:47:38 +00005229 rx[i].dev = dev;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005230 return 0;
5231}
Tom Herbertbf264142010-11-26 08:36:09 +00005232#endif
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005233
Changli Gaoaa942102010-12-04 02:31:41 +00005234static void netdev_init_one_queue(struct net_device *dev,
5235 struct netdev_queue *queue, void *_unused)
5236{
5237 /* Initialize queue lock */
5238 spin_lock_init(&queue->_xmit_lock);
5239 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5240 queue->xmit_lock_owner = -1;
Changli Gaob236da62010-12-14 03:09:15 +00005241 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
Changli Gaoaa942102010-12-04 02:31:41 +00005242 queue->dev = dev;
Tom Herbert114cf582011-11-28 16:33:09 +00005243#ifdef CONFIG_BQL
5244 dql_init(&queue->dql, HZ);
5245#endif
Changli Gaoaa942102010-12-04 02:31:41 +00005246}
5247
Eric Dumazet60877a32013-06-20 01:15:51 -07005248static void netif_free_tx_queues(struct net_device *dev)
5249{
5250 if (is_vmalloc_addr(dev->_tx))
5251 vfree(dev->_tx);
5252 else
5253 kfree(dev->_tx);
5254}
5255
Tom Herberte6484932010-10-18 18:04:39 +00005256static int netif_alloc_netdev_queues(struct net_device *dev)
5257{
5258 unsigned int count = dev->num_tx_queues;
5259 struct netdev_queue *tx;
Eric Dumazet60877a32013-06-20 01:15:51 -07005260 size_t sz = count * sizeof(*tx);
Tom Herberte6484932010-10-18 18:04:39 +00005261
Eric Dumazet60877a32013-06-20 01:15:51 -07005262 BUG_ON(count < 1 || count > 0xffff);
Tom Herberte6484932010-10-18 18:04:39 +00005263
Eric Dumazet60877a32013-06-20 01:15:51 -07005264 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5265 if (!tx) {
5266 tx = vzalloc(sz);
5267 if (!tx)
5268 return -ENOMEM;
5269 }
Tom Herberte6484932010-10-18 18:04:39 +00005270 dev->_tx = tx;
Tom Herbert1d24eb42010-11-21 13:17:27 +00005271
Tom Herberte6484932010-10-18 18:04:39 +00005272 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5273 spin_lock_init(&dev->tx_global_lock);
Changli Gaoaa942102010-12-04 02:31:41 +00005274
5275 return 0;
Tom Herberte6484932010-10-18 18:04:39 +00005276}
5277
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08005278/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005279 * register_netdevice - register a network device
5280 * @dev: device to register
5281 *
5282 * Take a completed network device structure and add it to the kernel
5283 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5284 * chain. 0 is returned on success. A negative errno code is returned
5285 * on a failure to set up the device, or if the name is a duplicate.
5286 *
5287 * Callers must hold the rtnl semaphore. You may want
5288 * register_netdev() instead of this.
5289 *
5290 * BUGS:
5291 * The locking appears insufficient to guarantee two parallel registers
5292 * will not get the same name.
5293 */
5294
5295int register_netdevice(struct net_device *dev)
5296{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005297 int ret;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005298 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005299
5300 BUG_ON(dev_boot_phase);
5301 ASSERT_RTNL();
5302
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005303 might_sleep();
5304
Linus Torvalds1da177e2005-04-16 15:20:36 -07005305 /* When net_device's are persistent, this will be fatal. */
5306 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005307 BUG_ON(!net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005308
David S. Millerf1f28aa2008-07-15 00:08:33 -07005309 spin_lock_init(&dev->addr_list_lock);
David S. Millercf508b12008-07-22 14:16:42 -07005310 netdev_set_addr_lockdep_class(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005311
Linus Torvalds1da177e2005-04-16 15:20:36 -07005312 dev->iflink = -1;
5313
Gao feng828de4f2012-09-13 20:58:27 +00005314 ret = dev_get_valid_name(net, dev, dev->name);
Peter Pan(潘卫平)0696c3a2011-05-12 15:46:56 +00005315 if (ret < 0)
5316 goto out;
5317
Linus Torvalds1da177e2005-04-16 15:20:36 -07005318 /* Init, if this function is available */
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005319 if (dev->netdev_ops->ndo_init) {
5320 ret = dev->netdev_ops->ndo_init(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005321 if (ret) {
5322 if (ret > 0)
5323 ret = -EIO;
Adrian Bunk90833aa2006-11-13 16:02:22 -08005324 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005325 }
5326 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005327
Patrick McHardyf6469682013-04-19 02:04:27 +00005328 if (((dev->hw_features | dev->features) &
5329 NETIF_F_HW_VLAN_CTAG_FILTER) &&
Michał Mirosławd2ed2732013-01-29 15:14:16 +00005330 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5331 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5332 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5333 ret = -EINVAL;
5334 goto err_uninit;
5335 }
5336
Pavel Emelyanov9c7dafb2012-08-08 21:52:46 +00005337 ret = -EBUSY;
5338 if (!dev->ifindex)
5339 dev->ifindex = dev_new_index(net);
5340 else if (__dev_get_by_index(net, dev->ifindex))
5341 goto err_uninit;
5342
Linus Torvalds1da177e2005-04-16 15:20:36 -07005343 if (dev->iflink == -1)
5344 dev->iflink = dev->ifindex;
5345
Michał Mirosław5455c692011-02-15 16:59:17 +00005346 /* Transfer changeable features to wanted_features and enable
5347 * software offloads (GSO and GRO).
5348 */
5349 dev->hw_features |= NETIF_F_SOFT_FEATURES;
Michał Mirosław14d12322011-02-22 16:52:28 +00005350 dev->features |= NETIF_F_SOFT_FEATURES;
5351 dev->wanted_features = dev->features & dev->hw_features;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005352
Tom Herbertc6e1a0d2011-04-04 22:30:30 -07005353 /* Turn on no cache copy if HW is doing checksum */
Michał Mirosław34324dc2011-11-15 15:29:55 +00005354 if (!(dev->flags & IFF_LOOPBACK)) {
5355 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5356 if (dev->features & NETIF_F_ALL_CSUM) {
5357 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5358 dev->features |= NETIF_F_NOCACHE_COPY;
5359 }
Tom Herbertc6e1a0d2011-04-04 22:30:30 -07005360 }
5361
Michał Mirosław1180e7d2011-07-14 14:41:11 -07005362 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
Brandon Philips16c3ea72010-09-15 09:24:24 +00005363 */
Michał Mirosław1180e7d2011-07-14 14:41:11 -07005364 dev->vlan_features |= NETIF_F_HIGHDMA;
Brandon Philips16c3ea72010-09-15 09:24:24 +00005365
Pravin B Shelaree579672013-03-07 09:28:08 +00005366 /* Make NETIF_F_SG inheritable to tunnel devices.
5367 */
5368 dev->hw_enc_features |= NETIF_F_SG;
5369
Simon Horman0d89d202013-05-23 21:02:52 +00005370 /* Make NETIF_F_SG inheritable to MPLS.
5371 */
5372 dev->mpls_features |= NETIF_F_SG;
5373
Johannes Berg7ffbe3f2009-10-02 05:15:27 +00005374 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5375 ret = notifier_to_errno(ret);
5376 if (ret)
5377 goto err_uninit;
5378
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005379 ret = netdev_register_kobject(dev);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005380 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005381 goto err_uninit;
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005382 dev->reg_state = NETREG_REGISTERED;
5383
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005384 __netdev_update_features(dev);
Michał Mirosław8e9b59b2011-02-22 16:52:28 +00005385
Linus Torvalds1da177e2005-04-16 15:20:36 -07005386 /*
5387 * Default initial state at registry is that the
5388 * device is present.
5389 */
5390
5391 set_bit(__LINK_STATE_PRESENT, &dev->state);
5392
Ben Hutchings8f4cccb2012-08-20 22:16:51 +01005393 linkwatch_init_dev(dev);
5394
Linus Torvalds1da177e2005-04-16 15:20:36 -07005395 dev_init_scheduler(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005396 dev_hold(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005397 list_netdevice(dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04005398 add_device_randomness(dev->dev_addr, dev->addr_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005399
Jiri Pirko948b3372013-01-08 01:38:25 +00005400 /* If the device has permanent device address, driver should
5401 * set dev_addr and also addr_assign_type should be set to
5402 * NET_ADDR_PERM (default value).
5403 */
5404 if (dev->addr_assign_type == NET_ADDR_PERM)
5405 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5406
Linus Torvalds1da177e2005-04-16 15:20:36 -07005407 /* Notify protocols, that a new device appeared. */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005408 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07005409 ret = notifier_to_errno(ret);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005410 if (ret) {
5411 rollback_registered(dev);
5412 dev->reg_state = NETREG_UNREGISTERED;
5413 }
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005414 /*
5415 * Prevent userspace races by waiting until the network
5416 * device is fully setup before sending notifications.
5417 */
Patrick McHardya2835762010-02-26 06:34:51 +00005418 if (!dev->rtnl_link_ops ||
5419 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5420 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005421
5422out:
5423 return ret;
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005424
5425err_uninit:
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005426 if (dev->netdev_ops->ndo_uninit)
5427 dev->netdev_ops->ndo_uninit(dev);
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005428 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005429}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005430EXPORT_SYMBOL(register_netdevice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005431
5432/**
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005433 * init_dummy_netdev - init a dummy network device for NAPI
5434 * @dev: device to init
5435 *
5436 * This takes a network device structure and initialize the minimum
5437 * amount of fields so it can be used to schedule NAPI polls without
5438 * registering a full blown interface. This is to be used by drivers
5439 * that need to tie several hardware interfaces to a single NAPI
5440 * poll scheduler due to HW limitations.
5441 */
5442int init_dummy_netdev(struct net_device *dev)
5443{
5444 /* Clear everything. Note we don't initialize spinlocks
5445 * are they aren't supposed to be taken by any of the
5446 * NAPI code and this dummy netdev is supposed to be
5447 * only ever used for NAPI polls
5448 */
5449 memset(dev, 0, sizeof(struct net_device));
5450
5451 /* make sure we BUG if trying to hit standard
5452 * register/unregister code path
5453 */
5454 dev->reg_state = NETREG_DUMMY;
5455
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005456 /* NAPI wants this */
5457 INIT_LIST_HEAD(&dev->napi_list);
5458
5459 /* a dummy interface is started by default */
5460 set_bit(__LINK_STATE_PRESENT, &dev->state);
5461 set_bit(__LINK_STATE_START, &dev->state);
5462
Eric Dumazet29b44332010-10-11 10:22:12 +00005463 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5464 * because users of this 'device' dont need to change
5465 * its refcount.
5466 */
5467
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005468 return 0;
5469}
5470EXPORT_SYMBOL_GPL(init_dummy_netdev);
5471
5472
5473/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005474 * register_netdev - register a network device
5475 * @dev: device to register
5476 *
5477 * Take a completed network device structure and add it to the kernel
5478 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5479 * chain. 0 is returned on success. A negative errno code is returned
5480 * on a failure to set up the device, or if the name is a duplicate.
5481 *
Borislav Petkov38b4da32007-04-20 22:14:10 -07005482 * This is a wrapper around register_netdevice that takes the rtnl semaphore
Linus Torvalds1da177e2005-04-16 15:20:36 -07005483 * and expands the device name if you passed a format string to
5484 * alloc_netdev.
5485 */
5486int register_netdev(struct net_device *dev)
5487{
5488 int err;
5489
5490 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005491 err = register_netdevice(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005492 rtnl_unlock();
5493 return err;
5494}
5495EXPORT_SYMBOL(register_netdev);
5496
Eric Dumazet29b44332010-10-11 10:22:12 +00005497int netdev_refcnt_read(const struct net_device *dev)
5498{
5499 int i, refcnt = 0;
5500
5501 for_each_possible_cpu(i)
5502 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5503 return refcnt;
5504}
5505EXPORT_SYMBOL(netdev_refcnt_read);
5506
Ben Hutchings2c530402012-07-10 10:55:09 +00005507/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005508 * netdev_wait_allrefs - wait until all references are gone.
Randy Dunlap3de7a372012-08-18 14:36:44 +00005509 * @dev: target net_device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005510 *
5511 * This is called when unregistering network devices.
5512 *
5513 * Any protocol or device that holds a reference should register
5514 * for netdevice notification, and cleanup and put back the
5515 * reference if they receive an UNREGISTER event.
5516 * We can get stuck here if buggy protocols don't correctly
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005517 * call dev_put.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005518 */
5519static void netdev_wait_allrefs(struct net_device *dev)
5520{
5521 unsigned long rebroadcast_time, warning_time;
Eric Dumazet29b44332010-10-11 10:22:12 +00005522 int refcnt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005523
Eric Dumazete014deb2009-11-17 05:59:21 +00005524 linkwatch_forget_dev(dev);
5525
Linus Torvalds1da177e2005-04-16 15:20:36 -07005526 rebroadcast_time = warning_time = jiffies;
Eric Dumazet29b44332010-10-11 10:22:12 +00005527 refcnt = netdev_refcnt_read(dev);
5528
5529 while (refcnt != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005530 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005531 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005532
5533 /* Rebroadcast unregister notification */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005534 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005535
Eric Dumazet748e2d92012-08-22 21:50:59 +00005536 __rtnl_unlock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005537 rcu_barrier();
Eric Dumazet748e2d92012-08-22 21:50:59 +00005538 rtnl_lock();
5539
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005540 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005541 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5542 &dev->state)) {
5543 /* We must not have linkwatch events
5544 * pending on unregister. If this
5545 * happens, we simply run the queue
5546 * unscheduled, resulting in a noop
5547 * for this device.
5548 */
5549 linkwatch_run_queue();
5550 }
5551
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005552 __rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005553
5554 rebroadcast_time = jiffies;
5555 }
5556
5557 msleep(250);
5558
Eric Dumazet29b44332010-10-11 10:22:12 +00005559 refcnt = netdev_refcnt_read(dev);
5560
Linus Torvalds1da177e2005-04-16 15:20:36 -07005561 if (time_after(jiffies, warning_time + 10 * HZ)) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005562 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5563 dev->name, refcnt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005564 warning_time = jiffies;
5565 }
5566 }
5567}
5568
5569/* The sequence is:
5570 *
5571 * rtnl_lock();
5572 * ...
5573 * register_netdevice(x1);
5574 * register_netdevice(x2);
5575 * ...
5576 * unregister_netdevice(y1);
5577 * unregister_netdevice(y2);
5578 * ...
5579 * rtnl_unlock();
5580 * free_netdev(y1);
5581 * free_netdev(y2);
5582 *
Herbert Xu58ec3b42008-10-07 15:50:03 -07005583 * We are invoked by rtnl_unlock().
Linus Torvalds1da177e2005-04-16 15:20:36 -07005584 * This allows us to deal with problems:
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005585 * 1) We can delete sysfs objects which invoke hotplug
Linus Torvalds1da177e2005-04-16 15:20:36 -07005586 * without deadlocking with linkwatch via keventd.
5587 * 2) Since we run with the RTNL semaphore not held, we can sleep
5588 * safely in order to wait for the netdev refcnt to drop to zero.
Herbert Xu58ec3b42008-10-07 15:50:03 -07005589 *
5590 * We must not return until all unregister events added during
5591 * the interval the lock was held have been completed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005592 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005593void netdev_run_todo(void)
5594{
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005595 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005596
Linus Torvalds1da177e2005-04-16 15:20:36 -07005597 /* Snapshot list, allow later requests */
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005598 list_replace_init(&net_todo_list, &list);
Herbert Xu58ec3b42008-10-07 15:50:03 -07005599
5600 __rtnl_unlock();
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005601
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005602
5603 /* Wait for rcu callbacks to finish before next phase */
Eric W. Biederman850a5452011-10-13 22:25:23 +00005604 if (!list_empty(&list))
5605 rcu_barrier();
5606
Linus Torvalds1da177e2005-04-16 15:20:36 -07005607 while (!list_empty(&list)) {
5608 struct net_device *dev
stephen hemmingere5e26d72010-02-24 14:01:38 +00005609 = list_first_entry(&list, struct net_device, todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005610 list_del(&dev->todo_list);
5611
Eric Dumazet748e2d92012-08-22 21:50:59 +00005612 rtnl_lock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005613 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Eric Dumazet748e2d92012-08-22 21:50:59 +00005614 __rtnl_unlock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005615
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005616 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005617 pr_err("network todo '%s' but state %d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07005618 dev->name, dev->reg_state);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005619 dump_stack();
5620 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005621 }
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005622
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005623 dev->reg_state = NETREG_UNREGISTERED;
5624
Changli Gao152102c2010-03-30 20:16:22 +00005625 on_each_cpu(flush_backlog, dev, 1);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07005626
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005627 netdev_wait_allrefs(dev);
5628
5629 /* paranoia */
Eric Dumazet29b44332010-10-11 10:22:12 +00005630 BUG_ON(netdev_refcnt_read(dev));
Eric Dumazet33d480c2011-08-11 19:30:52 +00005631 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5632 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
Ilpo Järvinen547b7922008-07-25 21:43:18 -07005633 WARN_ON(dev->dn_ptr);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005634
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005635 if (dev->destructor)
5636 dev->destructor(dev);
Stephen Hemminger9093bbb2007-05-19 15:39:25 -07005637
5638 /* Free network device */
5639 kobject_put(&dev->dev.kobj);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005640 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005641}
5642
Ben Hutchings3cfde792010-07-09 09:11:52 +00005643/* Convert net_device_stats to rtnl_link_stats64. They have the same
5644 * fields in the same order, with only the type differing.
5645 */
Eric Dumazet77a1abf2012-03-05 04:50:09 +00005646void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5647 const struct net_device_stats *netdev_stats)
Ben Hutchings3cfde792010-07-09 09:11:52 +00005648{
5649#if BITS_PER_LONG == 64
Eric Dumazet77a1abf2012-03-05 04:50:09 +00005650 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5651 memcpy(stats64, netdev_stats, sizeof(*stats64));
Ben Hutchings3cfde792010-07-09 09:11:52 +00005652#else
5653 size_t i, n = sizeof(*stats64) / sizeof(u64);
5654 const unsigned long *src = (const unsigned long *)netdev_stats;
5655 u64 *dst = (u64 *)stats64;
5656
5657 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5658 sizeof(*stats64) / sizeof(u64));
5659 for (i = 0; i < n; i++)
5660 dst[i] = src[i];
5661#endif
5662}
Eric Dumazet77a1abf2012-03-05 04:50:09 +00005663EXPORT_SYMBOL(netdev_stats_to_stats64);
Ben Hutchings3cfde792010-07-09 09:11:52 +00005664
Eric Dumazetd83345a2009-11-16 03:36:51 +00005665/**
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005666 * dev_get_stats - get network device statistics
5667 * @dev: device to get statistics from
Eric Dumazet28172732010-07-07 14:58:56 -07005668 * @storage: place to store stats
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005669 *
Ben Hutchingsd7753512010-07-09 09:12:41 +00005670 * Get network statistics from device. Return @storage.
5671 * The device driver may provide its own method by setting
5672 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5673 * otherwise the internal statistics structure is used.
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005674 */
Ben Hutchingsd7753512010-07-09 09:12:41 +00005675struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5676 struct rtnl_link_stats64 *storage)
Eric Dumazet7004bf22009-05-18 00:34:33 +00005677{
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005678 const struct net_device_ops *ops = dev->netdev_ops;
5679
Eric Dumazet28172732010-07-07 14:58:56 -07005680 if (ops->ndo_get_stats64) {
5681 memset(storage, 0, sizeof(*storage));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005682 ops->ndo_get_stats64(dev, storage);
5683 } else if (ops->ndo_get_stats) {
Ben Hutchings3cfde792010-07-09 09:11:52 +00005684 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005685 } else {
5686 netdev_stats_to_stats64(storage, &dev->stats);
Eric Dumazet28172732010-07-07 14:58:56 -07005687 }
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005688 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
Eric Dumazet28172732010-07-07 14:58:56 -07005689 return storage;
Rusty Russellc45d2862007-03-28 14:29:08 -07005690}
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005691EXPORT_SYMBOL(dev_get_stats);
Rusty Russellc45d2862007-03-28 14:29:08 -07005692
Eric Dumazet24824a02010-10-02 06:11:55 +00005693struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
David S. Millerdc2b4842008-07-08 17:18:23 -07005694{
Eric Dumazet24824a02010-10-02 06:11:55 +00005695 struct netdev_queue *queue = dev_ingress_queue(dev);
David S. Millerdc2b4842008-07-08 17:18:23 -07005696
Eric Dumazet24824a02010-10-02 06:11:55 +00005697#ifdef CONFIG_NET_CLS_ACT
5698 if (queue)
5699 return queue;
5700 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5701 if (!queue)
5702 return NULL;
5703 netdev_init_one_queue(dev, queue, NULL);
Eric Dumazet24824a02010-10-02 06:11:55 +00005704 queue->qdisc = &noop_qdisc;
5705 queue->qdisc_sleeping = &noop_qdisc;
5706 rcu_assign_pointer(dev->ingress_queue, queue);
5707#endif
5708 return queue;
David S. Millerbb949fb2008-07-08 16:55:56 -07005709}
5710
Eric Dumazet2c60db02012-09-16 09:17:26 +00005711static const struct ethtool_ops default_ethtool_ops;
5712
Stanislaw Gruszkad07d7502013-01-10 23:19:10 +00005713void netdev_set_default_ethtool_ops(struct net_device *dev,
5714 const struct ethtool_ops *ops)
5715{
5716 if (dev->ethtool_ops == &default_ethtool_ops)
5717 dev->ethtool_ops = ops;
5718}
5719EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5720
Linus Torvalds1da177e2005-04-16 15:20:36 -07005721/**
Tom Herbert36909ea2011-01-09 19:36:31 +00005722 * alloc_netdev_mqs - allocate network device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005723 * @sizeof_priv: size of private data to allocate space for
5724 * @name: device name format string
5725 * @setup: callback to initialize device
Tom Herbert36909ea2011-01-09 19:36:31 +00005726 * @txqs: the number of TX subqueues to allocate
5727 * @rxqs: the number of RX subqueues to allocate
Linus Torvalds1da177e2005-04-16 15:20:36 -07005728 *
5729 * Allocates a struct net_device with private data area for driver use
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005730 * and performs basic initialization. Also allocates subquue structs
Tom Herbert36909ea2011-01-09 19:36:31 +00005731 * for each queue on the device.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005732 */
Tom Herbert36909ea2011-01-09 19:36:31 +00005733struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5734 void (*setup)(struct net_device *),
5735 unsigned int txqs, unsigned int rxqs)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005736{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005737 struct net_device *dev;
Stephen Hemminger79439862008-07-21 13:28:44 -07005738 size_t alloc_size;
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005739 struct net_device *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005740
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07005741 BUG_ON(strlen(name) >= sizeof(dev->name));
5742
Tom Herbert36909ea2011-01-09 19:36:31 +00005743 if (txqs < 1) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005744 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
Tom Herbert55513fb2010-10-18 17:55:58 +00005745 return NULL;
5746 }
5747
Tom Herbert36909ea2011-01-09 19:36:31 +00005748#ifdef CONFIG_RPS
5749 if (rxqs < 1) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005750 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
Tom Herbert36909ea2011-01-09 19:36:31 +00005751 return NULL;
5752 }
5753#endif
5754
David S. Millerfd2ea0a2008-07-17 01:56:23 -07005755 alloc_size = sizeof(struct net_device);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005756 if (sizeof_priv) {
5757 /* ensure 32-byte alignment of private area */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005758 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005759 alloc_size += sizeof_priv;
5760 }
5761 /* ensure 32-byte alignment of whole construct */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005762 alloc_size += NETDEV_ALIGN - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005763
Paolo 'Blaisorblade' Giarrusso31380de2006-04-06 22:38:28 -07005764 p = kzalloc(alloc_size, GFP_KERNEL);
Joe Perches62b59422013-02-04 16:48:16 +00005765 if (!p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005766 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005767
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005768 dev = PTR_ALIGN(p, NETDEV_ALIGN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005769 dev->padded = (char *)dev - (char *)p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005770
Eric Dumazet29b44332010-10-11 10:22:12 +00005771 dev->pcpu_refcnt = alloc_percpu(int);
5772 if (!dev->pcpu_refcnt)
Tom Herberte6484932010-10-18 18:04:39 +00005773 goto free_p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005774
Linus Torvalds1da177e2005-04-16 15:20:36 -07005775 if (dev_addr_init(dev))
Eric Dumazet29b44332010-10-11 10:22:12 +00005776 goto free_pcpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005777
Jiri Pirko22bedad32010-04-01 21:22:57 +00005778 dev_mc_init(dev);
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005779 dev_uc_init(dev);
Jiri Pirkoccffad252009-05-22 23:22:17 +00005780
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005781 dev_net_set(dev, &init_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005782
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07005783 dev->gso_max_size = GSO_MAX_SIZE;
Ben Hutchings30b678d2012-07-30 15:57:00 +00005784 dev->gso_max_segs = GSO_MAX_SEGS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005785
Herbert Xud565b0a2008-12-15 23:38:52 -08005786 INIT_LIST_HEAD(&dev->napi_list);
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005787 INIT_LIST_HEAD(&dev->unreg_list);
Eric Dumazete014deb2009-11-17 05:59:21 +00005788 INIT_LIST_HEAD(&dev->link_watch_list);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005789 INIT_LIST_HEAD(&dev->upper_dev_list);
Eric Dumazet93f154b2009-05-18 22:19:19 -07005790 dev->priv_flags = IFF_XMIT_DST_RELEASE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005791 setup(dev);
David S. Miller8d3bdbd2011-02-08 15:02:50 -08005792
5793 dev->num_tx_queues = txqs;
5794 dev->real_num_tx_queues = txqs;
5795 if (netif_alloc_netdev_queues(dev))
5796 goto free_all;
5797
5798#ifdef CONFIG_RPS
5799 dev->num_rx_queues = rxqs;
5800 dev->real_num_rx_queues = rxqs;
5801 if (netif_alloc_rx_queues(dev))
5802 goto free_all;
5803#endif
5804
Linus Torvalds1da177e2005-04-16 15:20:36 -07005805 strcpy(dev->name, name);
Vlad Dogarucbda10f2011-01-13 23:38:30 +00005806 dev->group = INIT_NETDEV_GROUP;
Eric Dumazet2c60db02012-09-16 09:17:26 +00005807 if (!dev->ethtool_ops)
5808 dev->ethtool_ops = &default_ethtool_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005809 return dev;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005810
David S. Miller8d3bdbd2011-02-08 15:02:50 -08005811free_all:
5812 free_netdev(dev);
5813 return NULL;
5814
Eric Dumazet29b44332010-10-11 10:22:12 +00005815free_pcpu:
5816 free_percpu(dev->pcpu_refcnt);
Eric Dumazet60877a32013-06-20 01:15:51 -07005817 netif_free_tx_queues(dev);
Tom Herbertfe822242010-11-09 10:47:38 +00005818#ifdef CONFIG_RPS
5819 kfree(dev->_rx);
5820#endif
5821
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005822free_p:
5823 kfree(p);
5824 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005825}
Tom Herbert36909ea2011-01-09 19:36:31 +00005826EXPORT_SYMBOL(alloc_netdev_mqs);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005827
5828/**
5829 * free_netdev - free network device
5830 * @dev: device
5831 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005832 * This function does the last stage of destroying an allocated device
5833 * interface. The reference to the device object is released.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005834 * If this is the last reference then it will be freed.
5835 */
5836void free_netdev(struct net_device *dev)
5837{
Herbert Xud565b0a2008-12-15 23:38:52 -08005838 struct napi_struct *p, *n;
5839
Denis V. Lunevf3005d72008-04-16 02:02:18 -07005840 release_net(dev_net(dev));
5841
Eric Dumazet60877a32013-06-20 01:15:51 -07005842 netif_free_tx_queues(dev);
Tom Herbertfe822242010-11-09 10:47:38 +00005843#ifdef CONFIG_RPS
5844 kfree(dev->_rx);
5845#endif
David S. Millere8a04642008-07-17 00:34:19 -07005846
Eric Dumazet33d480c2011-08-11 19:30:52 +00005847 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
Eric Dumazet24824a02010-10-02 06:11:55 +00005848
Jiri Pirkof001fde2009-05-05 02:48:28 +00005849 /* Flush device addresses */
5850 dev_addr_flush(dev);
5851
Herbert Xud565b0a2008-12-15 23:38:52 -08005852 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5853 netif_napi_del(p);
5854
Eric Dumazet29b44332010-10-11 10:22:12 +00005855 free_percpu(dev->pcpu_refcnt);
5856 dev->pcpu_refcnt = NULL;
5857
Stephen Hemminger3041a062006-05-26 13:25:24 -07005858 /* Compatibility with error handling in drivers */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005859 if (dev->reg_state == NETREG_UNINITIALIZED) {
5860 kfree((char *)dev - dev->padded);
5861 return;
5862 }
5863
5864 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5865 dev->reg_state = NETREG_RELEASED;
5866
Greg Kroah-Hartman43cb76d2002-04-09 12:14:34 -07005867 /* will free via device release */
5868 put_device(&dev->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005869}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005870EXPORT_SYMBOL(free_netdev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005871
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005872/**
5873 * synchronize_net - Synchronize with packet receive processing
5874 *
5875 * Wait for packets currently being received to be done.
5876 * Does not block later packets from starting.
5877 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005878void synchronize_net(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005879{
5880 might_sleep();
Eric Dumazetbe3fc412011-05-23 23:07:32 +00005881 if (rtnl_is_locked())
5882 synchronize_rcu_expedited();
5883 else
5884 synchronize_rcu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005885}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005886EXPORT_SYMBOL(synchronize_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005887
5888/**
Eric Dumazet44a08732009-10-27 07:03:04 +00005889 * unregister_netdevice_queue - remove device from the kernel
Linus Torvalds1da177e2005-04-16 15:20:36 -07005890 * @dev: device
Eric Dumazet44a08732009-10-27 07:03:04 +00005891 * @head: list
Jaswinder Singh Rajput6ebfbc02009-11-22 20:43:13 -08005892 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07005893 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005894 * from the kernel tables.
Eric Dumazet44a08732009-10-27 07:03:04 +00005895 * If head not NULL, device is queued to be unregistered later.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005896 *
5897 * Callers must hold the rtnl semaphore. You may want
5898 * unregister_netdev() instead of this.
5899 */
5900
Eric Dumazet44a08732009-10-27 07:03:04 +00005901void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005902{
Herbert Xua6620712007-12-12 19:21:56 -08005903 ASSERT_RTNL();
5904
Eric Dumazet44a08732009-10-27 07:03:04 +00005905 if (head) {
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005906 list_move_tail(&dev->unreg_list, head);
Eric Dumazet44a08732009-10-27 07:03:04 +00005907 } else {
5908 rollback_registered(dev);
5909 /* Finish processing unregister after unlock */
5910 net_set_todo(dev);
5911 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005912}
Eric Dumazet44a08732009-10-27 07:03:04 +00005913EXPORT_SYMBOL(unregister_netdevice_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005914
5915/**
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005916 * unregister_netdevice_many - unregister many devices
5917 * @head: list of devices
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005918 */
5919void unregister_netdevice_many(struct list_head *head)
5920{
5921 struct net_device *dev;
5922
5923 if (!list_empty(head)) {
5924 rollback_registered_many(head);
5925 list_for_each_entry(dev, head, unreg_list)
5926 net_set_todo(dev);
5927 }
5928}
Eric Dumazet63c80992009-10-27 07:06:49 +00005929EXPORT_SYMBOL(unregister_netdevice_many);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005930
5931/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005932 * unregister_netdev - remove device from the kernel
5933 * @dev: device
5934 *
5935 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005936 * from the kernel tables.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005937 *
5938 * This is just a wrapper for unregister_netdevice that takes
5939 * the rtnl semaphore. In general you want to use this and not
5940 * unregister_netdevice.
5941 */
5942void unregister_netdev(struct net_device *dev)
5943{
5944 rtnl_lock();
5945 unregister_netdevice(dev);
5946 rtnl_unlock();
5947}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005948EXPORT_SYMBOL(unregister_netdev);
5949
Eric W. Biedermance286d32007-09-12 13:53:49 +02005950/**
5951 * dev_change_net_namespace - move device to different nethost namespace
5952 * @dev: device
5953 * @net: network namespace
5954 * @pat: If not NULL name pattern to try if the current device name
5955 * is already taken in the destination network namespace.
5956 *
5957 * This function shuts down a device interface and moves it
5958 * to a new network namespace. On success 0 is returned, on
5959 * a failure a netagive errno code is returned.
5960 *
5961 * Callers must hold the rtnl semaphore.
5962 */
5963
5964int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5965{
Eric W. Biedermance286d32007-09-12 13:53:49 +02005966 int err;
5967
5968 ASSERT_RTNL();
5969
5970 /* Don't allow namespace local devices to be moved. */
5971 err = -EINVAL;
5972 if (dev->features & NETIF_F_NETNS_LOCAL)
5973 goto out;
5974
5975 /* Ensure the device has been registrered */
Eric W. Biedermance286d32007-09-12 13:53:49 +02005976 if (dev->reg_state != NETREG_REGISTERED)
5977 goto out;
5978
5979 /* Get out if there is nothing todo */
5980 err = 0;
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09005981 if (net_eq(dev_net(dev), net))
Eric W. Biedermance286d32007-09-12 13:53:49 +02005982 goto out;
5983
5984 /* Pick the destination device name, and ensure
5985 * we can use it in the destination network namespace.
5986 */
5987 err = -EEXIST;
Octavian Purdilad9031022009-11-18 02:36:59 +00005988 if (__dev_get_by_name(net, dev->name)) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02005989 /* We get here if we can't use the current device name */
5990 if (!pat)
5991 goto out;
Gao feng828de4f2012-09-13 20:58:27 +00005992 if (dev_get_valid_name(net, dev, pat) < 0)
Eric W. Biedermance286d32007-09-12 13:53:49 +02005993 goto out;
5994 }
5995
5996 /*
5997 * And now a mini version of register_netdevice unregister_netdevice.
5998 */
5999
6000 /* If device is running close it first. */
Pavel Emelyanov9b772652007-10-10 02:49:09 -07006001 dev_close(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006002
6003 /* And unlink it from device chain */
6004 err = -ENODEV;
6005 unlist_netdevice(dev);
6006
6007 synchronize_net();
6008
6009 /* Shutdown queueing discipline. */
6010 dev_shutdown(dev);
6011
6012 /* Notify protocols, that we are about to destroy
6013 this device. They should clean all the things.
David Lamparter3b27e102010-09-17 03:22:19 +00006014
6015 Note that dev->reg_state stays at NETREG_REGISTERED.
6016 This is wanted because this way 8021q and macvlan know
6017 the device is just moving and can keep their slaves up.
Eric W. Biedermance286d32007-09-12 13:53:49 +02006018 */
6019 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Gao feng6549dd42012-08-23 15:36:55 +00006020 rcu_barrier();
6021 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Eric W. Biedermand2237d32011-10-21 06:24:20 +00006022 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006023
6024 /*
6025 * Flush the unicast and multicast chains
6026 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00006027 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00006028 dev_mc_flush(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006029
Serge Hallyn4e66ae22012-12-03 16:17:12 +00006030 /* Send a netdev-removed uevent to the old namespace */
6031 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6032
Eric W. Biedermance286d32007-09-12 13:53:49 +02006033 /* Actually switch the network namespace */
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09006034 dev_net_set(dev, net);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006035
Eric W. Biedermance286d32007-09-12 13:53:49 +02006036 /* If there is an ifindex conflict assign a new one */
6037 if (__dev_get_by_index(net, dev->ifindex)) {
6038 int iflink = (dev->iflink == dev->ifindex);
6039 dev->ifindex = dev_new_index(net);
6040 if (iflink)
6041 dev->iflink = dev->ifindex;
6042 }
6043
Serge Hallyn4e66ae22012-12-03 16:17:12 +00006044 /* Send a netdev-add uevent to the new namespace */
6045 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6046
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006047 /* Fixup kobjects */
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07006048 err = device_rename(&dev->dev, dev->name);
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006049 WARN_ON(err);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006050
6051 /* Add the device back in the hashes */
6052 list_netdevice(dev);
6053
6054 /* Notify protocols, that a new device appeared. */
6055 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6056
Eric W. Biedermand90a9092009-12-12 22:11:15 +00006057 /*
6058 * Prevent userspace races by waiting until the network
6059 * device is fully setup before sending notifications.
6060 */
6061 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6062
Eric W. Biedermance286d32007-09-12 13:53:49 +02006063 synchronize_net();
6064 err = 0;
6065out:
6066 return err;
6067}
Johannes Berg463d0182009-07-14 00:33:35 +02006068EXPORT_SYMBOL_GPL(dev_change_net_namespace);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006069
Linus Torvalds1da177e2005-04-16 15:20:36 -07006070static int dev_cpu_callback(struct notifier_block *nfb,
6071 unsigned long action,
6072 void *ocpu)
6073{
6074 struct sk_buff **list_skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006075 struct sk_buff *skb;
6076 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6077 struct softnet_data *sd, *oldsd;
6078
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006079 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006080 return NOTIFY_OK;
6081
6082 local_irq_disable();
6083 cpu = smp_processor_id();
6084 sd = &per_cpu(softnet_data, cpu);
6085 oldsd = &per_cpu(softnet_data, oldcpu);
6086
6087 /* Find end of our completion_queue. */
6088 list_skb = &sd->completion_queue;
6089 while (*list_skb)
6090 list_skb = &(*list_skb)->next;
6091 /* Append completion queue from offline CPU. */
6092 *list_skb = oldsd->completion_queue;
6093 oldsd->completion_queue = NULL;
6094
Linus Torvalds1da177e2005-04-16 15:20:36 -07006095 /* Append output queue from offline CPU. */
Changli Gaoa9cbd582010-04-26 23:06:24 +00006096 if (oldsd->output_queue) {
6097 *sd->output_queue_tailp = oldsd->output_queue;
6098 sd->output_queue_tailp = oldsd->output_queue_tailp;
6099 oldsd->output_queue = NULL;
6100 oldsd->output_queue_tailp = &oldsd->output_queue;
6101 }
Heiko Carstens264524d2011-06-06 20:50:03 +00006102 /* Append NAPI poll list from offline CPU. */
6103 if (!list_empty(&oldsd->poll_list)) {
6104 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6105 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6106 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006107
6108 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6109 local_irq_enable();
6110
6111 /* Process offline CPU's input_pkt_queue */
Tom Herbert76cc8b12010-05-20 18:37:59 +00006112 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6113 netif_rx(skb);
6114 input_queue_head_incr(oldsd);
6115 }
Tom Herbertfec5e652010-04-16 16:01:27 -07006116 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006117 netif_rx(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00006118 input_queue_head_incr(oldsd);
Tom Herbertfec5e652010-04-16 16:01:27 -07006119 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006120
6121 return NOTIFY_OK;
6122}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006123
6124
Herbert Xu7f353bf2007-08-10 15:47:58 -07006125/**
Herbert Xub63365a2008-10-23 01:11:29 -07006126 * netdev_increment_features - increment feature set by one
6127 * @all: current feature set
6128 * @one: new feature set
6129 * @mask: mask feature set
Herbert Xu7f353bf2007-08-10 15:47:58 -07006130 *
6131 * Computes a new feature set after adding a device with feature set
Herbert Xub63365a2008-10-23 01:11:29 -07006132 * @one to the master device with current feature set @all. Will not
6133 * enable anything that is off in @mask. Returns the new feature set.
Herbert Xu7f353bf2007-08-10 15:47:58 -07006134 */
Michał Mirosławc8f44af2011-11-15 15:29:55 +00006135netdev_features_t netdev_increment_features(netdev_features_t all,
6136 netdev_features_t one, netdev_features_t mask)
Herbert Xu7f353bf2007-08-10 15:47:58 -07006137{
Michał Mirosław1742f182011-04-22 06:31:16 +00006138 if (mask & NETIF_F_GEN_CSUM)
6139 mask |= NETIF_F_ALL_CSUM;
6140 mask |= NETIF_F_VLAN_CHALLENGED;
6141
6142 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6143 all &= one | ~NETIF_F_ALL_FOR_ALL;
6144
Michał Mirosław1742f182011-04-22 06:31:16 +00006145 /* If one device supports hw checksumming, set for all. */
6146 if (all & NETIF_F_GEN_CSUM)
6147 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
Herbert Xu7f353bf2007-08-10 15:47:58 -07006148
6149 return all;
6150}
Herbert Xub63365a2008-10-23 01:11:29 -07006151EXPORT_SYMBOL(netdev_increment_features);
Herbert Xu7f353bf2007-08-10 15:47:58 -07006152
Baruch Siach430f03c2013-06-02 20:43:55 +00006153static struct hlist_head * __net_init netdev_create_hash(void)
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006154{
6155 int i;
6156 struct hlist_head *hash;
6157
6158 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6159 if (hash != NULL)
6160 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6161 INIT_HLIST_HEAD(&hash[i]);
6162
6163 return hash;
6164}
6165
Eric W. Biederman881d9662007-09-17 11:56:21 -07006166/* Initialize per network namespace state */
Pavel Emelyanov46650792007-10-08 20:38:39 -07006167static int __net_init netdev_init(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07006168{
Rustad, Mark D734b6542012-07-18 09:06:07 +00006169 if (net != &init_net)
6170 INIT_LIST_HEAD(&net->dev_base_head);
Eric W. Biederman881d9662007-09-17 11:56:21 -07006171
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006172 net->dev_name_head = netdev_create_hash();
6173 if (net->dev_name_head == NULL)
6174 goto err_name;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006175
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006176 net->dev_index_head = netdev_create_hash();
6177 if (net->dev_index_head == NULL)
6178 goto err_idx;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006179
6180 return 0;
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006181
6182err_idx:
6183 kfree(net->dev_name_head);
6184err_name:
6185 return -ENOMEM;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006186}
6187
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006188/**
6189 * netdev_drivername - network driver for the device
6190 * @dev: network device
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006191 *
6192 * Determine network driver for device.
6193 */
David S. Miller3019de12011-06-06 16:41:33 -07006194const char *netdev_drivername(const struct net_device *dev)
Arjan van de Ven6579e572008-07-21 13:31:48 -07006195{
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07006196 const struct device_driver *driver;
6197 const struct device *parent;
David S. Miller3019de12011-06-06 16:41:33 -07006198 const char *empty = "";
Arjan van de Ven6579e572008-07-21 13:31:48 -07006199
6200 parent = dev->dev.parent;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006201 if (!parent)
David S. Miller3019de12011-06-06 16:41:33 -07006202 return empty;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006203
6204 driver = parent->driver;
6205 if (driver && driver->name)
David S. Miller3019de12011-06-06 16:41:33 -07006206 return driver->name;
6207 return empty;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006208}
6209
Joe Perchesb004ff42012-09-12 20:12:19 -07006210static int __netdev_printk(const char *level, const struct net_device *dev,
Joe Perches256df2f2010-06-27 01:02:35 +00006211 struct va_format *vaf)
6212{
6213 int r;
6214
Joe Perchesb004ff42012-09-12 20:12:19 -07006215 if (dev && dev->dev.parent) {
Joe Perches666f3552012-09-12 20:14:11 -07006216 r = dev_printk_emit(level[1] - '0',
6217 dev->dev.parent,
6218 "%s %s %s: %pV",
6219 dev_driver_string(dev->dev.parent),
6220 dev_name(dev->dev.parent),
6221 netdev_name(dev), vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006222 } else if (dev) {
Joe Perches256df2f2010-06-27 01:02:35 +00006223 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006224 } else {
Joe Perches256df2f2010-06-27 01:02:35 +00006225 r = printk("%s(NULL net_device): %pV", level, vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006226 }
Joe Perches256df2f2010-06-27 01:02:35 +00006227
6228 return r;
6229}
6230
6231int netdev_printk(const char *level, const struct net_device *dev,
6232 const char *format, ...)
6233{
6234 struct va_format vaf;
6235 va_list args;
6236 int r;
6237
6238 va_start(args, format);
6239
6240 vaf.fmt = format;
6241 vaf.va = &args;
6242
6243 r = __netdev_printk(level, dev, &vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006244
Joe Perches256df2f2010-06-27 01:02:35 +00006245 va_end(args);
6246
6247 return r;
6248}
6249EXPORT_SYMBOL(netdev_printk);
6250
6251#define define_netdev_printk_level(func, level) \
6252int func(const struct net_device *dev, const char *fmt, ...) \
6253{ \
6254 int r; \
6255 struct va_format vaf; \
6256 va_list args; \
6257 \
6258 va_start(args, fmt); \
6259 \
6260 vaf.fmt = fmt; \
6261 vaf.va = &args; \
6262 \
6263 r = __netdev_printk(level, dev, &vaf); \
Joe Perchesb004ff42012-09-12 20:12:19 -07006264 \
Joe Perches256df2f2010-06-27 01:02:35 +00006265 va_end(args); \
6266 \
6267 return r; \
6268} \
6269EXPORT_SYMBOL(func);
6270
6271define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6272define_netdev_printk_level(netdev_alert, KERN_ALERT);
6273define_netdev_printk_level(netdev_crit, KERN_CRIT);
6274define_netdev_printk_level(netdev_err, KERN_ERR);
6275define_netdev_printk_level(netdev_warn, KERN_WARNING);
6276define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6277define_netdev_printk_level(netdev_info, KERN_INFO);
6278
Pavel Emelyanov46650792007-10-08 20:38:39 -07006279static void __net_exit netdev_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07006280{
6281 kfree(net->dev_name_head);
6282 kfree(net->dev_index_head);
6283}
6284
Denis V. Lunev022cbae2007-11-13 03:23:50 -08006285static struct pernet_operations __net_initdata netdev_net_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07006286 .init = netdev_init,
6287 .exit = netdev_exit,
6288};
6289
Pavel Emelyanov46650792007-10-08 20:38:39 -07006290static void __net_exit default_device_exit(struct net *net)
Eric W. Biedermance286d32007-09-12 13:53:49 +02006291{
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006292 struct net_device *dev, *aux;
Eric W. Biedermance286d32007-09-12 13:53:49 +02006293 /*
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006294 * Push all migratable network devices back to the
Eric W. Biedermance286d32007-09-12 13:53:49 +02006295 * initial network namespace
6296 */
6297 rtnl_lock();
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006298 for_each_netdev_safe(net, dev, aux) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006299 int err;
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006300 char fb_name[IFNAMSIZ];
Eric W. Biedermance286d32007-09-12 13:53:49 +02006301
6302 /* Ignore unmoveable devices (i.e. loopback) */
6303 if (dev->features & NETIF_F_NETNS_LOCAL)
6304 continue;
6305
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006306 /* Leave virtual devices for the generic cleanup */
6307 if (dev->rtnl_link_ops)
6308 continue;
Eric W. Biedermand0c082c2008-11-05 15:59:38 -08006309
Lucas De Marchi25985ed2011-03-30 22:57:33 -03006310 /* Push remaining network devices to init_net */
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006311 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6312 err = dev_change_net_namespace(dev, &init_net, fb_name);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006313 if (err) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00006314 pr_emerg("%s: failed to move %s to init_net: %d\n",
6315 __func__, dev->name, err);
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006316 BUG();
Eric W. Biedermance286d32007-09-12 13:53:49 +02006317 }
6318 }
6319 rtnl_unlock();
6320}
6321
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006322static void __net_exit default_device_exit_batch(struct list_head *net_list)
6323{
6324 /* At exit all network devices most be removed from a network
Uwe Kleine-Königb5950762010-11-01 15:38:34 -04006325 * namespace. Do this in the reverse order of registration.
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006326 * Do this across as many network namespaces as possible to
6327 * improve batching efficiency.
6328 */
6329 struct net_device *dev;
6330 struct net *net;
6331 LIST_HEAD(dev_kill_list);
6332
6333 rtnl_lock();
6334 list_for_each_entry(net, net_list, exit_list) {
6335 for_each_netdev_reverse(net, dev) {
6336 if (dev->rtnl_link_ops)
6337 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6338 else
6339 unregister_netdevice_queue(dev, &dev_kill_list);
6340 }
6341 }
6342 unregister_netdevice_many(&dev_kill_list);
Eric Dumazetceaaec92011-02-17 22:59:19 +00006343 list_del(&dev_kill_list);
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006344 rtnl_unlock();
6345}
6346
Denis V. Lunev022cbae2007-11-13 03:23:50 -08006347static struct pernet_operations __net_initdata default_device_ops = {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006348 .exit = default_device_exit,
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006349 .exit_batch = default_device_exit_batch,
Eric W. Biedermance286d32007-09-12 13:53:49 +02006350};
6351
Linus Torvalds1da177e2005-04-16 15:20:36 -07006352/*
6353 * Initialize the DEV module. At boot time this walks the device list and
6354 * unhooks any devices that fail to initialise (normally hardware not
6355 * present) and leaves us with a valid list of present and active devices.
6356 *
6357 */
6358
6359/*
6360 * This is called single threaded during boot, so no need
6361 * to take the rtnl semaphore.
6362 */
6363static int __init net_dev_init(void)
6364{
6365 int i, rc = -ENOMEM;
6366
6367 BUG_ON(!dev_boot_phase);
6368
Linus Torvalds1da177e2005-04-16 15:20:36 -07006369 if (dev_proc_init())
6370 goto out;
6371
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006372 if (netdev_kobject_init())
Linus Torvalds1da177e2005-04-16 15:20:36 -07006373 goto out;
6374
6375 INIT_LIST_HEAD(&ptype_all);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08006376 for (i = 0; i < PTYPE_HASH_SIZE; i++)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006377 INIT_LIST_HEAD(&ptype_base[i]);
6378
Vlad Yasevich62532da2012-11-15 08:49:10 +00006379 INIT_LIST_HEAD(&offload_base);
6380
Eric W. Biederman881d9662007-09-17 11:56:21 -07006381 if (register_pernet_subsys(&netdev_net_ops))
6382 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006383
6384 /*
6385 * Initialise the packet receive queues.
6386 */
6387
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07006388 for_each_possible_cpu(i) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006389 struct softnet_data *sd = &per_cpu(softnet_data, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006390
Changli Gaodee42872010-05-02 05:42:16 +00006391 memset(sd, 0, sizeof(*sd));
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006392 skb_queue_head_init(&sd->input_pkt_queue);
Changli Gao6e7676c2010-04-27 15:07:33 -07006393 skb_queue_head_init(&sd->process_queue);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006394 sd->completion_queue = NULL;
6395 INIT_LIST_HEAD(&sd->poll_list);
Changli Gaoa9cbd582010-04-26 23:06:24 +00006396 sd->output_queue = NULL;
6397 sd->output_queue_tailp = &sd->output_queue;
Eric Dumazetdf334542010-03-24 19:13:54 +00006398#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006399 sd->csd.func = rps_trigger_softirq;
6400 sd->csd.info = sd;
6401 sd->csd.flags = 0;
6402 sd->cpu = i;
Tom Herbert1e94d722010-03-18 17:45:44 -07006403#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00006404
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006405 sd->backlog.poll = process_backlog;
6406 sd->backlog.weight = weight_p;
6407 sd->backlog.gro_list = NULL;
6408 sd->backlog.gro_count = 0;
Willem de Bruijn99bbc702013-05-20 04:02:32 +00006409
6410#ifdef CONFIG_NET_FLOW_LIMIT
6411 sd->flow_limit = NULL;
6412#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006413 }
6414
Linus Torvalds1da177e2005-04-16 15:20:36 -07006415 dev_boot_phase = 0;
6416
Eric W. Biederman505d4f72008-11-07 22:54:20 -08006417 /* The loopback device is special if any other network devices
6418 * is present in a network namespace the loopback device must
6419 * be present. Since we now dynamically allocate and free the
6420 * loopback device ensure this invariant is maintained by
6421 * keeping the loopback device as the first device on the
6422 * list of network devices. Ensuring the loopback devices
6423 * is the first device that appears and the last network device
6424 * that disappears.
6425 */
6426 if (register_pernet_device(&loopback_net_ops))
6427 goto out;
6428
6429 if (register_pernet_device(&default_device_ops))
6430 goto out;
6431
Carlos R. Mafra962cf362008-05-15 11:15:37 -03006432 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6433 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006434
6435 hotcpu_notifier(dev_cpu_callback, 0);
6436 dst_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006437 rc = 0;
6438out:
6439 return rc;
6440}
6441
6442subsys_initcall(net_dev_init);