blob: fa007dba6beb4371d2690321b4b2447f6dfd8555 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070076#include <linux/bitops.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080077#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070078#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
stephen hemminger08e98972009-11-10 07:20:34 +000081#include <linux/hash.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090082#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/sched.h>
Arjan van de Ven4a3e2f72006-03-20 22:33:17 -080084#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
Ben Hutchings0187bdf2008-06-19 16:15:47 -070094#include <linux/ethtool.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <linux/notifier.h>
96#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020097#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <net/sock.h>
99#include <linux/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100#include <linux/stat.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101#include <net/dst.h>
102#include <net/pkt_sched.h>
103#include <net/checksum.h>
Arnd Bergmann44540962009-11-26 06:07:08 +0000104#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105#include <linux/highmem.h>
106#include <linux/init.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107#include <linux/module.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#include <linux/netpoll.h>
109#include <linux/rcupdate.h>
110#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700111#include <net/iw_handler.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112#include <asm/current.h>
Steve Grubb5bdb9882005-12-03 08:39:35 -0500113#include <linux/audit.h>
Chris Leechdb217332006-06-17 21:24:58 -0700114#include <linux/dmaengine.h>
Herbert Xuf6a78bf2006-06-22 02:57:17 -0700115#include <linux/err.h>
David S. Millerc7fa9d12006-08-15 16:34:13 -0700116#include <linux/ctype.h>
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700117#include <linux/if_arp.h>
Ben Hutchings6de329e2008-06-16 17:02:28 -0700118#include <linux/if_vlan.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700119#include <linux/ip.h>
Alexander Duyckad55dca2008-09-20 22:05:50 -0700120#include <net/ip.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700121#include <linux/ipv6.h>
122#include <linux/in.h>
David S. Millerb6b2fed2008-07-21 09:48:06 -0700123#include <linux/jhash.h>
124#include <linux/random.h>
David S. Miller9cbc1cb2009-06-15 03:02:23 -0700125#include <trace/events/napi.h>
Koki Sanagicf66ba52010-08-23 18:45:02 +0900126#include <trace/events/net.h>
Koki Sanagi07dc22e2010-08-23 18:46:12 +0900127#include <trace/events/skb.h>
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +0000128#include <linux/pci.h>
Stephen Rothwellcaeda9b2010-09-16 21:39:16 -0700129#include <linux/inetdevice.h>
Ben Hutchingsc4454772011-01-19 11:03:53 +0000130#include <linux/cpu_rmap.h>
Ingo Molnarc5905af2012-02-24 08:31:31 +0100131#include <linux/static_key.h>
Eliezer Tamiraf12fa62013-06-10 11:39:41 +0300132#include <linux/hashtable.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700133
Pavel Emelyanov342709e2007-10-23 21:14:45 -0700134#include "net-sysfs.h"
135
Herbert Xud565b0a2008-12-15 23:38:52 -0800136/* Instead of increasing this, you should create a hash table. */
137#define MAX_GRO_SKBS 8
138
Herbert Xu5d38a072009-01-04 16:13:40 -0800139/* This should be increased if a protocol with a bigger head is added. */
140#define GRO_MAX_HEAD (MAX_HEADER + 128)
141
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142static DEFINE_SPINLOCK(ptype_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000143static DEFINE_SPINLOCK(offload_lock);
Cong Wang900ff8c2013-02-18 19:20:33 +0000144struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
145struct list_head ptype_all __read_mostly; /* Taps */
Vlad Yasevich62532da2012-11-15 08:49:10 +0000146static struct list_head offload_base __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148/*
Pavel Emelianov7562f872007-05-03 15:13:45 -0700149 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
Linus Torvalds1da177e2005-04-16 15:20:36 -0700150 * semaphore.
151 *
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800152 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153 *
154 * Writers must hold the rtnl semaphore while they loop through the
Pavel Emelianov7562f872007-05-03 15:13:45 -0700155 * dev_base_head list, and hold dev_base_lock for writing when they do the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156 * actual updates. This allows pure readers to access the list even
157 * while a writer is preparing to update it.
158 *
159 * To put it another way, dev_base_lock is held for writing only to
160 * protect against pure readers; the rtnl semaphore provides the
161 * protection against other writers.
162 *
163 * See, for example usages, register_netdevice() and
164 * unregister_netdevice(), which must be called with the rtnl
165 * semaphore held.
166 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167DEFINE_RWLOCK(dev_base_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700168EXPORT_SYMBOL(dev_base_lock);
169
Eliezer Tamiraf12fa62013-06-10 11:39:41 +0300170/* protects napi_hash addition/deletion and napi_gen_id */
171static DEFINE_SPINLOCK(napi_hash_lock);
172
173static unsigned int napi_gen_id;
174static DEFINE_HASHTABLE(napi_hash, 8);
175
Eric Dumazet30e6c9f2012-12-20 17:25:08 +0000176seqcount_t devnet_rename_seq;
Brian Haleyc91f6df2012-11-26 05:21:08 +0000177
Thomas Graf4e985ad2011-06-21 03:11:20 +0000178static inline void dev_base_seq_inc(struct net *net)
179{
180 while (++net->dev_base_seq == 0);
181}
182
Eric W. Biederman881d9662007-09-17 11:56:21 -0700183static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184{
Eric Dumazet95c96172012-04-15 05:58:06 +0000185 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
186
stephen hemminger08e98972009-11-10 07:20:34 +0000187 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188}
189
Eric W. Biederman881d9662007-09-17 11:56:21 -0700190static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700191{
Eric Dumazet7c28bd02009-10-24 06:13:17 -0700192 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193}
194
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000195static inline void rps_lock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000196{
197#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000198 spin_lock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000199#endif
200}
201
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000202static inline void rps_unlock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000203{
204#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000205 spin_unlock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000206#endif
207}
208
Eric W. Biedermance286d32007-09-12 13:53:49 +0200209/* Device list insertion */
dingtianhong53759be2013-04-17 22:17:50 +0000210static void list_netdevice(struct net_device *dev)
Eric W. Biedermance286d32007-09-12 13:53:49 +0200211{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900212 struct net *net = dev_net(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200213
214 ASSERT_RTNL();
215
216 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800217 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
Eric Dumazet72c95282009-10-30 07:11:27 +0000218 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000219 hlist_add_head_rcu(&dev->index_hlist,
220 dev_index_hash(net, dev->ifindex));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200221 write_unlock_bh(&dev_base_lock);
Thomas Graf4e985ad2011-06-21 03:11:20 +0000222
223 dev_base_seq_inc(net);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200224}
225
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000226/* Device list removal
227 * caller must respect a RCU grace period before freeing/reusing dev
228 */
Eric W. Biedermance286d32007-09-12 13:53:49 +0200229static void unlist_netdevice(struct net_device *dev)
230{
231 ASSERT_RTNL();
232
233 /* Unlink dev from the device chain */
234 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800235 list_del_rcu(&dev->dev_list);
Eric Dumazet72c95282009-10-30 07:11:27 +0000236 hlist_del_rcu(&dev->name_hlist);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000237 hlist_del_rcu(&dev->index_hlist);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200238 write_unlock_bh(&dev_base_lock);
Thomas Graf4e985ad2011-06-21 03:11:20 +0000239
240 dev_base_seq_inc(dev_net(dev));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200241}
242
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243/*
244 * Our notifier list
245 */
246
Alan Sternf07d5b92006-05-09 15:23:03 -0700247static RAW_NOTIFIER_HEAD(netdev_chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700248
249/*
250 * Device drivers call our routines to queue packets here. We empty the
251 * queue in the local softnet handler.
252 */
Stephen Hemmingerbea33482007-10-03 16:41:36 -0700253
Eric Dumazet9958da02010-04-17 04:17:02 +0000254DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700255EXPORT_PER_CPU_SYMBOL(softnet_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256
David S. Millercf508b12008-07-22 14:16:42 -0700257#ifdef CONFIG_LOCKDEP
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700258/*
David S. Millerc773e842008-07-08 23:13:53 -0700259 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700260 * according to dev->type
261 */
262static const unsigned short netdev_lock_type[] =
263 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
264 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
265 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
266 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
267 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
268 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
269 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
270 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
271 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
272 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
273 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
274 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
Paul Gortmaker211ed862012-05-10 17:14:35 -0400275 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
276 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
277 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700278
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700279static const char *const netdev_lock_name[] =
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700280 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
281 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
282 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
283 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
284 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
285 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
286 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
287 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
288 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
289 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
290 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
291 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
Paul Gortmaker211ed862012-05-10 17:14:35 -0400292 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
293 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
294 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700295
296static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
David S. Millercf508b12008-07-22 14:16:42 -0700297static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700298
299static inline unsigned short netdev_lock_pos(unsigned short dev_type)
300{
301 int i;
302
303 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
304 if (netdev_lock_type[i] == dev_type)
305 return i;
306 /* the last key is used by default */
307 return ARRAY_SIZE(netdev_lock_type) - 1;
308}
309
David S. Millercf508b12008-07-22 14:16:42 -0700310static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
311 unsigned short dev_type)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700312{
313 int i;
314
315 i = netdev_lock_pos(dev_type);
316 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
317 netdev_lock_name[i]);
318}
David S. Millercf508b12008-07-22 14:16:42 -0700319
320static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
321{
322 int i;
323
324 i = netdev_lock_pos(dev->type);
325 lockdep_set_class_and_name(&dev->addr_list_lock,
326 &netdev_addr_lock_key[i],
327 netdev_lock_name[i]);
328}
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700329#else
David S. Millercf508b12008-07-22 14:16:42 -0700330static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
331 unsigned short dev_type)
332{
333}
334static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700335{
336}
337#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338
339/*******************************************************************************
340
341 Protocol management and registration routines
342
343*******************************************************************************/
344
345/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700346 * Add a protocol ID to the list. Now that the input handler is
347 * smarter we can dispense with all the messy stuff that used to be
348 * here.
349 *
350 * BEWARE!!! Protocol handlers, mangling input packets,
351 * MUST BE last in hash buckets and checking protocol handlers
352 * MUST start from promiscuous ptype_all chain in net_bh.
353 * It is true now, do not change it.
354 * Explanation follows: if protocol handler, mangling packet, will
355 * be the first on list, it is not able to sense, that packet
356 * is cloned and should be copied-on-write, so that it will
357 * change it and subsequent readers will get broken packet.
358 * --ANK (980803)
359 */
360
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000361static inline struct list_head *ptype_head(const struct packet_type *pt)
362{
363 if (pt->type == htons(ETH_P_ALL))
364 return &ptype_all;
365 else
366 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
367}
368
Linus Torvalds1da177e2005-04-16 15:20:36 -0700369/**
370 * dev_add_pack - add packet handler
371 * @pt: packet type declaration
372 *
373 * Add a protocol handler to the networking stack. The passed &packet_type
374 * is linked into kernel lists and may not be freed until it has been
375 * removed from the kernel lists.
376 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900377 * This call does not sleep therefore it can not
Linus Torvalds1da177e2005-04-16 15:20:36 -0700378 * guarantee all CPU's that are in middle of receiving packets
379 * will see the new packet type (until the next received packet).
380 */
381
382void dev_add_pack(struct packet_type *pt)
383{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000384 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700385
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000386 spin_lock(&ptype_lock);
387 list_add_rcu(&pt->list, head);
388 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700390EXPORT_SYMBOL(dev_add_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700391
Linus Torvalds1da177e2005-04-16 15:20:36 -0700392/**
393 * __dev_remove_pack - remove packet handler
394 * @pt: packet type declaration
395 *
396 * Remove a protocol handler that was previously added to the kernel
397 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
398 * from the kernel lists and can be freed or reused once this function
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900399 * returns.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400 *
401 * The packet type might still be in use by receivers
402 * and must not be freed until after all the CPU's have gone
403 * through a quiescent state.
404 */
405void __dev_remove_pack(struct packet_type *pt)
406{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000407 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700408 struct packet_type *pt1;
409
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000410 spin_lock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700411
412 list_for_each_entry(pt1, head, list) {
413 if (pt == pt1) {
414 list_del_rcu(&pt->list);
415 goto out;
416 }
417 }
418
Joe Perches7b6cd1c2012-02-01 10:54:43 +0000419 pr_warn("dev_remove_pack: %p not found\n", pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700420out:
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000421 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700423EXPORT_SYMBOL(__dev_remove_pack);
424
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425/**
426 * dev_remove_pack - remove packet handler
427 * @pt: packet type declaration
428 *
429 * Remove a protocol handler that was previously added to the kernel
430 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
431 * from the kernel lists and can be freed or reused once this function
432 * returns.
433 *
434 * This call sleeps to guarantee that no CPU is looking at the packet
435 * type after return.
436 */
437void dev_remove_pack(struct packet_type *pt)
438{
439 __dev_remove_pack(pt);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900440
Linus Torvalds1da177e2005-04-16 15:20:36 -0700441 synchronize_net();
442}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700443EXPORT_SYMBOL(dev_remove_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700444
Vlad Yasevich62532da2012-11-15 08:49:10 +0000445
446/**
447 * dev_add_offload - register offload handlers
448 * @po: protocol offload declaration
449 *
450 * Add protocol offload handlers to the networking stack. The passed
451 * &proto_offload is linked into kernel lists and may not be freed until
452 * it has been removed from the kernel lists.
453 *
454 * This call does not sleep therefore it can not
455 * guarantee all CPU's that are in middle of receiving packets
456 * will see the new offload handlers (until the next received packet).
457 */
458void dev_add_offload(struct packet_offload *po)
459{
460 struct list_head *head = &offload_base;
461
462 spin_lock(&offload_lock);
463 list_add_rcu(&po->list, head);
464 spin_unlock(&offload_lock);
465}
466EXPORT_SYMBOL(dev_add_offload);
467
468/**
469 * __dev_remove_offload - remove offload handler
470 * @po: packet offload declaration
471 *
472 * Remove a protocol offload handler that was previously added to the
473 * kernel offload handlers by dev_add_offload(). The passed &offload_type
474 * is removed from the kernel lists and can be freed or reused once this
475 * function returns.
476 *
477 * The packet type might still be in use by receivers
478 * and must not be freed until after all the CPU's have gone
479 * through a quiescent state.
480 */
481void __dev_remove_offload(struct packet_offload *po)
482{
483 struct list_head *head = &offload_base;
484 struct packet_offload *po1;
485
Eric Dumazetc53aa502012-11-16 08:08:23 +0000486 spin_lock(&offload_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000487
488 list_for_each_entry(po1, head, list) {
489 if (po == po1) {
490 list_del_rcu(&po->list);
491 goto out;
492 }
493 }
494
495 pr_warn("dev_remove_offload: %p not found\n", po);
496out:
Eric Dumazetc53aa502012-11-16 08:08:23 +0000497 spin_unlock(&offload_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000498}
499EXPORT_SYMBOL(__dev_remove_offload);
500
501/**
502 * dev_remove_offload - remove packet offload handler
503 * @po: packet offload declaration
504 *
505 * Remove a packet offload handler that was previously added to the kernel
506 * offload handlers by dev_add_offload(). The passed &offload_type is
507 * removed from the kernel lists and can be freed or reused once this
508 * function returns.
509 *
510 * This call sleeps to guarantee that no CPU is looking at the packet
511 * type after return.
512 */
513void dev_remove_offload(struct packet_offload *po)
514{
515 __dev_remove_offload(po);
516
517 synchronize_net();
518}
519EXPORT_SYMBOL(dev_remove_offload);
520
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521/******************************************************************************
522
523 Device Boot-time Settings Routines
524
525*******************************************************************************/
526
527/* Boot time configuration table */
528static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
529
530/**
531 * netdev_boot_setup_add - add new setup entry
532 * @name: name of the device
533 * @map: configured settings for the device
534 *
535 * Adds new setup entry to the dev_boot_setup list. The function
536 * returns 0 on error and 1 on success. This is a generic routine to
537 * all netdevices.
538 */
539static int netdev_boot_setup_add(char *name, struct ifmap *map)
540{
541 struct netdev_boot_setup *s;
542 int i;
543
544 s = dev_boot_setup;
545 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
546 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
547 memset(s[i].name, 0, sizeof(s[i].name));
Wang Chen93b3cff2008-07-01 19:57:19 -0700548 strlcpy(s[i].name, name, IFNAMSIZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549 memcpy(&s[i].map, map, sizeof(s[i].map));
550 break;
551 }
552 }
553
554 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
555}
556
557/**
558 * netdev_boot_setup_check - check boot time settings
559 * @dev: the netdevice
560 *
561 * Check boot time settings for the device.
562 * The found settings are set for the device to be used
563 * later in the device probing.
564 * Returns 0 if no settings found, 1 if they are.
565 */
566int netdev_boot_setup_check(struct net_device *dev)
567{
568 struct netdev_boot_setup *s = dev_boot_setup;
569 int i;
570
571 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
572 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
Wang Chen93b3cff2008-07-01 19:57:19 -0700573 !strcmp(dev->name, s[i].name)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700574 dev->irq = s[i].map.irq;
575 dev->base_addr = s[i].map.base_addr;
576 dev->mem_start = s[i].map.mem_start;
577 dev->mem_end = s[i].map.mem_end;
578 return 1;
579 }
580 }
581 return 0;
582}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700583EXPORT_SYMBOL(netdev_boot_setup_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700584
585
586/**
587 * netdev_boot_base - get address from boot time settings
588 * @prefix: prefix for network device
589 * @unit: id for network device
590 *
591 * Check boot time settings for the base address of device.
592 * The found settings are set for the device to be used
593 * later in the device probing.
594 * Returns 0 if no settings found.
595 */
596unsigned long netdev_boot_base(const char *prefix, int unit)
597{
598 const struct netdev_boot_setup *s = dev_boot_setup;
599 char name[IFNAMSIZ];
600 int i;
601
602 sprintf(name, "%s%d", prefix, unit);
603
604 /*
605 * If device already registered then return base of 1
606 * to indicate not to probe for this interface
607 */
Eric W. Biederman881d9662007-09-17 11:56:21 -0700608 if (__dev_get_by_name(&init_net, name))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700609 return 1;
610
611 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
612 if (!strcmp(name, s[i].name))
613 return s[i].map.base_addr;
614 return 0;
615}
616
617/*
618 * Saves at boot time configured settings for any netdevice.
619 */
620int __init netdev_boot_setup(char *str)
621{
622 int ints[5];
623 struct ifmap map;
624
625 str = get_options(str, ARRAY_SIZE(ints), ints);
626 if (!str || !*str)
627 return 0;
628
629 /* Save settings */
630 memset(&map, 0, sizeof(map));
631 if (ints[0] > 0)
632 map.irq = ints[1];
633 if (ints[0] > 1)
634 map.base_addr = ints[2];
635 if (ints[0] > 2)
636 map.mem_start = ints[3];
637 if (ints[0] > 3)
638 map.mem_end = ints[4];
639
640 /* Add new entry to the list */
641 return netdev_boot_setup_add(str, &map);
642}
643
644__setup("netdev=", netdev_boot_setup);
645
646/*******************************************************************************
647
648 Device Interface Subroutines
649
650*******************************************************************************/
651
652/**
653 * __dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700654 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700655 * @name: name to find
656 *
657 * Find an interface by name. Must be called under RTNL semaphore
658 * or @dev_base_lock. If the name is found a pointer to the device
659 * is returned. If the name is not found then %NULL is returned. The
660 * reference counters are not incremented so the caller must be
661 * careful with locks.
662 */
663
Eric W. Biederman881d9662007-09-17 11:56:21 -0700664struct net_device *__dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665{
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700666 struct net_device *dev;
667 struct hlist_head *head = dev_name_hash(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700668
Sasha Levinb67bfe02013-02-27 17:06:00 -0800669 hlist_for_each_entry(dev, head, name_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700670 if (!strncmp(dev->name, name, IFNAMSIZ))
671 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700672
Linus Torvalds1da177e2005-04-16 15:20:36 -0700673 return NULL;
674}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700675EXPORT_SYMBOL(__dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700676
677/**
Eric Dumazet72c95282009-10-30 07:11:27 +0000678 * dev_get_by_name_rcu - find a device by its name
679 * @net: the applicable net namespace
680 * @name: name to find
681 *
682 * Find an interface by name.
683 * If the name is found a pointer to the device is returned.
684 * If the name is not found then %NULL is returned.
685 * The reference counters are not incremented so the caller must be
686 * careful with locks. The caller must hold RCU lock.
687 */
688
689struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
690{
Eric Dumazet72c95282009-10-30 07:11:27 +0000691 struct net_device *dev;
692 struct hlist_head *head = dev_name_hash(net, name);
693
Sasha Levinb67bfe02013-02-27 17:06:00 -0800694 hlist_for_each_entry_rcu(dev, head, name_hlist)
Eric Dumazet72c95282009-10-30 07:11:27 +0000695 if (!strncmp(dev->name, name, IFNAMSIZ))
696 return dev;
697
698 return NULL;
699}
700EXPORT_SYMBOL(dev_get_by_name_rcu);
701
702/**
Linus Torvalds1da177e2005-04-16 15:20:36 -0700703 * dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700704 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700705 * @name: name to find
706 *
707 * Find an interface by name. This can be called from any
708 * context and does its own locking. The returned handle has
709 * the usage count incremented and the caller must use dev_put() to
710 * release it when it is no longer needed. %NULL is returned if no
711 * matching device is found.
712 */
713
Eric W. Biederman881d9662007-09-17 11:56:21 -0700714struct net_device *dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700715{
716 struct net_device *dev;
717
Eric Dumazet72c95282009-10-30 07:11:27 +0000718 rcu_read_lock();
719 dev = dev_get_by_name_rcu(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700720 if (dev)
721 dev_hold(dev);
Eric Dumazet72c95282009-10-30 07:11:27 +0000722 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700723 return dev;
724}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700725EXPORT_SYMBOL(dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700726
727/**
728 * __dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700729 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700730 * @ifindex: index of device
731 *
732 * Search for an interface by index. Returns %NULL if the device
733 * is not found or a pointer to the device. The device has not
734 * had its reference counter increased so the caller must be careful
735 * about locking. The caller must hold either the RTNL semaphore
736 * or @dev_base_lock.
737 */
738
Eric W. Biederman881d9662007-09-17 11:56:21 -0700739struct net_device *__dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740{
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700741 struct net_device *dev;
742 struct hlist_head *head = dev_index_hash(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700743
Sasha Levinb67bfe02013-02-27 17:06:00 -0800744 hlist_for_each_entry(dev, head, index_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700745 if (dev->ifindex == ifindex)
746 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700747
Linus Torvalds1da177e2005-04-16 15:20:36 -0700748 return NULL;
749}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700750EXPORT_SYMBOL(__dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700751
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000752/**
753 * dev_get_by_index_rcu - find a device by its ifindex
754 * @net: the applicable net namespace
755 * @ifindex: index of device
756 *
757 * Search for an interface by index. Returns %NULL if the device
758 * is not found or a pointer to the device. The device has not
759 * had its reference counter increased so the caller must be careful
760 * about locking. The caller must hold RCU lock.
761 */
762
763struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
764{
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000765 struct net_device *dev;
766 struct hlist_head *head = dev_index_hash(net, ifindex);
767
Sasha Levinb67bfe02013-02-27 17:06:00 -0800768 hlist_for_each_entry_rcu(dev, head, index_hlist)
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000769 if (dev->ifindex == ifindex)
770 return dev;
771
772 return NULL;
773}
774EXPORT_SYMBOL(dev_get_by_index_rcu);
775
Linus Torvalds1da177e2005-04-16 15:20:36 -0700776
777/**
778 * dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700779 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700780 * @ifindex: index of device
781 *
782 * Search for an interface by index. Returns NULL if the device
783 * is not found or a pointer to the device. The device returned has
784 * had a reference added and the pointer is safe until the user calls
785 * dev_put to indicate they have finished with it.
786 */
787
Eric W. Biederman881d9662007-09-17 11:56:21 -0700788struct net_device *dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700789{
790 struct net_device *dev;
791
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000792 rcu_read_lock();
793 dev = dev_get_by_index_rcu(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700794 if (dev)
795 dev_hold(dev);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000796 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700797 return dev;
798}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700799EXPORT_SYMBOL(dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700800
801/**
Eric Dumazet941666c2010-12-05 01:23:53 +0000802 * dev_getbyhwaddr_rcu - find a device by its hardware address
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700803 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700804 * @type: media type of device
805 * @ha: hardware address
806 *
807 * Search for an interface by MAC address. Returns NULL if the device
Eric Dumazetc5066532011-01-24 13:16:16 -0800808 * is not found or a pointer to the device.
809 * The caller must hold RCU or RTNL.
Eric Dumazet941666c2010-12-05 01:23:53 +0000810 * The returned device has not had its ref count increased
Linus Torvalds1da177e2005-04-16 15:20:36 -0700811 * and the caller must therefore be careful about locking
812 *
Linus Torvalds1da177e2005-04-16 15:20:36 -0700813 */
814
Eric Dumazet941666c2010-12-05 01:23:53 +0000815struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
816 const char *ha)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700817{
818 struct net_device *dev;
819
Eric Dumazet941666c2010-12-05 01:23:53 +0000820 for_each_netdev_rcu(net, dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700821 if (dev->type == type &&
822 !memcmp(dev->dev_addr, ha, dev->addr_len))
Pavel Emelianov7562f872007-05-03 15:13:45 -0700823 return dev;
824
825 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700826}
Eric Dumazet941666c2010-12-05 01:23:53 +0000827EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
Jochen Friedrichcf309e32005-09-22 04:44:55 -0300828
Eric W. Biederman881d9662007-09-17 11:56:21 -0700829struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700830{
831 struct net_device *dev;
832
833 ASSERT_RTNL();
Eric W. Biederman881d9662007-09-17 11:56:21 -0700834 for_each_netdev(net, dev)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700835 if (dev->type == type)
Pavel Emelianov7562f872007-05-03 15:13:45 -0700836 return dev;
837
838 return NULL;
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700839}
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700840EXPORT_SYMBOL(__dev_getfirstbyhwtype);
841
Eric W. Biederman881d9662007-09-17 11:56:21 -0700842struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700843{
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000844 struct net_device *dev, *ret = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700845
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000846 rcu_read_lock();
847 for_each_netdev_rcu(net, dev)
848 if (dev->type == type) {
849 dev_hold(dev);
850 ret = dev;
851 break;
852 }
853 rcu_read_unlock();
854 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700855}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700856EXPORT_SYMBOL(dev_getfirstbyhwtype);
857
858/**
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000859 * dev_get_by_flags_rcu - find any device with given flags
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700860 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700861 * @if_flags: IFF_* values
862 * @mask: bitmask of bits in if_flags to check
863 *
864 * Search for any interface with the given flags. Returns NULL if a device
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000865 * is not found or a pointer to the device. Must be called inside
866 * rcu_read_lock(), and result refcount is unchanged.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700867 */
868
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000869struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700870 unsigned short mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700871{
Pavel Emelianov7562f872007-05-03 15:13:45 -0700872 struct net_device *dev, *ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700873
Pavel Emelianov7562f872007-05-03 15:13:45 -0700874 ret = NULL;
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800875 for_each_netdev_rcu(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700876 if (((dev->flags ^ if_flags) & mask) == 0) {
Pavel Emelianov7562f872007-05-03 15:13:45 -0700877 ret = dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700878 break;
879 }
880 }
Pavel Emelianov7562f872007-05-03 15:13:45 -0700881 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700882}
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000883EXPORT_SYMBOL(dev_get_by_flags_rcu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700884
885/**
886 * dev_valid_name - check if name is okay for network device
887 * @name: name string
888 *
889 * Network device names need to be valid file names to
David S. Millerc7fa9d12006-08-15 16:34:13 -0700890 * to allow sysfs to work. We also disallow any kind of
891 * whitespace.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700892 */
David S. Miller95f050b2012-03-06 16:12:15 -0500893bool dev_valid_name(const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700894{
David S. Millerc7fa9d12006-08-15 16:34:13 -0700895 if (*name == '\0')
David S. Miller95f050b2012-03-06 16:12:15 -0500896 return false;
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -0700897 if (strlen(name) >= IFNAMSIZ)
David S. Miller95f050b2012-03-06 16:12:15 -0500898 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700899 if (!strcmp(name, ".") || !strcmp(name, ".."))
David S. Miller95f050b2012-03-06 16:12:15 -0500900 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700901
902 while (*name) {
903 if (*name == '/' || isspace(*name))
David S. Miller95f050b2012-03-06 16:12:15 -0500904 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700905 name++;
906 }
David S. Miller95f050b2012-03-06 16:12:15 -0500907 return true;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700908}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700909EXPORT_SYMBOL(dev_valid_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700910
911/**
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200912 * __dev_alloc_name - allocate a name for a device
913 * @net: network namespace to allocate the device name in
Linus Torvalds1da177e2005-04-16 15:20:36 -0700914 * @name: name format string
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200915 * @buf: scratch buffer and result name string
Linus Torvalds1da177e2005-04-16 15:20:36 -0700916 *
917 * Passed a format string - eg "lt%d" it will try and find a suitable
Stephen Hemminger3041a062006-05-26 13:25:24 -0700918 * id. It scans list of devices to build up a free map, then chooses
919 * the first empty slot. The caller must hold the dev_base or rtnl lock
920 * while allocating the name and adding the device in order to avoid
921 * duplicates.
922 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
923 * Returns the number of the unit assigned or a negative errno code.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700924 */
925
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200926static int __dev_alloc_name(struct net *net, const char *name, char *buf)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700927{
928 int i = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700929 const char *p;
930 const int max_netdevices = 8*PAGE_SIZE;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700931 unsigned long *inuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700932 struct net_device *d;
933
934 p = strnchr(name, IFNAMSIZ-1, '%');
935 if (p) {
936 /*
937 * Verify the string as this thing may have come from
938 * the user. There must be either one "%d" and no other "%"
939 * characters.
940 */
941 if (p[1] != 'd' || strchr(p + 2, '%'))
942 return -EINVAL;
943
944 /* Use one page as a bit array of possible slots */
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700945 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700946 if (!inuse)
947 return -ENOMEM;
948
Eric W. Biederman881d9662007-09-17 11:56:21 -0700949 for_each_netdev(net, d) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700950 if (!sscanf(d->name, name, &i))
951 continue;
952 if (i < 0 || i >= max_netdevices)
953 continue;
954
955 /* avoid cases where sscanf is not exact inverse of printf */
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200956 snprintf(buf, IFNAMSIZ, name, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700957 if (!strncmp(buf, d->name, IFNAMSIZ))
958 set_bit(i, inuse);
959 }
960
961 i = find_first_zero_bit(inuse, max_netdevices);
962 free_page((unsigned long) inuse);
963 }
964
Octavian Purdilad9031022009-11-18 02:36:59 +0000965 if (buf != name)
966 snprintf(buf, IFNAMSIZ, name, i);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200967 if (!__dev_get_by_name(net, buf))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700969
970 /* It is possible to run out of possible slots
971 * when the name is long and there isn't enough space left
972 * for the digits, or if all bits are used.
973 */
974 return -ENFILE;
975}
976
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200977/**
978 * dev_alloc_name - allocate a name for a device
979 * @dev: device
980 * @name: name format string
981 *
982 * Passed a format string - eg "lt%d" it will try and find a suitable
983 * id. It scans list of devices to build up a free map, then chooses
984 * the first empty slot. The caller must hold the dev_base or rtnl lock
985 * while allocating the name and adding the device in order to avoid
986 * duplicates.
987 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
988 * Returns the number of the unit assigned or a negative errno code.
989 */
990
991int dev_alloc_name(struct net_device *dev, const char *name)
992{
993 char buf[IFNAMSIZ];
994 struct net *net;
995 int ret;
996
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900997 BUG_ON(!dev_net(dev));
998 net = dev_net(dev);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200999 ret = __dev_alloc_name(net, name, buf);
1000 if (ret >= 0)
1001 strlcpy(dev->name, buf, IFNAMSIZ);
1002 return ret;
1003}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001004EXPORT_SYMBOL(dev_alloc_name);
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001005
Gao feng828de4f2012-09-13 20:58:27 +00001006static int dev_alloc_name_ns(struct net *net,
1007 struct net_device *dev,
1008 const char *name)
Octavian Purdilad9031022009-11-18 02:36:59 +00001009{
Gao feng828de4f2012-09-13 20:58:27 +00001010 char buf[IFNAMSIZ];
1011 int ret;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001012
Gao feng828de4f2012-09-13 20:58:27 +00001013 ret = __dev_alloc_name(net, name, buf);
1014 if (ret >= 0)
1015 strlcpy(dev->name, buf, IFNAMSIZ);
1016 return ret;
1017}
1018
1019static int dev_get_valid_name(struct net *net,
1020 struct net_device *dev,
1021 const char *name)
1022{
1023 BUG_ON(!net);
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001024
Octavian Purdilad9031022009-11-18 02:36:59 +00001025 if (!dev_valid_name(name))
1026 return -EINVAL;
1027
Jiri Pirko1c5cae82011-04-30 01:21:32 +00001028 if (strchr(name, '%'))
Gao feng828de4f2012-09-13 20:58:27 +00001029 return dev_alloc_name_ns(net, dev, name);
Octavian Purdilad9031022009-11-18 02:36:59 +00001030 else if (__dev_get_by_name(net, name))
1031 return -EEXIST;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001032 else if (dev->name != name)
1033 strlcpy(dev->name, name, IFNAMSIZ);
Octavian Purdilad9031022009-11-18 02:36:59 +00001034
1035 return 0;
1036}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001037
1038/**
1039 * dev_change_name - change name of a device
1040 * @dev: device
1041 * @newname: name (or format string) must be at least IFNAMSIZ
1042 *
1043 * Change name of a device, can pass format strings "eth%d".
1044 * for wildcarding.
1045 */
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07001046int dev_change_name(struct net_device *dev, const char *newname)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001047{
Herbert Xufcc5a032007-07-30 17:03:38 -07001048 char oldname[IFNAMSIZ];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001049 int err = 0;
Herbert Xufcc5a032007-07-30 17:03:38 -07001050 int ret;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001051 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001052
1053 ASSERT_RTNL();
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001054 BUG_ON(!dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001055
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001056 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001057 if (dev->flags & IFF_UP)
1058 return -EBUSY;
1059
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001060 write_seqcount_begin(&devnet_rename_seq);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001061
1062 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001063 write_seqcount_end(&devnet_rename_seq);
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -07001064 return 0;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001065 }
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -07001066
Herbert Xufcc5a032007-07-30 17:03:38 -07001067 memcpy(oldname, dev->name, IFNAMSIZ);
1068
Gao feng828de4f2012-09-13 20:58:27 +00001069 err = dev_get_valid_name(net, dev, newname);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001070 if (err < 0) {
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001071 write_seqcount_end(&devnet_rename_seq);
Octavian Purdilad9031022009-11-18 02:36:59 +00001072 return err;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001073 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001074
Herbert Xufcc5a032007-07-30 17:03:38 -07001075rollback:
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001076 ret = device_rename(&dev->dev, dev->name);
1077 if (ret) {
1078 memcpy(dev->name, oldname, IFNAMSIZ);
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001079 write_seqcount_end(&devnet_rename_seq);
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001080 return ret;
Stephen Hemmingerdcc99772008-05-14 22:33:38 -07001081 }
Herbert Xu7f988ea2007-07-30 16:35:46 -07001082
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001083 write_seqcount_end(&devnet_rename_seq);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001084
Herbert Xu7f988ea2007-07-30 16:35:46 -07001085 write_lock_bh(&dev_base_lock);
Eric Dumazet372b2312011-05-17 13:56:59 -04001086 hlist_del_rcu(&dev->name_hlist);
Eric Dumazet72c95282009-10-30 07:11:27 +00001087 write_unlock_bh(&dev_base_lock);
1088
1089 synchronize_rcu();
1090
1091 write_lock_bh(&dev_base_lock);
1092 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Herbert Xu7f988ea2007-07-30 16:35:46 -07001093 write_unlock_bh(&dev_base_lock);
1094
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001095 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001096 ret = notifier_to_errno(ret);
1097
1098 if (ret) {
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001099 /* err >= 0 after dev_alloc_name() or stores the first errno */
1100 if (err >= 0) {
Herbert Xufcc5a032007-07-30 17:03:38 -07001101 err = ret;
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001102 write_seqcount_begin(&devnet_rename_seq);
Herbert Xufcc5a032007-07-30 17:03:38 -07001103 memcpy(dev->name, oldname, IFNAMSIZ);
1104 goto rollback;
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001105 } else {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001106 pr_err("%s: name change rollback failed: %d\n",
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001107 dev->name, ret);
Herbert Xufcc5a032007-07-30 17:03:38 -07001108 }
1109 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001110
1111 return err;
1112}
1113
1114/**
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001115 * dev_set_alias - change ifalias of a device
1116 * @dev: device
1117 * @alias: name up to IFALIASZ
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07001118 * @len: limit of bytes to copy from info
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001119 *
1120 * Set ifalias for a device,
1121 */
1122int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1123{
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001124 char *new_ifalias;
1125
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001126 ASSERT_RTNL();
1127
1128 if (len >= IFALIASZ)
1129 return -EINVAL;
1130
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001131 if (!len) {
Sachin Kamat388dfc22012-11-20 00:57:04 +00001132 kfree(dev->ifalias);
1133 dev->ifalias = NULL;
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001134 return 0;
1135 }
1136
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001137 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1138 if (!new_ifalias)
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001139 return -ENOMEM;
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001140 dev->ifalias = new_ifalias;
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001141
1142 strlcpy(dev->ifalias, alias, len+1);
1143 return len;
1144}
1145
1146
1147/**
Stephen Hemminger3041a062006-05-26 13:25:24 -07001148 * netdev_features_change - device changes features
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001149 * @dev: device to cause notification
1150 *
1151 * Called to indicate a device has changed features.
1152 */
1153void netdev_features_change(struct net_device *dev)
1154{
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001155 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001156}
1157EXPORT_SYMBOL(netdev_features_change);
1158
1159/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001160 * netdev_state_change - device changes state
1161 * @dev: device to cause notification
1162 *
1163 * Called to indicate a device has changed state. This function calls
1164 * the notifier chains for netdev_chain and sends a NEWLINK message
1165 * to the routing socket.
1166 */
1167void netdev_state_change(struct net_device *dev)
1168{
1169 if (dev->flags & IFF_UP) {
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001170 call_netdevice_notifiers(NETDEV_CHANGE, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001171 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1172 }
1173}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001174EXPORT_SYMBOL(netdev_state_change);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001175
Amerigo Wangee89bab2012-08-09 22:14:56 +00001176/**
1177 * netdev_notify_peers - notify network peers about existence of @dev
1178 * @dev: network device
1179 *
1180 * Generate traffic such that interested network peers are aware of
1181 * @dev, such as by generating a gratuitous ARP. This may be used when
1182 * a device wants to inform the rest of the network about some sort of
1183 * reconfiguration such as a failover event or virtual machine
1184 * migration.
1185 */
1186void netdev_notify_peers(struct net_device *dev)
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001187{
Amerigo Wangee89bab2012-08-09 22:14:56 +00001188 rtnl_lock();
1189 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1190 rtnl_unlock();
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001191}
Amerigo Wangee89bab2012-08-09 22:14:56 +00001192EXPORT_SYMBOL(netdev_notify_peers);
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001193
Patrick McHardybd380812010-02-26 06:34:53 +00001194static int __dev_open(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001195{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001196 const struct net_device_ops *ops = dev->netdev_ops;
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001197 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001198
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001199 ASSERT_RTNL();
1200
Linus Torvalds1da177e2005-04-16 15:20:36 -07001201 if (!netif_device_present(dev))
1202 return -ENODEV;
1203
Neil Hormanca99ca12013-02-05 08:05:43 +00001204 /* Block netpoll from trying to do any rx path servicing.
1205 * If we don't do this there is a chance ndo_poll_controller
1206 * or ndo_poll may be running while we open the device
1207 */
dingtianhongda6e3782013-05-27 19:53:31 +00001208 netpoll_rx_disable(dev);
Neil Hormanca99ca12013-02-05 08:05:43 +00001209
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001210 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1211 ret = notifier_to_errno(ret);
1212 if (ret)
1213 return ret;
1214
Linus Torvalds1da177e2005-04-16 15:20:36 -07001215 set_bit(__LINK_STATE_START, &dev->state);
Jeff Garzikbada3392007-10-23 20:19:37 -07001216
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001217 if (ops->ndo_validate_addr)
1218 ret = ops->ndo_validate_addr(dev);
Jeff Garzikbada3392007-10-23 20:19:37 -07001219
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001220 if (!ret && ops->ndo_open)
1221 ret = ops->ndo_open(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001222
Neil Hormanca99ca12013-02-05 08:05:43 +00001223 netpoll_rx_enable(dev);
1224
Jeff Garzikbada3392007-10-23 20:19:37 -07001225 if (ret)
1226 clear_bit(__LINK_STATE_START, &dev->state);
1227 else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001228 dev->flags |= IFF_UP;
David S. Millerb4bd07c2009-02-06 22:06:43 -08001229 net_dmaengine_get();
Patrick McHardy4417da62007-06-27 01:28:10 -07001230 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001231 dev_activate(dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04001232 add_device_randomness(dev->dev_addr, dev->addr_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001233 }
Jeff Garzikbada3392007-10-23 20:19:37 -07001234
Linus Torvalds1da177e2005-04-16 15:20:36 -07001235 return ret;
1236}
Patrick McHardybd380812010-02-26 06:34:53 +00001237
1238/**
1239 * dev_open - prepare an interface for use.
1240 * @dev: device to open
1241 *
1242 * Takes a device from down to up state. The device's private open
1243 * function is invoked and then the multicast lists are loaded. Finally
1244 * the device is moved into the up state and a %NETDEV_UP message is
1245 * sent to the netdev notifier chain.
1246 *
1247 * Calling this function on an active interface is a nop. On a failure
1248 * a negative errno code is returned.
1249 */
1250int dev_open(struct net_device *dev)
1251{
1252 int ret;
1253
Patrick McHardybd380812010-02-26 06:34:53 +00001254 if (dev->flags & IFF_UP)
1255 return 0;
1256
Patrick McHardybd380812010-02-26 06:34:53 +00001257 ret = __dev_open(dev);
1258 if (ret < 0)
1259 return ret;
1260
Patrick McHardybd380812010-02-26 06:34:53 +00001261 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1262 call_netdevice_notifiers(NETDEV_UP, dev);
1263
1264 return ret;
1265}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001266EXPORT_SYMBOL(dev_open);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001267
Octavian Purdila44345722010-12-13 12:44:07 +00001268static int __dev_close_many(struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001269{
Octavian Purdila44345722010-12-13 12:44:07 +00001270 struct net_device *dev;
Patrick McHardybd380812010-02-26 06:34:53 +00001271
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001272 ASSERT_RTNL();
David S. Miller9d5010d2007-09-12 14:33:25 +02001273 might_sleep();
1274
Octavian Purdila44345722010-12-13 12:44:07 +00001275 list_for_each_entry(dev, head, unreg_list) {
Octavian Purdila44345722010-12-13 12:44:07 +00001276 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001277
Octavian Purdila44345722010-12-13 12:44:07 +00001278 clear_bit(__LINK_STATE_START, &dev->state);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001279
Octavian Purdila44345722010-12-13 12:44:07 +00001280 /* Synchronize to scheduled poll. We cannot touch poll list, it
1281 * can be even on different cpu. So just clear netif_running().
1282 *
1283 * dev->stop() will invoke napi_disable() on all of it's
1284 * napi_struct instances on this device.
1285 */
1286 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1287 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001288
Octavian Purdila44345722010-12-13 12:44:07 +00001289 dev_deactivate_many(head);
1290
1291 list_for_each_entry(dev, head, unreg_list) {
1292 const struct net_device_ops *ops = dev->netdev_ops;
1293
1294 /*
1295 * Call the device specific close. This cannot fail.
1296 * Only if device is UP
1297 *
1298 * We allow it to be called even after a DETACH hot-plug
1299 * event.
1300 */
1301 if (ops->ndo_stop)
1302 ops->ndo_stop(dev);
1303
Octavian Purdila44345722010-12-13 12:44:07 +00001304 dev->flags &= ~IFF_UP;
Octavian Purdila44345722010-12-13 12:44:07 +00001305 net_dmaengine_put();
1306 }
1307
1308 return 0;
1309}
1310
1311static int __dev_close(struct net_device *dev)
1312{
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001313 int retval;
Octavian Purdila44345722010-12-13 12:44:07 +00001314 LIST_HEAD(single);
1315
Neil Hormanca99ca12013-02-05 08:05:43 +00001316 /* Temporarily disable netpoll until the interface is down */
dingtianhongda6e3782013-05-27 19:53:31 +00001317 netpoll_rx_disable(dev);
Neil Hormanca99ca12013-02-05 08:05:43 +00001318
Octavian Purdila44345722010-12-13 12:44:07 +00001319 list_add(&dev->unreg_list, &single);
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001320 retval = __dev_close_many(&single);
1321 list_del(&single);
Neil Hormanca99ca12013-02-05 08:05:43 +00001322
1323 netpoll_rx_enable(dev);
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001324 return retval;
Octavian Purdila44345722010-12-13 12:44:07 +00001325}
1326
Eric Dumazet3fbd8752011-01-19 21:23:22 +00001327static int dev_close_many(struct list_head *head)
Octavian Purdila44345722010-12-13 12:44:07 +00001328{
1329 struct net_device *dev, *tmp;
1330 LIST_HEAD(tmp_list);
1331
1332 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1333 if (!(dev->flags & IFF_UP))
1334 list_move(&dev->unreg_list, &tmp_list);
1335
1336 __dev_close_many(head);
Matti Linnanvuorid8b2a4d2008-02-12 23:10:11 -08001337
Octavian Purdila44345722010-12-13 12:44:07 +00001338 list_for_each_entry(dev, head, unreg_list) {
1339 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1340 call_netdevice_notifiers(NETDEV_DOWN, dev);
1341 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001342
Octavian Purdila44345722010-12-13 12:44:07 +00001343 /* rollback_registered_many needs the complete original list */
1344 list_splice(&tmp_list, head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001345 return 0;
1346}
Patrick McHardybd380812010-02-26 06:34:53 +00001347
1348/**
1349 * dev_close - shutdown an interface.
1350 * @dev: device to shutdown
1351 *
1352 * This function moves an active device into down state. A
1353 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1354 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1355 * chain.
1356 */
1357int dev_close(struct net_device *dev)
1358{
Eric Dumazete14a5992011-05-10 12:26:06 -07001359 if (dev->flags & IFF_UP) {
1360 LIST_HEAD(single);
Patrick McHardybd380812010-02-26 06:34:53 +00001361
Neil Hormanca99ca12013-02-05 08:05:43 +00001362 /* Block netpoll rx while the interface is going down */
dingtianhongda6e3782013-05-27 19:53:31 +00001363 netpoll_rx_disable(dev);
Neil Hormanca99ca12013-02-05 08:05:43 +00001364
Eric Dumazete14a5992011-05-10 12:26:06 -07001365 list_add(&dev->unreg_list, &single);
1366 dev_close_many(&single);
1367 list_del(&single);
Neil Hormanca99ca12013-02-05 08:05:43 +00001368
1369 netpoll_rx_enable(dev);
Eric Dumazete14a5992011-05-10 12:26:06 -07001370 }
dingtianhongda6e3782013-05-27 19:53:31 +00001371 return 0;
Patrick McHardybd380812010-02-26 06:34:53 +00001372}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001373EXPORT_SYMBOL(dev_close);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001374
1375
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001376/**
1377 * dev_disable_lro - disable Large Receive Offload on a device
1378 * @dev: device
1379 *
1380 * Disable Large Receive Offload (LRO) on a net device. Must be
1381 * called under RTNL. This is needed if received packets may be
1382 * forwarded to another interface.
1383 */
1384void dev_disable_lro(struct net_device *dev)
1385{
Neil Hormanf11970e2011-05-24 08:31:09 +00001386 /*
1387 * If we're trying to disable lro on a vlan device
1388 * use the underlying physical device instead
1389 */
1390 if (is_vlan_dev(dev))
1391 dev = vlan_dev_real_dev(dev);
1392
Michał Mirosławbc5787c62011-11-15 15:29:55 +00001393 dev->wanted_features &= ~NETIF_F_LRO;
1394 netdev_update_features(dev);
Michał Mirosław27660512011-03-18 16:56:34 +00001395
Michał Mirosław22d59692011-04-21 12:42:15 +00001396 if (unlikely(dev->features & NETIF_F_LRO))
1397 netdev_WARN(dev, "failed to disable LRO!\n");
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001398}
1399EXPORT_SYMBOL(dev_disable_lro);
1400
Jiri Pirko351638e2013-05-28 01:30:21 +00001401static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1402 struct net_device *dev)
1403{
1404 struct netdev_notifier_info info;
1405
1406 netdev_notifier_info_init(&info, dev);
1407 return nb->notifier_call(nb, val, &info);
1408}
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001409
Eric W. Biederman881d9662007-09-17 11:56:21 -07001410static int dev_boot_phase = 1;
1411
Linus Torvalds1da177e2005-04-16 15:20:36 -07001412/**
1413 * register_netdevice_notifier - register a network notifier block
1414 * @nb: notifier
1415 *
1416 * Register a notifier to be called when network device events occur.
1417 * The notifier passed is linked into the kernel structures and must
1418 * not be reused until it has been unregistered. A negative errno code
1419 * is returned on a failure.
1420 *
1421 * When registered all registration and up events are replayed
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001422 * to the new notifier to allow device to have a race free
Linus Torvalds1da177e2005-04-16 15:20:36 -07001423 * view of the network device list.
1424 */
1425
1426int register_netdevice_notifier(struct notifier_block *nb)
1427{
1428 struct net_device *dev;
Herbert Xufcc5a032007-07-30 17:03:38 -07001429 struct net_device *last;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001430 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001431 int err;
1432
1433 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001434 err = raw_notifier_chain_register(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001435 if (err)
1436 goto unlock;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001437 if (dev_boot_phase)
1438 goto unlock;
1439 for_each_net(net) {
1440 for_each_netdev(net, dev) {
Jiri Pirko351638e2013-05-28 01:30:21 +00001441 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
Eric W. Biederman881d9662007-09-17 11:56:21 -07001442 err = notifier_to_errno(err);
1443 if (err)
1444 goto rollback;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001445
Eric W. Biederman881d9662007-09-17 11:56:21 -07001446 if (!(dev->flags & IFF_UP))
1447 continue;
Herbert Xufcc5a032007-07-30 17:03:38 -07001448
Jiri Pirko351638e2013-05-28 01:30:21 +00001449 call_netdevice_notifier(nb, NETDEV_UP, dev);
Eric W. Biederman881d9662007-09-17 11:56:21 -07001450 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001451 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001452
1453unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001454 rtnl_unlock();
1455 return err;
Herbert Xufcc5a032007-07-30 17:03:38 -07001456
1457rollback:
1458 last = dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001459 for_each_net(net) {
1460 for_each_netdev(net, dev) {
1461 if (dev == last)
RongQing.Li8f891482011-11-30 23:43:07 -05001462 goto outroll;
Herbert Xufcc5a032007-07-30 17:03:38 -07001463
Eric W. Biederman881d9662007-09-17 11:56:21 -07001464 if (dev->flags & IFF_UP) {
Jiri Pirko351638e2013-05-28 01:30:21 +00001465 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1466 dev);
1467 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
Eric W. Biederman881d9662007-09-17 11:56:21 -07001468 }
Jiri Pirko351638e2013-05-28 01:30:21 +00001469 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001470 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001471 }
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001472
RongQing.Li8f891482011-11-30 23:43:07 -05001473outroll:
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001474 raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001475 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001476}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001477EXPORT_SYMBOL(register_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001478
1479/**
1480 * unregister_netdevice_notifier - unregister a network notifier block
1481 * @nb: notifier
1482 *
1483 * Unregister a notifier previously registered by
1484 * register_netdevice_notifier(). The notifier is unlinked into the
1485 * kernel structures and may then be reused. A negative errno code
1486 * is returned on a failure.
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001487 *
1488 * After unregistering unregister and down device events are synthesized
1489 * for all devices on the device list to the removed notifier to remove
1490 * the need for special case cleanup code.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001491 */
1492
1493int unregister_netdevice_notifier(struct notifier_block *nb)
1494{
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001495 struct net_device *dev;
1496 struct net *net;
Herbert Xu9f514952006-03-25 01:24:25 -08001497 int err;
1498
1499 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001500 err = raw_notifier_chain_unregister(&netdev_chain, nb);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001501 if (err)
1502 goto unlock;
1503
1504 for_each_net(net) {
1505 for_each_netdev(net, dev) {
1506 if (dev->flags & IFF_UP) {
Jiri Pirko351638e2013-05-28 01:30:21 +00001507 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1508 dev);
1509 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001510 }
Jiri Pirko351638e2013-05-28 01:30:21 +00001511 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001512 }
1513 }
1514unlock:
Herbert Xu9f514952006-03-25 01:24:25 -08001515 rtnl_unlock();
1516 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001517}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001518EXPORT_SYMBOL(unregister_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001519
1520/**
Jiri Pirko351638e2013-05-28 01:30:21 +00001521 * call_netdevice_notifiers_info - call all network notifier blocks
1522 * @val: value passed unmodified to notifier function
1523 * @dev: net_device pointer passed unmodified to notifier function
1524 * @info: notifier information data
1525 *
1526 * Call all network notifier blocks. Parameters and return value
1527 * are as for raw_notifier_call_chain().
1528 */
1529
1530int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
1531 struct netdev_notifier_info *info)
1532{
1533 ASSERT_RTNL();
1534 netdev_notifier_info_init(info, dev);
1535 return raw_notifier_call_chain(&netdev_chain, val, info);
1536}
1537EXPORT_SYMBOL(call_netdevice_notifiers_info);
1538
1539/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001540 * call_netdevice_notifiers - call all network notifier blocks
1541 * @val: value passed unmodified to notifier function
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001542 * @dev: net_device pointer passed unmodified to notifier function
Linus Torvalds1da177e2005-04-16 15:20:36 -07001543 *
1544 * Call all network notifier blocks. Parameters and return value
Alan Sternf07d5b92006-05-09 15:23:03 -07001545 * are as for raw_notifier_call_chain().
Linus Torvalds1da177e2005-04-16 15:20:36 -07001546 */
1547
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001548int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549{
Jiri Pirko351638e2013-05-28 01:30:21 +00001550 struct netdev_notifier_info info;
1551
1552 return call_netdevice_notifiers_info(val, dev, &info);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001553}
stephen hemmingeredf947f2011-03-24 13:24:01 +00001554EXPORT_SYMBOL(call_netdevice_notifiers);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001555
Ingo Molnarc5905af2012-02-24 08:31:31 +01001556static struct static_key netstamp_needed __read_mostly;
Eric Dumazetb90e5792011-11-28 11:16:50 +00001557#ifdef HAVE_JUMP_LABEL
Ingo Molnarc5905af2012-02-24 08:31:31 +01001558/* We are not allowed to call static_key_slow_dec() from irq context
Eric Dumazetb90e5792011-11-28 11:16:50 +00001559 * If net_disable_timestamp() is called from irq context, defer the
Ingo Molnarc5905af2012-02-24 08:31:31 +01001560 * static_key_slow_dec() calls.
Eric Dumazetb90e5792011-11-28 11:16:50 +00001561 */
1562static atomic_t netstamp_needed_deferred;
1563#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001564
1565void net_enable_timestamp(void)
1566{
Eric Dumazetb90e5792011-11-28 11:16:50 +00001567#ifdef HAVE_JUMP_LABEL
1568 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1569
1570 if (deferred) {
1571 while (--deferred)
Ingo Molnarc5905af2012-02-24 08:31:31 +01001572 static_key_slow_dec(&netstamp_needed);
Eric Dumazetb90e5792011-11-28 11:16:50 +00001573 return;
1574 }
1575#endif
Ingo Molnarc5905af2012-02-24 08:31:31 +01001576 static_key_slow_inc(&netstamp_needed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001577}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001578EXPORT_SYMBOL(net_enable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001579
1580void net_disable_timestamp(void)
1581{
Eric Dumazetb90e5792011-11-28 11:16:50 +00001582#ifdef HAVE_JUMP_LABEL
1583 if (in_interrupt()) {
1584 atomic_inc(&netstamp_needed_deferred);
1585 return;
1586 }
1587#endif
Ingo Molnarc5905af2012-02-24 08:31:31 +01001588 static_key_slow_dec(&netstamp_needed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001590EXPORT_SYMBOL(net_disable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001591
Eric Dumazet3b098e22010-05-15 23:57:10 -07001592static inline void net_timestamp_set(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001593{
Eric Dumazet588f0332011-11-15 04:12:55 +00001594 skb->tstamp.tv64 = 0;
Ingo Molnarc5905af2012-02-24 08:31:31 +01001595 if (static_key_false(&netstamp_needed))
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001596 __net_timestamp(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001597}
1598
Eric Dumazet588f0332011-11-15 04:12:55 +00001599#define net_timestamp_check(COND, SKB) \
Ingo Molnarc5905af2012-02-24 08:31:31 +01001600 if (static_key_false(&netstamp_needed)) { \
Eric Dumazet588f0332011-11-15 04:12:55 +00001601 if ((COND) && !(SKB)->tstamp.tv64) \
1602 __net_timestamp(SKB); \
1603 } \
Eric Dumazet3b098e22010-05-15 23:57:10 -07001604
Daniel Lezcano79b569f2011-03-30 02:42:17 -07001605static inline bool is_skb_forwardable(struct net_device *dev,
1606 struct sk_buff *skb)
1607{
1608 unsigned int len;
1609
1610 if (!(dev->flags & IFF_UP))
1611 return false;
1612
1613 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1614 if (skb->len <= len)
1615 return true;
1616
1617 /* if TSO is enabled, we don't care about the length as the packet
1618 * could be forwarded without being segmented before
1619 */
1620 if (skb_is_gso(skb))
1621 return true;
1622
1623 return false;
1624}
1625
Arnd Bergmann44540962009-11-26 06:07:08 +00001626/**
1627 * dev_forward_skb - loopback an skb to another netif
1628 *
1629 * @dev: destination network device
1630 * @skb: buffer to forward
1631 *
1632 * return values:
1633 * NET_RX_SUCCESS (no congestion)
Eric Dumazet6ec82562010-05-06 00:53:53 -07001634 * NET_RX_DROP (packet was dropped, but freed)
Arnd Bergmann44540962009-11-26 06:07:08 +00001635 *
1636 * dev_forward_skb can be used for injecting an skb from the
1637 * start_xmit function of one device into the receive queue
1638 * of another device.
1639 *
1640 * The receiving device may be in another namespace, so
1641 * we have to clear all information in the skb that could
1642 * impact namespace isolation.
1643 */
1644int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1645{
Michael S. Tsirkin48c83012011-08-31 08:03:29 +00001646 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1647 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1648 atomic_long_inc(&dev->rx_dropped);
1649 kfree_skb(skb);
1650 return NET_RX_DROP;
1651 }
1652 }
1653
Arnd Bergmann44540962009-11-26 06:07:08 +00001654 skb_orphan(skb);
1655
Daniel Lezcano79b569f2011-03-30 02:42:17 -07001656 if (unlikely(!is_skb_forwardable(dev, skb))) {
Eric Dumazetcaf586e2010-09-30 21:06:55 +00001657 atomic_long_inc(&dev->rx_dropped);
Eric Dumazet6ec82562010-05-06 00:53:53 -07001658 kfree_skb(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001659 return NET_RX_DROP;
Eric Dumazet6ec82562010-05-06 00:53:53 -07001660 }
Benjamin LaHaise3b9785c2012-03-27 15:55:44 +00001661 skb->skb_iif = 0;
David S. Miller59b99972012-05-10 23:03:34 -04001662 skb_dst_drop(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001663 skb->tstamp.tv64 = 0;
1664 skb->pkt_type = PACKET_HOST;
1665 skb->protocol = eth_type_trans(skb, dev);
David S. Miller59b99972012-05-10 23:03:34 -04001666 skb->mark = 0;
1667 secpath_reset(skb);
1668 nf_reset(skb);
Patrick McHardy124dff02013-04-05 20:42:05 +02001669 nf_reset_trace(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001670 return netif_rx(skb);
1671}
1672EXPORT_SYMBOL_GPL(dev_forward_skb);
1673
Changli Gao71d9dec2010-12-15 19:57:25 +00001674static inline int deliver_skb(struct sk_buff *skb,
1675 struct packet_type *pt_prev,
1676 struct net_device *orig_dev)
1677{
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00001678 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1679 return -ENOMEM;
Changli Gao71d9dec2010-12-15 19:57:25 +00001680 atomic_inc(&skb->users);
1681 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1682}
1683
Eric Leblondc0de08d2012-08-16 22:02:58 +00001684static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1685{
Eric Leblonda3d744e2012-11-06 02:10:10 +00001686 if (!ptype->af_packet_priv || !skb->sk)
Eric Leblondc0de08d2012-08-16 22:02:58 +00001687 return false;
1688
1689 if (ptype->id_match)
1690 return ptype->id_match(ptype, skb->sk);
1691 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1692 return true;
1693
1694 return false;
1695}
1696
Linus Torvalds1da177e2005-04-16 15:20:36 -07001697/*
1698 * Support routine. Sends outgoing frames to any network
1699 * taps currently in use.
1700 */
1701
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001702static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001703{
1704 struct packet_type *ptype;
Changli Gao71d9dec2010-12-15 19:57:25 +00001705 struct sk_buff *skb2 = NULL;
1706 struct packet_type *pt_prev = NULL;
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001707
Linus Torvalds1da177e2005-04-16 15:20:36 -07001708 rcu_read_lock();
1709 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1710 /* Never send packets back to the socket
1711 * they originated from - MvS (miquels@drinkel.ow.org)
1712 */
1713 if ((ptype->dev == dev || !ptype->dev) &&
Eric Leblondc0de08d2012-08-16 22:02:58 +00001714 (!skb_loop_sk(ptype, skb))) {
Changli Gao71d9dec2010-12-15 19:57:25 +00001715 if (pt_prev) {
1716 deliver_skb(skb2, pt_prev, skb->dev);
1717 pt_prev = ptype;
1718 continue;
1719 }
1720
1721 skb2 = skb_clone(skb, GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001722 if (!skb2)
1723 break;
1724
Eric Dumazet70978182010-12-20 21:22:51 +00001725 net_timestamp_set(skb2);
1726
Linus Torvalds1da177e2005-04-16 15:20:36 -07001727 /* skb->nh should be correctly
1728 set by sender, so that the second statement is
1729 just protection against buggy protocols.
1730 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001731 skb_reset_mac_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001732
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001733 if (skb_network_header(skb2) < skb2->data ||
Simon Hormanced14f62013-05-28 20:34:25 +00001734 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
Joe Perchese87cc472012-05-13 21:56:26 +00001735 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1736 ntohs(skb2->protocol),
1737 dev->name);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07001738 skb_reset_network_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001739 }
1740
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001741 skb2->transport_header = skb2->network_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001742 skb2->pkt_type = PACKET_OUTGOING;
Changli Gao71d9dec2010-12-15 19:57:25 +00001743 pt_prev = ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001744 }
1745 }
Changli Gao71d9dec2010-12-15 19:57:25 +00001746 if (pt_prev)
1747 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001748 rcu_read_unlock();
1749}
1750
Ben Hutchings2c530402012-07-10 10:55:09 +00001751/**
1752 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
John Fastabend4f57c082011-01-17 08:06:04 +00001753 * @dev: Network device
1754 * @txq: number of queues available
1755 *
1756 * If real_num_tx_queues is changed the tc mappings may no longer be
1757 * valid. To resolve this verify the tc mapping remains valid and if
1758 * not NULL the mapping. With no priorities mapping to this
1759 * offset/count pair it will no longer be used. In the worst case TC0
1760 * is invalid nothing can be done so disable priority mappings. If is
1761 * expected that drivers will fix this mapping if they can before
1762 * calling netif_set_real_num_tx_queues.
1763 */
Eric Dumazetbb134d22011-01-20 19:18:08 +00001764static void netif_setup_tc(struct net_device *dev, unsigned int txq)
John Fastabend4f57c082011-01-17 08:06:04 +00001765{
1766 int i;
1767 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1768
1769 /* If TC0 is invalidated disable TC mapping */
1770 if (tc->offset + tc->count > txq) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001771 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
John Fastabend4f57c082011-01-17 08:06:04 +00001772 dev->num_tc = 0;
1773 return;
1774 }
1775
1776 /* Invalidated prio to tc mappings set to TC0 */
1777 for (i = 1; i < TC_BITMASK + 1; i++) {
1778 int q = netdev_get_prio_tc_map(dev, i);
1779
1780 tc = &dev->tc_to_txq[q];
1781 if (tc->offset + tc->count > txq) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001782 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1783 i, q);
John Fastabend4f57c082011-01-17 08:06:04 +00001784 netdev_set_prio_tc_map(dev, i, 0);
1785 }
1786 }
1787}
1788
Alexander Duyck537c00d2013-01-10 08:57:02 +00001789#ifdef CONFIG_XPS
1790static DEFINE_MUTEX(xps_map_mutex);
1791#define xmap_dereference(P) \
1792 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1793
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001794static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1795 int cpu, u16 index)
1796{
1797 struct xps_map *map = NULL;
1798 int pos;
1799
1800 if (dev_maps)
1801 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1802
1803 for (pos = 0; map && pos < map->len; pos++) {
1804 if (map->queues[pos] == index) {
1805 if (map->len > 1) {
1806 map->queues[pos] = map->queues[--map->len];
1807 } else {
1808 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1809 kfree_rcu(map, rcu);
1810 map = NULL;
1811 }
1812 break;
1813 }
1814 }
1815
1816 return map;
1817}
1818
Alexander Duyck024e9672013-01-10 08:57:46 +00001819static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
Alexander Duyck537c00d2013-01-10 08:57:02 +00001820{
1821 struct xps_dev_maps *dev_maps;
Alexander Duyck024e9672013-01-10 08:57:46 +00001822 int cpu, i;
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001823 bool active = false;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001824
1825 mutex_lock(&xps_map_mutex);
1826 dev_maps = xmap_dereference(dev->xps_maps);
1827
1828 if (!dev_maps)
1829 goto out_no_maps;
1830
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001831 for_each_possible_cpu(cpu) {
Alexander Duyck024e9672013-01-10 08:57:46 +00001832 for (i = index; i < dev->num_tx_queues; i++) {
1833 if (!remove_xps_queue(dev_maps, cpu, i))
1834 break;
1835 }
1836 if (i == dev->num_tx_queues)
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001837 active = true;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001838 }
1839
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001840 if (!active) {
Alexander Duyck537c00d2013-01-10 08:57:02 +00001841 RCU_INIT_POINTER(dev->xps_maps, NULL);
1842 kfree_rcu(dev_maps, rcu);
1843 }
1844
Alexander Duyck024e9672013-01-10 08:57:46 +00001845 for (i = index; i < dev->num_tx_queues; i++)
1846 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1847 NUMA_NO_NODE);
1848
Alexander Duyck537c00d2013-01-10 08:57:02 +00001849out_no_maps:
1850 mutex_unlock(&xps_map_mutex);
1851}
1852
Alexander Duyck01c5f862013-01-10 08:57:35 +00001853static struct xps_map *expand_xps_map(struct xps_map *map,
1854 int cpu, u16 index)
1855{
1856 struct xps_map *new_map;
1857 int alloc_len = XPS_MIN_MAP_ALLOC;
1858 int i, pos;
1859
1860 for (pos = 0; map && pos < map->len; pos++) {
1861 if (map->queues[pos] != index)
1862 continue;
1863 return map;
1864 }
1865
1866 /* Need to add queue to this CPU's existing map */
1867 if (map) {
1868 if (pos < map->alloc_len)
1869 return map;
1870
1871 alloc_len = map->alloc_len * 2;
1872 }
1873
1874 /* Need to allocate new map to store queue on this CPU's map */
1875 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1876 cpu_to_node(cpu));
1877 if (!new_map)
1878 return NULL;
1879
1880 for (i = 0; i < pos; i++)
1881 new_map->queues[i] = map->queues[i];
1882 new_map->alloc_len = alloc_len;
1883 new_map->len = pos;
1884
1885 return new_map;
1886}
1887
Alexander Duyck537c00d2013-01-10 08:57:02 +00001888int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1889{
Alexander Duyck01c5f862013-01-10 08:57:35 +00001890 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001891 struct xps_map *map, *new_map;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001892 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
Alexander Duyck01c5f862013-01-10 08:57:35 +00001893 int cpu, numa_node_id = -2;
1894 bool active = false;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001895
1896 mutex_lock(&xps_map_mutex);
1897
1898 dev_maps = xmap_dereference(dev->xps_maps);
1899
Alexander Duyck01c5f862013-01-10 08:57:35 +00001900 /* allocate memory for queue storage */
1901 for_each_online_cpu(cpu) {
1902 if (!cpumask_test_cpu(cpu, mask))
1903 continue;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001904
Alexander Duyck01c5f862013-01-10 08:57:35 +00001905 if (!new_dev_maps)
1906 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
Alexander Duyck2bb60cb2013-02-22 06:38:44 +00001907 if (!new_dev_maps) {
1908 mutex_unlock(&xps_map_mutex);
Alexander Duyck01c5f862013-01-10 08:57:35 +00001909 return -ENOMEM;
Alexander Duyck2bb60cb2013-02-22 06:38:44 +00001910 }
Alexander Duyck01c5f862013-01-10 08:57:35 +00001911
1912 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1913 NULL;
1914
1915 map = expand_xps_map(map, cpu, index);
1916 if (!map)
1917 goto error;
1918
1919 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1920 }
1921
1922 if (!new_dev_maps)
1923 goto out_no_new_maps;
1924
1925 for_each_possible_cpu(cpu) {
1926 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1927 /* add queue to CPU maps */
1928 int pos = 0;
1929
1930 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1931 while ((pos < map->len) && (map->queues[pos] != index))
1932 pos++;
1933
1934 if (pos == map->len)
1935 map->queues[map->len++] = index;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001936#ifdef CONFIG_NUMA
Alexander Duyck537c00d2013-01-10 08:57:02 +00001937 if (numa_node_id == -2)
1938 numa_node_id = cpu_to_node(cpu);
1939 else if (numa_node_id != cpu_to_node(cpu))
1940 numa_node_id = -1;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001941#endif
Alexander Duyck01c5f862013-01-10 08:57:35 +00001942 } else if (dev_maps) {
1943 /* fill in the new device map from the old device map */
1944 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1945 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
Alexander Duyck537c00d2013-01-10 08:57:02 +00001946 }
Alexander Duyck01c5f862013-01-10 08:57:35 +00001947
Alexander Duyck537c00d2013-01-10 08:57:02 +00001948 }
1949
Alexander Duyck01c5f862013-01-10 08:57:35 +00001950 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1951
Alexander Duyck537c00d2013-01-10 08:57:02 +00001952 /* Cleanup old maps */
Alexander Duyck01c5f862013-01-10 08:57:35 +00001953 if (dev_maps) {
1954 for_each_possible_cpu(cpu) {
1955 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1956 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1957 if (map && map != new_map)
1958 kfree_rcu(map, rcu);
1959 }
Alexander Duyck537c00d2013-01-10 08:57:02 +00001960
Alexander Duyck537c00d2013-01-10 08:57:02 +00001961 kfree_rcu(dev_maps, rcu);
Alexander Duyck01c5f862013-01-10 08:57:35 +00001962 }
Alexander Duyck537c00d2013-01-10 08:57:02 +00001963
Alexander Duyck01c5f862013-01-10 08:57:35 +00001964 dev_maps = new_dev_maps;
1965 active = true;
1966
1967out_no_new_maps:
1968 /* update Tx queue numa node */
Alexander Duyck537c00d2013-01-10 08:57:02 +00001969 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1970 (numa_node_id >= 0) ? numa_node_id :
1971 NUMA_NO_NODE);
1972
Alexander Duyck01c5f862013-01-10 08:57:35 +00001973 if (!dev_maps)
1974 goto out_no_maps;
1975
1976 /* removes queue from unused CPUs */
1977 for_each_possible_cpu(cpu) {
1978 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1979 continue;
1980
1981 if (remove_xps_queue(dev_maps, cpu, index))
1982 active = true;
1983 }
1984
1985 /* free map if not active */
1986 if (!active) {
1987 RCU_INIT_POINTER(dev->xps_maps, NULL);
1988 kfree_rcu(dev_maps, rcu);
1989 }
1990
1991out_no_maps:
Alexander Duyck537c00d2013-01-10 08:57:02 +00001992 mutex_unlock(&xps_map_mutex);
1993
1994 return 0;
1995error:
Alexander Duyck01c5f862013-01-10 08:57:35 +00001996 /* remove any maps that we added */
1997 for_each_possible_cpu(cpu) {
1998 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1999 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2000 NULL;
2001 if (new_map && new_map != map)
2002 kfree(new_map);
2003 }
2004
Alexander Duyck537c00d2013-01-10 08:57:02 +00002005 mutex_unlock(&xps_map_mutex);
2006
Alexander Duyck537c00d2013-01-10 08:57:02 +00002007 kfree(new_dev_maps);
2008 return -ENOMEM;
2009}
2010EXPORT_SYMBOL(netif_set_xps_queue);
2011
2012#endif
John Fastabendf0796d52010-07-01 13:21:57 +00002013/*
2014 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2015 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2016 */
Tom Herberte6484932010-10-18 18:04:39 +00002017int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
John Fastabendf0796d52010-07-01 13:21:57 +00002018{
Tom Herbert1d24eb42010-11-21 13:17:27 +00002019 int rc;
2020
Tom Herberte6484932010-10-18 18:04:39 +00002021 if (txq < 1 || txq > dev->num_tx_queues)
2022 return -EINVAL;
John Fastabendf0796d52010-07-01 13:21:57 +00002023
Ben Hutchings5c565802011-02-15 19:39:21 +00002024 if (dev->reg_state == NETREG_REGISTERED ||
2025 dev->reg_state == NETREG_UNREGISTERING) {
Tom Herberte6484932010-10-18 18:04:39 +00002026 ASSERT_RTNL();
2027
Tom Herbert1d24eb42010-11-21 13:17:27 +00002028 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2029 txq);
Tom Herbertbf264142010-11-26 08:36:09 +00002030 if (rc)
2031 return rc;
2032
John Fastabend4f57c082011-01-17 08:06:04 +00002033 if (dev->num_tc)
2034 netif_setup_tc(dev, txq);
2035
Alexander Duyck024e9672013-01-10 08:57:46 +00002036 if (txq < dev->real_num_tx_queues) {
Tom Herberte6484932010-10-18 18:04:39 +00002037 qdisc_reset_all_tx_gt(dev, txq);
Alexander Duyck024e9672013-01-10 08:57:46 +00002038#ifdef CONFIG_XPS
2039 netif_reset_xps_queues_gt(dev, txq);
2040#endif
2041 }
John Fastabendf0796d52010-07-01 13:21:57 +00002042 }
Tom Herberte6484932010-10-18 18:04:39 +00002043
2044 dev->real_num_tx_queues = txq;
2045 return 0;
John Fastabendf0796d52010-07-01 13:21:57 +00002046}
2047EXPORT_SYMBOL(netif_set_real_num_tx_queues);
Denis Vlasenko56079432006-03-29 15:57:29 -08002048
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002049#ifdef CONFIG_RPS
2050/**
2051 * netif_set_real_num_rx_queues - set actual number of RX queues used
2052 * @dev: Network device
2053 * @rxq: Actual number of RX queues
2054 *
2055 * This must be called either with the rtnl_lock held or before
2056 * registration of the net device. Returns 0 on success, or a
Ben Hutchings4e7f7952010-10-08 10:33:39 -07002057 * negative error code. If called before registration, it always
2058 * succeeds.
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002059 */
2060int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2061{
2062 int rc;
2063
Tom Herbertbd25fa72010-10-18 18:00:16 +00002064 if (rxq < 1 || rxq > dev->num_rx_queues)
2065 return -EINVAL;
2066
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002067 if (dev->reg_state == NETREG_REGISTERED) {
2068 ASSERT_RTNL();
2069
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002070 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2071 rxq);
2072 if (rc)
2073 return rc;
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002074 }
2075
2076 dev->real_num_rx_queues = rxq;
2077 return 0;
2078}
2079EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2080#endif
2081
Ben Hutchings2c530402012-07-10 10:55:09 +00002082/**
2083 * netif_get_num_default_rss_queues - default number of RSS queues
Yuval Mintz16917b82012-07-01 03:18:50 +00002084 *
2085 * This routine should set an upper limit on the number of RSS queues
2086 * used by default by multiqueue devices.
2087 */
Ben Hutchingsa55b1382012-07-10 10:54:38 +00002088int netif_get_num_default_rss_queues(void)
Yuval Mintz16917b82012-07-01 03:18:50 +00002089{
2090 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2091}
2092EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2093
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002094static inline void __netif_reschedule(struct Qdisc *q)
2095{
2096 struct softnet_data *sd;
2097 unsigned long flags;
2098
2099 local_irq_save(flags);
2100 sd = &__get_cpu_var(softnet_data);
Changli Gaoa9cbd582010-04-26 23:06:24 +00002101 q->next_sched = NULL;
2102 *sd->output_queue_tailp = q;
2103 sd->output_queue_tailp = &q->next_sched;
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002104 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2105 local_irq_restore(flags);
2106}
2107
David S. Miller37437bb2008-07-16 02:15:04 -07002108void __netif_schedule(struct Qdisc *q)
Denis Vlasenko56079432006-03-29 15:57:29 -08002109{
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002110 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2111 __netif_reschedule(q);
Denis Vlasenko56079432006-03-29 15:57:29 -08002112}
2113EXPORT_SYMBOL(__netif_schedule);
2114
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002115void dev_kfree_skb_irq(struct sk_buff *skb)
Denis Vlasenko56079432006-03-29 15:57:29 -08002116{
David S. Miller3578b0c2010-08-03 00:24:04 -07002117 if (atomic_dec_and_test(&skb->users)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002118 struct softnet_data *sd;
2119 unsigned long flags;
Denis Vlasenko56079432006-03-29 15:57:29 -08002120
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002121 local_irq_save(flags);
2122 sd = &__get_cpu_var(softnet_data);
2123 skb->next = sd->completion_queue;
2124 sd->completion_queue = skb;
2125 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2126 local_irq_restore(flags);
2127 }
Denis Vlasenko56079432006-03-29 15:57:29 -08002128}
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002129EXPORT_SYMBOL(dev_kfree_skb_irq);
Denis Vlasenko56079432006-03-29 15:57:29 -08002130
2131void dev_kfree_skb_any(struct sk_buff *skb)
2132{
2133 if (in_irq() || irqs_disabled())
2134 dev_kfree_skb_irq(skb);
2135 else
2136 dev_kfree_skb(skb);
2137}
2138EXPORT_SYMBOL(dev_kfree_skb_any);
2139
2140
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002141/**
2142 * netif_device_detach - mark device as removed
2143 * @dev: network device
2144 *
2145 * Mark device as removed from system and therefore no longer available.
2146 */
Denis Vlasenko56079432006-03-29 15:57:29 -08002147void netif_device_detach(struct net_device *dev)
2148{
2149 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2150 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00002151 netif_tx_stop_all_queues(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08002152 }
2153}
2154EXPORT_SYMBOL(netif_device_detach);
2155
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002156/**
2157 * netif_device_attach - mark device as attached
2158 * @dev: network device
2159 *
2160 * Mark device as attached from system and restart if needed.
2161 */
Denis Vlasenko56079432006-03-29 15:57:29 -08002162void netif_device_attach(struct net_device *dev)
2163{
2164 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2165 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00002166 netif_tx_wake_all_queues(dev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002167 __netdev_watchdog_up(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08002168 }
2169}
2170EXPORT_SYMBOL(netif_device_attach);
2171
Ben Hutchings36c92472012-01-17 07:57:56 +00002172static void skb_warn_bad_offload(const struct sk_buff *skb)
2173{
Michał Mirosław65e9d2f2012-01-17 10:00:40 +00002174 static const netdev_features_t null_features = 0;
Ben Hutchings36c92472012-01-17 07:57:56 +00002175 struct net_device *dev = skb->dev;
2176 const char *driver = "";
2177
Ben Greearc846ad92013-04-19 10:45:52 +00002178 if (!net_ratelimit())
2179 return;
2180
Ben Hutchings36c92472012-01-17 07:57:56 +00002181 if (dev && dev->dev.parent)
2182 driver = dev_driver_string(dev->dev.parent);
2183
2184 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2185 "gso_type=%d ip_summed=%d\n",
Michał Mirosław65e9d2f2012-01-17 10:00:40 +00002186 driver, dev ? &dev->features : &null_features,
2187 skb->sk ? &skb->sk->sk_route_caps : &null_features,
Ben Hutchings36c92472012-01-17 07:57:56 +00002188 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2189 skb_shinfo(skb)->gso_type, skb->ip_summed);
2190}
2191
Linus Torvalds1da177e2005-04-16 15:20:36 -07002192/*
2193 * Invalidate hardware checksum when packet is to be mangled, and
2194 * complete checksum manually on outgoing path.
2195 */
Patrick McHardy84fa7932006-08-29 16:44:56 -07002196int skb_checksum_help(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002197{
Al Virod3bc23e2006-11-14 21:24:49 -08002198 __wsum csum;
Herbert Xu663ead32007-04-09 11:59:07 -07002199 int ret = 0, offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002200
Patrick McHardy84fa7932006-08-29 16:44:56 -07002201 if (skb->ip_summed == CHECKSUM_COMPLETE)
Herbert Xua430a432006-07-08 13:34:56 -07002202 goto out_set_summed;
2203
2204 if (unlikely(skb_shinfo(skb)->gso_size)) {
Ben Hutchings36c92472012-01-17 07:57:56 +00002205 skb_warn_bad_offload(skb);
2206 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002207 }
2208
Eric Dumazetcef401d2013-01-25 20:34:37 +00002209 /* Before computing a checksum, we should make sure no frag could
2210 * be modified by an external entity : checksum could be wrong.
2211 */
2212 if (skb_has_shared_frag(skb)) {
2213 ret = __skb_linearize(skb);
2214 if (ret)
2215 goto out;
2216 }
2217
Michał Mirosław55508d62010-12-14 15:24:08 +00002218 offset = skb_checksum_start_offset(skb);
Herbert Xua0308472007-10-15 01:47:15 -07002219 BUG_ON(offset >= skb_headlen(skb));
2220 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2221
2222 offset += skb->csum_offset;
2223 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2224
2225 if (skb_cloned(skb) &&
2226 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002227 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2228 if (ret)
2229 goto out;
2230 }
2231
Herbert Xua0308472007-10-15 01:47:15 -07002232 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
Herbert Xua430a432006-07-08 13:34:56 -07002233out_set_summed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002234 skb->ip_summed = CHECKSUM_NONE;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002235out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002236 return ret;
2237}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002238EXPORT_SYMBOL(skb_checksum_help);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002239
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002240__be16 skb_network_protocol(struct sk_buff *skb)
2241{
2242 __be16 type = skb->protocol;
David S. Miller61816592013-03-20 12:46:26 -04002243 int vlan_depth = ETH_HLEN;
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002244
Pravin B Shelar19acc322013-05-07 20:41:07 +00002245 /* Tunnel gso handlers can set protocol to ethernet. */
2246 if (type == htons(ETH_P_TEB)) {
2247 struct ethhdr *eth;
2248
2249 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2250 return 0;
2251
2252 eth = (struct ethhdr *)skb_mac_header(skb);
2253 type = eth->h_proto;
2254 }
2255
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002256 while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002257 struct vlan_hdr *vh;
2258
2259 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2260 return 0;
2261
2262 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2263 type = vh->h_vlan_encapsulated_proto;
2264 vlan_depth += VLAN_HLEN;
2265 }
2266
2267 return type;
2268}
2269
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002270/**
2271 * skb_mac_gso_segment - mac layer segmentation handler.
2272 * @skb: buffer to segment
2273 * @features: features for the output path (see dev->features)
2274 */
2275struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2276 netdev_features_t features)
2277{
2278 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2279 struct packet_offload *ptype;
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002280 __be16 type = skb_network_protocol(skb);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002281
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002282 if (unlikely(!type))
2283 return ERR_PTR(-EINVAL);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002284
2285 __skb_pull(skb, skb->mac_len);
2286
2287 rcu_read_lock();
2288 list_for_each_entry_rcu(ptype, &offload_base, list) {
2289 if (ptype->type == type && ptype->callbacks.gso_segment) {
2290 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2291 int err;
2292
2293 err = ptype->callbacks.gso_send_check(skb);
2294 segs = ERR_PTR(err);
2295 if (err || skb_gso_ok(skb, features))
2296 break;
2297 __skb_push(skb, (skb->data -
2298 skb_network_header(skb)));
2299 }
2300 segs = ptype->callbacks.gso_segment(skb, features);
2301 break;
2302 }
2303 }
2304 rcu_read_unlock();
2305
2306 __skb_push(skb, skb->data - skb_mac_header(skb));
2307
2308 return segs;
2309}
2310EXPORT_SYMBOL(skb_mac_gso_segment);
2311
2312
Cong Wang12b00042013-02-05 16:36:38 +00002313/* openvswitch calls this on rx path, so we need a different check.
2314 */
2315static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2316{
2317 if (tx_path)
2318 return skb->ip_summed != CHECKSUM_PARTIAL;
2319 else
2320 return skb->ip_summed == CHECKSUM_NONE;
2321}
2322
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002323/**
Cong Wang12b00042013-02-05 16:36:38 +00002324 * __skb_gso_segment - Perform segmentation on skb.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002325 * @skb: buffer to segment
Herbert Xu576a30e2006-06-27 13:22:38 -07002326 * @features: features for the output path (see dev->features)
Cong Wang12b00042013-02-05 16:36:38 +00002327 * @tx_path: whether it is called in TX path
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002328 *
2329 * This function segments the given skb and returns a list of segments.
Herbert Xu576a30e2006-06-27 13:22:38 -07002330 *
2331 * It may return NULL if the skb requires no segmentation. This is
2332 * only possible when GSO is used for verifying header integrity.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002333 */
Cong Wang12b00042013-02-05 16:36:38 +00002334struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2335 netdev_features_t features, bool tx_path)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002336{
Cong Wang12b00042013-02-05 16:36:38 +00002337 if (unlikely(skb_needs_check(skb, tx_path))) {
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002338 int err;
2339
Ben Hutchings36c92472012-01-17 07:57:56 +00002340 skb_warn_bad_offload(skb);
Herbert Xu67fd1a72009-01-19 16:26:44 -08002341
Herbert Xua430a432006-07-08 13:34:56 -07002342 if (skb_header_cloned(skb) &&
2343 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2344 return ERR_PTR(err);
2345 }
2346
Pravin B Shelar68c33162013-02-14 14:02:41 +00002347 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002348 skb_reset_mac_header(skb);
2349 skb_reset_mac_len(skb);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002350
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002351 return skb_mac_gso_segment(skb, features);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002352}
Cong Wang12b00042013-02-05 16:36:38 +00002353EXPORT_SYMBOL(__skb_gso_segment);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002354
Herbert Xufb286bb2005-11-10 13:01:24 -08002355/* Take action when hardware reception checksum errors are detected. */
2356#ifdef CONFIG_BUG
2357void netdev_rx_csum_fault(struct net_device *dev)
2358{
2359 if (net_ratelimit()) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00002360 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
Herbert Xufb286bb2005-11-10 13:01:24 -08002361 dump_stack();
2362 }
2363}
2364EXPORT_SYMBOL(netdev_rx_csum_fault);
2365#endif
2366
Linus Torvalds1da177e2005-04-16 15:20:36 -07002367/* Actually, we should eliminate this check as soon as we know, that:
2368 * 1. IOMMU is present and allows to map all the memory.
2369 * 2. No high memory really exists on this machine.
2370 */
2371
Eric Dumazet9092c652010-04-02 13:34:49 -07002372static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002373{
Herbert Xu3d3a8532006-06-27 13:33:10 -07002374#ifdef CONFIG_HIGHMEM
Linus Torvalds1da177e2005-04-16 15:20:36 -07002375 int i;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002376 if (!(dev->features & NETIF_F_HIGHDMA)) {
Ian Campbellea2ab692011-08-22 23:44:58 +00002377 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2378 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2379 if (PageHighMem(skb_frag_page(frag)))
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002380 return 1;
Ian Campbellea2ab692011-08-22 23:44:58 +00002381 }
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002382 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002383
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002384 if (PCI_DMA_BUS_IS_PHYS) {
2385 struct device *pdev = dev->dev.parent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002386
Eric Dumazet9092c652010-04-02 13:34:49 -07002387 if (!pdev)
2388 return 0;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002389 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
Ian Campbellea2ab692011-08-22 23:44:58 +00002390 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2391 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002392 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2393 return 1;
2394 }
2395 }
Herbert Xu3d3a8532006-06-27 13:33:10 -07002396#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002397 return 0;
2398}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002399
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002400struct dev_gso_cb {
2401 void (*destructor)(struct sk_buff *skb);
2402};
2403
2404#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2405
2406static void dev_gso_skb_destructor(struct sk_buff *skb)
2407{
2408 struct dev_gso_cb *cb;
2409
2410 do {
2411 struct sk_buff *nskb = skb->next;
2412
2413 skb->next = nskb->next;
2414 nskb->next = NULL;
2415 kfree_skb(nskb);
2416 } while (skb->next);
2417
2418 cb = DEV_GSO_CB(skb);
2419 if (cb->destructor)
2420 cb->destructor(skb);
2421}
2422
2423/**
2424 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2425 * @skb: buffer to segment
Jesse Gross91ecb632011-01-09 06:23:33 +00002426 * @features: device features as applicable to this skb
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002427 *
2428 * This function segments the given skb and stores the list of segments
2429 * in skb->next.
2430 */
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002431static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002432{
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002433 struct sk_buff *segs;
2434
Herbert Xu576a30e2006-06-27 13:22:38 -07002435 segs = skb_gso_segment(skb, features);
2436
2437 /* Verifying header integrity only. */
2438 if (!segs)
2439 return 0;
2440
Hirofumi Nakagawa801678c2008-04-29 01:03:09 -07002441 if (IS_ERR(segs))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002442 return PTR_ERR(segs);
2443
2444 skb->next = segs;
2445 DEV_GSO_CB(skb)->destructor = skb->destructor;
2446 skb->destructor = dev_gso_skb_destructor;
2447
2448 return 0;
2449}
2450
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002451static netdev_features_t harmonize_features(struct sk_buff *skb,
2452 __be16 protocol, netdev_features_t features)
Jesse Grossf01a5232011-01-09 06:23:31 +00002453{
Ed Cashinc0d680e2012-09-19 15:49:00 +00002454 if (skb->ip_summed != CHECKSUM_NONE &&
2455 !can_checksum_protocol(features, protocol)) {
Jesse Grossf01a5232011-01-09 06:23:31 +00002456 features &= ~NETIF_F_ALL_CSUM;
Jesse Grossf01a5232011-01-09 06:23:31 +00002457 } else if (illegal_highdma(skb->dev, skb)) {
2458 features &= ~NETIF_F_SG;
2459 }
2460
2461 return features;
2462}
2463
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002464netdev_features_t netif_skb_features(struct sk_buff *skb)
Jesse Gross58e998c2010-10-29 12:14:55 +00002465{
2466 __be16 protocol = skb->protocol;
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002467 netdev_features_t features = skb->dev->features;
Jesse Gross58e998c2010-10-29 12:14:55 +00002468
Ben Hutchings30b678d2012-07-30 15:57:00 +00002469 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2470 features &= ~NETIF_F_GSO_MASK;
2471
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002472 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
Jesse Gross58e998c2010-10-29 12:14:55 +00002473 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2474 protocol = veh->h_vlan_encapsulated_proto;
Jesse Grossf01a5232011-01-09 06:23:31 +00002475 } else if (!vlan_tx_tag_present(skb)) {
2476 return harmonize_features(skb, protocol, features);
2477 }
Jesse Gross58e998c2010-10-29 12:14:55 +00002478
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002479 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2480 NETIF_F_HW_VLAN_STAG_TX);
Jesse Grossf01a5232011-01-09 06:23:31 +00002481
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002482 if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
Jesse Grossf01a5232011-01-09 06:23:31 +00002483 return harmonize_features(skb, protocol, features);
2484 } else {
2485 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
Patrick McHardy8ad227f2013-04-19 02:04:31 +00002486 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2487 NETIF_F_HW_VLAN_STAG_TX;
Jesse Grossf01a5232011-01-09 06:23:31 +00002488 return harmonize_features(skb, protocol, features);
2489 }
Jesse Gross58e998c2010-10-29 12:14:55 +00002490}
Jesse Grossf01a5232011-01-09 06:23:31 +00002491EXPORT_SYMBOL(netif_skb_features);
Jesse Gross58e998c2010-10-29 12:14:55 +00002492
John Fastabend6afff0c2010-06-16 14:18:12 +00002493/*
2494 * Returns true if either:
2495 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
Rami Rosend1a53df2012-08-27 23:39:24 +00002496 * 2. skb is fragmented and the device does not support SG.
John Fastabend6afff0c2010-06-16 14:18:12 +00002497 */
2498static inline int skb_needs_linearize(struct sk_buff *skb,
Patrick McHardy6708c9e2013-05-01 22:36:49 +00002499 netdev_features_t features)
John Fastabend6afff0c2010-06-16 14:18:12 +00002500{
Jesse Gross02932ce2011-01-09 06:23:34 +00002501 return skb_is_nonlinear(skb) &&
2502 ((skb_has_frag_list(skb) &&
2503 !(features & NETIF_F_FRAGLIST)) ||
Jesse Grosse1e78db2010-10-29 12:14:53 +00002504 (skb_shinfo(skb)->nr_frags &&
Jesse Gross02932ce2011-01-09 06:23:34 +00002505 !(features & NETIF_F_SG)));
John Fastabend6afff0c2010-06-16 14:18:12 +00002506}
2507
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002508int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2509 struct netdev_queue *txq)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002510{
Stephen Hemminger00829822008-11-20 20:14:53 -08002511 const struct net_device_ops *ops = dev->netdev_ops;
Patrick McHardy572a9d72009-11-10 06:14:14 +00002512 int rc = NETDEV_TX_OK;
Koki Sanagiec764bf2011-05-30 21:48:34 +00002513 unsigned int skb_len;
Stephen Hemminger00829822008-11-20 20:14:53 -08002514
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002515 if (likely(!skb->next)) {
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002516 netdev_features_t features;
Jesse Grossfc741212011-01-09 06:23:32 +00002517
Eric Dumazet93f154b2009-05-18 22:19:19 -07002518 /*
Lucas De Marchi25985ed2011-03-30 22:57:33 -03002519 * If device doesn't need skb->dst, release it right now while
Eric Dumazet93f154b2009-05-18 22:19:19 -07002520 * its hot in this cpu cache
2521 */
Eric Dumazetadf30902009-06-02 05:19:30 +00002522 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2523 skb_dst_drop(skb);
2524
Jesse Grossfc741212011-01-09 06:23:32 +00002525 features = netif_skb_features(skb);
2526
Jesse Gross7b9c6092010-10-20 13:56:04 +00002527 if (vlan_tx_tag_present(skb) &&
Patrick McHardy86a9bad2013-04-19 02:04:30 +00002528 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2529 skb = __vlan_put_tag(skb, skb->vlan_proto,
2530 vlan_tx_tag_get(skb));
Jesse Gross7b9c6092010-10-20 13:56:04 +00002531 if (unlikely(!skb))
2532 goto out;
2533
2534 skb->vlan_tci = 0;
2535 }
2536
Alexander Duyckfc70fb62012-12-07 14:14:15 +00002537 /* If encapsulation offload request, verify we are testing
2538 * hardware encapsulation features instead of standard
2539 * features for the netdev
2540 */
2541 if (skb->encapsulation)
2542 features &= dev->hw_enc_features;
2543
Jesse Grossfc741212011-01-09 06:23:32 +00002544 if (netif_needs_gso(skb, features)) {
Jesse Gross91ecb632011-01-09 06:23:33 +00002545 if (unlikely(dev_gso_segment(skb, features)))
David S. Miller9ccb8972010-04-22 01:02:07 -07002546 goto out_kfree_skb;
2547 if (skb->next)
2548 goto gso;
John Fastabend6afff0c2010-06-16 14:18:12 +00002549 } else {
Jesse Gross02932ce2011-01-09 06:23:34 +00002550 if (skb_needs_linearize(skb, features) &&
John Fastabend6afff0c2010-06-16 14:18:12 +00002551 __skb_linearize(skb))
2552 goto out_kfree_skb;
2553
2554 /* If packet is not checksummed and device does not
2555 * support checksumming for this protocol, complete
2556 * checksumming here.
2557 */
2558 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Alexander Duyckfc70fb62012-12-07 14:14:15 +00002559 if (skb->encapsulation)
2560 skb_set_inner_transport_header(skb,
2561 skb_checksum_start_offset(skb));
2562 else
2563 skb_set_transport_header(skb,
2564 skb_checksum_start_offset(skb));
Jesse Gross03634662011-01-09 06:23:35 +00002565 if (!(features & NETIF_F_ALL_CSUM) &&
John Fastabend6afff0c2010-06-16 14:18:12 +00002566 skb_checksum_help(skb))
2567 goto out_kfree_skb;
2568 }
David S. Miller9ccb8972010-04-22 01:02:07 -07002569 }
2570
Eric Dumazetb40863c2012-09-18 20:44:49 +00002571 if (!list_empty(&ptype_all))
2572 dev_queue_xmit_nit(skb, dev);
2573
Koki Sanagiec764bf2011-05-30 21:48:34 +00002574 skb_len = skb->len;
Patrick Ohlyac45f602009-02-12 05:03:37 +00002575 rc = ops->ndo_start_xmit(skb, dev);
Koki Sanagiec764bf2011-05-30 21:48:34 +00002576 trace_net_dev_xmit(skb, rc, dev, skb_len);
Patrick McHardyec634fe2009-07-05 19:23:38 -07002577 if (rc == NETDEV_TX_OK)
Eric Dumazet08baf562009-05-25 22:58:01 -07002578 txq_trans_update(txq);
Patrick Ohlyac45f602009-02-12 05:03:37 +00002579 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002580 }
2581
Herbert Xu576a30e2006-06-27 13:22:38 -07002582gso:
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002583 do {
2584 struct sk_buff *nskb = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002585
2586 skb->next = nskb->next;
2587 nskb->next = NULL;
Krishna Kumar068a2de2009-12-09 20:59:58 +00002588
Eric Dumazetb40863c2012-09-18 20:44:49 +00002589 if (!list_empty(&ptype_all))
2590 dev_queue_xmit_nit(nskb, dev);
2591
Koki Sanagiec764bf2011-05-30 21:48:34 +00002592 skb_len = nskb->len;
Stephen Hemminger00829822008-11-20 20:14:53 -08002593 rc = ops->ndo_start_xmit(nskb, dev);
Koki Sanagiec764bf2011-05-30 21:48:34 +00002594 trace_net_dev_xmit(nskb, rc, dev, skb_len);
Patrick McHardyec634fe2009-07-05 19:23:38 -07002595 if (unlikely(rc != NETDEV_TX_OK)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002596 if (rc & ~NETDEV_TX_MASK)
2597 goto out_kfree_gso_skb;
Michael Chanf54d9e82006-06-25 23:57:04 -07002598 nskb->next = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002599 skb->next = nskb;
2600 return rc;
2601 }
Eric Dumazet08baf562009-05-25 22:58:01 -07002602 txq_trans_update(txq);
Tom Herbert734664982011-11-28 16:32:44 +00002603 if (unlikely(netif_xmit_stopped(txq) && skb->next))
Michael Chanf54d9e82006-06-25 23:57:04 -07002604 return NETDEV_TX_BUSY;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002605 } while (skb->next);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002606
Patrick McHardy572a9d72009-11-10 06:14:14 +00002607out_kfree_gso_skb:
Sridhar Samudrala0c772152013-04-29 13:02:42 +00002608 if (likely(skb->next == NULL)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002609 skb->destructor = DEV_GSO_CB(skb)->destructor;
Sridhar Samudrala0c772152013-04-29 13:02:42 +00002610 consume_skb(skb);
2611 return rc;
2612 }
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002613out_kfree_skb:
2614 kfree_skb(skb);
Jesse Gross7b9c6092010-10-20 13:56:04 +00002615out:
Patrick McHardy572a9d72009-11-10 06:14:14 +00002616 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002617}
2618
Eric Dumazet1def9232013-01-10 12:36:42 +00002619static void qdisc_pkt_len_init(struct sk_buff *skb)
2620{
2621 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2622
2623 qdisc_skb_cb(skb)->pkt_len = skb->len;
2624
2625 /* To get more precise estimation of bytes sent on wire,
2626 * we add to pkt_len the headers size of all segments
2627 */
2628 if (shinfo->gso_size) {
Eric Dumazet757b8b12013-01-15 21:14:21 -08002629 unsigned int hdr_len;
Jason Wang15e5a032013-03-25 20:19:59 +00002630 u16 gso_segs = shinfo->gso_segs;
Eric Dumazet1def9232013-01-10 12:36:42 +00002631
Eric Dumazet757b8b12013-01-15 21:14:21 -08002632 /* mac layer + network layer */
2633 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2634
2635 /* + transport layer */
Eric Dumazet1def9232013-01-10 12:36:42 +00002636 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2637 hdr_len += tcp_hdrlen(skb);
2638 else
2639 hdr_len += sizeof(struct udphdr);
Jason Wang15e5a032013-03-25 20:19:59 +00002640
2641 if (shinfo->gso_type & SKB_GSO_DODGY)
2642 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2643 shinfo->gso_size);
2644
2645 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
Eric Dumazet1def9232013-01-10 12:36:42 +00002646 }
2647}
2648
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002649static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2650 struct net_device *dev,
2651 struct netdev_queue *txq)
2652{
2653 spinlock_t *root_lock = qdisc_lock(q);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002654 bool contended;
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002655 int rc;
2656
Eric Dumazet1def9232013-01-10 12:36:42 +00002657 qdisc_pkt_len_init(skb);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002658 qdisc_calculate_pkt_len(skb, q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002659 /*
2660 * Heuristic to force contended enqueues to serialize on a
2661 * separate lock before trying to get qdisc main lock.
2662 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2663 * and dequeue packets faster.
2664 */
Eric Dumazeta2da5702011-01-20 03:48:19 +00002665 contended = qdisc_is_running(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002666 if (unlikely(contended))
2667 spin_lock(&q->busylock);
2668
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002669 spin_lock(root_lock);
2670 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2671 kfree_skb(skb);
2672 rc = NET_XMIT_DROP;
2673 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
Eric Dumazetbc135b22010-06-02 03:23:51 -07002674 qdisc_run_begin(q)) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002675 /*
2676 * This is a work-conserving queue; there are no old skbs
2677 * waiting to be sent out; and the qdisc is not running -
2678 * xmit the skb directly.
2679 */
Eric Dumazet7fee2262010-05-11 23:19:48 +00002680 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2681 skb_dst_force(skb);
Eric Dumazetbfe0d022011-01-09 08:30:54 +00002682
Eric Dumazetbfe0d022011-01-09 08:30:54 +00002683 qdisc_bstats_update(q, skb);
2684
Eric Dumazet79640a42010-06-02 05:09:29 -07002685 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2686 if (unlikely(contended)) {
2687 spin_unlock(&q->busylock);
2688 contended = false;
2689 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002690 __qdisc_run(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002691 } else
Eric Dumazetbc135b22010-06-02 03:23:51 -07002692 qdisc_run_end(q);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002693
2694 rc = NET_XMIT_SUCCESS;
2695 } else {
Eric Dumazet7fee2262010-05-11 23:19:48 +00002696 skb_dst_force(skb);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002697 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
Eric Dumazet79640a42010-06-02 05:09:29 -07002698 if (qdisc_run_begin(q)) {
2699 if (unlikely(contended)) {
2700 spin_unlock(&q->busylock);
2701 contended = false;
2702 }
2703 __qdisc_run(q);
2704 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002705 }
2706 spin_unlock(root_lock);
Eric Dumazet79640a42010-06-02 05:09:29 -07002707 if (unlikely(contended))
2708 spin_unlock(&q->busylock);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002709 return rc;
2710}
2711
Neil Horman5bc14212011-11-22 05:10:51 +00002712#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2713static void skb_update_prio(struct sk_buff *skb)
2714{
Igor Maravic6977a792011-11-25 07:44:54 +00002715 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
Neil Horman5bc14212011-11-22 05:10:51 +00002716
Eric Dumazet91c68ce2012-07-08 21:45:10 +00002717 if (!skb->priority && skb->sk && map) {
2718 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2719
2720 if (prioidx < map->priomap_len)
2721 skb->priority = map->priomap[prioidx];
2722 }
Neil Horman5bc14212011-11-22 05:10:51 +00002723}
2724#else
2725#define skb_update_prio(skb)
2726#endif
2727
Eric Dumazet745e20f2010-09-29 13:23:09 -07002728static DEFINE_PER_CPU(int, xmit_recursion);
David S. Miller11a766c2010-10-25 12:51:55 -07002729#define RECURSION_LIMIT 10
Eric Dumazet745e20f2010-09-29 13:23:09 -07002730
Dave Jonesd29f7492008-07-22 14:09:06 -07002731/**
Michel Machado95603e22012-06-12 10:16:35 +00002732 * dev_loopback_xmit - loop back @skb
2733 * @skb: buffer to transmit
2734 */
2735int dev_loopback_xmit(struct sk_buff *skb)
2736{
2737 skb_reset_mac_header(skb);
2738 __skb_pull(skb, skb_network_offset(skb));
2739 skb->pkt_type = PACKET_LOOPBACK;
2740 skb->ip_summed = CHECKSUM_UNNECESSARY;
2741 WARN_ON(!skb_dst(skb));
2742 skb_dst_force(skb);
2743 netif_rx_ni(skb);
2744 return 0;
2745}
2746EXPORT_SYMBOL(dev_loopback_xmit);
2747
2748/**
Dave Jonesd29f7492008-07-22 14:09:06 -07002749 * dev_queue_xmit - transmit a buffer
2750 * @skb: buffer to transmit
2751 *
2752 * Queue a buffer for transmission to a network device. The caller must
2753 * have set the device and priority and built the buffer before calling
2754 * this function. The function can be called from an interrupt.
2755 *
2756 * A negative errno code is returned on a failure. A success does not
2757 * guarantee the frame will be transmitted as it may be dropped due
2758 * to congestion or traffic shaping.
2759 *
2760 * -----------------------------------------------------------------------------------
2761 * I notice this method can also return errors from the queue disciplines,
2762 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2763 * be positive.
2764 *
2765 * Regardless of the return value, the skb is consumed, so it is currently
2766 * difficult to retry a send to this method. (You can bump the ref count
2767 * before sending to hold a reference for retry if you are careful.)
2768 *
2769 * When calling this method, interrupts MUST be enabled. This is because
2770 * the BH enable code must have IRQs enabled so that it will not deadlock.
2771 * --BLG
2772 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002773int dev_queue_xmit(struct sk_buff *skb)
2774{
2775 struct net_device *dev = skb->dev;
David S. Millerdc2b4842008-07-08 17:18:23 -07002776 struct netdev_queue *txq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002777 struct Qdisc *q;
2778 int rc = -ENOMEM;
2779
Eric Dumazet6d1ccff2013-02-05 20:22:20 +00002780 skb_reset_mac_header(skb);
2781
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002782 /* Disable soft irqs for various locks below. Also
2783 * stops preemption for RCU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002784 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002785 rcu_read_lock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002786
Neil Horman5bc14212011-11-22 05:10:51 +00002787 skb_update_prio(skb);
2788
Amerigo Wang8c4c49d2012-09-17 20:16:31 +00002789 txq = netdev_pick_tx(dev, skb);
Paul E. McKenneya898def2010-02-22 17:04:49 -08002790 q = rcu_dereference_bh(txq->qdisc);
David S. Miller37437bb2008-07-16 02:15:04 -07002791
Linus Torvalds1da177e2005-04-16 15:20:36 -07002792#ifdef CONFIG_NET_CLS_ACT
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002793 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002794#endif
Koki Sanagicf66ba52010-08-23 18:45:02 +09002795 trace_net_dev_queue(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002796 if (q->enqueue) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002797 rc = __dev_xmit_skb(skb, q, dev, txq);
David S. Miller37437bb2008-07-16 02:15:04 -07002798 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002799 }
2800
2801 /* The device has no queue. Common case for software devices:
2802 loopback, all the sorts of tunnels...
2803
Herbert Xu932ff272006-06-09 12:20:56 -07002804 Really, it is unlikely that netif_tx_lock protection is necessary
2805 here. (f.e. loopback and IP tunnels are clean ignoring statistics
Linus Torvalds1da177e2005-04-16 15:20:36 -07002806 counters.)
2807 However, it is possible, that they rely on protection
2808 made by us here.
2809
2810 Check this and shot the lock. It is not prone from deadlocks.
2811 Either shot noqueue qdisc, it is even simpler 8)
2812 */
2813 if (dev->flags & IFF_UP) {
2814 int cpu = smp_processor_id(); /* ok because BHs are off */
2815
David S. Millerc773e842008-07-08 23:13:53 -07002816 if (txq->xmit_lock_owner != cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002817
Eric Dumazet745e20f2010-09-29 13:23:09 -07002818 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2819 goto recursion_alert;
2820
David S. Millerc773e842008-07-08 23:13:53 -07002821 HARD_TX_LOCK(dev, txq, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002822
Tom Herbert734664982011-11-28 16:32:44 +00002823 if (!netif_xmit_stopped(txq)) {
Eric Dumazet745e20f2010-09-29 13:23:09 -07002824 __this_cpu_inc(xmit_recursion);
Patrick McHardy572a9d72009-11-10 06:14:14 +00002825 rc = dev_hard_start_xmit(skb, dev, txq);
Eric Dumazet745e20f2010-09-29 13:23:09 -07002826 __this_cpu_dec(xmit_recursion);
Patrick McHardy572a9d72009-11-10 06:14:14 +00002827 if (dev_xmit_complete(rc)) {
David S. Millerc773e842008-07-08 23:13:53 -07002828 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002829 goto out;
2830 }
2831 }
David S. Millerc773e842008-07-08 23:13:53 -07002832 HARD_TX_UNLOCK(dev, txq);
Joe Perchese87cc472012-05-13 21:56:26 +00002833 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2834 dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002835 } else {
2836 /* Recursion is detected! It is possible,
Eric Dumazet745e20f2010-09-29 13:23:09 -07002837 * unfortunately
2838 */
2839recursion_alert:
Joe Perchese87cc472012-05-13 21:56:26 +00002840 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2841 dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002842 }
2843 }
2844
2845 rc = -ENETDOWN;
Herbert Xud4828d82006-06-22 02:28:18 -07002846 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002847
Linus Torvalds1da177e2005-04-16 15:20:36 -07002848 kfree_skb(skb);
2849 return rc;
2850out:
Herbert Xud4828d82006-06-22 02:28:18 -07002851 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002852 return rc;
2853}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002854EXPORT_SYMBOL(dev_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002855
2856
2857/*=======================================================================
2858 Receiver routines
2859 =======================================================================*/
2860
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002861int netdev_max_backlog __read_mostly = 1000;
Eric Dumazetc9e6bc62012-09-27 19:29:05 +00002862EXPORT_SYMBOL(netdev_max_backlog);
2863
Eric Dumazet3b098e22010-05-15 23:57:10 -07002864int netdev_tstamp_prequeue __read_mostly = 1;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002865int netdev_budget __read_mostly = 300;
2866int weight_p __read_mostly = 64; /* old backlog weight */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002867
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002868/* Called with irq disabled */
2869static inline void ____napi_schedule(struct softnet_data *sd,
2870 struct napi_struct *napi)
2871{
2872 list_add_tail(&napi->poll_list, &sd->poll_list);
2873 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2874}
2875
Eric Dumazetdf334542010-03-24 19:13:54 +00002876#ifdef CONFIG_RPS
Tom Herbertfec5e652010-04-16 16:01:27 -07002877
2878/* One global table that all flow-based protocols share. */
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002879struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
Tom Herbertfec5e652010-04-16 16:01:27 -07002880EXPORT_SYMBOL(rps_sock_flow_table);
2881
Ingo Molnarc5905af2012-02-24 08:31:31 +01002882struct static_key rps_needed __read_mostly;
Eric Dumazetadc93002011-11-17 03:13:26 +00002883
Ben Hutchingsc4454772011-01-19 11:03:53 +00002884static struct rps_dev_flow *
2885set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2886 struct rps_dev_flow *rflow, u16 next_cpu)
2887{
Ben Hutchings09994d12011-10-03 04:42:46 +00002888 if (next_cpu != RPS_NO_CPU) {
Ben Hutchingsc4454772011-01-19 11:03:53 +00002889#ifdef CONFIG_RFS_ACCEL
2890 struct netdev_rx_queue *rxqueue;
2891 struct rps_dev_flow_table *flow_table;
2892 struct rps_dev_flow *old_rflow;
2893 u32 flow_id;
2894 u16 rxq_index;
2895 int rc;
2896
2897 /* Should we steer this flow to a different hardware queue? */
Ben Hutchings69a19ee2011-02-15 20:32:04 +00002898 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2899 !(dev->features & NETIF_F_NTUPLE))
Ben Hutchingsc4454772011-01-19 11:03:53 +00002900 goto out;
2901 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2902 if (rxq_index == skb_get_rx_queue(skb))
2903 goto out;
2904
2905 rxqueue = dev->_rx + rxq_index;
2906 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2907 if (!flow_table)
2908 goto out;
2909 flow_id = skb->rxhash & flow_table->mask;
2910 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2911 rxq_index, flow_id);
2912 if (rc < 0)
2913 goto out;
2914 old_rflow = rflow;
2915 rflow = &flow_table->flows[flow_id];
Ben Hutchingsc4454772011-01-19 11:03:53 +00002916 rflow->filter = rc;
2917 if (old_rflow->filter == rflow->filter)
2918 old_rflow->filter = RPS_NO_FILTER;
2919 out:
2920#endif
2921 rflow->last_qtail =
Ben Hutchings09994d12011-10-03 04:42:46 +00002922 per_cpu(softnet_data, next_cpu).input_queue_head;
Ben Hutchingsc4454772011-01-19 11:03:53 +00002923 }
2924
Ben Hutchings09994d12011-10-03 04:42:46 +00002925 rflow->cpu = next_cpu;
Ben Hutchingsc4454772011-01-19 11:03:53 +00002926 return rflow;
2927}
2928
Tom Herbert0a9627f2010-03-16 08:03:29 +00002929/*
2930 * get_rps_cpu is called from netif_receive_skb and returns the target
2931 * CPU from the RPS map of the receiving queue for a given skb.
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002932 * rcu_read_lock must be held on entry.
Tom Herbert0a9627f2010-03-16 08:03:29 +00002933 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002934static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2935 struct rps_dev_flow **rflowp)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002936{
Tom Herbert0a9627f2010-03-16 08:03:29 +00002937 struct netdev_rx_queue *rxqueue;
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002938 struct rps_map *map;
Tom Herbertfec5e652010-04-16 16:01:27 -07002939 struct rps_dev_flow_table *flow_table;
2940 struct rps_sock_flow_table *sock_flow_table;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002941 int cpu = -1;
Tom Herbertfec5e652010-04-16 16:01:27 -07002942 u16 tcpu;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002943
Tom Herbert0a9627f2010-03-16 08:03:29 +00002944 if (skb_rx_queue_recorded(skb)) {
2945 u16 index = skb_get_rx_queue(skb);
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002946 if (unlikely(index >= dev->real_num_rx_queues)) {
2947 WARN_ONCE(dev->real_num_rx_queues > 1,
2948 "%s received packet on queue %u, but number "
2949 "of RX queues is %u\n",
2950 dev->name, index, dev->real_num_rx_queues);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002951 goto done;
2952 }
2953 rxqueue = dev->_rx + index;
2954 } else
2955 rxqueue = dev->_rx;
2956
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002957 map = rcu_dereference(rxqueue->rps_map);
2958 if (map) {
Tom Herbert85875232011-01-31 16:23:42 -08002959 if (map->len == 1 &&
Eric Dumazet33d480c2011-08-11 19:30:52 +00002960 !rcu_access_pointer(rxqueue->rps_flow_table)) {
Changli Gao6febfca2010-09-03 23:12:37 +00002961 tcpu = map->cpus[0];
2962 if (cpu_online(tcpu))
2963 cpu = tcpu;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002964 goto done;
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002965 }
Eric Dumazet33d480c2011-08-11 19:30:52 +00002966 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00002967 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002968 }
2969
Changli Gao2d47b452010-08-17 19:00:56 +00002970 skb_reset_network_header(skb);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002971 if (!skb_get_rxhash(skb))
Tom Herbert0a9627f2010-03-16 08:03:29 +00002972 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002973
Tom Herbertfec5e652010-04-16 16:01:27 -07002974 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2975 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2976 if (flow_table && sock_flow_table) {
2977 u16 next_cpu;
2978 struct rps_dev_flow *rflow;
2979
2980 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2981 tcpu = rflow->cpu;
2982
2983 next_cpu = sock_flow_table->ents[skb->rxhash &
2984 sock_flow_table->mask];
2985
2986 /*
2987 * If the desired CPU (where last recvmsg was done) is
2988 * different from current CPU (one in the rx-queue flow
2989 * table entry), switch if one of the following holds:
2990 * - Current CPU is unset (equal to RPS_NO_CPU).
2991 * - Current CPU is offline.
2992 * - The current CPU's queue tail has advanced beyond the
2993 * last packet that was enqueued using this table entry.
2994 * This guarantees that all previous packets for the flow
2995 * have been dequeued, thus preserving in order delivery.
2996 */
2997 if (unlikely(tcpu != next_cpu) &&
2998 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2999 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
Tom Herbertbaefa312012-11-16 09:04:15 +00003000 rflow->last_qtail)) >= 0)) {
3001 tcpu = next_cpu;
Ben Hutchingsc4454772011-01-19 11:03:53 +00003002 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
Tom Herbertbaefa312012-11-16 09:04:15 +00003003 }
Ben Hutchingsc4454772011-01-19 11:03:53 +00003004
Tom Herbertfec5e652010-04-16 16:01:27 -07003005 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3006 *rflowp = rflow;
3007 cpu = tcpu;
3008 goto done;
3009 }
3010 }
3011
Tom Herbert0a9627f2010-03-16 08:03:29 +00003012 if (map) {
Tom Herbertfec5e652010-04-16 16:01:27 -07003013 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
Tom Herbert0a9627f2010-03-16 08:03:29 +00003014
3015 if (cpu_online(tcpu)) {
3016 cpu = tcpu;
3017 goto done;
3018 }
3019 }
3020
3021done:
Tom Herbert0a9627f2010-03-16 08:03:29 +00003022 return cpu;
3023}
3024
Ben Hutchingsc4454772011-01-19 11:03:53 +00003025#ifdef CONFIG_RFS_ACCEL
3026
3027/**
3028 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3029 * @dev: Device on which the filter was set
3030 * @rxq_index: RX queue index
3031 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3032 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3033 *
3034 * Drivers that implement ndo_rx_flow_steer() should periodically call
3035 * this function for each installed filter and remove the filters for
3036 * which it returns %true.
3037 */
3038bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3039 u32 flow_id, u16 filter_id)
3040{
3041 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3042 struct rps_dev_flow_table *flow_table;
3043 struct rps_dev_flow *rflow;
3044 bool expire = true;
3045 int cpu;
3046
3047 rcu_read_lock();
3048 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3049 if (flow_table && flow_id <= flow_table->mask) {
3050 rflow = &flow_table->flows[flow_id];
3051 cpu = ACCESS_ONCE(rflow->cpu);
3052 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3053 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3054 rflow->last_qtail) <
3055 (int)(10 * flow_table->mask)))
3056 expire = false;
3057 }
3058 rcu_read_unlock();
3059 return expire;
3060}
3061EXPORT_SYMBOL(rps_may_expire_flow);
3062
3063#endif /* CONFIG_RFS_ACCEL */
3064
Tom Herbert0a9627f2010-03-16 08:03:29 +00003065/* Called from hardirq (IPI) context */
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003066static void rps_trigger_softirq(void *data)
Tom Herbert0a9627f2010-03-16 08:03:29 +00003067{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003068 struct softnet_data *sd = data;
3069
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003070 ____napi_schedule(sd, &sd->backlog);
Changli Gaodee42872010-05-02 05:42:16 +00003071 sd->received_rps++;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003072}
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003073
Tom Herbertfec5e652010-04-16 16:01:27 -07003074#endif /* CONFIG_RPS */
Tom Herbert0a9627f2010-03-16 08:03:29 +00003075
3076/*
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003077 * Check if this softnet_data structure is another cpu one
3078 * If yes, queue it to our IPI list and return 1
3079 * If no, return 0
3080 */
3081static int rps_ipi_queued(struct softnet_data *sd)
3082{
3083#ifdef CONFIG_RPS
3084 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3085
3086 if (sd != mysd) {
3087 sd->rps_ipi_next = mysd->rps_ipi_list;
3088 mysd->rps_ipi_list = sd;
3089
3090 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3091 return 1;
3092 }
3093#endif /* CONFIG_RPS */
3094 return 0;
3095}
3096
Willem de Bruijn99bbc702013-05-20 04:02:32 +00003097#ifdef CONFIG_NET_FLOW_LIMIT
3098int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3099#endif
3100
3101static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3102{
3103#ifdef CONFIG_NET_FLOW_LIMIT
3104 struct sd_flow_limit *fl;
3105 struct softnet_data *sd;
3106 unsigned int old_flow, new_flow;
3107
3108 if (qlen < (netdev_max_backlog >> 1))
3109 return false;
3110
3111 sd = &__get_cpu_var(softnet_data);
3112
3113 rcu_read_lock();
3114 fl = rcu_dereference(sd->flow_limit);
3115 if (fl) {
3116 new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
3117 old_flow = fl->history[fl->history_head];
3118 fl->history[fl->history_head] = new_flow;
3119
3120 fl->history_head++;
3121 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3122
3123 if (likely(fl->buckets[old_flow]))
3124 fl->buckets[old_flow]--;
3125
3126 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3127 fl->count++;
3128 rcu_read_unlock();
3129 return true;
3130 }
3131 }
3132 rcu_read_unlock();
3133#endif
3134 return false;
3135}
3136
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003137/*
Tom Herbert0a9627f2010-03-16 08:03:29 +00003138 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3139 * queue (may be a remote CPU queue).
3140 */
Tom Herbertfec5e652010-04-16 16:01:27 -07003141static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3142 unsigned int *qtail)
Tom Herbert0a9627f2010-03-16 08:03:29 +00003143{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003144 struct softnet_data *sd;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003145 unsigned long flags;
Willem de Bruijn99bbc702013-05-20 04:02:32 +00003146 unsigned int qlen;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003147
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003148 sd = &per_cpu(softnet_data, cpu);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003149
3150 local_irq_save(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003151
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003152 rps_lock(sd);
Willem de Bruijn99bbc702013-05-20 04:02:32 +00003153 qlen = skb_queue_len(&sd->input_pkt_queue);
3154 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
Changli Gao6e7676c2010-04-27 15:07:33 -07003155 if (skb_queue_len(&sd->input_pkt_queue)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00003156enqueue:
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003157 __skb_queue_tail(&sd->input_pkt_queue, skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003158 input_queue_tail_incr_save(sd, qtail);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003159 rps_unlock(sd);
Changli Gao152102c2010-03-30 20:16:22 +00003160 local_irq_restore(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003161 return NET_RX_SUCCESS;
3162 }
3163
Eric Dumazetebda37c22010-05-06 23:51:21 +00003164 /* Schedule NAPI for backlog device
3165 * We can use non atomic operation since we own the queue lock
3166 */
3167 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003168 if (!rps_ipi_queued(sd))
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003169 ____napi_schedule(sd, &sd->backlog);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003170 }
3171 goto enqueue;
3172 }
3173
Changli Gaodee42872010-05-02 05:42:16 +00003174 sd->dropped++;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003175 rps_unlock(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003176
Tom Herbert0a9627f2010-03-16 08:03:29 +00003177 local_irq_restore(flags);
3178
Eric Dumazetcaf586e2010-09-30 21:06:55 +00003179 atomic_long_inc(&skb->dev->rx_dropped);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003180 kfree_skb(skb);
3181 return NET_RX_DROP;
3182}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003183
Linus Torvalds1da177e2005-04-16 15:20:36 -07003184/**
3185 * netif_rx - post buffer to the network code
3186 * @skb: buffer to post
3187 *
3188 * This function receives a packet from a device driver and queues it for
3189 * the upper (protocol) levels to process. It always succeeds. The buffer
3190 * may be dropped during processing for congestion control or by the
3191 * protocol layers.
3192 *
3193 * return values:
3194 * NET_RX_SUCCESS (no congestion)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003195 * NET_RX_DROP (packet was dropped)
3196 *
3197 */
3198
3199int netif_rx(struct sk_buff *skb)
3200{
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003201 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003202
3203 /* if netpoll wants it, pretend we never saw it */
3204 if (netpoll_rx(skb))
3205 return NET_RX_DROP;
3206
Eric Dumazet588f0332011-11-15 04:12:55 +00003207 net_timestamp_check(netdev_tstamp_prequeue, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003208
Koki Sanagicf66ba52010-08-23 18:45:02 +09003209 trace_netif_rx(skb);
Eric Dumazetdf334542010-03-24 19:13:54 +00003210#ifdef CONFIG_RPS
Ingo Molnarc5905af2012-02-24 08:31:31 +01003211 if (static_key_false(&rps_needed)) {
Tom Herbertfec5e652010-04-16 16:01:27 -07003212 struct rps_dev_flow voidflow, *rflow = &voidflow;
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003213 int cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003214
Changli Gaocece1942010-08-07 20:35:43 -07003215 preempt_disable();
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003216 rcu_read_lock();
Tom Herbertfec5e652010-04-16 16:01:27 -07003217
3218 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003219 if (cpu < 0)
3220 cpu = smp_processor_id();
Tom Herbertfec5e652010-04-16 16:01:27 -07003221
3222 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3223
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003224 rcu_read_unlock();
Changli Gaocece1942010-08-07 20:35:43 -07003225 preempt_enable();
Eric Dumazetadc93002011-11-17 03:13:26 +00003226 } else
3227#endif
Tom Herbertfec5e652010-04-16 16:01:27 -07003228 {
3229 unsigned int qtail;
3230 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3231 put_cpu();
3232 }
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003233 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003234}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003235EXPORT_SYMBOL(netif_rx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003236
3237int netif_rx_ni(struct sk_buff *skb)
3238{
3239 int err;
3240
3241 preempt_disable();
3242 err = netif_rx(skb);
3243 if (local_softirq_pending())
3244 do_softirq();
3245 preempt_enable();
3246
3247 return err;
3248}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003249EXPORT_SYMBOL(netif_rx_ni);
3250
Linus Torvalds1da177e2005-04-16 15:20:36 -07003251static void net_tx_action(struct softirq_action *h)
3252{
3253 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3254
3255 if (sd->completion_queue) {
3256 struct sk_buff *clist;
3257
3258 local_irq_disable();
3259 clist = sd->completion_queue;
3260 sd->completion_queue = NULL;
3261 local_irq_enable();
3262
3263 while (clist) {
3264 struct sk_buff *skb = clist;
3265 clist = clist->next;
3266
Ilpo Järvinen547b7922008-07-25 21:43:18 -07003267 WARN_ON(atomic_read(&skb->users));
Koki Sanagi07dc22e2010-08-23 18:46:12 +09003268 trace_kfree_skb(skb, net_tx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003269 __kfree_skb(skb);
3270 }
3271 }
3272
3273 if (sd->output_queue) {
David S. Miller37437bb2008-07-16 02:15:04 -07003274 struct Qdisc *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003275
3276 local_irq_disable();
3277 head = sd->output_queue;
3278 sd->output_queue = NULL;
Changli Gaoa9cbd582010-04-26 23:06:24 +00003279 sd->output_queue_tailp = &sd->output_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003280 local_irq_enable();
3281
3282 while (head) {
David S. Miller37437bb2008-07-16 02:15:04 -07003283 struct Qdisc *q = head;
3284 spinlock_t *root_lock;
3285
Linus Torvalds1da177e2005-04-16 15:20:36 -07003286 head = head->next_sched;
3287
David S. Miller5fb66222008-08-02 20:02:43 -07003288 root_lock = qdisc_lock(q);
David S. Miller37437bb2008-07-16 02:15:04 -07003289 if (spin_trylock(root_lock)) {
Jarek Poplawskidef82a12008-08-17 21:54:43 -07003290 smp_mb__before_clear_bit();
3291 clear_bit(__QDISC_STATE_SCHED,
3292 &q->state);
David S. Miller37437bb2008-07-16 02:15:04 -07003293 qdisc_run(q);
3294 spin_unlock(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003295 } else {
David S. Miller195648b2008-08-19 04:00:36 -07003296 if (!test_bit(__QDISC_STATE_DEACTIVATED,
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07003297 &q->state)) {
David S. Miller195648b2008-08-19 04:00:36 -07003298 __netif_reschedule(q);
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07003299 } else {
3300 smp_mb__before_clear_bit();
3301 clear_bit(__QDISC_STATE_SCHED,
3302 &q->state);
3303 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003304 }
3305 }
3306 }
3307}
3308
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003309#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3310 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
Michał Mirosławda678292009-06-05 05:35:28 +00003311/* This hook is defined here for ATM LANE */
3312int (*br_fdb_test_addr_hook)(struct net_device *dev,
3313 unsigned char *addr) __read_mostly;
Stephen Hemminger4fb019a2009-09-11 11:50:08 -07003314EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
Michał Mirosławda678292009-06-05 05:35:28 +00003315#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003316
Linus Torvalds1da177e2005-04-16 15:20:36 -07003317#ifdef CONFIG_NET_CLS_ACT
3318/* TODO: Maybe we should just force sch_ingress to be compiled in
3319 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3320 * a compare and 2 stores extra right now if we dont have it on
3321 * but have CONFIG_NET_CLS_ACT
Lucas De Marchi25985ed2011-03-30 22:57:33 -03003322 * NOTE: This doesn't stop any functionality; if you dont have
3323 * the ingress scheduler, you just can't add policies on ingress.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003324 *
3325 */
Eric Dumazet24824a02010-10-02 06:11:55 +00003326static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003327{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003328 struct net_device *dev = skb->dev;
Herbert Xuf697c3e2007-10-14 00:38:47 -07003329 u32 ttl = G_TC_RTTL(skb->tc_verd);
David S. Miller555353c2008-07-08 17:33:13 -07003330 int result = TC_ACT_OK;
3331 struct Qdisc *q;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003332
Stephen Hemmingerde384832010-08-01 00:33:23 -07003333 if (unlikely(MAX_RED_LOOP < ttl++)) {
Joe Perchese87cc472012-05-13 21:56:26 +00003334 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3335 skb->skb_iif, dev->ifindex);
Herbert Xuf697c3e2007-10-14 00:38:47 -07003336 return TC_ACT_SHOT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003337 }
3338
Herbert Xuf697c3e2007-10-14 00:38:47 -07003339 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3340 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3341
David S. Miller83874002008-07-17 00:53:03 -07003342 q = rxq->qdisc;
David S. Miller8d50b532008-07-30 02:37:46 -07003343 if (q != &noop_qdisc) {
David S. Miller83874002008-07-17 00:53:03 -07003344 spin_lock(qdisc_lock(q));
David S. Millera9312ae2008-08-17 21:51:03 -07003345 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3346 result = qdisc_enqueue_root(skb, q);
David S. Miller83874002008-07-17 00:53:03 -07003347 spin_unlock(qdisc_lock(q));
3348 }
Herbert Xuf697c3e2007-10-14 00:38:47 -07003349
Linus Torvalds1da177e2005-04-16 15:20:36 -07003350 return result;
3351}
Herbert Xuf697c3e2007-10-14 00:38:47 -07003352
3353static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3354 struct packet_type **pt_prev,
3355 int *ret, struct net_device *orig_dev)
3356{
Eric Dumazet24824a02010-10-02 06:11:55 +00003357 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3358
3359 if (!rxq || rxq->qdisc == &noop_qdisc)
Herbert Xuf697c3e2007-10-14 00:38:47 -07003360 goto out;
3361
3362 if (*pt_prev) {
3363 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3364 *pt_prev = NULL;
Herbert Xuf697c3e2007-10-14 00:38:47 -07003365 }
3366
Eric Dumazet24824a02010-10-02 06:11:55 +00003367 switch (ing_filter(skb, rxq)) {
Herbert Xuf697c3e2007-10-14 00:38:47 -07003368 case TC_ACT_SHOT:
3369 case TC_ACT_STOLEN:
3370 kfree_skb(skb);
3371 return NULL;
3372 }
3373
3374out:
3375 skb->tc_verd = 0;
3376 return skb;
3377}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003378#endif
3379
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003380/**
3381 * netdev_rx_handler_register - register receive handler
3382 * @dev: device to register a handler for
3383 * @rx_handler: receive handler to register
Jiri Pirko93e2c322010-06-10 03:34:59 +00003384 * @rx_handler_data: data pointer that is used by rx handler
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003385 *
3386 * Register a receive hander for a device. This handler will then be
3387 * called from __netif_receive_skb. A negative errno code is returned
3388 * on a failure.
3389 *
3390 * The caller must hold the rtnl_mutex.
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003391 *
3392 * For a general description of rx_handler, see enum rx_handler_result.
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003393 */
3394int netdev_rx_handler_register(struct net_device *dev,
Jiri Pirko93e2c322010-06-10 03:34:59 +00003395 rx_handler_func_t *rx_handler,
3396 void *rx_handler_data)
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003397{
3398 ASSERT_RTNL();
3399
3400 if (dev->rx_handler)
3401 return -EBUSY;
3402
Eric Dumazet00cfec32013-03-29 03:01:22 +00003403 /* Note: rx_handler_data must be set before rx_handler */
Jiri Pirko93e2c322010-06-10 03:34:59 +00003404 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003405 rcu_assign_pointer(dev->rx_handler, rx_handler);
3406
3407 return 0;
3408}
3409EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3410
3411/**
3412 * netdev_rx_handler_unregister - unregister receive handler
3413 * @dev: device to unregister a handler from
3414 *
Kusanagi Kouichi166ec362013-03-18 02:59:52 +00003415 * Unregister a receive handler from a device.
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003416 *
3417 * The caller must hold the rtnl_mutex.
3418 */
3419void netdev_rx_handler_unregister(struct net_device *dev)
3420{
3421
3422 ASSERT_RTNL();
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00003423 RCU_INIT_POINTER(dev->rx_handler, NULL);
Eric Dumazet00cfec32013-03-29 03:01:22 +00003424 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3425 * section has a guarantee to see a non NULL rx_handler_data
3426 * as well.
3427 */
3428 synchronize_net();
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00003429 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003430}
3431EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3432
Mel Gormanb4b9e352012-07-31 16:44:26 -07003433/*
3434 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3435 * the special handling of PFMEMALLOC skbs.
3436 */
3437static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3438{
3439 switch (skb->protocol) {
3440 case __constant_htons(ETH_P_ARP):
3441 case __constant_htons(ETH_P_IP):
3442 case __constant_htons(ETH_P_IPV6):
3443 case __constant_htons(ETH_P_8021Q):
Patrick McHardy8ad227f2013-04-19 02:04:31 +00003444 case __constant_htons(ETH_P_8021AD):
Mel Gormanb4b9e352012-07-31 16:44:26 -07003445 return true;
3446 default:
3447 return false;
3448 }
3449}
3450
David S. Miller9754e292013-02-14 15:57:38 -05003451static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003452{
3453 struct packet_type *ptype, *pt_prev;
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003454 rx_handler_func_t *rx_handler;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003455 struct net_device *orig_dev;
David S. Miller63d8ea72011-02-28 10:48:59 -08003456 struct net_device *null_or_dev;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003457 bool deliver_exact = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003458 int ret = NET_RX_DROP;
Al Viro252e3342006-11-14 20:48:11 -08003459 __be16 type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003460
Eric Dumazet588f0332011-11-15 04:12:55 +00003461 net_timestamp_check(!netdev_tstamp_prequeue, skb);
Eric Dumazet81bbb3d2009-09-30 16:42:42 -07003462
Koki Sanagicf66ba52010-08-23 18:45:02 +09003463 trace_netif_receive_skb(skb);
Patrick McHardy9b22ea52008-11-04 14:49:57 -08003464
Linus Torvalds1da177e2005-04-16 15:20:36 -07003465 /* if we've gotten here through NAPI, check netpoll */
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003466 if (netpoll_receive_skb(skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003467 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003468
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07003469 orig_dev = skb->dev;
Jiri Pirko1765a572011-02-12 06:48:36 +00003470
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003471 skb_reset_network_header(skb);
Eric Dumazetfda55ec2013-01-07 09:28:21 +00003472 if (!skb_transport_header_was_set(skb))
3473 skb_reset_transport_header(skb);
Jiri Pirko0b5c9db2011-06-10 06:56:58 +00003474 skb_reset_mac_len(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003475
3476 pt_prev = NULL;
3477
3478 rcu_read_lock();
3479
David S. Miller63d8ea72011-02-28 10:48:59 -08003480another_round:
David S. Millerb6858172012-07-23 16:27:54 -07003481 skb->skb_iif = skb->dev->ifindex;
David S. Miller63d8ea72011-02-28 10:48:59 -08003482
3483 __this_cpu_inc(softnet_data.processed);
3484
Patrick McHardy8ad227f2013-04-19 02:04:31 +00003485 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3486 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
Jiri Pirkobcc6d472011-04-07 19:48:33 +00003487 skb = vlan_untag(skb);
3488 if (unlikely(!skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003489 goto unlock;
Jiri Pirkobcc6d472011-04-07 19:48:33 +00003490 }
3491
Linus Torvalds1da177e2005-04-16 15:20:36 -07003492#ifdef CONFIG_NET_CLS_ACT
3493 if (skb->tc_verd & TC_NCLS) {
3494 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3495 goto ncls;
3496 }
3497#endif
3498
David S. Miller9754e292013-02-14 15:57:38 -05003499 if (pfmemalloc)
Mel Gormanb4b9e352012-07-31 16:44:26 -07003500 goto skip_taps;
3501
Linus Torvalds1da177e2005-04-16 15:20:36 -07003502 list_for_each_entry_rcu(ptype, &ptype_all, list) {
David S. Miller63d8ea72011-02-28 10:48:59 -08003503 if (!ptype->dev || ptype->dev == skb->dev) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003504 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003505 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003506 pt_prev = ptype;
3507 }
3508 }
3509
Mel Gormanb4b9e352012-07-31 16:44:26 -07003510skip_taps:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003511#ifdef CONFIG_NET_CLS_ACT
Herbert Xuf697c3e2007-10-14 00:38:47 -07003512 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3513 if (!skb)
Mel Gormanb4b9e352012-07-31 16:44:26 -07003514 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003515ncls:
3516#endif
3517
David S. Miller9754e292013-02-14 15:57:38 -05003518 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003519 goto drop;
3520
John Fastabend24257172011-10-10 09:16:41 +00003521 if (vlan_tx_tag_present(skb)) {
3522 if (pt_prev) {
3523 ret = deliver_skb(skb, pt_prev, orig_dev);
3524 pt_prev = NULL;
3525 }
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003526 if (vlan_do_receive(&skb))
John Fastabend24257172011-10-10 09:16:41 +00003527 goto another_round;
3528 else if (unlikely(!skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07003529 goto unlock;
John Fastabend24257172011-10-10 09:16:41 +00003530 }
3531
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003532 rx_handler = rcu_dereference(skb->dev->rx_handler);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003533 if (rx_handler) {
3534 if (pt_prev) {
3535 ret = deliver_skb(skb, pt_prev, orig_dev);
3536 pt_prev = NULL;
3537 }
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003538 switch (rx_handler(&skb)) {
3539 case RX_HANDLER_CONSUMED:
Cristian Bercaru3bc1b1a2013-03-08 07:03:38 +00003540 ret = NET_RX_SUCCESS;
Mel Gormanb4b9e352012-07-31 16:44:26 -07003541 goto unlock;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003542 case RX_HANDLER_ANOTHER:
David S. Miller63d8ea72011-02-28 10:48:59 -08003543 goto another_round;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003544 case RX_HANDLER_EXACT:
3545 deliver_exact = true;
3546 case RX_HANDLER_PASS:
3547 break;
3548 default:
3549 BUG();
3550 }
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003551 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003552
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00003553 if (vlan_tx_nonzero_tag_present(skb))
3554 skb->pkt_type = PACKET_OTHERHOST;
3555
David S. Miller63d8ea72011-02-28 10:48:59 -08003556 /* deliver only exact match when indicated */
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003557 null_or_dev = deliver_exact ? skb->dev : NULL;
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00003558
Linus Torvalds1da177e2005-04-16 15:20:36 -07003559 type = skb->protocol;
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08003560 list_for_each_entry_rcu(ptype,
3561 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
David S. Miller63d8ea72011-02-28 10:48:59 -08003562 if (ptype->type == type &&
Jiri Pirkoe3f48d32011-02-28 20:26:31 +00003563 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3564 ptype->dev == orig_dev)) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003565 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003566 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003567 pt_prev = ptype;
3568 }
3569 }
3570
3571 if (pt_prev) {
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00003572 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
Michael S. Tsirkin0e698bf2012-09-15 22:44:16 +00003573 goto drop;
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00003574 else
3575 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003576 } else {
Mel Gormanb4b9e352012-07-31 16:44:26 -07003577drop:
Eric Dumazetcaf586e2010-09-30 21:06:55 +00003578 atomic_long_inc(&skb->dev->rx_dropped);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003579 kfree_skb(skb);
3580 /* Jamal, now you will not able to escape explaining
3581 * me how you were going to use this. :-)
3582 */
3583 ret = NET_RX_DROP;
3584 }
3585
Mel Gormanb4b9e352012-07-31 16:44:26 -07003586unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003587 rcu_read_unlock();
Mel Gormanb4b9e352012-07-31 16:44:26 -07003588out:
David S. Miller9754e292013-02-14 15:57:38 -05003589 return ret;
3590}
3591
3592static int __netif_receive_skb(struct sk_buff *skb)
3593{
3594 int ret;
3595
3596 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3597 unsigned long pflags = current->flags;
3598
3599 /*
3600 * PFMEMALLOC skbs are special, they should
3601 * - be delivered to SOCK_MEMALLOC sockets only
3602 * - stay away from userspace
3603 * - have bounded memory usage
3604 *
3605 * Use PF_MEMALLOC as this saves us from propagating the allocation
3606 * context down to all allocation sites.
3607 */
3608 current->flags |= PF_MEMALLOC;
3609 ret = __netif_receive_skb_core(skb, true);
3610 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3611 } else
3612 ret = __netif_receive_skb_core(skb, false);
3613
Linus Torvalds1da177e2005-04-16 15:20:36 -07003614 return ret;
3615}
Tom Herbert0a9627f2010-03-16 08:03:29 +00003616
3617/**
3618 * netif_receive_skb - process receive buffer from network
3619 * @skb: buffer to process
3620 *
3621 * netif_receive_skb() is the main receive data processing function.
3622 * It always succeeds. The buffer may be dropped during processing
3623 * for congestion control or by the protocol layers.
3624 *
3625 * This function may only be called from softirq context and interrupts
3626 * should be enabled.
3627 *
3628 * Return values (usually ignored):
3629 * NET_RX_SUCCESS: no congestion
3630 * NET_RX_DROP: packet was dropped
3631 */
3632int netif_receive_skb(struct sk_buff *skb)
3633{
Eric Dumazet588f0332011-11-15 04:12:55 +00003634 net_timestamp_check(netdev_tstamp_prequeue, skb);
Eric Dumazet3b098e22010-05-15 23:57:10 -07003635
Richard Cochranc1f19b52010-07-17 08:49:36 +00003636 if (skb_defer_rx_timestamp(skb))
3637 return NET_RX_SUCCESS;
3638
Eric Dumazetdf334542010-03-24 19:13:54 +00003639#ifdef CONFIG_RPS
Ingo Molnarc5905af2012-02-24 08:31:31 +01003640 if (static_key_false(&rps_needed)) {
Eric Dumazet3b098e22010-05-15 23:57:10 -07003641 struct rps_dev_flow voidflow, *rflow = &voidflow;
3642 int cpu, ret;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003643
Eric Dumazet3b098e22010-05-15 23:57:10 -07003644 rcu_read_lock();
Tom Herbert0a9627f2010-03-16 08:03:29 +00003645
Eric Dumazet3b098e22010-05-15 23:57:10 -07003646 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Tom Herbertfec5e652010-04-16 16:01:27 -07003647
Eric Dumazet3b098e22010-05-15 23:57:10 -07003648 if (cpu >= 0) {
3649 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3650 rcu_read_unlock();
Eric Dumazetadc93002011-11-17 03:13:26 +00003651 return ret;
Eric Dumazet3b098e22010-05-15 23:57:10 -07003652 }
Eric Dumazetadc93002011-11-17 03:13:26 +00003653 rcu_read_unlock();
Tom Herbertfec5e652010-04-16 16:01:27 -07003654 }
Tom Herbert1e94d722010-03-18 17:45:44 -07003655#endif
Eric Dumazetadc93002011-11-17 03:13:26 +00003656 return __netif_receive_skb(skb);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003657}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003658EXPORT_SYMBOL(netif_receive_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003659
Eric Dumazet88751272010-04-19 05:07:33 +00003660/* Network device is going away, flush any packets still pending
3661 * Called with irqs disabled.
3662 */
Changli Gao152102c2010-03-30 20:16:22 +00003663static void flush_backlog(void *arg)
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003664{
Changli Gao152102c2010-03-30 20:16:22 +00003665 struct net_device *dev = arg;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003666 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003667 struct sk_buff *skb, *tmp;
3668
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003669 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003670 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003671 if (skb->dev == dev) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003672 __skb_unlink(skb, &sd->input_pkt_queue);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003673 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003674 input_queue_head_incr(sd);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003675 }
Changli Gao6e7676c2010-04-27 15:07:33 -07003676 }
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003677 rps_unlock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003678
3679 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3680 if (skb->dev == dev) {
3681 __skb_unlink(skb, &sd->process_queue);
3682 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003683 input_queue_head_incr(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003684 }
3685 }
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003686}
3687
Herbert Xud565b0a2008-12-15 23:38:52 -08003688static int napi_gro_complete(struct sk_buff *skb)
3689{
Vlad Yasevich22061d82012-11-15 08:49:11 +00003690 struct packet_offload *ptype;
Herbert Xud565b0a2008-12-15 23:38:52 -08003691 __be16 type = skb->protocol;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003692 struct list_head *head = &offload_base;
Herbert Xud565b0a2008-12-15 23:38:52 -08003693 int err = -ENOENT;
3694
Eric Dumazetc3c7c252012-12-06 13:54:59 +00003695 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3696
Herbert Xufc59f9a2009-04-14 15:11:06 -07003697 if (NAPI_GRO_CB(skb)->count == 1) {
3698 skb_shinfo(skb)->gso_size = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003699 goto out;
Herbert Xufc59f9a2009-04-14 15:11:06 -07003700 }
Herbert Xud565b0a2008-12-15 23:38:52 -08003701
3702 rcu_read_lock();
3703 list_for_each_entry_rcu(ptype, head, list) {
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003704 if (ptype->type != type || !ptype->callbacks.gro_complete)
Herbert Xud565b0a2008-12-15 23:38:52 -08003705 continue;
3706
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003707 err = ptype->callbacks.gro_complete(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003708 break;
3709 }
3710 rcu_read_unlock();
3711
3712 if (err) {
3713 WARN_ON(&ptype->list == head);
3714 kfree_skb(skb);
3715 return NET_RX_SUCCESS;
3716 }
3717
3718out:
Herbert Xud565b0a2008-12-15 23:38:52 -08003719 return netif_receive_skb(skb);
3720}
3721
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003722/* napi->gro_list contains packets ordered by age.
3723 * youngest packets at the head of it.
3724 * Complete skbs in reverse order to reduce latencies.
3725 */
3726void napi_gro_flush(struct napi_struct *napi, bool flush_old)
Herbert Xud565b0a2008-12-15 23:38:52 -08003727{
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003728 struct sk_buff *skb, *prev = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08003729
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003730 /* scan list and build reverse chain */
3731 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3732 skb->prev = prev;
3733 prev = skb;
Herbert Xud565b0a2008-12-15 23:38:52 -08003734 }
3735
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003736 for (skb = prev; skb; skb = prev) {
3737 skb->next = NULL;
3738
3739 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3740 return;
3741
3742 prev = skb->prev;
3743 napi_gro_complete(skb);
3744 napi->gro_count--;
3745 }
3746
Herbert Xud565b0a2008-12-15 23:38:52 -08003747 napi->gro_list = NULL;
3748}
Eric Dumazet86cac582010-08-31 18:25:32 +00003749EXPORT_SYMBOL(napi_gro_flush);
Herbert Xud565b0a2008-12-15 23:38:52 -08003750
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003751static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3752{
3753 struct sk_buff *p;
3754 unsigned int maclen = skb->dev->hard_header_len;
3755
3756 for (p = napi->gro_list; p; p = p->next) {
3757 unsigned long diffs;
3758
3759 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3760 diffs |= p->vlan_tci ^ skb->vlan_tci;
3761 if (maclen == ETH_HLEN)
3762 diffs |= compare_ether_header(skb_mac_header(p),
3763 skb_gro_mac_header(skb));
3764 else if (!diffs)
3765 diffs = memcmp(skb_mac_header(p),
3766 skb_gro_mac_header(skb),
3767 maclen);
3768 NAPI_GRO_CB(p)->same_flow = !diffs;
3769 NAPI_GRO_CB(p)->flush = 0;
3770 }
3771}
3772
Rami Rosenbb728822012-11-28 21:55:25 +00003773static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xud565b0a2008-12-15 23:38:52 -08003774{
3775 struct sk_buff **pp = NULL;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003776 struct packet_offload *ptype;
Herbert Xud565b0a2008-12-15 23:38:52 -08003777 __be16 type = skb->protocol;
Vlad Yasevich22061d82012-11-15 08:49:11 +00003778 struct list_head *head = &offload_base;
Herbert Xu0da2afd52008-12-26 14:57:42 -08003779 int same_flow;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003780 enum gro_result ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003781
Jarek Poplawskice9e76c2010-08-05 01:19:11 +00003782 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
Herbert Xud565b0a2008-12-15 23:38:52 -08003783 goto normal;
3784
David S. Miller21dc3302010-08-23 00:13:46 -07003785 if (skb_is_gso(skb) || skb_has_frag_list(skb))
Herbert Xuf17f5c92009-01-14 14:36:12 -08003786 goto normal;
3787
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003788 gro_list_prepare(napi, skb);
3789
Herbert Xud565b0a2008-12-15 23:38:52 -08003790 rcu_read_lock();
3791 list_for_each_entry_rcu(ptype, head, list) {
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003792 if (ptype->type != type || !ptype->callbacks.gro_receive)
Herbert Xud565b0a2008-12-15 23:38:52 -08003793 continue;
3794
Herbert Xu86911732009-01-29 14:19:50 +00003795 skb_set_network_header(skb, skb_gro_offset(skb));
Eric Dumazetefd94502013-02-14 17:31:48 +00003796 skb_reset_mac_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003797 NAPI_GRO_CB(skb)->same_flow = 0;
3798 NAPI_GRO_CB(skb)->flush = 0;
Herbert Xu5d38a072009-01-04 16:13:40 -08003799 NAPI_GRO_CB(skb)->free = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003800
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00003801 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003802 break;
3803 }
3804 rcu_read_unlock();
3805
3806 if (&ptype->list == head)
3807 goto normal;
3808
Herbert Xu0da2afd52008-12-26 14:57:42 -08003809 same_flow = NAPI_GRO_CB(skb)->same_flow;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003810 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
Herbert Xu0da2afd52008-12-26 14:57:42 -08003811
Herbert Xud565b0a2008-12-15 23:38:52 -08003812 if (pp) {
3813 struct sk_buff *nskb = *pp;
3814
3815 *pp = nskb->next;
3816 nskb->next = NULL;
3817 napi_gro_complete(nskb);
Herbert Xu4ae55442009-02-08 18:00:36 +00003818 napi->gro_count--;
Herbert Xud565b0a2008-12-15 23:38:52 -08003819 }
3820
Herbert Xu0da2afd52008-12-26 14:57:42 -08003821 if (same_flow)
Herbert Xud565b0a2008-12-15 23:38:52 -08003822 goto ok;
3823
Herbert Xu4ae55442009-02-08 18:00:36 +00003824 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
Herbert Xud565b0a2008-12-15 23:38:52 -08003825 goto normal;
Herbert Xud565b0a2008-12-15 23:38:52 -08003826
Herbert Xu4ae55442009-02-08 18:00:36 +00003827 napi->gro_count++;
Herbert Xud565b0a2008-12-15 23:38:52 -08003828 NAPI_GRO_CB(skb)->count = 1;
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00003829 NAPI_GRO_CB(skb)->age = jiffies;
Herbert Xu86911732009-01-29 14:19:50 +00003830 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003831 skb->next = napi->gro_list;
3832 napi->gro_list = skb;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003833 ret = GRO_HELD;
Herbert Xud565b0a2008-12-15 23:38:52 -08003834
Herbert Xuad0f9902009-02-01 01:24:55 -08003835pull:
Herbert Xucb189782009-05-26 18:50:31 +00003836 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3837 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3838
3839 BUG_ON(skb->end - skb->tail < grow);
3840
3841 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3842
3843 skb->tail += grow;
3844 skb->data_len -= grow;
3845
3846 skb_shinfo(skb)->frags[0].page_offset += grow;
Eric Dumazet9e903e02011-10-18 21:00:24 +00003847 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
Herbert Xucb189782009-05-26 18:50:31 +00003848
Eric Dumazet9e903e02011-10-18 21:00:24 +00003849 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
Ian Campbellea2ab692011-08-22 23:44:58 +00003850 skb_frag_unref(skb, 0);
Herbert Xucb189782009-05-26 18:50:31 +00003851 memmove(skb_shinfo(skb)->frags,
3852 skb_shinfo(skb)->frags + 1,
Jarek Poplawskie5093ae2010-08-11 02:02:10 +00003853 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
Herbert Xucb189782009-05-26 18:50:31 +00003854 }
Herbert Xuad0f9902009-02-01 01:24:55 -08003855 }
3856
Herbert Xud565b0a2008-12-15 23:38:52 -08003857ok:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003858 return ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003859
3860normal:
Herbert Xuad0f9902009-02-01 01:24:55 -08003861 ret = GRO_NORMAL;
3862 goto pull;
Herbert Xu5d38a072009-01-04 16:13:40 -08003863}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003864
Herbert Xu96e93ea2009-01-06 10:49:34 -08003865
Rami Rosenbb728822012-11-28 21:55:25 +00003866static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
Herbert Xu5d38a072009-01-04 16:13:40 -08003867{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003868 switch (ret) {
3869 case GRO_NORMAL:
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003870 if (netif_receive_skb(skb))
3871 ret = GRO_DROP;
3872 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003873
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003874 case GRO_DROP:
Herbert Xu5d38a072009-01-04 16:13:40 -08003875 kfree_skb(skb);
3876 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003877
Eric Dumazetdaa86542012-04-19 07:07:40 +00003878 case GRO_MERGED_FREE:
Eric Dumazetd7e88832012-04-30 08:10:34 +00003879 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3880 kmem_cache_free(skbuff_head_cache, skb);
3881 else
3882 __kfree_skb(skb);
Eric Dumazetdaa86542012-04-19 07:07:40 +00003883 break;
3884
Ben Hutchings5b252f02009-10-29 07:17:09 +00003885 case GRO_HELD:
3886 case GRO_MERGED:
3887 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003888 }
3889
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003890 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003891}
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003892
Eric Dumazetca07e432012-10-06 22:28:06 +00003893static void skb_gro_reset_offset(struct sk_buff *skb)
Herbert Xu78a478d2009-05-26 18:50:21 +00003894{
Eric Dumazetca07e432012-10-06 22:28:06 +00003895 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3896 const skb_frag_t *frag0 = &pinfo->frags[0];
3897
Herbert Xu78a478d2009-05-26 18:50:21 +00003898 NAPI_GRO_CB(skb)->data_offset = 0;
3899 NAPI_GRO_CB(skb)->frag0 = NULL;
Herbert Xu74895942009-05-26 18:50:27 +00003900 NAPI_GRO_CB(skb)->frag0_len = 0;
Herbert Xu78a478d2009-05-26 18:50:21 +00003901
Simon Hormanced14f62013-05-28 20:34:25 +00003902 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
Eric Dumazetca07e432012-10-06 22:28:06 +00003903 pinfo->nr_frags &&
3904 !PageHighMem(skb_frag_page(frag0))) {
3905 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3906 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
Herbert Xu74895942009-05-26 18:50:27 +00003907 }
Herbert Xu78a478d2009-05-26 18:50:21 +00003908}
Herbert Xu78a478d2009-05-26 18:50:21 +00003909
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003910gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003911{
Herbert Xu86911732009-01-29 14:19:50 +00003912 skb_gro_reset_offset(skb);
3913
Eric Dumazet89c5fa32012-12-10 13:28:16 +00003914 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003915}
3916EXPORT_SYMBOL(napi_gro_receive);
3917
stephen hemmingerd0c2b0d2010-10-19 07:12:10 +00003918static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003919{
Herbert Xu96e93ea2009-01-06 10:49:34 -08003920 __skb_pull(skb, skb_headlen(skb));
Eric Dumazet2a2a4592012-03-21 06:58:03 +00003921 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3922 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
Jesse Gross3701e512010-10-20 13:56:06 +00003923 skb->vlan_tci = 0;
Herbert Xu66c46d72011-01-29 20:44:54 -08003924 skb->dev = napi->dev;
Andy Gospodarek6d152e22011-02-02 14:53:25 -08003925 skb->skb_iif = 0;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003926
3927 napi->skb = skb;
3928}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003929
Herbert Xu76620aa2009-04-16 02:02:07 -07003930struct sk_buff *napi_get_frags(struct napi_struct *napi)
Herbert Xu5d38a072009-01-04 16:13:40 -08003931{
Herbert Xu5d38a072009-01-04 16:13:40 -08003932 struct sk_buff *skb = napi->skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003933
3934 if (!skb) {
Eric Dumazet89d71a62009-10-13 05:34:20 +00003935 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3936 if (skb)
3937 napi->skb = skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003938 }
Herbert Xu96e93ea2009-01-06 10:49:34 -08003939 return skb;
3940}
Herbert Xu76620aa2009-04-16 02:02:07 -07003941EXPORT_SYMBOL(napi_get_frags);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003942
Rami Rosenbb728822012-11-28 21:55:25 +00003943static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003944 gro_result_t ret)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003945{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003946 switch (ret) {
3947 case GRO_NORMAL:
Herbert Xu86911732009-01-29 14:19:50 +00003948 case GRO_HELD:
Ajit Khapardee76b69c2010-02-16 20:25:43 +00003949 skb->protocol = eth_type_trans(skb, skb->dev);
Herbert Xu86911732009-01-29 14:19:50 +00003950
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003951 if (ret == GRO_HELD)
3952 skb_gro_pull(skb, -ETH_HLEN);
3953 else if (netif_receive_skb(skb))
3954 ret = GRO_DROP;
Herbert Xu86911732009-01-29 14:19:50 +00003955 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003956
3957 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003958 case GRO_MERGED_FREE:
3959 napi_reuse_skb(napi, skb);
3960 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003961
3962 case GRO_MERGED:
3963 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003964 }
3965
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003966 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003967}
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003968
Eric Dumazet4adb9c42012-05-18 20:49:06 +00003969static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003970{
Herbert Xu76620aa2009-04-16 02:02:07 -07003971 struct sk_buff *skb = napi->skb;
3972 struct ethhdr *eth;
Herbert Xua5b1cf22009-05-26 18:50:28 +00003973 unsigned int hlen;
3974 unsigned int off;
Herbert Xu76620aa2009-04-16 02:02:07 -07003975
3976 napi->skb = NULL;
3977
3978 skb_reset_mac_header(skb);
3979 skb_gro_reset_offset(skb);
3980
Herbert Xua5b1cf22009-05-26 18:50:28 +00003981 off = skb_gro_offset(skb);
3982 hlen = off + sizeof(*eth);
3983 eth = skb_gro_header_fast(skb, off);
3984 if (skb_gro_header_hard(skb, hlen)) {
3985 eth = skb_gro_header_slow(skb, hlen, off);
3986 if (unlikely(!eth)) {
3987 napi_reuse_skb(napi, skb);
3988 skb = NULL;
3989 goto out;
3990 }
Herbert Xu76620aa2009-04-16 02:02:07 -07003991 }
3992
3993 skb_gro_pull(skb, sizeof(*eth));
3994
3995 /*
3996 * This works because the only protocols we care about don't require
3997 * special handling. We'll fix it up properly at the end.
3998 */
3999 skb->protocol = eth->h_proto;
4000
4001out:
4002 return skb;
4003}
Herbert Xu76620aa2009-04-16 02:02:07 -07004004
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07004005gro_result_t napi_gro_frags(struct napi_struct *napi)
Herbert Xu76620aa2009-04-16 02:02:07 -07004006{
4007 struct sk_buff *skb = napi_frags_skb(napi);
Herbert Xu96e93ea2009-01-06 10:49:34 -08004008
4009 if (!skb)
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07004010 return GRO_DROP;
Herbert Xu96e93ea2009-01-06 10:49:34 -08004011
Eric Dumazet89c5fa32012-12-10 13:28:16 +00004012 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
Herbert Xu5d38a072009-01-04 16:13:40 -08004013}
4014EXPORT_SYMBOL(napi_gro_frags);
4015
Eric Dumazete326bed2010-04-22 00:22:45 -07004016/*
4017 * net_rps_action sends any pending IPI's for rps.
4018 * Note: called with local irq disabled, but exits with local irq enabled.
4019 */
4020static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4021{
4022#ifdef CONFIG_RPS
4023 struct softnet_data *remsd = sd->rps_ipi_list;
4024
4025 if (remsd) {
4026 sd->rps_ipi_list = NULL;
4027
4028 local_irq_enable();
4029
4030 /* Send pending IPI's to kick RPS processing on remote cpus. */
4031 while (remsd) {
4032 struct softnet_data *next = remsd->rps_ipi_next;
4033
4034 if (cpu_online(remsd->cpu))
4035 __smp_call_function_single(remsd->cpu,
4036 &remsd->csd, 0);
4037 remsd = next;
4038 }
4039 } else
4040#endif
4041 local_irq_enable();
4042}
4043
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004044static int process_backlog(struct napi_struct *napi, int quota)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004045{
4046 int work = 0;
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004047 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004048
Eric Dumazete326bed2010-04-22 00:22:45 -07004049#ifdef CONFIG_RPS
4050 /* Check if we have pending ipi, its better to send them now,
4051 * not waiting net_rx_action() end.
4052 */
4053 if (sd->rps_ipi_list) {
4054 local_irq_disable();
4055 net_rps_action_and_irq_enable(sd);
4056 }
4057#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004058 napi->weight = weight_p;
Changli Gao6e7676c2010-04-27 15:07:33 -07004059 local_irq_disable();
4060 while (work < quota) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004061 struct sk_buff *skb;
Changli Gao6e7676c2010-04-27 15:07:33 -07004062 unsigned int qlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004063
Changli Gao6e7676c2010-04-27 15:07:33 -07004064 while ((skb = __skb_dequeue(&sd->process_queue))) {
Eric Dumazete4008272010-04-05 15:42:39 -07004065 local_irq_enable();
Changli Gao6e7676c2010-04-27 15:07:33 -07004066 __netif_receive_skb(skb);
Changli Gao6e7676c2010-04-27 15:07:33 -07004067 local_irq_disable();
Tom Herbert76cc8b12010-05-20 18:37:59 +00004068 input_queue_head_incr(sd);
4069 if (++work >= quota) {
4070 local_irq_enable();
4071 return work;
4072 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004073 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004074
Changli Gao6e7676c2010-04-27 15:07:33 -07004075 rps_lock(sd);
4076 qlen = skb_queue_len(&sd->input_pkt_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00004077 if (qlen)
Changli Gao6e7676c2010-04-27 15:07:33 -07004078 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4079 &sd->process_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00004080
Changli Gao6e7676c2010-04-27 15:07:33 -07004081 if (qlen < quota - work) {
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004082 /*
4083 * Inline a custom version of __napi_complete().
4084 * only current cpu owns and manipulates this napi,
4085 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4086 * we can use a plain write instead of clear_bit(),
4087 * and we dont need an smp_mb() memory barrier.
4088 */
4089 list_del(&napi->poll_list);
4090 napi->state = 0;
4091
Changli Gao6e7676c2010-04-27 15:07:33 -07004092 quota = work + qlen;
4093 }
4094 rps_unlock(sd);
4095 }
4096 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004097
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004098 return work;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004099}
4100
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004101/**
4102 * __napi_schedule - schedule for receive
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004103 * @n: entry to schedule
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004104 *
4105 * The entry's receive function will be scheduled to run
4106 */
Harvey Harrisonb5606c22008-02-13 15:03:16 -08004107void __napi_schedule(struct napi_struct *n)
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004108{
4109 unsigned long flags;
4110
4111 local_irq_save(flags);
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004112 ____napi_schedule(&__get_cpu_var(softnet_data), n);
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004113 local_irq_restore(flags);
4114}
4115EXPORT_SYMBOL(__napi_schedule);
4116
Herbert Xud565b0a2008-12-15 23:38:52 -08004117void __napi_complete(struct napi_struct *n)
4118{
4119 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4120 BUG_ON(n->gro_list);
4121
4122 list_del(&n->poll_list);
4123 smp_mb__before_clear_bit();
4124 clear_bit(NAPI_STATE_SCHED, &n->state);
4125}
4126EXPORT_SYMBOL(__napi_complete);
4127
4128void napi_complete(struct napi_struct *n)
4129{
4130 unsigned long flags;
4131
4132 /*
4133 * don't let napi dequeue from the cpu poll list
4134 * just in case its running on a different cpu
4135 */
4136 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4137 return;
4138
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004139 napi_gro_flush(n, false);
Herbert Xud565b0a2008-12-15 23:38:52 -08004140 local_irq_save(flags);
4141 __napi_complete(n);
4142 local_irq_restore(flags);
4143}
4144EXPORT_SYMBOL(napi_complete);
4145
Eliezer Tamiraf12fa62013-06-10 11:39:41 +03004146/* must be called under rcu_read_lock(), as we dont take a reference */
4147struct napi_struct *napi_by_id(unsigned int napi_id)
4148{
4149 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4150 struct napi_struct *napi;
4151
4152 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4153 if (napi->napi_id == napi_id)
4154 return napi;
4155
4156 return NULL;
4157}
4158EXPORT_SYMBOL_GPL(napi_by_id);
4159
4160void napi_hash_add(struct napi_struct *napi)
4161{
4162 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4163
4164 spin_lock(&napi_hash_lock);
4165
4166 /* 0 is not a valid id, we also skip an id that is taken
4167 * we expect both events to be extremely rare
4168 */
4169 napi->napi_id = 0;
4170 while (!napi->napi_id) {
4171 napi->napi_id = ++napi_gen_id;
4172 if (napi_by_id(napi->napi_id))
4173 napi->napi_id = 0;
4174 }
4175
4176 hlist_add_head_rcu(&napi->napi_hash_node,
4177 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4178
4179 spin_unlock(&napi_hash_lock);
4180 }
4181}
4182EXPORT_SYMBOL_GPL(napi_hash_add);
4183
4184/* Warning : caller is responsible to make sure rcu grace period
4185 * is respected before freeing memory containing @napi
4186 */
4187void napi_hash_del(struct napi_struct *napi)
4188{
4189 spin_lock(&napi_hash_lock);
4190
4191 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4192 hlist_del_rcu(&napi->napi_hash_node);
4193
4194 spin_unlock(&napi_hash_lock);
4195}
4196EXPORT_SYMBOL_GPL(napi_hash_del);
4197
Herbert Xud565b0a2008-12-15 23:38:52 -08004198void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4199 int (*poll)(struct napi_struct *, int), int weight)
4200{
4201 INIT_LIST_HEAD(&napi->poll_list);
Herbert Xu4ae55442009-02-08 18:00:36 +00004202 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08004203 napi->gro_list = NULL;
Herbert Xu5d38a072009-01-04 16:13:40 -08004204 napi->skb = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08004205 napi->poll = poll;
Eric Dumazet82dc3c62013-03-05 15:57:22 +00004206 if (weight > NAPI_POLL_WEIGHT)
4207 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4208 weight, dev->name);
Herbert Xud565b0a2008-12-15 23:38:52 -08004209 napi->weight = weight;
4210 list_add(&napi->dev_list, &dev->napi_list);
Herbert Xud565b0a2008-12-15 23:38:52 -08004211 napi->dev = dev;
Herbert Xu5d38a072009-01-04 16:13:40 -08004212#ifdef CONFIG_NETPOLL
Herbert Xud565b0a2008-12-15 23:38:52 -08004213 spin_lock_init(&napi->poll_lock);
4214 napi->poll_owner = -1;
4215#endif
4216 set_bit(NAPI_STATE_SCHED, &napi->state);
4217}
4218EXPORT_SYMBOL(netif_napi_add);
4219
4220void netif_napi_del(struct napi_struct *napi)
4221{
4222 struct sk_buff *skb, *next;
4223
Peter P Waskiewicz Jrd7b06632008-12-26 01:35:35 -08004224 list_del_init(&napi->dev_list);
Herbert Xu76620aa2009-04-16 02:02:07 -07004225 napi_free_frags(napi);
Herbert Xud565b0a2008-12-15 23:38:52 -08004226
4227 for (skb = napi->gro_list; skb; skb = next) {
4228 next = skb->next;
4229 skb->next = NULL;
4230 kfree_skb(skb);
4231 }
4232
4233 napi->gro_list = NULL;
Herbert Xu4ae55442009-02-08 18:00:36 +00004234 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08004235}
4236EXPORT_SYMBOL(netif_napi_del);
4237
Linus Torvalds1da177e2005-04-16 15:20:36 -07004238static void net_rx_action(struct softirq_action *h)
4239{
Eric Dumazete326bed2010-04-22 00:22:45 -07004240 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger24f8b232008-11-03 17:14:38 -08004241 unsigned long time_limit = jiffies + 2;
Stephen Hemminger51b0bde2005-06-23 20:14:40 -07004242 int budget = netdev_budget;
Matt Mackall53fb95d2005-08-11 19:27:43 -07004243 void *have;
4244
Linus Torvalds1da177e2005-04-16 15:20:36 -07004245 local_irq_disable();
4246
Eric Dumazete326bed2010-04-22 00:22:45 -07004247 while (!list_empty(&sd->poll_list)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004248 struct napi_struct *n;
4249 int work, weight;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004250
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004251 /* If softirq window is exhuasted then punt.
Stephen Hemminger24f8b232008-11-03 17:14:38 -08004252 * Allow this to run for 2 jiffies since which will allow
4253 * an average latency of 1.5/HZ.
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004254 */
Eric Dumazetd1f41b62013-03-05 07:15:13 +00004255 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004256 goto softnet_break;
4257
4258 local_irq_enable();
4259
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004260 /* Even though interrupts have been re-enabled, this
4261 * access is safe because interrupts can only add new
4262 * entries to the tail of this list, and only ->poll()
4263 * calls can remove this head entry from the list.
4264 */
Eric Dumazete326bed2010-04-22 00:22:45 -07004265 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004266
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004267 have = netpoll_poll_lock(n);
4268
4269 weight = n->weight;
4270
David S. Miller0a7606c2007-10-29 21:28:47 -07004271 /* This NAPI_STATE_SCHED test is for avoiding a race
4272 * with netpoll's poll_napi(). Only the entity which
4273 * obtains the lock and sees NAPI_STATE_SCHED set will
4274 * actually make the ->poll() call. Therefore we avoid
Lucas De Marchi25985ed2011-03-30 22:57:33 -03004275 * accidentally calling ->poll() when NAPI is not scheduled.
David S. Miller0a7606c2007-10-29 21:28:47 -07004276 */
4277 work = 0;
Neil Horman4ea7e382009-05-21 07:36:08 +00004278 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
David S. Miller0a7606c2007-10-29 21:28:47 -07004279 work = n->poll(n, weight);
Neil Horman4ea7e382009-05-21 07:36:08 +00004280 trace_napi_poll(n);
4281 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004282
4283 WARN_ON_ONCE(work > weight);
4284
4285 budget -= work;
4286
4287 local_irq_disable();
4288
4289 /* Drivers must not modify the NAPI state if they
4290 * consume the entire weight. In such cases this code
4291 * still "owns" the NAPI instance and therefore can
4292 * move the instance around on the list at-will.
4293 */
David S. Millerfed17f32008-01-07 21:00:40 -08004294 if (unlikely(work == weight)) {
Herbert Xuff780cd2009-06-26 19:27:04 -07004295 if (unlikely(napi_disable_pending(n))) {
4296 local_irq_enable();
4297 napi_complete(n);
4298 local_irq_disable();
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004299 } else {
4300 if (n->gro_list) {
4301 /* flush too old packets
4302 * If HZ < 1000, flush all packets.
4303 */
4304 local_irq_enable();
4305 napi_gro_flush(n, HZ >= 1000);
4306 local_irq_disable();
4307 }
Eric Dumazete326bed2010-04-22 00:22:45 -07004308 list_move_tail(&n->poll_list, &sd->poll_list);
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004309 }
David S. Millerfed17f32008-01-07 21:00:40 -08004310 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004311
4312 netpoll_poll_unlock(have);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004313 }
4314out:
Eric Dumazete326bed2010-04-22 00:22:45 -07004315 net_rps_action_and_irq_enable(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00004316
Chris Leechdb217332006-06-17 21:24:58 -07004317#ifdef CONFIG_NET_DMA
4318 /*
4319 * There may not be any more sk_buffs coming right now, so push
4320 * any pending DMA copies to hardware
4321 */
Dan Williams2ba05622009-01-06 11:38:14 -07004322 dma_issue_pending_all();
Chris Leechdb217332006-06-17 21:24:58 -07004323#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004324
Linus Torvalds1da177e2005-04-16 15:20:36 -07004325 return;
4326
4327softnet_break:
Changli Gaodee42872010-05-02 05:42:16 +00004328 sd->time_squeeze++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004329 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4330 goto out;
4331}
4332
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004333struct netdev_upper {
4334 struct net_device *dev;
4335 bool master;
4336 struct list_head list;
4337 struct rcu_head rcu;
4338 struct list_head search_list;
4339};
4340
4341static void __append_search_uppers(struct list_head *search_list,
4342 struct net_device *dev)
4343{
4344 struct netdev_upper *upper;
4345
4346 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4347 /* check if this upper is not already in search list */
4348 if (list_empty(&upper->search_list))
4349 list_add_tail(&upper->search_list, search_list);
4350 }
4351}
4352
4353static bool __netdev_search_upper_dev(struct net_device *dev,
4354 struct net_device *upper_dev)
4355{
4356 LIST_HEAD(search_list);
4357 struct netdev_upper *upper;
4358 struct netdev_upper *tmp;
4359 bool ret = false;
4360
4361 __append_search_uppers(&search_list, dev);
4362 list_for_each_entry(upper, &search_list, search_list) {
4363 if (upper->dev == upper_dev) {
4364 ret = true;
4365 break;
4366 }
4367 __append_search_uppers(&search_list, upper->dev);
4368 }
4369 list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4370 INIT_LIST_HEAD(&upper->search_list);
4371 return ret;
4372}
4373
4374static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4375 struct net_device *upper_dev)
4376{
4377 struct netdev_upper *upper;
4378
4379 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4380 if (upper->dev == upper_dev)
4381 return upper;
4382 }
4383 return NULL;
4384}
4385
4386/**
4387 * netdev_has_upper_dev - Check if device is linked to an upper device
4388 * @dev: device
4389 * @upper_dev: upper device to check
4390 *
4391 * Find out if a device is linked to specified upper device and return true
4392 * in case it is. Note that this checks only immediate upper device,
4393 * not through a complete stack of devices. The caller must hold the RTNL lock.
4394 */
4395bool netdev_has_upper_dev(struct net_device *dev,
4396 struct net_device *upper_dev)
4397{
4398 ASSERT_RTNL();
4399
4400 return __netdev_find_upper(dev, upper_dev);
4401}
4402EXPORT_SYMBOL(netdev_has_upper_dev);
4403
4404/**
4405 * netdev_has_any_upper_dev - Check if device is linked to some device
4406 * @dev: device
4407 *
4408 * Find out if a device is linked to an upper device and return true in case
4409 * it is. The caller must hold the RTNL lock.
4410 */
4411bool netdev_has_any_upper_dev(struct net_device *dev)
4412{
4413 ASSERT_RTNL();
4414
4415 return !list_empty(&dev->upper_dev_list);
4416}
4417EXPORT_SYMBOL(netdev_has_any_upper_dev);
4418
4419/**
4420 * netdev_master_upper_dev_get - Get master upper device
4421 * @dev: device
4422 *
4423 * Find a master upper device and return pointer to it or NULL in case
4424 * it's not there. The caller must hold the RTNL lock.
4425 */
4426struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4427{
4428 struct netdev_upper *upper;
4429
4430 ASSERT_RTNL();
4431
4432 if (list_empty(&dev->upper_dev_list))
4433 return NULL;
4434
4435 upper = list_first_entry(&dev->upper_dev_list,
4436 struct netdev_upper, list);
4437 if (likely(upper->master))
4438 return upper->dev;
4439 return NULL;
4440}
4441EXPORT_SYMBOL(netdev_master_upper_dev_get);
4442
4443/**
4444 * netdev_master_upper_dev_get_rcu - Get master upper device
4445 * @dev: device
4446 *
4447 * Find a master upper device and return pointer to it or NULL in case
4448 * it's not there. The caller must hold the RCU read lock.
4449 */
4450struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4451{
4452 struct netdev_upper *upper;
4453
4454 upper = list_first_or_null_rcu(&dev->upper_dev_list,
4455 struct netdev_upper, list);
4456 if (upper && likely(upper->master))
4457 return upper->dev;
4458 return NULL;
4459}
4460EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4461
4462static int __netdev_upper_dev_link(struct net_device *dev,
4463 struct net_device *upper_dev, bool master)
4464{
4465 struct netdev_upper *upper;
4466
4467 ASSERT_RTNL();
4468
4469 if (dev == upper_dev)
4470 return -EBUSY;
4471
4472 /* To prevent loops, check if dev is not upper device to upper_dev. */
4473 if (__netdev_search_upper_dev(upper_dev, dev))
4474 return -EBUSY;
4475
4476 if (__netdev_find_upper(dev, upper_dev))
4477 return -EEXIST;
4478
4479 if (master && netdev_master_upper_dev_get(dev))
4480 return -EBUSY;
4481
4482 upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4483 if (!upper)
4484 return -ENOMEM;
4485
4486 upper->dev = upper_dev;
4487 upper->master = master;
4488 INIT_LIST_HEAD(&upper->search_list);
4489
4490 /* Ensure that master upper link is always the first item in list. */
4491 if (master)
4492 list_add_rcu(&upper->list, &dev->upper_dev_list);
4493 else
4494 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4495 dev_hold(upper_dev);
Jiri Pirko42e52bf2013-05-25 04:12:10 +00004496 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004497 return 0;
4498}
4499
4500/**
4501 * netdev_upper_dev_link - Add a link to the upper device
4502 * @dev: device
4503 * @upper_dev: new upper device
4504 *
4505 * Adds a link to device which is upper to this one. The caller must hold
4506 * the RTNL lock. On a failure a negative errno code is returned.
4507 * On success the reference counts are adjusted and the function
4508 * returns zero.
4509 */
4510int netdev_upper_dev_link(struct net_device *dev,
4511 struct net_device *upper_dev)
4512{
4513 return __netdev_upper_dev_link(dev, upper_dev, false);
4514}
4515EXPORT_SYMBOL(netdev_upper_dev_link);
4516
4517/**
4518 * netdev_master_upper_dev_link - Add a master link to the upper device
4519 * @dev: device
4520 * @upper_dev: new upper device
4521 *
4522 * Adds a link to device which is upper to this one. In this case, only
4523 * one master upper device can be linked, although other non-master devices
4524 * might be linked as well. The caller must hold the RTNL lock.
4525 * On a failure a negative errno code is returned. On success the reference
4526 * counts are adjusted and the function returns zero.
4527 */
4528int netdev_master_upper_dev_link(struct net_device *dev,
4529 struct net_device *upper_dev)
4530{
4531 return __netdev_upper_dev_link(dev, upper_dev, true);
4532}
4533EXPORT_SYMBOL(netdev_master_upper_dev_link);
4534
4535/**
4536 * netdev_upper_dev_unlink - Removes a link to upper device
4537 * @dev: device
4538 * @upper_dev: new upper device
4539 *
4540 * Removes a link to device which is upper to this one. The caller must hold
4541 * the RTNL lock.
4542 */
4543void netdev_upper_dev_unlink(struct net_device *dev,
4544 struct net_device *upper_dev)
4545{
4546 struct netdev_upper *upper;
4547
4548 ASSERT_RTNL();
4549
4550 upper = __netdev_find_upper(dev, upper_dev);
4551 if (!upper)
4552 return;
4553 list_del_rcu(&upper->list);
4554 dev_put(upper_dev);
4555 kfree_rcu(upper, rcu);
Jiri Pirko42e52bf2013-05-25 04:12:10 +00004556 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00004557}
4558EXPORT_SYMBOL(netdev_upper_dev_unlink);
4559
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004560static void dev_change_rx_flags(struct net_device *dev, int flags)
4561{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004562 const struct net_device_ops *ops = dev->netdev_ops;
4563
4564 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4565 ops->ndo_change_rx_flags(dev, flags);
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004566}
4567
Wang Chendad9b332008-06-18 01:48:28 -07004568static int __dev_set_promiscuity(struct net_device *dev, int inc)
Patrick McHardy4417da62007-06-27 01:28:10 -07004569{
Eric Dumazetb536db92011-11-30 21:42:26 +00004570 unsigned int old_flags = dev->flags;
Eric W. Biedermand04a48b2012-05-23 17:01:57 -06004571 kuid_t uid;
4572 kgid_t gid;
Patrick McHardy4417da62007-06-27 01:28:10 -07004573
Patrick McHardy24023452007-07-14 18:51:31 -07004574 ASSERT_RTNL();
4575
Wang Chendad9b332008-06-18 01:48:28 -07004576 dev->flags |= IFF_PROMISC;
4577 dev->promiscuity += inc;
4578 if (dev->promiscuity == 0) {
4579 /*
4580 * Avoid overflow.
4581 * If inc causes overflow, untouch promisc and return error.
4582 */
4583 if (inc < 0)
4584 dev->flags &= ~IFF_PROMISC;
4585 else {
4586 dev->promiscuity -= inc;
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004587 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4588 dev->name);
Wang Chendad9b332008-06-18 01:48:28 -07004589 return -EOVERFLOW;
4590 }
4591 }
Patrick McHardy4417da62007-06-27 01:28:10 -07004592 if (dev->flags != old_flags) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004593 pr_info("device %s %s promiscuous mode\n",
4594 dev->name,
4595 dev->flags & IFF_PROMISC ? "entered" : "left");
David Howells8192b0c2008-11-14 10:39:10 +11004596 if (audit_enabled) {
4597 current_uid_gid(&uid, &gid);
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004598 audit_log(current->audit_context, GFP_ATOMIC,
4599 AUDIT_ANOM_PROMISCUOUS,
4600 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4601 dev->name, (dev->flags & IFF_PROMISC),
4602 (old_flags & IFF_PROMISC),
Eric W. Biedermane1760bd2012-09-10 22:39:43 -07004603 from_kuid(&init_user_ns, audit_get_loginuid(current)),
Eric W. Biedermand04a48b2012-05-23 17:01:57 -06004604 from_kuid(&init_user_ns, uid),
4605 from_kgid(&init_user_ns, gid),
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004606 audit_get_sessionid(current));
David Howells8192b0c2008-11-14 10:39:10 +11004607 }
Patrick McHardy24023452007-07-14 18:51:31 -07004608
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004609 dev_change_rx_flags(dev, IFF_PROMISC);
Patrick McHardy4417da62007-06-27 01:28:10 -07004610 }
Wang Chendad9b332008-06-18 01:48:28 -07004611 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004612}
4613
Linus Torvalds1da177e2005-04-16 15:20:36 -07004614/**
4615 * dev_set_promiscuity - update promiscuity count on a device
4616 * @dev: device
4617 * @inc: modifier
4618 *
Stephen Hemminger3041a062006-05-26 13:25:24 -07004619 * Add or remove promiscuity from a device. While the count in the device
Linus Torvalds1da177e2005-04-16 15:20:36 -07004620 * remains above zero the interface remains promiscuous. Once it hits zero
4621 * the device reverts back to normal filtering operation. A negative inc
4622 * value is used to drop promiscuity on the device.
Wang Chendad9b332008-06-18 01:48:28 -07004623 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004624 */
Wang Chendad9b332008-06-18 01:48:28 -07004625int dev_set_promiscuity(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004626{
Eric Dumazetb536db92011-11-30 21:42:26 +00004627 unsigned int old_flags = dev->flags;
Wang Chendad9b332008-06-18 01:48:28 -07004628 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004629
Wang Chendad9b332008-06-18 01:48:28 -07004630 err = __dev_set_promiscuity(dev, inc);
Patrick McHardy4b5a6982008-07-06 15:49:08 -07004631 if (err < 0)
Wang Chendad9b332008-06-18 01:48:28 -07004632 return err;
Patrick McHardy4417da62007-06-27 01:28:10 -07004633 if (dev->flags != old_flags)
4634 dev_set_rx_mode(dev);
Wang Chendad9b332008-06-18 01:48:28 -07004635 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004636}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004637EXPORT_SYMBOL(dev_set_promiscuity);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004638
4639/**
4640 * dev_set_allmulti - update allmulti count on a device
4641 * @dev: device
4642 * @inc: modifier
4643 *
4644 * Add or remove reception of all multicast frames to a device. While the
4645 * count in the device remains above zero the interface remains listening
4646 * to all interfaces. Once it hits zero the device reverts back to normal
4647 * filtering operation. A negative @inc value is used to drop the counter
4648 * when releasing a resource needing all multicasts.
Wang Chendad9b332008-06-18 01:48:28 -07004649 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004650 */
4651
Wang Chendad9b332008-06-18 01:48:28 -07004652int dev_set_allmulti(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004653{
Eric Dumazetb536db92011-11-30 21:42:26 +00004654 unsigned int old_flags = dev->flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004655
Patrick McHardy24023452007-07-14 18:51:31 -07004656 ASSERT_RTNL();
4657
Linus Torvalds1da177e2005-04-16 15:20:36 -07004658 dev->flags |= IFF_ALLMULTI;
Wang Chendad9b332008-06-18 01:48:28 -07004659 dev->allmulti += inc;
4660 if (dev->allmulti == 0) {
4661 /*
4662 * Avoid overflow.
4663 * If inc causes overflow, untouch allmulti and return error.
4664 */
4665 if (inc < 0)
4666 dev->flags &= ~IFF_ALLMULTI;
4667 else {
4668 dev->allmulti -= inc;
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004669 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4670 dev->name);
Wang Chendad9b332008-06-18 01:48:28 -07004671 return -EOVERFLOW;
4672 }
4673 }
Patrick McHardy24023452007-07-14 18:51:31 -07004674 if (dev->flags ^ old_flags) {
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004675 dev_change_rx_flags(dev, IFF_ALLMULTI);
Patrick McHardy4417da62007-06-27 01:28:10 -07004676 dev_set_rx_mode(dev);
Patrick McHardy24023452007-07-14 18:51:31 -07004677 }
Wang Chendad9b332008-06-18 01:48:28 -07004678 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004679}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004680EXPORT_SYMBOL(dev_set_allmulti);
Patrick McHardy4417da62007-06-27 01:28:10 -07004681
4682/*
4683 * Upload unicast and multicast address lists to device and
4684 * configure RX filtering. When the device doesn't support unicast
Joe Perches53ccaae2007-12-20 14:02:06 -08004685 * filtering it is put in promiscuous mode while unicast addresses
Patrick McHardy4417da62007-06-27 01:28:10 -07004686 * are present.
4687 */
4688void __dev_set_rx_mode(struct net_device *dev)
4689{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004690 const struct net_device_ops *ops = dev->netdev_ops;
4691
Patrick McHardy4417da62007-06-27 01:28:10 -07004692 /* dev_open will call this function so the list will stay sane. */
4693 if (!(dev->flags&IFF_UP))
4694 return;
4695
4696 if (!netif_device_present(dev))
YOSHIFUJI Hideaki40b77c92007-07-19 10:43:23 +09004697 return;
Patrick McHardy4417da62007-06-27 01:28:10 -07004698
Jiri Pirko01789342011-08-16 06:29:00 +00004699 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004700 /* Unicast addresses changes may only happen under the rtnl,
4701 * therefore calling __dev_set_promiscuity here is safe.
4702 */
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004703 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004704 __dev_set_promiscuity(dev, 1);
Joe Perches2d348d12011-07-25 16:17:35 -07004705 dev->uc_promisc = true;
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004706 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004707 __dev_set_promiscuity(dev, -1);
Joe Perches2d348d12011-07-25 16:17:35 -07004708 dev->uc_promisc = false;
Patrick McHardy4417da62007-06-27 01:28:10 -07004709 }
Patrick McHardy4417da62007-06-27 01:28:10 -07004710 }
Jiri Pirko01789342011-08-16 06:29:00 +00004711
4712 if (ops->ndo_set_rx_mode)
4713 ops->ndo_set_rx_mode(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004714}
4715
4716void dev_set_rx_mode(struct net_device *dev)
4717{
David S. Millerb9e40852008-07-15 00:15:08 -07004718 netif_addr_lock_bh(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004719 __dev_set_rx_mode(dev);
David S. Millerb9e40852008-07-15 00:15:08 -07004720 netif_addr_unlock_bh(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004721}
4722
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004723/**
4724 * dev_get_flags - get flags reported to userspace
4725 * @dev: device
4726 *
4727 * Get the combination of flag bits exported through APIs to userspace.
4728 */
Eric Dumazet95c96172012-04-15 05:58:06 +00004729unsigned int dev_get_flags(const struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004730{
Eric Dumazet95c96172012-04-15 05:58:06 +00004731 unsigned int flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004732
4733 flags = (dev->flags & ~(IFF_PROMISC |
4734 IFF_ALLMULTI |
Stefan Rompfb00055a2006-03-20 17:09:11 -08004735 IFF_RUNNING |
4736 IFF_LOWER_UP |
4737 IFF_DORMANT)) |
Linus Torvalds1da177e2005-04-16 15:20:36 -07004738 (dev->gflags & (IFF_PROMISC |
4739 IFF_ALLMULTI));
4740
Stefan Rompfb00055a2006-03-20 17:09:11 -08004741 if (netif_running(dev)) {
4742 if (netif_oper_up(dev))
4743 flags |= IFF_RUNNING;
4744 if (netif_carrier_ok(dev))
4745 flags |= IFF_LOWER_UP;
4746 if (netif_dormant(dev))
4747 flags |= IFF_DORMANT;
4748 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004749
4750 return flags;
4751}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004752EXPORT_SYMBOL(dev_get_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004753
Patrick McHardybd380812010-02-26 06:34:53 +00004754int __dev_change_flags(struct net_device *dev, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004755{
Eric Dumazetb536db92011-11-30 21:42:26 +00004756 unsigned int old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00004757 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004758
Patrick McHardy24023452007-07-14 18:51:31 -07004759 ASSERT_RTNL();
4760
Linus Torvalds1da177e2005-04-16 15:20:36 -07004761 /*
4762 * Set the flags on our device.
4763 */
4764
4765 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4766 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4767 IFF_AUTOMEDIA)) |
4768 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4769 IFF_ALLMULTI));
4770
4771 /*
4772 * Load in the correct multicast list now the flags have changed.
4773 */
4774
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004775 if ((old_flags ^ flags) & IFF_MULTICAST)
4776 dev_change_rx_flags(dev, IFF_MULTICAST);
Patrick McHardy24023452007-07-14 18:51:31 -07004777
Patrick McHardy4417da62007-06-27 01:28:10 -07004778 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004779
4780 /*
4781 * Have we downed the interface. We handle IFF_UP ourselves
4782 * according to user attempts to set it, rather than blindly
4783 * setting it.
4784 */
4785
4786 ret = 0;
4787 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
Patrick McHardybd380812010-02-26 06:34:53 +00004788 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004789
4790 if (!ret)
Patrick McHardy4417da62007-06-27 01:28:10 -07004791 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004792 }
4793
Linus Torvalds1da177e2005-04-16 15:20:36 -07004794 if ((flags ^ dev->gflags) & IFF_PROMISC) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004795 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4796
Linus Torvalds1da177e2005-04-16 15:20:36 -07004797 dev->gflags ^= IFF_PROMISC;
4798 dev_set_promiscuity(dev, inc);
4799 }
4800
4801 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4802 is important. Some (broken) drivers set IFF_PROMISC, when
4803 IFF_ALLMULTI is requested not asking us and not reporting.
4804 */
4805 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004806 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4807
Linus Torvalds1da177e2005-04-16 15:20:36 -07004808 dev->gflags ^= IFF_ALLMULTI;
4809 dev_set_allmulti(dev, inc);
4810 }
4811
Patrick McHardybd380812010-02-26 06:34:53 +00004812 return ret;
4813}
4814
4815void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4816{
4817 unsigned int changes = dev->flags ^ old_flags;
4818
4819 if (changes & IFF_UP) {
4820 if (dev->flags & IFF_UP)
4821 call_netdevice_notifiers(NETDEV_UP, dev);
4822 else
4823 call_netdevice_notifiers(NETDEV_DOWN, dev);
4824 }
4825
4826 if (dev->flags & IFF_UP &&
Jiri Pirkobe9efd32013-05-28 01:30:22 +00004827 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
4828 struct netdev_notifier_change_info change_info;
4829
4830 change_info.flags_changed = changes;
4831 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
4832 &change_info.info);
4833 }
Patrick McHardybd380812010-02-26 06:34:53 +00004834}
4835
4836/**
4837 * dev_change_flags - change device settings
4838 * @dev: device
4839 * @flags: device state flags
4840 *
4841 * Change settings on device based state flags. The flags are
4842 * in the userspace exported format.
4843 */
Eric Dumazetb536db92011-11-30 21:42:26 +00004844int dev_change_flags(struct net_device *dev, unsigned int flags)
Patrick McHardybd380812010-02-26 06:34:53 +00004845{
Eric Dumazetb536db92011-11-30 21:42:26 +00004846 int ret;
4847 unsigned int changes, old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00004848
4849 ret = __dev_change_flags(dev, flags);
4850 if (ret < 0)
4851 return ret;
4852
4853 changes = old_flags ^ dev->flags;
Thomas Graf7c355f52007-06-05 16:03:03 -07004854 if (changes)
4855 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004856
Patrick McHardybd380812010-02-26 06:34:53 +00004857 __dev_notify_flags(dev, old_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004858 return ret;
4859}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004860EXPORT_SYMBOL(dev_change_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004861
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004862/**
4863 * dev_set_mtu - Change maximum transfer unit
4864 * @dev: device
4865 * @new_mtu: new transfer unit
4866 *
4867 * Change the maximum transfer size of the network device.
4868 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004869int dev_set_mtu(struct net_device *dev, int new_mtu)
4870{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004871 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004872 int err;
4873
4874 if (new_mtu == dev->mtu)
4875 return 0;
4876
4877 /* MTU must be positive. */
4878 if (new_mtu < 0)
4879 return -EINVAL;
4880
4881 if (!netif_device_present(dev))
4882 return -ENODEV;
4883
4884 err = 0;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004885 if (ops->ndo_change_mtu)
4886 err = ops->ndo_change_mtu(dev, new_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004887 else
4888 dev->mtu = new_mtu;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004889
Jiri Pirkoe3d8fab2012-12-03 01:16:32 +00004890 if (!err)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07004891 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004892 return err;
4893}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004894EXPORT_SYMBOL(dev_set_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004895
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004896/**
Vlad Dogarucbda10f2011-01-13 23:38:30 +00004897 * dev_set_group - Change group this device belongs to
4898 * @dev: device
4899 * @new_group: group this device should belong to
4900 */
4901void dev_set_group(struct net_device *dev, int new_group)
4902{
4903 dev->group = new_group;
4904}
4905EXPORT_SYMBOL(dev_set_group);
4906
4907/**
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004908 * dev_set_mac_address - Change Media Access Control Address
4909 * @dev: device
4910 * @sa: new address
4911 *
4912 * Change the hardware (MAC) address of the device
4913 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004914int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4915{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004916 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004917 int err;
4918
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004919 if (!ops->ndo_set_mac_address)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004920 return -EOPNOTSUPP;
4921 if (sa->sa_family != dev->type)
4922 return -EINVAL;
4923 if (!netif_device_present(dev))
4924 return -ENODEV;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004925 err = ops->ndo_set_mac_address(dev, sa);
Jiri Pirkof6521512013-01-01 03:30:14 +00004926 if (err)
4927 return err;
Jiri Pirkofbdeca22013-01-01 03:30:16 +00004928 dev->addr_assign_type = NET_ADDR_SET;
Jiri Pirkof6521512013-01-01 03:30:14 +00004929 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04004930 add_device_randomness(dev->dev_addr, dev->addr_len);
Jiri Pirkof6521512013-01-01 03:30:14 +00004931 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004932}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004933EXPORT_SYMBOL(dev_set_mac_address);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004934
Jiri Pirko4bf84c32012-12-27 23:49:37 +00004935/**
4936 * dev_change_carrier - Change device carrier
4937 * @dev: device
Randy Dunlap691b3b72013-03-04 12:32:43 +00004938 * @new_carrier: new value
Jiri Pirko4bf84c32012-12-27 23:49:37 +00004939 *
4940 * Change device carrier
4941 */
4942int dev_change_carrier(struct net_device *dev, bool new_carrier)
4943{
4944 const struct net_device_ops *ops = dev->netdev_ops;
4945
4946 if (!ops->ndo_change_carrier)
4947 return -EOPNOTSUPP;
4948 if (!netif_device_present(dev))
4949 return -ENODEV;
4950 return ops->ndo_change_carrier(dev, new_carrier);
4951}
4952EXPORT_SYMBOL(dev_change_carrier);
4953
Linus Torvalds1da177e2005-04-16 15:20:36 -07004954/**
4955 * dev_new_index - allocate an ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004956 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07004957 *
4958 * Returns a suitable unique value for a new device interface
4959 * number. The caller must hold the rtnl semaphore or the
4960 * dev_base_lock to be sure it remains unique.
4961 */
Eric W. Biederman881d9662007-09-17 11:56:21 -07004962static int dev_new_index(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004963{
Pavel Emelyanovaa79e662012-08-08 21:53:19 +00004964 int ifindex = net->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004965 for (;;) {
4966 if (++ifindex <= 0)
4967 ifindex = 1;
Eric W. Biederman881d9662007-09-17 11:56:21 -07004968 if (!__dev_get_by_index(net, ifindex))
Pavel Emelyanovaa79e662012-08-08 21:53:19 +00004969 return net->ifindex = ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004970 }
4971}
4972
Linus Torvalds1da177e2005-04-16 15:20:36 -07004973/* Delayed registration/unregisteration */
Denis Cheng3b5b34f2007-12-07 00:49:17 -08004974static LIST_HEAD(net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004975
Stephen Hemminger6f05f622007-03-08 20:46:03 -08004976static void net_set_todo(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004977{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004978 list_add_tail(&dev->todo_list, &net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004979}
4980
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004981static void rollback_registered_many(struct list_head *head)
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004982{
Krishna Kumare93737b2009-12-08 22:26:02 +00004983 struct net_device *dev, *tmp;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004984
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004985 BUG_ON(dev_boot_phase);
4986 ASSERT_RTNL();
4987
Krishna Kumare93737b2009-12-08 22:26:02 +00004988 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004989 /* Some devices call without registering
Krishna Kumare93737b2009-12-08 22:26:02 +00004990 * for initialization unwind. Remove those
4991 * devices and proceed with the remaining.
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004992 */
4993 if (dev->reg_state == NETREG_UNINITIALIZED) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00004994 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4995 dev->name, dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004996
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004997 WARN_ON(1);
Krishna Kumare93737b2009-12-08 22:26:02 +00004998 list_del(&dev->unreg_list);
4999 continue;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005000 }
Eric Dumazet449f4542011-05-19 12:24:16 +00005001 dev->dismantle = true;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005002 BUG_ON(dev->reg_state != NETREG_REGISTERED);
Octavian Purdila44345722010-12-13 12:44:07 +00005003 }
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005004
Octavian Purdila44345722010-12-13 12:44:07 +00005005 /* If device is running, close it first. */
5006 dev_close_many(head);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005007
Octavian Purdila44345722010-12-13 12:44:07 +00005008 list_for_each_entry(dev, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005009 /* And unlink it from device chain. */
5010 unlist_netdevice(dev);
5011
5012 dev->reg_state = NETREG_UNREGISTERING;
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005013 }
5014
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005015 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005016
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005017 list_for_each_entry(dev, head, unreg_list) {
5018 /* Shutdown queueing discipline. */
5019 dev_shutdown(dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005020
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005021
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005022 /* Notify protocols, that we are about to destroy
5023 this device. They should clean all the things.
5024 */
5025 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5026
Patrick McHardya2835762010-02-26 06:34:51 +00005027 if (!dev->rtnl_link_ops ||
5028 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5029 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5030
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005031 /*
5032 * Flush the unicast and multicast chains
5033 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005034 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00005035 dev_mc_flush(dev);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005036
5037 if (dev->netdev_ops->ndo_uninit)
5038 dev->netdev_ops->ndo_uninit(dev);
5039
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005040 /* Notifier chain MUST detach us all upper devices. */
5041 WARN_ON(netdev_has_any_upper_dev(dev));
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005042
5043 /* Remove entries from kobject tree */
5044 netdev_unregister_kobject(dev);
Alexander Duyck024e9672013-01-10 08:57:46 +00005045#ifdef CONFIG_XPS
5046 /* Remove XPS queueing entries */
5047 netif_reset_xps_queues_gt(dev, 0);
5048#endif
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005049 }
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005050
Eric W. Biederman850a5452011-10-13 22:25:23 +00005051 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005052
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005053 list_for_each_entry(dev, head, unreg_list)
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005054 dev_put(dev);
5055}
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005056
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005057static void rollback_registered(struct net_device *dev)
5058{
5059 LIST_HEAD(single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005060
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005061 list_add(&dev->unreg_list, &single);
5062 rollback_registered_many(&single);
Eric Dumazetceaaec92011-02-17 22:59:19 +00005063 list_del(&single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005064}
5065
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005066static netdev_features_t netdev_fix_features(struct net_device *dev,
5067 netdev_features_t features)
Herbert Xub63365a2008-10-23 01:11:29 -07005068{
Michał Mirosław57422dc2011-01-22 12:14:12 +00005069 /* Fix illegal checksum combinations */
5070 if ((features & NETIF_F_HW_CSUM) &&
5071 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005072 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
Michał Mirosław57422dc2011-01-22 12:14:12 +00005073 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5074 }
5075
Herbert Xub63365a2008-10-23 01:11:29 -07005076 /* TSO requires that SG is present as well. */
Ben Hutchingsea2d3682011-04-12 14:38:37 +00005077 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005078 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
Ben Hutchingsea2d3682011-04-12 14:38:37 +00005079 features &= ~NETIF_F_ALL_TSO;
Herbert Xub63365a2008-10-23 01:11:29 -07005080 }
5081
Pravin B Shelarec5f0612013-03-07 09:28:01 +00005082 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5083 !(features & NETIF_F_IP_CSUM)) {
5084 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5085 features &= ~NETIF_F_TSO;
5086 features &= ~NETIF_F_TSO_ECN;
5087 }
5088
5089 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5090 !(features & NETIF_F_IPV6_CSUM)) {
5091 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5092 features &= ~NETIF_F_TSO6;
5093 }
5094
Ben Hutchings31d8b9e2011-04-12 14:47:15 +00005095 /* TSO ECN requires that TSO is present as well. */
5096 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5097 features &= ~NETIF_F_TSO_ECN;
5098
Michał Mirosław212b5732011-02-15 16:59:16 +00005099 /* Software GSO depends on SG. */
5100 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005101 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
Michał Mirosław212b5732011-02-15 16:59:16 +00005102 features &= ~NETIF_F_GSO;
5103 }
5104
Michał Mirosławacd11302011-01-24 15:45:15 -08005105 /* UFO needs SG and checksumming */
Herbert Xub63365a2008-10-23 01:11:29 -07005106 if (features & NETIF_F_UFO) {
Michał Mirosław79032642010-11-30 06:38:00 +00005107 /* maybe split UFO into V4 and V6? */
5108 if (!((features & NETIF_F_GEN_CSUM) ||
5109 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5110 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005111 netdev_dbg(dev,
Michał Mirosławacd11302011-01-24 15:45:15 -08005112 "Dropping NETIF_F_UFO since no checksum offload features.\n");
Herbert Xub63365a2008-10-23 01:11:29 -07005113 features &= ~NETIF_F_UFO;
5114 }
5115
5116 if (!(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04005117 netdev_dbg(dev,
Michał Mirosławacd11302011-01-24 15:45:15 -08005118 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
Herbert Xub63365a2008-10-23 01:11:29 -07005119 features &= ~NETIF_F_UFO;
5120 }
5121 }
5122
5123 return features;
5124}
Herbert Xub63365a2008-10-23 01:11:29 -07005125
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005126int __netdev_update_features(struct net_device *dev)
Michał Mirosław5455c692011-02-15 16:59:17 +00005127{
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005128 netdev_features_t features;
Michał Mirosław5455c692011-02-15 16:59:17 +00005129 int err = 0;
5130
Michał Mirosław87267482011-04-12 09:56:38 +00005131 ASSERT_RTNL();
5132
Michał Mirosław5455c692011-02-15 16:59:17 +00005133 features = netdev_get_wanted_features(dev);
5134
5135 if (dev->netdev_ops->ndo_fix_features)
5136 features = dev->netdev_ops->ndo_fix_features(dev, features);
5137
5138 /* driver might be less strict about feature dependencies */
5139 features = netdev_fix_features(dev, features);
5140
5141 if (dev->features == features)
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005142 return 0;
Michał Mirosław5455c692011-02-15 16:59:17 +00005143
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005144 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5145 &dev->features, &features);
Michał Mirosław5455c692011-02-15 16:59:17 +00005146
5147 if (dev->netdev_ops->ndo_set_features)
5148 err = dev->netdev_ops->ndo_set_features(dev, features);
5149
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005150 if (unlikely(err < 0)) {
Michał Mirosław5455c692011-02-15 16:59:17 +00005151 netdev_err(dev,
Michał Mirosławc8f44af2011-11-15 15:29:55 +00005152 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5153 err, &features, &dev->features);
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005154 return -1;
5155 }
5156
5157 if (!err)
5158 dev->features = features;
5159
5160 return 1;
5161}
5162
Michał Mirosławafe12cc2011-05-07 03:22:17 +00005163/**
5164 * netdev_update_features - recalculate device features
5165 * @dev: the device to check
5166 *
5167 * Recalculate dev->features set and send notifications if it
5168 * has changed. Should be called after driver or hardware dependent
5169 * conditions might have changed that influence the features.
5170 */
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005171void netdev_update_features(struct net_device *dev)
5172{
5173 if (__netdev_update_features(dev))
5174 netdev_features_change(dev);
Michał Mirosław5455c692011-02-15 16:59:17 +00005175}
5176EXPORT_SYMBOL(netdev_update_features);
5177
Linus Torvalds1da177e2005-04-16 15:20:36 -07005178/**
Michał Mirosławafe12cc2011-05-07 03:22:17 +00005179 * netdev_change_features - recalculate device features
5180 * @dev: the device to check
5181 *
5182 * Recalculate dev->features set and send notifications even
5183 * if they have not changed. Should be called instead of
5184 * netdev_update_features() if also dev->vlan_features might
5185 * have changed to allow the changes to be propagated to stacked
5186 * VLAN devices.
5187 */
5188void netdev_change_features(struct net_device *dev)
5189{
5190 __netdev_update_features(dev);
5191 netdev_features_change(dev);
5192}
5193EXPORT_SYMBOL(netdev_change_features);
5194
5195/**
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08005196 * netif_stacked_transfer_operstate - transfer operstate
5197 * @rootdev: the root or lower level device to transfer state from
5198 * @dev: the device to transfer operstate to
5199 *
5200 * Transfer operational state from root to device. This is normally
5201 * called when a stacking relationship exists between the root
5202 * device and the device(a leaf device).
5203 */
5204void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5205 struct net_device *dev)
5206{
5207 if (rootdev->operstate == IF_OPER_DORMANT)
5208 netif_dormant_on(dev);
5209 else
5210 netif_dormant_off(dev);
5211
5212 if (netif_carrier_ok(rootdev)) {
5213 if (!netif_carrier_ok(dev))
5214 netif_carrier_on(dev);
5215 } else {
5216 if (netif_carrier_ok(dev))
5217 netif_carrier_off(dev);
5218 }
5219}
5220EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5221
Tom Herbertbf264142010-11-26 08:36:09 +00005222#ifdef CONFIG_RPS
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005223static int netif_alloc_rx_queues(struct net_device *dev)
5224{
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005225 unsigned int i, count = dev->num_rx_queues;
Tom Herbertbd25fa72010-10-18 18:00:16 +00005226 struct netdev_rx_queue *rx;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005227
Tom Herbertbd25fa72010-10-18 18:00:16 +00005228 BUG_ON(count < 1);
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005229
Tom Herbertbd25fa72010-10-18 18:00:16 +00005230 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
Joe Perches62b59422013-02-04 16:48:16 +00005231 if (!rx)
Tom Herbertbd25fa72010-10-18 18:00:16 +00005232 return -ENOMEM;
Joe Perches62b59422013-02-04 16:48:16 +00005233
Tom Herbertbd25fa72010-10-18 18:00:16 +00005234 dev->_rx = rx;
5235
Tom Herbertbd25fa72010-10-18 18:00:16 +00005236 for (i = 0; i < count; i++)
Tom Herbertfe822242010-11-09 10:47:38 +00005237 rx[i].dev = dev;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005238 return 0;
5239}
Tom Herbertbf264142010-11-26 08:36:09 +00005240#endif
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005241
Changli Gaoaa942102010-12-04 02:31:41 +00005242static void netdev_init_one_queue(struct net_device *dev,
5243 struct netdev_queue *queue, void *_unused)
5244{
5245 /* Initialize queue lock */
5246 spin_lock_init(&queue->_xmit_lock);
5247 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5248 queue->xmit_lock_owner = -1;
Changli Gaob236da62010-12-14 03:09:15 +00005249 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
Changli Gaoaa942102010-12-04 02:31:41 +00005250 queue->dev = dev;
Tom Herbert114cf582011-11-28 16:33:09 +00005251#ifdef CONFIG_BQL
5252 dql_init(&queue->dql, HZ);
5253#endif
Changli Gaoaa942102010-12-04 02:31:41 +00005254}
5255
Tom Herberte6484932010-10-18 18:04:39 +00005256static int netif_alloc_netdev_queues(struct net_device *dev)
5257{
5258 unsigned int count = dev->num_tx_queues;
5259 struct netdev_queue *tx;
5260
5261 BUG_ON(count < 1);
5262
5263 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
Joe Perches62b59422013-02-04 16:48:16 +00005264 if (!tx)
Tom Herberte6484932010-10-18 18:04:39 +00005265 return -ENOMEM;
Joe Perches62b59422013-02-04 16:48:16 +00005266
Tom Herberte6484932010-10-18 18:04:39 +00005267 dev->_tx = tx;
Tom Herbert1d24eb42010-11-21 13:17:27 +00005268
Tom Herberte6484932010-10-18 18:04:39 +00005269 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5270 spin_lock_init(&dev->tx_global_lock);
Changli Gaoaa942102010-12-04 02:31:41 +00005271
5272 return 0;
Tom Herberte6484932010-10-18 18:04:39 +00005273}
5274
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08005275/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005276 * register_netdevice - register a network device
5277 * @dev: device to register
5278 *
5279 * Take a completed network device structure and add it to the kernel
5280 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5281 * chain. 0 is returned on success. A negative errno code is returned
5282 * on a failure to set up the device, or if the name is a duplicate.
5283 *
5284 * Callers must hold the rtnl semaphore. You may want
5285 * register_netdev() instead of this.
5286 *
5287 * BUGS:
5288 * The locking appears insufficient to guarantee two parallel registers
5289 * will not get the same name.
5290 */
5291
5292int register_netdevice(struct net_device *dev)
5293{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005294 int ret;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005295 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005296
5297 BUG_ON(dev_boot_phase);
5298 ASSERT_RTNL();
5299
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005300 might_sleep();
5301
Linus Torvalds1da177e2005-04-16 15:20:36 -07005302 /* When net_device's are persistent, this will be fatal. */
5303 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005304 BUG_ON(!net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005305
David S. Millerf1f28aa2008-07-15 00:08:33 -07005306 spin_lock_init(&dev->addr_list_lock);
David S. Millercf508b12008-07-22 14:16:42 -07005307 netdev_set_addr_lockdep_class(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005308
Linus Torvalds1da177e2005-04-16 15:20:36 -07005309 dev->iflink = -1;
5310
Gao feng828de4f2012-09-13 20:58:27 +00005311 ret = dev_get_valid_name(net, dev, dev->name);
Peter Pan(潘卫平)0696c3a2011-05-12 15:46:56 +00005312 if (ret < 0)
5313 goto out;
5314
Linus Torvalds1da177e2005-04-16 15:20:36 -07005315 /* Init, if this function is available */
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005316 if (dev->netdev_ops->ndo_init) {
5317 ret = dev->netdev_ops->ndo_init(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005318 if (ret) {
5319 if (ret > 0)
5320 ret = -EIO;
Adrian Bunk90833aa2006-11-13 16:02:22 -08005321 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005322 }
5323 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005324
Patrick McHardyf6469682013-04-19 02:04:27 +00005325 if (((dev->hw_features | dev->features) &
5326 NETIF_F_HW_VLAN_CTAG_FILTER) &&
Michał Mirosławd2ed2732013-01-29 15:14:16 +00005327 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5328 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5329 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5330 ret = -EINVAL;
5331 goto err_uninit;
5332 }
5333
Pavel Emelyanov9c7dafb2012-08-08 21:52:46 +00005334 ret = -EBUSY;
5335 if (!dev->ifindex)
5336 dev->ifindex = dev_new_index(net);
5337 else if (__dev_get_by_index(net, dev->ifindex))
5338 goto err_uninit;
5339
Linus Torvalds1da177e2005-04-16 15:20:36 -07005340 if (dev->iflink == -1)
5341 dev->iflink = dev->ifindex;
5342
Michał Mirosław5455c692011-02-15 16:59:17 +00005343 /* Transfer changeable features to wanted_features and enable
5344 * software offloads (GSO and GRO).
5345 */
5346 dev->hw_features |= NETIF_F_SOFT_FEATURES;
Michał Mirosław14d12322011-02-22 16:52:28 +00005347 dev->features |= NETIF_F_SOFT_FEATURES;
5348 dev->wanted_features = dev->features & dev->hw_features;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005349
Tom Herbertc6e1a0d2011-04-04 22:30:30 -07005350 /* Turn on no cache copy if HW is doing checksum */
Michał Mirosław34324dc2011-11-15 15:29:55 +00005351 if (!(dev->flags & IFF_LOOPBACK)) {
5352 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5353 if (dev->features & NETIF_F_ALL_CSUM) {
5354 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5355 dev->features |= NETIF_F_NOCACHE_COPY;
5356 }
Tom Herbertc6e1a0d2011-04-04 22:30:30 -07005357 }
5358
Michał Mirosław1180e7d2011-07-14 14:41:11 -07005359 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
Brandon Philips16c3ea72010-09-15 09:24:24 +00005360 */
Michał Mirosław1180e7d2011-07-14 14:41:11 -07005361 dev->vlan_features |= NETIF_F_HIGHDMA;
Brandon Philips16c3ea72010-09-15 09:24:24 +00005362
Pravin B Shelaree579672013-03-07 09:28:08 +00005363 /* Make NETIF_F_SG inheritable to tunnel devices.
5364 */
5365 dev->hw_enc_features |= NETIF_F_SG;
5366
Simon Horman0d89d202013-05-23 21:02:52 +00005367 /* Make NETIF_F_SG inheritable to MPLS.
5368 */
5369 dev->mpls_features |= NETIF_F_SG;
5370
Johannes Berg7ffbe3f2009-10-02 05:15:27 +00005371 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5372 ret = notifier_to_errno(ret);
5373 if (ret)
5374 goto err_uninit;
5375
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005376 ret = netdev_register_kobject(dev);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005377 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005378 goto err_uninit;
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005379 dev->reg_state = NETREG_REGISTERED;
5380
Michał Mirosław6cb6a272011-04-02 22:48:47 -07005381 __netdev_update_features(dev);
Michał Mirosław8e9b59b2011-02-22 16:52:28 +00005382
Linus Torvalds1da177e2005-04-16 15:20:36 -07005383 /*
5384 * Default initial state at registry is that the
5385 * device is present.
5386 */
5387
5388 set_bit(__LINK_STATE_PRESENT, &dev->state);
5389
Ben Hutchings8f4cccb2012-08-20 22:16:51 +01005390 linkwatch_init_dev(dev);
5391
Linus Torvalds1da177e2005-04-16 15:20:36 -07005392 dev_init_scheduler(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005393 dev_hold(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005394 list_netdevice(dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04005395 add_device_randomness(dev->dev_addr, dev->addr_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005396
Jiri Pirko948b3372013-01-08 01:38:25 +00005397 /* If the device has permanent device address, driver should
5398 * set dev_addr and also addr_assign_type should be set to
5399 * NET_ADDR_PERM (default value).
5400 */
5401 if (dev->addr_assign_type == NET_ADDR_PERM)
5402 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5403
Linus Torvalds1da177e2005-04-16 15:20:36 -07005404 /* Notify protocols, that a new device appeared. */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005405 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07005406 ret = notifier_to_errno(ret);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005407 if (ret) {
5408 rollback_registered(dev);
5409 dev->reg_state = NETREG_UNREGISTERED;
5410 }
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005411 /*
5412 * Prevent userspace races by waiting until the network
5413 * device is fully setup before sending notifications.
5414 */
Patrick McHardya2835762010-02-26 06:34:51 +00005415 if (!dev->rtnl_link_ops ||
5416 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5417 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005418
5419out:
5420 return ret;
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005421
5422err_uninit:
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005423 if (dev->netdev_ops->ndo_uninit)
5424 dev->netdev_ops->ndo_uninit(dev);
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005425 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005426}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005427EXPORT_SYMBOL(register_netdevice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005428
5429/**
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005430 * init_dummy_netdev - init a dummy network device for NAPI
5431 * @dev: device to init
5432 *
5433 * This takes a network device structure and initialize the minimum
5434 * amount of fields so it can be used to schedule NAPI polls without
5435 * registering a full blown interface. This is to be used by drivers
5436 * that need to tie several hardware interfaces to a single NAPI
5437 * poll scheduler due to HW limitations.
5438 */
5439int init_dummy_netdev(struct net_device *dev)
5440{
5441 /* Clear everything. Note we don't initialize spinlocks
5442 * are they aren't supposed to be taken by any of the
5443 * NAPI code and this dummy netdev is supposed to be
5444 * only ever used for NAPI polls
5445 */
5446 memset(dev, 0, sizeof(struct net_device));
5447
5448 /* make sure we BUG if trying to hit standard
5449 * register/unregister code path
5450 */
5451 dev->reg_state = NETREG_DUMMY;
5452
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005453 /* NAPI wants this */
5454 INIT_LIST_HEAD(&dev->napi_list);
5455
5456 /* a dummy interface is started by default */
5457 set_bit(__LINK_STATE_PRESENT, &dev->state);
5458 set_bit(__LINK_STATE_START, &dev->state);
5459
Eric Dumazet29b44332010-10-11 10:22:12 +00005460 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5461 * because users of this 'device' dont need to change
5462 * its refcount.
5463 */
5464
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005465 return 0;
5466}
5467EXPORT_SYMBOL_GPL(init_dummy_netdev);
5468
5469
5470/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005471 * register_netdev - register a network device
5472 * @dev: device to register
5473 *
5474 * Take a completed network device structure and add it to the kernel
5475 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5476 * chain. 0 is returned on success. A negative errno code is returned
5477 * on a failure to set up the device, or if the name is a duplicate.
5478 *
Borislav Petkov38b4da32007-04-20 22:14:10 -07005479 * This is a wrapper around register_netdevice that takes the rtnl semaphore
Linus Torvalds1da177e2005-04-16 15:20:36 -07005480 * and expands the device name if you passed a format string to
5481 * alloc_netdev.
5482 */
5483int register_netdev(struct net_device *dev)
5484{
5485 int err;
5486
5487 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005488 err = register_netdevice(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005489 rtnl_unlock();
5490 return err;
5491}
5492EXPORT_SYMBOL(register_netdev);
5493
Eric Dumazet29b44332010-10-11 10:22:12 +00005494int netdev_refcnt_read(const struct net_device *dev)
5495{
5496 int i, refcnt = 0;
5497
5498 for_each_possible_cpu(i)
5499 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5500 return refcnt;
5501}
5502EXPORT_SYMBOL(netdev_refcnt_read);
5503
Ben Hutchings2c530402012-07-10 10:55:09 +00005504/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005505 * netdev_wait_allrefs - wait until all references are gone.
Randy Dunlap3de7a372012-08-18 14:36:44 +00005506 * @dev: target net_device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005507 *
5508 * This is called when unregistering network devices.
5509 *
5510 * Any protocol or device that holds a reference should register
5511 * for netdevice notification, and cleanup and put back the
5512 * reference if they receive an UNREGISTER event.
5513 * We can get stuck here if buggy protocols don't correctly
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005514 * call dev_put.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005515 */
5516static void netdev_wait_allrefs(struct net_device *dev)
5517{
5518 unsigned long rebroadcast_time, warning_time;
Eric Dumazet29b44332010-10-11 10:22:12 +00005519 int refcnt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005520
Eric Dumazete014deb2009-11-17 05:59:21 +00005521 linkwatch_forget_dev(dev);
5522
Linus Torvalds1da177e2005-04-16 15:20:36 -07005523 rebroadcast_time = warning_time = jiffies;
Eric Dumazet29b44332010-10-11 10:22:12 +00005524 refcnt = netdev_refcnt_read(dev);
5525
5526 while (refcnt != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005527 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005528 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005529
5530 /* Rebroadcast unregister notification */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005531 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005532
Eric Dumazet748e2d92012-08-22 21:50:59 +00005533 __rtnl_unlock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005534 rcu_barrier();
Eric Dumazet748e2d92012-08-22 21:50:59 +00005535 rtnl_lock();
5536
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005537 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005538 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5539 &dev->state)) {
5540 /* We must not have linkwatch events
5541 * pending on unregister. If this
5542 * happens, we simply run the queue
5543 * unscheduled, resulting in a noop
5544 * for this device.
5545 */
5546 linkwatch_run_queue();
5547 }
5548
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005549 __rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005550
5551 rebroadcast_time = jiffies;
5552 }
5553
5554 msleep(250);
5555
Eric Dumazet29b44332010-10-11 10:22:12 +00005556 refcnt = netdev_refcnt_read(dev);
5557
Linus Torvalds1da177e2005-04-16 15:20:36 -07005558 if (time_after(jiffies, warning_time + 10 * HZ)) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005559 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5560 dev->name, refcnt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005561 warning_time = jiffies;
5562 }
5563 }
5564}
5565
5566/* The sequence is:
5567 *
5568 * rtnl_lock();
5569 * ...
5570 * register_netdevice(x1);
5571 * register_netdevice(x2);
5572 * ...
5573 * unregister_netdevice(y1);
5574 * unregister_netdevice(y2);
5575 * ...
5576 * rtnl_unlock();
5577 * free_netdev(y1);
5578 * free_netdev(y2);
5579 *
Herbert Xu58ec3b42008-10-07 15:50:03 -07005580 * We are invoked by rtnl_unlock().
Linus Torvalds1da177e2005-04-16 15:20:36 -07005581 * This allows us to deal with problems:
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005582 * 1) We can delete sysfs objects which invoke hotplug
Linus Torvalds1da177e2005-04-16 15:20:36 -07005583 * without deadlocking with linkwatch via keventd.
5584 * 2) Since we run with the RTNL semaphore not held, we can sleep
5585 * safely in order to wait for the netdev refcnt to drop to zero.
Herbert Xu58ec3b42008-10-07 15:50:03 -07005586 *
5587 * We must not return until all unregister events added during
5588 * the interval the lock was held have been completed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005589 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005590void netdev_run_todo(void)
5591{
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005592 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005593
Linus Torvalds1da177e2005-04-16 15:20:36 -07005594 /* Snapshot list, allow later requests */
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005595 list_replace_init(&net_todo_list, &list);
Herbert Xu58ec3b42008-10-07 15:50:03 -07005596
5597 __rtnl_unlock();
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005598
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005599
5600 /* Wait for rcu callbacks to finish before next phase */
Eric W. Biederman850a5452011-10-13 22:25:23 +00005601 if (!list_empty(&list))
5602 rcu_barrier();
5603
Linus Torvalds1da177e2005-04-16 15:20:36 -07005604 while (!list_empty(&list)) {
5605 struct net_device *dev
stephen hemmingere5e26d72010-02-24 14:01:38 +00005606 = list_first_entry(&list, struct net_device, todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005607 list_del(&dev->todo_list);
5608
Eric Dumazet748e2d92012-08-22 21:50:59 +00005609 rtnl_lock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005610 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Eric Dumazet748e2d92012-08-22 21:50:59 +00005611 __rtnl_unlock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00005612
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005613 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005614 pr_err("network todo '%s' but state %d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07005615 dev->name, dev->reg_state);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005616 dump_stack();
5617 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005618 }
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005619
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005620 dev->reg_state = NETREG_UNREGISTERED;
5621
Changli Gao152102c2010-03-30 20:16:22 +00005622 on_each_cpu(flush_backlog, dev, 1);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07005623
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005624 netdev_wait_allrefs(dev);
5625
5626 /* paranoia */
Eric Dumazet29b44332010-10-11 10:22:12 +00005627 BUG_ON(netdev_refcnt_read(dev));
Eric Dumazet33d480c2011-08-11 19:30:52 +00005628 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5629 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
Ilpo Järvinen547b7922008-07-25 21:43:18 -07005630 WARN_ON(dev->dn_ptr);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005631
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005632 if (dev->destructor)
5633 dev->destructor(dev);
Stephen Hemminger9093bbb2007-05-19 15:39:25 -07005634
5635 /* Free network device */
5636 kobject_put(&dev->dev.kobj);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005637 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005638}
5639
Ben Hutchings3cfde792010-07-09 09:11:52 +00005640/* Convert net_device_stats to rtnl_link_stats64. They have the same
5641 * fields in the same order, with only the type differing.
5642 */
Eric Dumazet77a1abf2012-03-05 04:50:09 +00005643void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5644 const struct net_device_stats *netdev_stats)
Ben Hutchings3cfde792010-07-09 09:11:52 +00005645{
5646#if BITS_PER_LONG == 64
Eric Dumazet77a1abf2012-03-05 04:50:09 +00005647 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5648 memcpy(stats64, netdev_stats, sizeof(*stats64));
Ben Hutchings3cfde792010-07-09 09:11:52 +00005649#else
5650 size_t i, n = sizeof(*stats64) / sizeof(u64);
5651 const unsigned long *src = (const unsigned long *)netdev_stats;
5652 u64 *dst = (u64 *)stats64;
5653
5654 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5655 sizeof(*stats64) / sizeof(u64));
5656 for (i = 0; i < n; i++)
5657 dst[i] = src[i];
5658#endif
5659}
Eric Dumazet77a1abf2012-03-05 04:50:09 +00005660EXPORT_SYMBOL(netdev_stats_to_stats64);
Ben Hutchings3cfde792010-07-09 09:11:52 +00005661
Eric Dumazetd83345a2009-11-16 03:36:51 +00005662/**
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005663 * dev_get_stats - get network device statistics
5664 * @dev: device to get statistics from
Eric Dumazet28172732010-07-07 14:58:56 -07005665 * @storage: place to store stats
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005666 *
Ben Hutchingsd7753512010-07-09 09:12:41 +00005667 * Get network statistics from device. Return @storage.
5668 * The device driver may provide its own method by setting
5669 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5670 * otherwise the internal statistics structure is used.
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005671 */
Ben Hutchingsd7753512010-07-09 09:12:41 +00005672struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5673 struct rtnl_link_stats64 *storage)
Eric Dumazet7004bf22009-05-18 00:34:33 +00005674{
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005675 const struct net_device_ops *ops = dev->netdev_ops;
5676
Eric Dumazet28172732010-07-07 14:58:56 -07005677 if (ops->ndo_get_stats64) {
5678 memset(storage, 0, sizeof(*storage));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005679 ops->ndo_get_stats64(dev, storage);
5680 } else if (ops->ndo_get_stats) {
Ben Hutchings3cfde792010-07-09 09:11:52 +00005681 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005682 } else {
5683 netdev_stats_to_stats64(storage, &dev->stats);
Eric Dumazet28172732010-07-07 14:58:56 -07005684 }
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005685 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
Eric Dumazet28172732010-07-07 14:58:56 -07005686 return storage;
Rusty Russellc45d2862007-03-28 14:29:08 -07005687}
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005688EXPORT_SYMBOL(dev_get_stats);
Rusty Russellc45d2862007-03-28 14:29:08 -07005689
Eric Dumazet24824a02010-10-02 06:11:55 +00005690struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
David S. Millerdc2b4842008-07-08 17:18:23 -07005691{
Eric Dumazet24824a02010-10-02 06:11:55 +00005692 struct netdev_queue *queue = dev_ingress_queue(dev);
David S. Millerdc2b4842008-07-08 17:18:23 -07005693
Eric Dumazet24824a02010-10-02 06:11:55 +00005694#ifdef CONFIG_NET_CLS_ACT
5695 if (queue)
5696 return queue;
5697 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5698 if (!queue)
5699 return NULL;
5700 netdev_init_one_queue(dev, queue, NULL);
Eric Dumazet24824a02010-10-02 06:11:55 +00005701 queue->qdisc = &noop_qdisc;
5702 queue->qdisc_sleeping = &noop_qdisc;
5703 rcu_assign_pointer(dev->ingress_queue, queue);
5704#endif
5705 return queue;
David S. Millerbb949fb2008-07-08 16:55:56 -07005706}
5707
Eric Dumazet2c60db02012-09-16 09:17:26 +00005708static const struct ethtool_ops default_ethtool_ops;
5709
Stanislaw Gruszkad07d7502013-01-10 23:19:10 +00005710void netdev_set_default_ethtool_ops(struct net_device *dev,
5711 const struct ethtool_ops *ops)
5712{
5713 if (dev->ethtool_ops == &default_ethtool_ops)
5714 dev->ethtool_ops = ops;
5715}
5716EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5717
Linus Torvalds1da177e2005-04-16 15:20:36 -07005718/**
Tom Herbert36909ea2011-01-09 19:36:31 +00005719 * alloc_netdev_mqs - allocate network device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005720 * @sizeof_priv: size of private data to allocate space for
5721 * @name: device name format string
5722 * @setup: callback to initialize device
Tom Herbert36909ea2011-01-09 19:36:31 +00005723 * @txqs: the number of TX subqueues to allocate
5724 * @rxqs: the number of RX subqueues to allocate
Linus Torvalds1da177e2005-04-16 15:20:36 -07005725 *
5726 * Allocates a struct net_device with private data area for driver use
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005727 * and performs basic initialization. Also allocates subquue structs
Tom Herbert36909ea2011-01-09 19:36:31 +00005728 * for each queue on the device.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005729 */
Tom Herbert36909ea2011-01-09 19:36:31 +00005730struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5731 void (*setup)(struct net_device *),
5732 unsigned int txqs, unsigned int rxqs)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005733{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005734 struct net_device *dev;
Stephen Hemminger79439862008-07-21 13:28:44 -07005735 size_t alloc_size;
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005736 struct net_device *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005737
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07005738 BUG_ON(strlen(name) >= sizeof(dev->name));
5739
Tom Herbert36909ea2011-01-09 19:36:31 +00005740 if (txqs < 1) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005741 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
Tom Herbert55513fb2010-10-18 17:55:58 +00005742 return NULL;
5743 }
5744
Tom Herbert36909ea2011-01-09 19:36:31 +00005745#ifdef CONFIG_RPS
5746 if (rxqs < 1) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005747 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
Tom Herbert36909ea2011-01-09 19:36:31 +00005748 return NULL;
5749 }
5750#endif
5751
David S. Millerfd2ea0a2008-07-17 01:56:23 -07005752 alloc_size = sizeof(struct net_device);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005753 if (sizeof_priv) {
5754 /* ensure 32-byte alignment of private area */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005755 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005756 alloc_size += sizeof_priv;
5757 }
5758 /* ensure 32-byte alignment of whole construct */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005759 alloc_size += NETDEV_ALIGN - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005760
Paolo 'Blaisorblade' Giarrusso31380de2006-04-06 22:38:28 -07005761 p = kzalloc(alloc_size, GFP_KERNEL);
Joe Perches62b59422013-02-04 16:48:16 +00005762 if (!p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005763 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005764
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005765 dev = PTR_ALIGN(p, NETDEV_ALIGN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005766 dev->padded = (char *)dev - (char *)p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005767
Eric Dumazet29b44332010-10-11 10:22:12 +00005768 dev->pcpu_refcnt = alloc_percpu(int);
5769 if (!dev->pcpu_refcnt)
Tom Herberte6484932010-10-18 18:04:39 +00005770 goto free_p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005771
Linus Torvalds1da177e2005-04-16 15:20:36 -07005772 if (dev_addr_init(dev))
Eric Dumazet29b44332010-10-11 10:22:12 +00005773 goto free_pcpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005774
Jiri Pirko22bedad32010-04-01 21:22:57 +00005775 dev_mc_init(dev);
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005776 dev_uc_init(dev);
Jiri Pirkoccffad252009-05-22 23:22:17 +00005777
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005778 dev_net_set(dev, &init_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005779
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07005780 dev->gso_max_size = GSO_MAX_SIZE;
Ben Hutchings30b678d2012-07-30 15:57:00 +00005781 dev->gso_max_segs = GSO_MAX_SEGS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005782
Herbert Xud565b0a2008-12-15 23:38:52 -08005783 INIT_LIST_HEAD(&dev->napi_list);
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005784 INIT_LIST_HEAD(&dev->unreg_list);
Eric Dumazete014deb2009-11-17 05:59:21 +00005785 INIT_LIST_HEAD(&dev->link_watch_list);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005786 INIT_LIST_HEAD(&dev->upper_dev_list);
Eric Dumazet93f154b2009-05-18 22:19:19 -07005787 dev->priv_flags = IFF_XMIT_DST_RELEASE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005788 setup(dev);
David S. Miller8d3bdbd2011-02-08 15:02:50 -08005789
5790 dev->num_tx_queues = txqs;
5791 dev->real_num_tx_queues = txqs;
5792 if (netif_alloc_netdev_queues(dev))
5793 goto free_all;
5794
5795#ifdef CONFIG_RPS
5796 dev->num_rx_queues = rxqs;
5797 dev->real_num_rx_queues = rxqs;
5798 if (netif_alloc_rx_queues(dev))
5799 goto free_all;
5800#endif
5801
Linus Torvalds1da177e2005-04-16 15:20:36 -07005802 strcpy(dev->name, name);
Vlad Dogarucbda10f2011-01-13 23:38:30 +00005803 dev->group = INIT_NETDEV_GROUP;
Eric Dumazet2c60db02012-09-16 09:17:26 +00005804 if (!dev->ethtool_ops)
5805 dev->ethtool_ops = &default_ethtool_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005806 return dev;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005807
David S. Miller8d3bdbd2011-02-08 15:02:50 -08005808free_all:
5809 free_netdev(dev);
5810 return NULL;
5811
Eric Dumazet29b44332010-10-11 10:22:12 +00005812free_pcpu:
5813 free_percpu(dev->pcpu_refcnt);
Tom Herberted9af2e2010-11-09 10:47:30 +00005814 kfree(dev->_tx);
Tom Herbertfe822242010-11-09 10:47:38 +00005815#ifdef CONFIG_RPS
5816 kfree(dev->_rx);
5817#endif
5818
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005819free_p:
5820 kfree(p);
5821 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005822}
Tom Herbert36909ea2011-01-09 19:36:31 +00005823EXPORT_SYMBOL(alloc_netdev_mqs);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005824
5825/**
5826 * free_netdev - free network device
5827 * @dev: device
5828 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005829 * This function does the last stage of destroying an allocated device
5830 * interface. The reference to the device object is released.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005831 * If this is the last reference then it will be freed.
5832 */
5833void free_netdev(struct net_device *dev)
5834{
Herbert Xud565b0a2008-12-15 23:38:52 -08005835 struct napi_struct *p, *n;
5836
Denis V. Lunevf3005d72008-04-16 02:02:18 -07005837 release_net(dev_net(dev));
5838
David S. Millere8a04642008-07-17 00:34:19 -07005839 kfree(dev->_tx);
Tom Herbertfe822242010-11-09 10:47:38 +00005840#ifdef CONFIG_RPS
5841 kfree(dev->_rx);
5842#endif
David S. Millere8a04642008-07-17 00:34:19 -07005843
Eric Dumazet33d480c2011-08-11 19:30:52 +00005844 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
Eric Dumazet24824a02010-10-02 06:11:55 +00005845
Jiri Pirkof001fde2009-05-05 02:48:28 +00005846 /* Flush device addresses */
5847 dev_addr_flush(dev);
5848
Herbert Xud565b0a2008-12-15 23:38:52 -08005849 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5850 netif_napi_del(p);
5851
Eric Dumazet29b44332010-10-11 10:22:12 +00005852 free_percpu(dev->pcpu_refcnt);
5853 dev->pcpu_refcnt = NULL;
5854
Stephen Hemminger3041a062006-05-26 13:25:24 -07005855 /* Compatibility with error handling in drivers */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005856 if (dev->reg_state == NETREG_UNINITIALIZED) {
5857 kfree((char *)dev - dev->padded);
5858 return;
5859 }
5860
5861 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5862 dev->reg_state = NETREG_RELEASED;
5863
Greg Kroah-Hartman43cb76d2002-04-09 12:14:34 -07005864 /* will free via device release */
5865 put_device(&dev->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005866}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005867EXPORT_SYMBOL(free_netdev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005868
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005869/**
5870 * synchronize_net - Synchronize with packet receive processing
5871 *
5872 * Wait for packets currently being received to be done.
5873 * Does not block later packets from starting.
5874 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005875void synchronize_net(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005876{
5877 might_sleep();
Eric Dumazetbe3fc412011-05-23 23:07:32 +00005878 if (rtnl_is_locked())
5879 synchronize_rcu_expedited();
5880 else
5881 synchronize_rcu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005882}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005883EXPORT_SYMBOL(synchronize_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005884
5885/**
Eric Dumazet44a08732009-10-27 07:03:04 +00005886 * unregister_netdevice_queue - remove device from the kernel
Linus Torvalds1da177e2005-04-16 15:20:36 -07005887 * @dev: device
Eric Dumazet44a08732009-10-27 07:03:04 +00005888 * @head: list
Jaswinder Singh Rajput6ebfbc02009-11-22 20:43:13 -08005889 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07005890 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005891 * from the kernel tables.
Eric Dumazet44a08732009-10-27 07:03:04 +00005892 * If head not NULL, device is queued to be unregistered later.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005893 *
5894 * Callers must hold the rtnl semaphore. You may want
5895 * unregister_netdev() instead of this.
5896 */
5897
Eric Dumazet44a08732009-10-27 07:03:04 +00005898void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005899{
Herbert Xua6620712007-12-12 19:21:56 -08005900 ASSERT_RTNL();
5901
Eric Dumazet44a08732009-10-27 07:03:04 +00005902 if (head) {
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005903 list_move_tail(&dev->unreg_list, head);
Eric Dumazet44a08732009-10-27 07:03:04 +00005904 } else {
5905 rollback_registered(dev);
5906 /* Finish processing unregister after unlock */
5907 net_set_todo(dev);
5908 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005909}
Eric Dumazet44a08732009-10-27 07:03:04 +00005910EXPORT_SYMBOL(unregister_netdevice_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005911
5912/**
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005913 * unregister_netdevice_many - unregister many devices
5914 * @head: list of devices
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005915 */
5916void unregister_netdevice_many(struct list_head *head)
5917{
5918 struct net_device *dev;
5919
5920 if (!list_empty(head)) {
5921 rollback_registered_many(head);
5922 list_for_each_entry(dev, head, unreg_list)
5923 net_set_todo(dev);
5924 }
5925}
Eric Dumazet63c80992009-10-27 07:06:49 +00005926EXPORT_SYMBOL(unregister_netdevice_many);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005927
5928/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005929 * unregister_netdev - remove device from the kernel
5930 * @dev: device
5931 *
5932 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005933 * from the kernel tables.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005934 *
5935 * This is just a wrapper for unregister_netdevice that takes
5936 * the rtnl semaphore. In general you want to use this and not
5937 * unregister_netdevice.
5938 */
5939void unregister_netdev(struct net_device *dev)
5940{
5941 rtnl_lock();
5942 unregister_netdevice(dev);
5943 rtnl_unlock();
5944}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005945EXPORT_SYMBOL(unregister_netdev);
5946
Eric W. Biedermance286d32007-09-12 13:53:49 +02005947/**
5948 * dev_change_net_namespace - move device to different nethost namespace
5949 * @dev: device
5950 * @net: network namespace
5951 * @pat: If not NULL name pattern to try if the current device name
5952 * is already taken in the destination network namespace.
5953 *
5954 * This function shuts down a device interface and moves it
5955 * to a new network namespace. On success 0 is returned, on
5956 * a failure a netagive errno code is returned.
5957 *
5958 * Callers must hold the rtnl semaphore.
5959 */
5960
5961int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5962{
Eric W. Biedermance286d32007-09-12 13:53:49 +02005963 int err;
5964
5965 ASSERT_RTNL();
5966
5967 /* Don't allow namespace local devices to be moved. */
5968 err = -EINVAL;
5969 if (dev->features & NETIF_F_NETNS_LOCAL)
5970 goto out;
5971
5972 /* Ensure the device has been registrered */
Eric W. Biedermance286d32007-09-12 13:53:49 +02005973 if (dev->reg_state != NETREG_REGISTERED)
5974 goto out;
5975
5976 /* Get out if there is nothing todo */
5977 err = 0;
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09005978 if (net_eq(dev_net(dev), net))
Eric W. Biedermance286d32007-09-12 13:53:49 +02005979 goto out;
5980
5981 /* Pick the destination device name, and ensure
5982 * we can use it in the destination network namespace.
5983 */
5984 err = -EEXIST;
Octavian Purdilad9031022009-11-18 02:36:59 +00005985 if (__dev_get_by_name(net, dev->name)) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02005986 /* We get here if we can't use the current device name */
5987 if (!pat)
5988 goto out;
Gao feng828de4f2012-09-13 20:58:27 +00005989 if (dev_get_valid_name(net, dev, pat) < 0)
Eric W. Biedermance286d32007-09-12 13:53:49 +02005990 goto out;
5991 }
5992
5993 /*
5994 * And now a mini version of register_netdevice unregister_netdevice.
5995 */
5996
5997 /* If device is running close it first. */
Pavel Emelyanov9b772652007-10-10 02:49:09 -07005998 dev_close(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005999
6000 /* And unlink it from device chain */
6001 err = -ENODEV;
6002 unlist_netdevice(dev);
6003
6004 synchronize_net();
6005
6006 /* Shutdown queueing discipline. */
6007 dev_shutdown(dev);
6008
6009 /* Notify protocols, that we are about to destroy
6010 this device. They should clean all the things.
David Lamparter3b27e102010-09-17 03:22:19 +00006011
6012 Note that dev->reg_state stays at NETREG_REGISTERED.
6013 This is wanted because this way 8021q and macvlan know
6014 the device is just moving and can keep their slaves up.
Eric W. Biedermance286d32007-09-12 13:53:49 +02006015 */
6016 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Gao feng6549dd42012-08-23 15:36:55 +00006017 rcu_barrier();
6018 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Eric W. Biedermand2237d32011-10-21 06:24:20 +00006019 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006020
6021 /*
6022 * Flush the unicast and multicast chains
6023 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00006024 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00006025 dev_mc_flush(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006026
Serge Hallyn4e66ae22012-12-03 16:17:12 +00006027 /* Send a netdev-removed uevent to the old namespace */
6028 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6029
Eric W. Biedermance286d32007-09-12 13:53:49 +02006030 /* Actually switch the network namespace */
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09006031 dev_net_set(dev, net);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006032
Eric W. Biedermance286d32007-09-12 13:53:49 +02006033 /* If there is an ifindex conflict assign a new one */
6034 if (__dev_get_by_index(net, dev->ifindex)) {
6035 int iflink = (dev->iflink == dev->ifindex);
6036 dev->ifindex = dev_new_index(net);
6037 if (iflink)
6038 dev->iflink = dev->ifindex;
6039 }
6040
Serge Hallyn4e66ae22012-12-03 16:17:12 +00006041 /* Send a netdev-add uevent to the new namespace */
6042 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6043
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006044 /* Fixup kobjects */
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07006045 err = device_rename(&dev->dev, dev->name);
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006046 WARN_ON(err);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006047
6048 /* Add the device back in the hashes */
6049 list_netdevice(dev);
6050
6051 /* Notify protocols, that a new device appeared. */
6052 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6053
Eric W. Biedermand90a9092009-12-12 22:11:15 +00006054 /*
6055 * Prevent userspace races by waiting until the network
6056 * device is fully setup before sending notifications.
6057 */
6058 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6059
Eric W. Biedermance286d32007-09-12 13:53:49 +02006060 synchronize_net();
6061 err = 0;
6062out:
6063 return err;
6064}
Johannes Berg463d0182009-07-14 00:33:35 +02006065EXPORT_SYMBOL_GPL(dev_change_net_namespace);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006066
Linus Torvalds1da177e2005-04-16 15:20:36 -07006067static int dev_cpu_callback(struct notifier_block *nfb,
6068 unsigned long action,
6069 void *ocpu)
6070{
6071 struct sk_buff **list_skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006072 struct sk_buff *skb;
6073 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6074 struct softnet_data *sd, *oldsd;
6075
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006076 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006077 return NOTIFY_OK;
6078
6079 local_irq_disable();
6080 cpu = smp_processor_id();
6081 sd = &per_cpu(softnet_data, cpu);
6082 oldsd = &per_cpu(softnet_data, oldcpu);
6083
6084 /* Find end of our completion_queue. */
6085 list_skb = &sd->completion_queue;
6086 while (*list_skb)
6087 list_skb = &(*list_skb)->next;
6088 /* Append completion queue from offline CPU. */
6089 *list_skb = oldsd->completion_queue;
6090 oldsd->completion_queue = NULL;
6091
Linus Torvalds1da177e2005-04-16 15:20:36 -07006092 /* Append output queue from offline CPU. */
Changli Gaoa9cbd582010-04-26 23:06:24 +00006093 if (oldsd->output_queue) {
6094 *sd->output_queue_tailp = oldsd->output_queue;
6095 sd->output_queue_tailp = oldsd->output_queue_tailp;
6096 oldsd->output_queue = NULL;
6097 oldsd->output_queue_tailp = &oldsd->output_queue;
6098 }
Heiko Carstens264524d2011-06-06 20:50:03 +00006099 /* Append NAPI poll list from offline CPU. */
6100 if (!list_empty(&oldsd->poll_list)) {
6101 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6102 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6103 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006104
6105 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6106 local_irq_enable();
6107
6108 /* Process offline CPU's input_pkt_queue */
Tom Herbert76cc8b12010-05-20 18:37:59 +00006109 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6110 netif_rx(skb);
6111 input_queue_head_incr(oldsd);
6112 }
Tom Herbertfec5e652010-04-16 16:01:27 -07006113 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006114 netif_rx(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00006115 input_queue_head_incr(oldsd);
Tom Herbertfec5e652010-04-16 16:01:27 -07006116 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006117
6118 return NOTIFY_OK;
6119}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006120
6121
Herbert Xu7f353bf2007-08-10 15:47:58 -07006122/**
Herbert Xub63365a2008-10-23 01:11:29 -07006123 * netdev_increment_features - increment feature set by one
6124 * @all: current feature set
6125 * @one: new feature set
6126 * @mask: mask feature set
Herbert Xu7f353bf2007-08-10 15:47:58 -07006127 *
6128 * Computes a new feature set after adding a device with feature set
Herbert Xub63365a2008-10-23 01:11:29 -07006129 * @one to the master device with current feature set @all. Will not
6130 * enable anything that is off in @mask. Returns the new feature set.
Herbert Xu7f353bf2007-08-10 15:47:58 -07006131 */
Michał Mirosławc8f44af2011-11-15 15:29:55 +00006132netdev_features_t netdev_increment_features(netdev_features_t all,
6133 netdev_features_t one, netdev_features_t mask)
Herbert Xu7f353bf2007-08-10 15:47:58 -07006134{
Michał Mirosław1742f182011-04-22 06:31:16 +00006135 if (mask & NETIF_F_GEN_CSUM)
6136 mask |= NETIF_F_ALL_CSUM;
6137 mask |= NETIF_F_VLAN_CHALLENGED;
6138
6139 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6140 all &= one | ~NETIF_F_ALL_FOR_ALL;
6141
Michał Mirosław1742f182011-04-22 06:31:16 +00006142 /* If one device supports hw checksumming, set for all. */
6143 if (all & NETIF_F_GEN_CSUM)
6144 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
Herbert Xu7f353bf2007-08-10 15:47:58 -07006145
6146 return all;
6147}
Herbert Xub63365a2008-10-23 01:11:29 -07006148EXPORT_SYMBOL(netdev_increment_features);
Herbert Xu7f353bf2007-08-10 15:47:58 -07006149
Baruch Siach430f03c2013-06-02 20:43:55 +00006150static struct hlist_head * __net_init netdev_create_hash(void)
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006151{
6152 int i;
6153 struct hlist_head *hash;
6154
6155 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6156 if (hash != NULL)
6157 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6158 INIT_HLIST_HEAD(&hash[i]);
6159
6160 return hash;
6161}
6162
Eric W. Biederman881d9662007-09-17 11:56:21 -07006163/* Initialize per network namespace state */
Pavel Emelyanov46650792007-10-08 20:38:39 -07006164static int __net_init netdev_init(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07006165{
Rustad, Mark D734b6542012-07-18 09:06:07 +00006166 if (net != &init_net)
6167 INIT_LIST_HEAD(&net->dev_base_head);
Eric W. Biederman881d9662007-09-17 11:56:21 -07006168
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006169 net->dev_name_head = netdev_create_hash();
6170 if (net->dev_name_head == NULL)
6171 goto err_name;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006172
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006173 net->dev_index_head = netdev_create_hash();
6174 if (net->dev_index_head == NULL)
6175 goto err_idx;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006176
6177 return 0;
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006178
6179err_idx:
6180 kfree(net->dev_name_head);
6181err_name:
6182 return -ENOMEM;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006183}
6184
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006185/**
6186 * netdev_drivername - network driver for the device
6187 * @dev: network device
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006188 *
6189 * Determine network driver for device.
6190 */
David S. Miller3019de12011-06-06 16:41:33 -07006191const char *netdev_drivername(const struct net_device *dev)
Arjan van de Ven6579e572008-07-21 13:31:48 -07006192{
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07006193 const struct device_driver *driver;
6194 const struct device *parent;
David S. Miller3019de12011-06-06 16:41:33 -07006195 const char *empty = "";
Arjan van de Ven6579e572008-07-21 13:31:48 -07006196
6197 parent = dev->dev.parent;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006198 if (!parent)
David S. Miller3019de12011-06-06 16:41:33 -07006199 return empty;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006200
6201 driver = parent->driver;
6202 if (driver && driver->name)
David S. Miller3019de12011-06-06 16:41:33 -07006203 return driver->name;
6204 return empty;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006205}
6206
Joe Perchesb004ff42012-09-12 20:12:19 -07006207static int __netdev_printk(const char *level, const struct net_device *dev,
Joe Perches256df2f2010-06-27 01:02:35 +00006208 struct va_format *vaf)
6209{
6210 int r;
6211
Joe Perchesb004ff42012-09-12 20:12:19 -07006212 if (dev && dev->dev.parent) {
Joe Perches666f3552012-09-12 20:14:11 -07006213 r = dev_printk_emit(level[1] - '0',
6214 dev->dev.parent,
6215 "%s %s %s: %pV",
6216 dev_driver_string(dev->dev.parent),
6217 dev_name(dev->dev.parent),
6218 netdev_name(dev), vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006219 } else if (dev) {
Joe Perches256df2f2010-06-27 01:02:35 +00006220 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006221 } else {
Joe Perches256df2f2010-06-27 01:02:35 +00006222 r = printk("%s(NULL net_device): %pV", level, vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006223 }
Joe Perches256df2f2010-06-27 01:02:35 +00006224
6225 return r;
6226}
6227
6228int netdev_printk(const char *level, const struct net_device *dev,
6229 const char *format, ...)
6230{
6231 struct va_format vaf;
6232 va_list args;
6233 int r;
6234
6235 va_start(args, format);
6236
6237 vaf.fmt = format;
6238 vaf.va = &args;
6239
6240 r = __netdev_printk(level, dev, &vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07006241
Joe Perches256df2f2010-06-27 01:02:35 +00006242 va_end(args);
6243
6244 return r;
6245}
6246EXPORT_SYMBOL(netdev_printk);
6247
6248#define define_netdev_printk_level(func, level) \
6249int func(const struct net_device *dev, const char *fmt, ...) \
6250{ \
6251 int r; \
6252 struct va_format vaf; \
6253 va_list args; \
6254 \
6255 va_start(args, fmt); \
6256 \
6257 vaf.fmt = fmt; \
6258 vaf.va = &args; \
6259 \
6260 r = __netdev_printk(level, dev, &vaf); \
Joe Perchesb004ff42012-09-12 20:12:19 -07006261 \
Joe Perches256df2f2010-06-27 01:02:35 +00006262 va_end(args); \
6263 \
6264 return r; \
6265} \
6266EXPORT_SYMBOL(func);
6267
6268define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6269define_netdev_printk_level(netdev_alert, KERN_ALERT);
6270define_netdev_printk_level(netdev_crit, KERN_CRIT);
6271define_netdev_printk_level(netdev_err, KERN_ERR);
6272define_netdev_printk_level(netdev_warn, KERN_WARNING);
6273define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6274define_netdev_printk_level(netdev_info, KERN_INFO);
6275
Pavel Emelyanov46650792007-10-08 20:38:39 -07006276static void __net_exit netdev_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07006277{
6278 kfree(net->dev_name_head);
6279 kfree(net->dev_index_head);
6280}
6281
Denis V. Lunev022cbae2007-11-13 03:23:50 -08006282static struct pernet_operations __net_initdata netdev_net_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07006283 .init = netdev_init,
6284 .exit = netdev_exit,
6285};
6286
Pavel Emelyanov46650792007-10-08 20:38:39 -07006287static void __net_exit default_device_exit(struct net *net)
Eric W. Biedermance286d32007-09-12 13:53:49 +02006288{
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006289 struct net_device *dev, *aux;
Eric W. Biedermance286d32007-09-12 13:53:49 +02006290 /*
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006291 * Push all migratable network devices back to the
Eric W. Biedermance286d32007-09-12 13:53:49 +02006292 * initial network namespace
6293 */
6294 rtnl_lock();
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006295 for_each_netdev_safe(net, dev, aux) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006296 int err;
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006297 char fb_name[IFNAMSIZ];
Eric W. Biedermance286d32007-09-12 13:53:49 +02006298
6299 /* Ignore unmoveable devices (i.e. loopback) */
6300 if (dev->features & NETIF_F_NETNS_LOCAL)
6301 continue;
6302
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006303 /* Leave virtual devices for the generic cleanup */
6304 if (dev->rtnl_link_ops)
6305 continue;
Eric W. Biedermand0c082c2008-11-05 15:59:38 -08006306
Lucas De Marchi25985ed2011-03-30 22:57:33 -03006307 /* Push remaining network devices to init_net */
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006308 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6309 err = dev_change_net_namespace(dev, &init_net, fb_name);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006310 if (err) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00006311 pr_emerg("%s: failed to move %s to init_net: %d\n",
6312 __func__, dev->name, err);
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006313 BUG();
Eric W. Biedermance286d32007-09-12 13:53:49 +02006314 }
6315 }
6316 rtnl_unlock();
6317}
6318
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006319static void __net_exit default_device_exit_batch(struct list_head *net_list)
6320{
6321 /* At exit all network devices most be removed from a network
Uwe Kleine-Königb5950762010-11-01 15:38:34 -04006322 * namespace. Do this in the reverse order of registration.
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006323 * Do this across as many network namespaces as possible to
6324 * improve batching efficiency.
6325 */
6326 struct net_device *dev;
6327 struct net *net;
6328 LIST_HEAD(dev_kill_list);
6329
6330 rtnl_lock();
6331 list_for_each_entry(net, net_list, exit_list) {
6332 for_each_netdev_reverse(net, dev) {
6333 if (dev->rtnl_link_ops)
6334 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6335 else
6336 unregister_netdevice_queue(dev, &dev_kill_list);
6337 }
6338 }
6339 unregister_netdevice_many(&dev_kill_list);
Eric Dumazetceaaec92011-02-17 22:59:19 +00006340 list_del(&dev_kill_list);
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006341 rtnl_unlock();
6342}
6343
Denis V. Lunev022cbae2007-11-13 03:23:50 -08006344static struct pernet_operations __net_initdata default_device_ops = {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006345 .exit = default_device_exit,
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006346 .exit_batch = default_device_exit_batch,
Eric W. Biedermance286d32007-09-12 13:53:49 +02006347};
6348
Linus Torvalds1da177e2005-04-16 15:20:36 -07006349/*
6350 * Initialize the DEV module. At boot time this walks the device list and
6351 * unhooks any devices that fail to initialise (normally hardware not
6352 * present) and leaves us with a valid list of present and active devices.
6353 *
6354 */
6355
6356/*
6357 * This is called single threaded during boot, so no need
6358 * to take the rtnl semaphore.
6359 */
6360static int __init net_dev_init(void)
6361{
6362 int i, rc = -ENOMEM;
6363
6364 BUG_ON(!dev_boot_phase);
6365
Linus Torvalds1da177e2005-04-16 15:20:36 -07006366 if (dev_proc_init())
6367 goto out;
6368
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006369 if (netdev_kobject_init())
Linus Torvalds1da177e2005-04-16 15:20:36 -07006370 goto out;
6371
6372 INIT_LIST_HEAD(&ptype_all);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08006373 for (i = 0; i < PTYPE_HASH_SIZE; i++)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006374 INIT_LIST_HEAD(&ptype_base[i]);
6375
Vlad Yasevich62532da2012-11-15 08:49:10 +00006376 INIT_LIST_HEAD(&offload_base);
6377
Eric W. Biederman881d9662007-09-17 11:56:21 -07006378 if (register_pernet_subsys(&netdev_net_ops))
6379 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006380
6381 /*
6382 * Initialise the packet receive queues.
6383 */
6384
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07006385 for_each_possible_cpu(i) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006386 struct softnet_data *sd = &per_cpu(softnet_data, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006387
Changli Gaodee42872010-05-02 05:42:16 +00006388 memset(sd, 0, sizeof(*sd));
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006389 skb_queue_head_init(&sd->input_pkt_queue);
Changli Gao6e7676c2010-04-27 15:07:33 -07006390 skb_queue_head_init(&sd->process_queue);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006391 sd->completion_queue = NULL;
6392 INIT_LIST_HEAD(&sd->poll_list);
Changli Gaoa9cbd582010-04-26 23:06:24 +00006393 sd->output_queue = NULL;
6394 sd->output_queue_tailp = &sd->output_queue;
Eric Dumazetdf334542010-03-24 19:13:54 +00006395#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006396 sd->csd.func = rps_trigger_softirq;
6397 sd->csd.info = sd;
6398 sd->csd.flags = 0;
6399 sd->cpu = i;
Tom Herbert1e94d722010-03-18 17:45:44 -07006400#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00006401
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006402 sd->backlog.poll = process_backlog;
6403 sd->backlog.weight = weight_p;
6404 sd->backlog.gro_list = NULL;
6405 sd->backlog.gro_count = 0;
Willem de Bruijn99bbc702013-05-20 04:02:32 +00006406
6407#ifdef CONFIG_NET_FLOW_LIMIT
6408 sd->flow_limit = NULL;
6409#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006410 }
6411
Linus Torvalds1da177e2005-04-16 15:20:36 -07006412 dev_boot_phase = 0;
6413
Eric W. Biederman505d4f72008-11-07 22:54:20 -08006414 /* The loopback device is special if any other network devices
6415 * is present in a network namespace the loopback device must
6416 * be present. Since we now dynamically allocate and free the
6417 * loopback device ensure this invariant is maintained by
6418 * keeping the loopback device as the first device on the
6419 * list of network devices. Ensuring the loopback devices
6420 * is the first device that appears and the last network device
6421 * that disappears.
6422 */
6423 if (register_pernet_device(&loopback_net_ops))
6424 goto out;
6425
6426 if (register_pernet_device(&default_device_ops))
6427 goto out;
6428
Carlos R. Mafra962cf362008-05-15 11:15:37 -03006429 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6430 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006431
6432 hotcpu_notifier(dev_cpu_callback, 0);
6433 dst_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006434 rc = 0;
6435out:
6436 return rc;
6437}
6438
6439subsys_initcall(net_dev_init);