blob: 914b4a24c654361c3604bf8d96225d587db2f013 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070076#include <linux/bitops.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080077#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070078#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
stephen hemminger08e98972009-11-10 07:20:34 +000081#include <linux/hash.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090082#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/sched.h>
Arjan van de Ven4a3e2f72006-03-20 22:33:17 -080084#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
Ben Hutchings0187bdf2008-06-19 16:15:47 -070094#include <linux/ethtool.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <linux/notifier.h>
96#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020097#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <net/sock.h>
Eric Dumazet02d62e82015-11-18 06:30:52 -080099#include <net/busy_poll.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100#include <linux/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101#include <linux/stat.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700102#include <net/dst.h>
Pravin B Shelarfc4099f2015-10-22 18:17:16 -0700103#include <net/dst_metadata.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700104#include <net/pkt_sched.h>
105#include <net/checksum.h>
Arnd Bergmann44540962009-11-26 06:07:08 +0000106#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107#include <linux/highmem.h>
108#include <linux/init.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700109#include <linux/module.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110#include <linux/netpoll.h>
111#include <linux/rcupdate.h>
112#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113#include <net/iw_handler.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114#include <asm/current.h>
Steve Grubb5bdb9882005-12-03 08:39:35 -0500115#include <linux/audit.h>
Chris Leechdb217332006-06-17 21:24:58 -0700116#include <linux/dmaengine.h>
Herbert Xuf6a78bf2006-06-22 02:57:17 -0700117#include <linux/err.h>
David S. Millerc7fa9d12006-08-15 16:34:13 -0700118#include <linux/ctype.h>
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700119#include <linux/if_arp.h>
Ben Hutchings6de329e2008-06-16 17:02:28 -0700120#include <linux/if_vlan.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700121#include <linux/ip.h>
Alexander Duyckad55dca2008-09-20 22:05:50 -0700122#include <net/ip.h>
Simon Horman25cd9ba2014-10-06 05:05:13 -0700123#include <net/mpls.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700124#include <linux/ipv6.h>
125#include <linux/in.h>
David S. Millerb6b2fed2008-07-21 09:48:06 -0700126#include <linux/jhash.h>
127#include <linux/random.h>
David S. Miller9cbc1cb2009-06-15 03:02:23 -0700128#include <trace/events/napi.h>
Koki Sanagicf66ba52010-08-23 18:45:02 +0900129#include <trace/events/net.h>
Koki Sanagi07dc22e2010-08-23 18:46:12 +0900130#include <trace/events/skb.h>
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +0000131#include <linux/pci.h>
Stephen Rothwellcaeda9b2010-09-16 21:39:16 -0700132#include <linux/inetdevice.h>
Ben Hutchingsc4454772011-01-19 11:03:53 +0000133#include <linux/cpu_rmap.h>
Ingo Molnarc5905af2012-02-24 08:31:31 +0100134#include <linux/static_key.h>
Eliezer Tamiraf12fa62013-06-10 11:39:41 +0300135#include <linux/hashtable.h>
Eric Dumazet60877a32013-06-20 01:15:51 -0700136#include <linux/vmalloc.h>
Michal Kubeček529d0482013-11-15 06:18:50 +0100137#include <linux/if_macvlan.h>
Willem de Bruijne7fd2882014-08-04 22:11:48 -0400138#include <linux/errqueue.h>
Eric Dumazet3b47d302014-11-06 21:09:44 -0800139#include <linux/hrtimer.h>
Pablo Neirae687ad62015-05-13 18:19:38 +0200140#include <linux/netfilter_ingress.h>
Tom Herbert6ae23ad2015-12-14 11:19:46 -0800141#include <linux/sctp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142
Pavel Emelyanov342709e2007-10-23 21:14:45 -0700143#include "net-sysfs.h"
144
Herbert Xud565b0a2008-12-15 23:38:52 -0800145/* Instead of increasing this, you should create a hash table. */
146#define MAX_GRO_SKBS 8
147
Herbert Xu5d38a072009-01-04 16:13:40 -0800148/* This should be increased if a protocol with a bigger head is added. */
149#define GRO_MAX_HEAD (MAX_HEADER + 128)
150
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151static DEFINE_SPINLOCK(ptype_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000152static DEFINE_SPINLOCK(offload_lock);
Cong Wang900ff8c2013-02-18 19:20:33 +0000153struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
154struct list_head ptype_all __read_mostly; /* Taps */
Vlad Yasevich62532da2012-11-15 08:49:10 +0000155static struct list_head offload_base __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156
Ben Hutchingsae78dbf2014-01-10 22:17:24 +0000157static int netif_rx_internal(struct sk_buff *skb);
Loic Prylli5495119462014-07-01 21:39:43 -0700158static int call_netdevice_notifiers_info(unsigned long val,
159 struct net_device *dev,
160 struct netdev_notifier_info *info);
Ben Hutchingsae78dbf2014-01-10 22:17:24 +0000161
Linus Torvalds1da177e2005-04-16 15:20:36 -0700162/*
Pavel Emelianov7562f872007-05-03 15:13:45 -0700163 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
Linus Torvalds1da177e2005-04-16 15:20:36 -0700164 * semaphore.
165 *
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800166 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167 *
168 * Writers must hold the rtnl semaphore while they loop through the
Pavel Emelianov7562f872007-05-03 15:13:45 -0700169 * dev_base_head list, and hold dev_base_lock for writing when they do the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700170 * actual updates. This allows pure readers to access the list even
171 * while a writer is preparing to update it.
172 *
173 * To put it another way, dev_base_lock is held for writing only to
174 * protect against pure readers; the rtnl semaphore provides the
175 * protection against other writers.
176 *
177 * See, for example usages, register_netdevice() and
178 * unregister_netdevice(), which must be called with the rtnl
179 * semaphore held.
180 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181DEFINE_RWLOCK(dev_base_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700182EXPORT_SYMBOL(dev_base_lock);
183
Eliezer Tamiraf12fa62013-06-10 11:39:41 +0300184/* protects napi_hash addition/deletion and napi_gen_id */
185static DEFINE_SPINLOCK(napi_hash_lock);
186
Eric Dumazet52bd2d62015-11-18 06:30:50 -0800187static unsigned int napi_gen_id = NR_CPUS;
Eric Dumazet6180d9d2015-11-18 06:31:01 -0800188static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
Eliezer Tamiraf12fa62013-06-10 11:39:41 +0300189
Thomas Gleixner18afa4b2013-07-23 16:13:17 +0200190static seqcount_t devnet_rename_seq;
Brian Haleyc91f6df2012-11-26 05:21:08 +0000191
Thomas Graf4e985ad2011-06-21 03:11:20 +0000192static inline void dev_base_seq_inc(struct net *net)
193{
194 while (++net->dev_base_seq == 0);
195}
196
Eric W. Biederman881d9662007-09-17 11:56:21 -0700197static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198{
Eric Dumazet95c96172012-04-15 05:58:06 +0000199 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
200
stephen hemminger08e98972009-11-10 07:20:34 +0000201 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202}
203
Eric W. Biederman881d9662007-09-17 11:56:21 -0700204static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700205{
Eric Dumazet7c28bd02009-10-24 06:13:17 -0700206 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207}
208
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000209static inline void rps_lock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000210{
211#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000212 spin_lock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000213#endif
214}
215
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000216static inline void rps_unlock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000217{
218#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000219 spin_unlock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000220#endif
221}
222
Eric W. Biedermance286d32007-09-12 13:53:49 +0200223/* Device list insertion */
dingtianhong53759be2013-04-17 22:17:50 +0000224static void list_netdevice(struct net_device *dev)
Eric W. Biedermance286d32007-09-12 13:53:49 +0200225{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900226 struct net *net = dev_net(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200227
228 ASSERT_RTNL();
229
230 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800231 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
Eric Dumazet72c95282009-10-30 07:11:27 +0000232 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000233 hlist_add_head_rcu(&dev->index_hlist,
234 dev_index_hash(net, dev->ifindex));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200235 write_unlock_bh(&dev_base_lock);
Thomas Graf4e985ad2011-06-21 03:11:20 +0000236
237 dev_base_seq_inc(net);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200238}
239
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000240/* Device list removal
241 * caller must respect a RCU grace period before freeing/reusing dev
242 */
Eric W. Biedermance286d32007-09-12 13:53:49 +0200243static void unlist_netdevice(struct net_device *dev)
244{
245 ASSERT_RTNL();
246
247 /* Unlink dev from the device chain */
248 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800249 list_del_rcu(&dev->dev_list);
Eric Dumazet72c95282009-10-30 07:11:27 +0000250 hlist_del_rcu(&dev->name_hlist);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000251 hlist_del_rcu(&dev->index_hlist);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200252 write_unlock_bh(&dev_base_lock);
Thomas Graf4e985ad2011-06-21 03:11:20 +0000253
254 dev_base_seq_inc(dev_net(dev));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200255}
256
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257/*
258 * Our notifier list
259 */
260
Alan Sternf07d5b92006-05-09 15:23:03 -0700261static RAW_NOTIFIER_HEAD(netdev_chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700262
263/*
264 * Device drivers call our routines to queue packets here. We empty the
265 * queue in the local softnet handler.
266 */
Stephen Hemmingerbea33482007-10-03 16:41:36 -0700267
Eric Dumazet9958da02010-04-17 04:17:02 +0000268DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700269EXPORT_PER_CPU_SYMBOL(softnet_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700270
David S. Millercf508b12008-07-22 14:16:42 -0700271#ifdef CONFIG_LOCKDEP
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700272/*
David S. Millerc773e842008-07-08 23:13:53 -0700273 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700274 * according to dev->type
275 */
276static const unsigned short netdev_lock_type[] =
277 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
278 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
279 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
280 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
281 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
282 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
283 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
284 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
285 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
286 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
287 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
288 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
Paul Gortmaker211ed862012-05-10 17:14:35 -0400289 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
290 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
291 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700292
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700293static const char *const netdev_lock_name[] =
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700294 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
295 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
296 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
297 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
298 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
299 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
300 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
301 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
302 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
303 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
304 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
305 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
Paul Gortmaker211ed862012-05-10 17:14:35 -0400306 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
307 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
308 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700309
310static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
David S. Millercf508b12008-07-22 14:16:42 -0700311static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700312
313static inline unsigned short netdev_lock_pos(unsigned short dev_type)
314{
315 int i;
316
317 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
318 if (netdev_lock_type[i] == dev_type)
319 return i;
320 /* the last key is used by default */
321 return ARRAY_SIZE(netdev_lock_type) - 1;
322}
323
David S. Millercf508b12008-07-22 14:16:42 -0700324static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
325 unsigned short dev_type)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700326{
327 int i;
328
329 i = netdev_lock_pos(dev_type);
330 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
331 netdev_lock_name[i]);
332}
David S. Millercf508b12008-07-22 14:16:42 -0700333
334static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
335{
336 int i;
337
338 i = netdev_lock_pos(dev->type);
339 lockdep_set_class_and_name(&dev->addr_list_lock,
340 &netdev_addr_lock_key[i],
341 netdev_lock_name[i]);
342}
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700343#else
David S. Millercf508b12008-07-22 14:16:42 -0700344static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
345 unsigned short dev_type)
346{
347}
348static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700349{
350}
351#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352
353/*******************************************************************************
354
355 Protocol management and registration routines
356
357*******************************************************************************/
358
359/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360 * Add a protocol ID to the list. Now that the input handler is
361 * smarter we can dispense with all the messy stuff that used to be
362 * here.
363 *
364 * BEWARE!!! Protocol handlers, mangling input packets,
365 * MUST BE last in hash buckets and checking protocol handlers
366 * MUST start from promiscuous ptype_all chain in net_bh.
367 * It is true now, do not change it.
368 * Explanation follows: if protocol handler, mangling packet, will
369 * be the first on list, it is not able to sense, that packet
370 * is cloned and should be copied-on-write, so that it will
371 * change it and subsequent readers will get broken packet.
372 * --ANK (980803)
373 */
374
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000375static inline struct list_head *ptype_head(const struct packet_type *pt)
376{
377 if (pt->type == htons(ETH_P_ALL))
Salam Noureddine7866a622015-01-27 11:35:48 -0800378 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000379 else
Salam Noureddine7866a622015-01-27 11:35:48 -0800380 return pt->dev ? &pt->dev->ptype_specific :
381 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000382}
383
Linus Torvalds1da177e2005-04-16 15:20:36 -0700384/**
385 * dev_add_pack - add packet handler
386 * @pt: packet type declaration
387 *
388 * Add a protocol handler to the networking stack. The passed &packet_type
389 * is linked into kernel lists and may not be freed until it has been
390 * removed from the kernel lists.
391 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900392 * This call does not sleep therefore it can not
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393 * guarantee all CPU's that are in middle of receiving packets
394 * will see the new packet type (until the next received packet).
395 */
396
397void dev_add_pack(struct packet_type *pt)
398{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000399 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000401 spin_lock(&ptype_lock);
402 list_add_rcu(&pt->list, head);
403 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700405EXPORT_SYMBOL(dev_add_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700406
Linus Torvalds1da177e2005-04-16 15:20:36 -0700407/**
408 * __dev_remove_pack - remove packet handler
409 * @pt: packet type declaration
410 *
411 * Remove a protocol handler that was previously added to the kernel
412 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
413 * from the kernel lists and can be freed or reused once this function
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900414 * returns.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700415 *
416 * The packet type might still be in use by receivers
417 * and must not be freed until after all the CPU's have gone
418 * through a quiescent state.
419 */
420void __dev_remove_pack(struct packet_type *pt)
421{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000422 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700423 struct packet_type *pt1;
424
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000425 spin_lock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700426
427 list_for_each_entry(pt1, head, list) {
428 if (pt == pt1) {
429 list_del_rcu(&pt->list);
430 goto out;
431 }
432 }
433
Joe Perches7b6cd1c2012-02-01 10:54:43 +0000434 pr_warn("dev_remove_pack: %p not found\n", pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700435out:
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000436 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700438EXPORT_SYMBOL(__dev_remove_pack);
439
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440/**
441 * dev_remove_pack - remove packet handler
442 * @pt: packet type declaration
443 *
444 * Remove a protocol handler that was previously added to the kernel
445 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
446 * from the kernel lists and can be freed or reused once this function
447 * returns.
448 *
449 * This call sleeps to guarantee that no CPU is looking at the packet
450 * type after return.
451 */
452void dev_remove_pack(struct packet_type *pt)
453{
454 __dev_remove_pack(pt);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900455
Linus Torvalds1da177e2005-04-16 15:20:36 -0700456 synchronize_net();
457}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700458EXPORT_SYMBOL(dev_remove_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700459
Vlad Yasevich62532da2012-11-15 08:49:10 +0000460
461/**
462 * dev_add_offload - register offload handlers
463 * @po: protocol offload declaration
464 *
465 * Add protocol offload handlers to the networking stack. The passed
466 * &proto_offload is linked into kernel lists and may not be freed until
467 * it has been removed from the kernel lists.
468 *
469 * This call does not sleep therefore it can not
470 * guarantee all CPU's that are in middle of receiving packets
471 * will see the new offload handlers (until the next received packet).
472 */
473void dev_add_offload(struct packet_offload *po)
474{
David S. Millerbdef7de2015-06-01 14:56:09 -0700475 struct packet_offload *elem;
Vlad Yasevich62532da2012-11-15 08:49:10 +0000476
477 spin_lock(&offload_lock);
David S. Millerbdef7de2015-06-01 14:56:09 -0700478 list_for_each_entry(elem, &offload_base, list) {
479 if (po->priority < elem->priority)
480 break;
481 }
482 list_add_rcu(&po->list, elem->list.prev);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000483 spin_unlock(&offload_lock);
484}
485EXPORT_SYMBOL(dev_add_offload);
486
487/**
488 * __dev_remove_offload - remove offload handler
489 * @po: packet offload declaration
490 *
491 * Remove a protocol offload handler that was previously added to the
492 * kernel offload handlers by dev_add_offload(). The passed &offload_type
493 * is removed from the kernel lists and can be freed or reused once this
494 * function returns.
495 *
496 * The packet type might still be in use by receivers
497 * and must not be freed until after all the CPU's have gone
498 * through a quiescent state.
499 */
stephen hemminger1d143d92013-12-29 14:01:29 -0800500static void __dev_remove_offload(struct packet_offload *po)
Vlad Yasevich62532da2012-11-15 08:49:10 +0000501{
502 struct list_head *head = &offload_base;
503 struct packet_offload *po1;
504
Eric Dumazetc53aa502012-11-16 08:08:23 +0000505 spin_lock(&offload_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000506
507 list_for_each_entry(po1, head, list) {
508 if (po == po1) {
509 list_del_rcu(&po->list);
510 goto out;
511 }
512 }
513
514 pr_warn("dev_remove_offload: %p not found\n", po);
515out:
Eric Dumazetc53aa502012-11-16 08:08:23 +0000516 spin_unlock(&offload_lock);
Vlad Yasevich62532da2012-11-15 08:49:10 +0000517}
Vlad Yasevich62532da2012-11-15 08:49:10 +0000518
519/**
520 * dev_remove_offload - remove packet offload handler
521 * @po: packet offload declaration
522 *
523 * Remove a packet offload handler that was previously added to the kernel
524 * offload handlers by dev_add_offload(). The passed &offload_type is
525 * removed from the kernel lists and can be freed or reused once this
526 * function returns.
527 *
528 * This call sleeps to guarantee that no CPU is looking at the packet
529 * type after return.
530 */
531void dev_remove_offload(struct packet_offload *po)
532{
533 __dev_remove_offload(po);
534
535 synchronize_net();
536}
537EXPORT_SYMBOL(dev_remove_offload);
538
Linus Torvalds1da177e2005-04-16 15:20:36 -0700539/******************************************************************************
540
541 Device Boot-time Settings Routines
542
543*******************************************************************************/
544
545/* Boot time configuration table */
546static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
547
548/**
549 * netdev_boot_setup_add - add new setup entry
550 * @name: name of the device
551 * @map: configured settings for the device
552 *
553 * Adds new setup entry to the dev_boot_setup list. The function
554 * returns 0 on error and 1 on success. This is a generic routine to
555 * all netdevices.
556 */
557static int netdev_boot_setup_add(char *name, struct ifmap *map)
558{
559 struct netdev_boot_setup *s;
560 int i;
561
562 s = dev_boot_setup;
563 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
564 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
565 memset(s[i].name, 0, sizeof(s[i].name));
Wang Chen93b3cff2008-07-01 19:57:19 -0700566 strlcpy(s[i].name, name, IFNAMSIZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700567 memcpy(&s[i].map, map, sizeof(s[i].map));
568 break;
569 }
570 }
571
572 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
573}
574
575/**
576 * netdev_boot_setup_check - check boot time settings
577 * @dev: the netdevice
578 *
579 * Check boot time settings for the device.
580 * The found settings are set for the device to be used
581 * later in the device probing.
582 * Returns 0 if no settings found, 1 if they are.
583 */
584int netdev_boot_setup_check(struct net_device *dev)
585{
586 struct netdev_boot_setup *s = dev_boot_setup;
587 int i;
588
589 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
590 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
Wang Chen93b3cff2008-07-01 19:57:19 -0700591 !strcmp(dev->name, s[i].name)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700592 dev->irq = s[i].map.irq;
593 dev->base_addr = s[i].map.base_addr;
594 dev->mem_start = s[i].map.mem_start;
595 dev->mem_end = s[i].map.mem_end;
596 return 1;
597 }
598 }
599 return 0;
600}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700601EXPORT_SYMBOL(netdev_boot_setup_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700602
603
604/**
605 * netdev_boot_base - get address from boot time settings
606 * @prefix: prefix for network device
607 * @unit: id for network device
608 *
609 * Check boot time settings for the base address of device.
610 * The found settings are set for the device to be used
611 * later in the device probing.
612 * Returns 0 if no settings found.
613 */
614unsigned long netdev_boot_base(const char *prefix, int unit)
615{
616 const struct netdev_boot_setup *s = dev_boot_setup;
617 char name[IFNAMSIZ];
618 int i;
619
620 sprintf(name, "%s%d", prefix, unit);
621
622 /*
623 * If device already registered then return base of 1
624 * to indicate not to probe for this interface
625 */
Eric W. Biederman881d9662007-09-17 11:56:21 -0700626 if (__dev_get_by_name(&init_net, name))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700627 return 1;
628
629 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
630 if (!strcmp(name, s[i].name))
631 return s[i].map.base_addr;
632 return 0;
633}
634
635/*
636 * Saves at boot time configured settings for any netdevice.
637 */
638int __init netdev_boot_setup(char *str)
639{
640 int ints[5];
641 struct ifmap map;
642
643 str = get_options(str, ARRAY_SIZE(ints), ints);
644 if (!str || !*str)
645 return 0;
646
647 /* Save settings */
648 memset(&map, 0, sizeof(map));
649 if (ints[0] > 0)
650 map.irq = ints[1];
651 if (ints[0] > 1)
652 map.base_addr = ints[2];
653 if (ints[0] > 2)
654 map.mem_start = ints[3];
655 if (ints[0] > 3)
656 map.mem_end = ints[4];
657
658 /* Add new entry to the list */
659 return netdev_boot_setup_add(str, &map);
660}
661
662__setup("netdev=", netdev_boot_setup);
663
664/*******************************************************************************
665
666 Device Interface Subroutines
667
668*******************************************************************************/
669
670/**
Nicolas Dichtela54acb32015-04-02 17:07:00 +0200671 * dev_get_iflink - get 'iflink' value of a interface
672 * @dev: targeted interface
673 *
674 * Indicates the ifindex the interface is linked to.
675 * Physical interfaces have the same 'ifindex' and 'iflink' values.
676 */
677
678int dev_get_iflink(const struct net_device *dev)
679{
680 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
681 return dev->netdev_ops->ndo_get_iflink(dev);
682
Nicolas Dichtel7a66bbc2015-04-02 17:07:09 +0200683 return dev->ifindex;
Nicolas Dichtela54acb32015-04-02 17:07:00 +0200684}
685EXPORT_SYMBOL(dev_get_iflink);
686
687/**
Pravin B Shelarfc4099f2015-10-22 18:17:16 -0700688 * dev_fill_metadata_dst - Retrieve tunnel egress information.
689 * @dev: targeted interface
690 * @skb: The packet.
691 *
692 * For better visibility of tunnel traffic OVS needs to retrieve
693 * egress tunnel information for a packet. Following API allows
694 * user to get this info.
695 */
696int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
697{
698 struct ip_tunnel_info *info;
699
700 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
701 return -EINVAL;
702
703 info = skb_tunnel_info_unclone(skb);
704 if (!info)
705 return -ENOMEM;
706 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
707 return -EINVAL;
708
709 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
710}
711EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
712
713/**
Linus Torvalds1da177e2005-04-16 15:20:36 -0700714 * __dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700715 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716 * @name: name to find
717 *
718 * Find an interface by name. Must be called under RTNL semaphore
719 * or @dev_base_lock. If the name is found a pointer to the device
720 * is returned. If the name is not found then %NULL is returned. The
721 * reference counters are not incremented so the caller must be
722 * careful with locks.
723 */
724
Eric W. Biederman881d9662007-09-17 11:56:21 -0700725struct net_device *__dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700726{
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700727 struct net_device *dev;
728 struct hlist_head *head = dev_name_hash(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700729
Sasha Levinb67bfe02013-02-27 17:06:00 -0800730 hlist_for_each_entry(dev, head, name_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731 if (!strncmp(dev->name, name, IFNAMSIZ))
732 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700733
Linus Torvalds1da177e2005-04-16 15:20:36 -0700734 return NULL;
735}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700736EXPORT_SYMBOL(__dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700737
738/**
Eric Dumazet72c95282009-10-30 07:11:27 +0000739 * dev_get_by_name_rcu - find a device by its name
740 * @net: the applicable net namespace
741 * @name: name to find
742 *
743 * Find an interface by name.
744 * If the name is found a pointer to the device is returned.
745 * If the name is not found then %NULL is returned.
746 * The reference counters are not incremented so the caller must be
747 * careful with locks. The caller must hold RCU lock.
748 */
749
750struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
751{
Eric Dumazet72c95282009-10-30 07:11:27 +0000752 struct net_device *dev;
753 struct hlist_head *head = dev_name_hash(net, name);
754
Sasha Levinb67bfe02013-02-27 17:06:00 -0800755 hlist_for_each_entry_rcu(dev, head, name_hlist)
Eric Dumazet72c95282009-10-30 07:11:27 +0000756 if (!strncmp(dev->name, name, IFNAMSIZ))
757 return dev;
758
759 return NULL;
760}
761EXPORT_SYMBOL(dev_get_by_name_rcu);
762
763/**
Linus Torvalds1da177e2005-04-16 15:20:36 -0700764 * dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700765 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700766 * @name: name to find
767 *
768 * Find an interface by name. This can be called from any
769 * context and does its own locking. The returned handle has
770 * the usage count incremented and the caller must use dev_put() to
771 * release it when it is no longer needed. %NULL is returned if no
772 * matching device is found.
773 */
774
Eric W. Biederman881d9662007-09-17 11:56:21 -0700775struct net_device *dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700776{
777 struct net_device *dev;
778
Eric Dumazet72c95282009-10-30 07:11:27 +0000779 rcu_read_lock();
780 dev = dev_get_by_name_rcu(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700781 if (dev)
782 dev_hold(dev);
Eric Dumazet72c95282009-10-30 07:11:27 +0000783 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700784 return dev;
785}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700786EXPORT_SYMBOL(dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700787
788/**
789 * __dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700790 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700791 * @ifindex: index of device
792 *
793 * Search for an interface by index. Returns %NULL if the device
794 * is not found or a pointer to the device. The device has not
795 * had its reference counter increased so the caller must be careful
796 * about locking. The caller must hold either the RTNL semaphore
797 * or @dev_base_lock.
798 */
799
Eric W. Biederman881d9662007-09-17 11:56:21 -0700800struct net_device *__dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700801{
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700802 struct net_device *dev;
803 struct hlist_head *head = dev_index_hash(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700804
Sasha Levinb67bfe02013-02-27 17:06:00 -0800805 hlist_for_each_entry(dev, head, index_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700806 if (dev->ifindex == ifindex)
807 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700808
Linus Torvalds1da177e2005-04-16 15:20:36 -0700809 return NULL;
810}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700811EXPORT_SYMBOL(__dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700812
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000813/**
814 * dev_get_by_index_rcu - find a device by its ifindex
815 * @net: the applicable net namespace
816 * @ifindex: index of device
817 *
818 * Search for an interface by index. Returns %NULL if the device
819 * is not found or a pointer to the device. The device has not
820 * had its reference counter increased so the caller must be careful
821 * about locking. The caller must hold RCU lock.
822 */
823
824struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
825{
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000826 struct net_device *dev;
827 struct hlist_head *head = dev_index_hash(net, ifindex);
828
Sasha Levinb67bfe02013-02-27 17:06:00 -0800829 hlist_for_each_entry_rcu(dev, head, index_hlist)
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000830 if (dev->ifindex == ifindex)
831 return dev;
832
833 return NULL;
834}
835EXPORT_SYMBOL(dev_get_by_index_rcu);
836
Linus Torvalds1da177e2005-04-16 15:20:36 -0700837
838/**
839 * dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700840 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700841 * @ifindex: index of device
842 *
843 * Search for an interface by index. Returns NULL if the device
844 * is not found or a pointer to the device. The device returned has
845 * had a reference added and the pointer is safe until the user calls
846 * dev_put to indicate they have finished with it.
847 */
848
Eric W. Biederman881d9662007-09-17 11:56:21 -0700849struct net_device *dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700850{
851 struct net_device *dev;
852
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000853 rcu_read_lock();
854 dev = dev_get_by_index_rcu(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700855 if (dev)
856 dev_hold(dev);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000857 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700858 return dev;
859}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700860EXPORT_SYMBOL(dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700861
862/**
Nicolas Schichan5dbe7c12013-06-26 17:23:42 +0200863 * netdev_get_name - get a netdevice name, knowing its ifindex.
864 * @net: network namespace
865 * @name: a pointer to the buffer where the name will be stored.
866 * @ifindex: the ifindex of the interface to get the name from.
867 *
868 * The use of raw_seqcount_begin() and cond_resched() before
869 * retrying is required as we want to give the writers a chance
870 * to complete when CONFIG_PREEMPT is not set.
871 */
872int netdev_get_name(struct net *net, char *name, int ifindex)
873{
874 struct net_device *dev;
875 unsigned int seq;
876
877retry:
878 seq = raw_seqcount_begin(&devnet_rename_seq);
879 rcu_read_lock();
880 dev = dev_get_by_index_rcu(net, ifindex);
881 if (!dev) {
882 rcu_read_unlock();
883 return -ENODEV;
884 }
885
886 strcpy(name, dev->name);
887 rcu_read_unlock();
888 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
889 cond_resched();
890 goto retry;
891 }
892
893 return 0;
894}
895
896/**
Eric Dumazet941666c2010-12-05 01:23:53 +0000897 * dev_getbyhwaddr_rcu - find a device by its hardware address
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700898 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700899 * @type: media type of device
900 * @ha: hardware address
901 *
902 * Search for an interface by MAC address. Returns NULL if the device
Eric Dumazetc5066532011-01-24 13:16:16 -0800903 * is not found or a pointer to the device.
904 * The caller must hold RCU or RTNL.
Eric Dumazet941666c2010-12-05 01:23:53 +0000905 * The returned device has not had its ref count increased
Linus Torvalds1da177e2005-04-16 15:20:36 -0700906 * and the caller must therefore be careful about locking
907 *
Linus Torvalds1da177e2005-04-16 15:20:36 -0700908 */
909
Eric Dumazet941666c2010-12-05 01:23:53 +0000910struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
911 const char *ha)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700912{
913 struct net_device *dev;
914
Eric Dumazet941666c2010-12-05 01:23:53 +0000915 for_each_netdev_rcu(net, dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700916 if (dev->type == type &&
917 !memcmp(dev->dev_addr, ha, dev->addr_len))
Pavel Emelianov7562f872007-05-03 15:13:45 -0700918 return dev;
919
920 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700921}
Eric Dumazet941666c2010-12-05 01:23:53 +0000922EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
Jochen Friedrichcf309e32005-09-22 04:44:55 -0300923
Eric W. Biederman881d9662007-09-17 11:56:21 -0700924struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700925{
926 struct net_device *dev;
927
928 ASSERT_RTNL();
Eric W. Biederman881d9662007-09-17 11:56:21 -0700929 for_each_netdev(net, dev)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700930 if (dev->type == type)
Pavel Emelianov7562f872007-05-03 15:13:45 -0700931 return dev;
932
933 return NULL;
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700934}
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700935EXPORT_SYMBOL(__dev_getfirstbyhwtype);
936
Eric W. Biederman881d9662007-09-17 11:56:21 -0700937struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700938{
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000939 struct net_device *dev, *ret = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700940
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000941 rcu_read_lock();
942 for_each_netdev_rcu(net, dev)
943 if (dev->type == type) {
944 dev_hold(dev);
945 ret = dev;
946 break;
947 }
948 rcu_read_unlock();
949 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700950}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700951EXPORT_SYMBOL(dev_getfirstbyhwtype);
952
953/**
WANG Cong6c555492014-09-11 15:35:09 -0700954 * __dev_get_by_flags - find any device with given flags
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700955 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700956 * @if_flags: IFF_* values
957 * @mask: bitmask of bits in if_flags to check
958 *
959 * Search for any interface with the given flags. Returns NULL if a device
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000960 * is not found or a pointer to the device. Must be called inside
WANG Cong6c555492014-09-11 15:35:09 -0700961 * rtnl_lock(), and result refcount is unchanged.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700962 */
963
WANG Cong6c555492014-09-11 15:35:09 -0700964struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
965 unsigned short mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700966{
Pavel Emelianov7562f872007-05-03 15:13:45 -0700967 struct net_device *dev, *ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968
WANG Cong6c555492014-09-11 15:35:09 -0700969 ASSERT_RTNL();
970
Pavel Emelianov7562f872007-05-03 15:13:45 -0700971 ret = NULL;
WANG Cong6c555492014-09-11 15:35:09 -0700972 for_each_netdev(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700973 if (((dev->flags ^ if_flags) & mask) == 0) {
Pavel Emelianov7562f872007-05-03 15:13:45 -0700974 ret = dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700975 break;
976 }
977 }
Pavel Emelianov7562f872007-05-03 15:13:45 -0700978 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700979}
WANG Cong6c555492014-09-11 15:35:09 -0700980EXPORT_SYMBOL(__dev_get_by_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700981
982/**
983 * dev_valid_name - check if name is okay for network device
984 * @name: name string
985 *
986 * Network device names need to be valid file names to
David S. Millerc7fa9d12006-08-15 16:34:13 -0700987 * to allow sysfs to work. We also disallow any kind of
988 * whitespace.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700989 */
David S. Miller95f050b2012-03-06 16:12:15 -0500990bool dev_valid_name(const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991{
David S. Millerc7fa9d12006-08-15 16:34:13 -0700992 if (*name == '\0')
David S. Miller95f050b2012-03-06 16:12:15 -0500993 return false;
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -0700994 if (strlen(name) >= IFNAMSIZ)
David S. Miller95f050b2012-03-06 16:12:15 -0500995 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700996 if (!strcmp(name, ".") || !strcmp(name, ".."))
David S. Miller95f050b2012-03-06 16:12:15 -0500997 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700998
999 while (*name) {
Matthew Thodea4176a92015-02-17 18:31:57 -06001000 if (*name == '/' || *name == ':' || isspace(*name))
David S. Miller95f050b2012-03-06 16:12:15 -05001001 return false;
David S. Millerc7fa9d12006-08-15 16:34:13 -07001002 name++;
1003 }
David S. Miller95f050b2012-03-06 16:12:15 -05001004 return true;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001005}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001006EXPORT_SYMBOL(dev_valid_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001007
1008/**
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001009 * __dev_alloc_name - allocate a name for a device
1010 * @net: network namespace to allocate the device name in
Linus Torvalds1da177e2005-04-16 15:20:36 -07001011 * @name: name format string
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001012 * @buf: scratch buffer and result name string
Linus Torvalds1da177e2005-04-16 15:20:36 -07001013 *
1014 * Passed a format string - eg "lt%d" it will try and find a suitable
Stephen Hemminger3041a062006-05-26 13:25:24 -07001015 * id. It scans list of devices to build up a free map, then chooses
1016 * the first empty slot. The caller must hold the dev_base or rtnl lock
1017 * while allocating the name and adding the device in order to avoid
1018 * duplicates.
1019 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1020 * Returns the number of the unit assigned or a negative errno code.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001021 */
1022
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001023static int __dev_alloc_name(struct net *net, const char *name, char *buf)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001024{
1025 int i = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001026 const char *p;
1027 const int max_netdevices = 8*PAGE_SIZE;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -07001028 unsigned long *inuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001029 struct net_device *d;
1030
1031 p = strnchr(name, IFNAMSIZ-1, '%');
1032 if (p) {
1033 /*
1034 * Verify the string as this thing may have come from
1035 * the user. There must be either one "%d" and no other "%"
1036 * characters.
1037 */
1038 if (p[1] != 'd' || strchr(p + 2, '%'))
1039 return -EINVAL;
1040
1041 /* Use one page as a bit array of possible slots */
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -07001042 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001043 if (!inuse)
1044 return -ENOMEM;
1045
Eric W. Biederman881d9662007-09-17 11:56:21 -07001046 for_each_netdev(net, d) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001047 if (!sscanf(d->name, name, &i))
1048 continue;
1049 if (i < 0 || i >= max_netdevices)
1050 continue;
1051
1052 /* avoid cases where sscanf is not exact inverse of printf */
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001053 snprintf(buf, IFNAMSIZ, name, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001054 if (!strncmp(buf, d->name, IFNAMSIZ))
1055 set_bit(i, inuse);
1056 }
1057
1058 i = find_first_zero_bit(inuse, max_netdevices);
1059 free_page((unsigned long) inuse);
1060 }
1061
Octavian Purdilad9031022009-11-18 02:36:59 +00001062 if (buf != name)
1063 snprintf(buf, IFNAMSIZ, name, i);
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001064 if (!__dev_get_by_name(net, buf))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001065 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001066
1067 /* It is possible to run out of possible slots
1068 * when the name is long and there isn't enough space left
1069 * for the digits, or if all bits are used.
1070 */
1071 return -ENFILE;
1072}
1073
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001074/**
1075 * dev_alloc_name - allocate a name for a device
1076 * @dev: device
1077 * @name: name format string
1078 *
1079 * Passed a format string - eg "lt%d" it will try and find a suitable
1080 * id. It scans list of devices to build up a free map, then chooses
1081 * the first empty slot. The caller must hold the dev_base or rtnl lock
1082 * while allocating the name and adding the device in order to avoid
1083 * duplicates.
1084 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1085 * Returns the number of the unit assigned or a negative errno code.
1086 */
1087
1088int dev_alloc_name(struct net_device *dev, const char *name)
1089{
1090 char buf[IFNAMSIZ];
1091 struct net *net;
1092 int ret;
1093
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001094 BUG_ON(!dev_net(dev));
1095 net = dev_net(dev);
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001096 ret = __dev_alloc_name(net, name, buf);
1097 if (ret >= 0)
1098 strlcpy(dev->name, buf, IFNAMSIZ);
1099 return ret;
1100}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001101EXPORT_SYMBOL(dev_alloc_name);
Eric W. Biedermanb267b172007-09-12 13:48:45 +02001102
Gao feng828de4f2012-09-13 20:58:27 +00001103static int dev_alloc_name_ns(struct net *net,
1104 struct net_device *dev,
1105 const char *name)
Octavian Purdilad9031022009-11-18 02:36:59 +00001106{
Gao feng828de4f2012-09-13 20:58:27 +00001107 char buf[IFNAMSIZ];
1108 int ret;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001109
Gao feng828de4f2012-09-13 20:58:27 +00001110 ret = __dev_alloc_name(net, name, buf);
1111 if (ret >= 0)
1112 strlcpy(dev->name, buf, IFNAMSIZ);
1113 return ret;
1114}
1115
1116static int dev_get_valid_name(struct net *net,
1117 struct net_device *dev,
1118 const char *name)
1119{
1120 BUG_ON(!net);
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001121
Octavian Purdilad9031022009-11-18 02:36:59 +00001122 if (!dev_valid_name(name))
1123 return -EINVAL;
1124
Jiri Pirko1c5cae82011-04-30 01:21:32 +00001125 if (strchr(name, '%'))
Gao feng828de4f2012-09-13 20:58:27 +00001126 return dev_alloc_name_ns(net, dev, name);
Octavian Purdilad9031022009-11-18 02:36:59 +00001127 else if (__dev_get_by_name(net, name))
1128 return -EEXIST;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001129 else if (dev->name != name)
1130 strlcpy(dev->name, name, IFNAMSIZ);
Octavian Purdilad9031022009-11-18 02:36:59 +00001131
1132 return 0;
1133}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001134
1135/**
1136 * dev_change_name - change name of a device
1137 * @dev: device
1138 * @newname: name (or format string) must be at least IFNAMSIZ
1139 *
1140 * Change name of a device, can pass format strings "eth%d".
1141 * for wildcarding.
1142 */
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07001143int dev_change_name(struct net_device *dev, const char *newname)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001144{
Tom Gundersen238fa362014-07-14 16:37:23 +02001145 unsigned char old_assign_type;
Herbert Xufcc5a032007-07-30 17:03:38 -07001146 char oldname[IFNAMSIZ];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001147 int err = 0;
Herbert Xufcc5a032007-07-30 17:03:38 -07001148 int ret;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001149 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001150
1151 ASSERT_RTNL();
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001152 BUG_ON(!dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001153
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001154 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001155 if (dev->flags & IFF_UP)
1156 return -EBUSY;
1157
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001158 write_seqcount_begin(&devnet_rename_seq);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001159
1160 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001161 write_seqcount_end(&devnet_rename_seq);
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -07001162 return 0;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001163 }
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -07001164
Herbert Xufcc5a032007-07-30 17:03:38 -07001165 memcpy(oldname, dev->name, IFNAMSIZ);
1166
Gao feng828de4f2012-09-13 20:58:27 +00001167 err = dev_get_valid_name(net, dev, newname);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001168 if (err < 0) {
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001169 write_seqcount_end(&devnet_rename_seq);
Octavian Purdilad9031022009-11-18 02:36:59 +00001170 return err;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001171 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001172
Veaceslav Falico6fe82a32014-07-17 20:33:32 +02001173 if (oldname[0] && !strchr(oldname, '%'))
1174 netdev_info(dev, "renamed from %s\n", oldname);
1175
Tom Gundersen238fa362014-07-14 16:37:23 +02001176 old_assign_type = dev->name_assign_type;
1177 dev->name_assign_type = NET_NAME_RENAMED;
1178
Herbert Xufcc5a032007-07-30 17:03:38 -07001179rollback:
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001180 ret = device_rename(&dev->dev, dev->name);
1181 if (ret) {
1182 memcpy(dev->name, oldname, IFNAMSIZ);
Tom Gundersen238fa362014-07-14 16:37:23 +02001183 dev->name_assign_type = old_assign_type;
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001184 write_seqcount_end(&devnet_rename_seq);
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001185 return ret;
Stephen Hemmingerdcc99772008-05-14 22:33:38 -07001186 }
Herbert Xu7f988ea2007-07-30 16:35:46 -07001187
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001188 write_seqcount_end(&devnet_rename_seq);
Brian Haleyc91f6df2012-11-26 05:21:08 +00001189
Veaceslav Falico5bb025f2014-01-14 21:58:51 +01001190 netdev_adjacent_rename_links(dev, oldname);
1191
Herbert Xu7f988ea2007-07-30 16:35:46 -07001192 write_lock_bh(&dev_base_lock);
Eric Dumazet372b2312011-05-17 13:56:59 -04001193 hlist_del_rcu(&dev->name_hlist);
Eric Dumazet72c95282009-10-30 07:11:27 +00001194 write_unlock_bh(&dev_base_lock);
1195
1196 synchronize_rcu();
1197
1198 write_lock_bh(&dev_base_lock);
1199 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Herbert Xu7f988ea2007-07-30 16:35:46 -07001200 write_unlock_bh(&dev_base_lock);
1201
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001202 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001203 ret = notifier_to_errno(ret);
1204
1205 if (ret) {
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001206 /* err >= 0 after dev_alloc_name() or stores the first errno */
1207 if (err >= 0) {
Herbert Xufcc5a032007-07-30 17:03:38 -07001208 err = ret;
Eric Dumazet30e6c9f2012-12-20 17:25:08 +00001209 write_seqcount_begin(&devnet_rename_seq);
Herbert Xufcc5a032007-07-30 17:03:38 -07001210 memcpy(dev->name, oldname, IFNAMSIZ);
Veaceslav Falico5bb025f2014-01-14 21:58:51 +01001211 memcpy(oldname, newname, IFNAMSIZ);
Tom Gundersen238fa362014-07-14 16:37:23 +02001212 dev->name_assign_type = old_assign_type;
1213 old_assign_type = NET_NAME_RENAMED;
Herbert Xufcc5a032007-07-30 17:03:38 -07001214 goto rollback;
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001215 } else {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001216 pr_err("%s: name change rollback failed: %d\n",
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001217 dev->name, ret);
Herbert Xufcc5a032007-07-30 17:03:38 -07001218 }
1219 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001220
1221 return err;
1222}
1223
1224/**
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001225 * dev_set_alias - change ifalias of a device
1226 * @dev: device
1227 * @alias: name up to IFALIASZ
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07001228 * @len: limit of bytes to copy from info
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001229 *
1230 * Set ifalias for a device,
1231 */
1232int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1233{
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001234 char *new_ifalias;
1235
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001236 ASSERT_RTNL();
1237
1238 if (len >= IFALIASZ)
1239 return -EINVAL;
1240
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001241 if (!len) {
Sachin Kamat388dfc22012-11-20 00:57:04 +00001242 kfree(dev->ifalias);
1243 dev->ifalias = NULL;
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001244 return 0;
1245 }
1246
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001247 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1248 if (!new_ifalias)
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001249 return -ENOMEM;
Alexey Khoroshilov7364e442012-08-08 00:33:25 +00001250 dev->ifalias = new_ifalias;
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001251
1252 strlcpy(dev->ifalias, alias, len+1);
1253 return len;
1254}
1255
1256
1257/**
Stephen Hemminger3041a062006-05-26 13:25:24 -07001258 * netdev_features_change - device changes features
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001259 * @dev: device to cause notification
1260 *
1261 * Called to indicate a device has changed features.
1262 */
1263void netdev_features_change(struct net_device *dev)
1264{
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001265 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001266}
1267EXPORT_SYMBOL(netdev_features_change);
1268
1269/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001270 * netdev_state_change - device changes state
1271 * @dev: device to cause notification
1272 *
1273 * Called to indicate a device has changed state. This function calls
1274 * the notifier chains for netdev_chain and sends a NEWLINK message
1275 * to the routing socket.
1276 */
1277void netdev_state_change(struct net_device *dev)
1278{
1279 if (dev->flags & IFF_UP) {
Loic Prylli5495119462014-07-01 21:39:43 -07001280 struct netdev_notifier_change_info change_info;
1281
1282 change_info.flags_changed = 0;
1283 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1284 &change_info.info);
Alexei Starovoitov7f294052013-10-23 16:02:42 -07001285 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001286 }
1287}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001288EXPORT_SYMBOL(netdev_state_change);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001289
Amerigo Wangee89bab2012-08-09 22:14:56 +00001290/**
1291 * netdev_notify_peers - notify network peers about existence of @dev
1292 * @dev: network device
1293 *
1294 * Generate traffic such that interested network peers are aware of
1295 * @dev, such as by generating a gratuitous ARP. This may be used when
1296 * a device wants to inform the rest of the network about some sort of
1297 * reconfiguration such as a failover event or virtual machine
1298 * migration.
1299 */
1300void netdev_notify_peers(struct net_device *dev)
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001301{
Amerigo Wangee89bab2012-08-09 22:14:56 +00001302 rtnl_lock();
1303 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1304 rtnl_unlock();
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001305}
Amerigo Wangee89bab2012-08-09 22:14:56 +00001306EXPORT_SYMBOL(netdev_notify_peers);
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001307
Patrick McHardybd380812010-02-26 06:34:53 +00001308static int __dev_open(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001309{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001310 const struct net_device_ops *ops = dev->netdev_ops;
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001311 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001312
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001313 ASSERT_RTNL();
1314
Linus Torvalds1da177e2005-04-16 15:20:36 -07001315 if (!netif_device_present(dev))
1316 return -ENODEV;
1317
Neil Hormanca99ca12013-02-05 08:05:43 +00001318 /* Block netpoll from trying to do any rx path servicing.
1319 * If we don't do this there is a chance ndo_poll_controller
1320 * or ndo_poll may be running while we open the device
1321 */
Eric W. Biederman66b55522014-03-27 15:39:03 -07001322 netpoll_poll_disable(dev);
Neil Hormanca99ca12013-02-05 08:05:43 +00001323
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001324 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1325 ret = notifier_to_errno(ret);
1326 if (ret)
1327 return ret;
1328
Linus Torvalds1da177e2005-04-16 15:20:36 -07001329 set_bit(__LINK_STATE_START, &dev->state);
Jeff Garzikbada3392007-10-23 20:19:37 -07001330
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001331 if (ops->ndo_validate_addr)
1332 ret = ops->ndo_validate_addr(dev);
Jeff Garzikbada3392007-10-23 20:19:37 -07001333
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001334 if (!ret && ops->ndo_open)
1335 ret = ops->ndo_open(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001336
Eric W. Biederman66b55522014-03-27 15:39:03 -07001337 netpoll_poll_enable(dev);
Neil Hormanca99ca12013-02-05 08:05:43 +00001338
Jeff Garzikbada3392007-10-23 20:19:37 -07001339 if (ret)
1340 clear_bit(__LINK_STATE_START, &dev->state);
1341 else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001342 dev->flags |= IFF_UP;
Patrick McHardy4417da62007-06-27 01:28:10 -07001343 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001344 dev_activate(dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04001345 add_device_randomness(dev->dev_addr, dev->addr_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001346 }
Jeff Garzikbada3392007-10-23 20:19:37 -07001347
Linus Torvalds1da177e2005-04-16 15:20:36 -07001348 return ret;
1349}
Patrick McHardybd380812010-02-26 06:34:53 +00001350
1351/**
1352 * dev_open - prepare an interface for use.
1353 * @dev: device to open
1354 *
1355 * Takes a device from down to up state. The device's private open
1356 * function is invoked and then the multicast lists are loaded. Finally
1357 * the device is moved into the up state and a %NETDEV_UP message is
1358 * sent to the netdev notifier chain.
1359 *
1360 * Calling this function on an active interface is a nop. On a failure
1361 * a negative errno code is returned.
1362 */
1363int dev_open(struct net_device *dev)
1364{
1365 int ret;
1366
Patrick McHardybd380812010-02-26 06:34:53 +00001367 if (dev->flags & IFF_UP)
1368 return 0;
1369
Patrick McHardybd380812010-02-26 06:34:53 +00001370 ret = __dev_open(dev);
1371 if (ret < 0)
1372 return ret;
1373
Alexei Starovoitov7f294052013-10-23 16:02:42 -07001374 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
Patrick McHardybd380812010-02-26 06:34:53 +00001375 call_netdevice_notifiers(NETDEV_UP, dev);
1376
1377 return ret;
1378}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001379EXPORT_SYMBOL(dev_open);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001380
Octavian Purdila44345722010-12-13 12:44:07 +00001381static int __dev_close_many(struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001382{
Octavian Purdila44345722010-12-13 12:44:07 +00001383 struct net_device *dev;
Patrick McHardybd380812010-02-26 06:34:53 +00001384
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001385 ASSERT_RTNL();
David S. Miller9d5010d2007-09-12 14:33:25 +02001386 might_sleep();
1387
Eric W. Biederman5cde2822013-10-05 19:26:05 -07001388 list_for_each_entry(dev, head, close_list) {
Eric W. Biederman3f4df202014-03-27 15:38:17 -07001389 /* Temporarily disable netpoll until the interface is down */
Eric W. Biederman66b55522014-03-27 15:39:03 -07001390 netpoll_poll_disable(dev);
Eric W. Biederman3f4df202014-03-27 15:38:17 -07001391
Octavian Purdila44345722010-12-13 12:44:07 +00001392 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001393
Octavian Purdila44345722010-12-13 12:44:07 +00001394 clear_bit(__LINK_STATE_START, &dev->state);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001395
Octavian Purdila44345722010-12-13 12:44:07 +00001396 /* Synchronize to scheduled poll. We cannot touch poll list, it
1397 * can be even on different cpu. So just clear netif_running().
1398 *
1399 * dev->stop() will invoke napi_disable() on all of it's
1400 * napi_struct instances on this device.
1401 */
Peter Zijlstra4e857c52014-03-17 18:06:10 +01001402 smp_mb__after_atomic(); /* Commit netif_running(). */
Octavian Purdila44345722010-12-13 12:44:07 +00001403 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001404
Octavian Purdila44345722010-12-13 12:44:07 +00001405 dev_deactivate_many(head);
1406
Eric W. Biederman5cde2822013-10-05 19:26:05 -07001407 list_for_each_entry(dev, head, close_list) {
Octavian Purdila44345722010-12-13 12:44:07 +00001408 const struct net_device_ops *ops = dev->netdev_ops;
1409
1410 /*
1411 * Call the device specific close. This cannot fail.
1412 * Only if device is UP
1413 *
1414 * We allow it to be called even after a DETACH hot-plug
1415 * event.
1416 */
1417 if (ops->ndo_stop)
1418 ops->ndo_stop(dev);
1419
Octavian Purdila44345722010-12-13 12:44:07 +00001420 dev->flags &= ~IFF_UP;
Eric W. Biederman66b55522014-03-27 15:39:03 -07001421 netpoll_poll_enable(dev);
Octavian Purdila44345722010-12-13 12:44:07 +00001422 }
1423
1424 return 0;
1425}
1426
1427static int __dev_close(struct net_device *dev)
1428{
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001429 int retval;
Octavian Purdila44345722010-12-13 12:44:07 +00001430 LIST_HEAD(single);
1431
Eric W. Biederman5cde2822013-10-05 19:26:05 -07001432 list_add(&dev->close_list, &single);
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001433 retval = __dev_close_many(&single);
1434 list_del(&single);
Neil Hormanca99ca12013-02-05 08:05:43 +00001435
Linus Torvaldsf87e6f42011-02-17 22:54:38 +00001436 return retval;
Octavian Purdila44345722010-12-13 12:44:07 +00001437}
1438
David S. Miller99c4a262015-03-18 22:52:33 -04001439int dev_close_many(struct list_head *head, bool unlink)
Octavian Purdila44345722010-12-13 12:44:07 +00001440{
1441 struct net_device *dev, *tmp;
Octavian Purdila44345722010-12-13 12:44:07 +00001442
Eric W. Biederman5cde2822013-10-05 19:26:05 -07001443 /* Remove the devices that don't need to be closed */
1444 list_for_each_entry_safe(dev, tmp, head, close_list)
Octavian Purdila44345722010-12-13 12:44:07 +00001445 if (!(dev->flags & IFF_UP))
Eric W. Biederman5cde2822013-10-05 19:26:05 -07001446 list_del_init(&dev->close_list);
Octavian Purdila44345722010-12-13 12:44:07 +00001447
1448 __dev_close_many(head);
Matti Linnanvuorid8b2a4d2008-02-12 23:10:11 -08001449
Eric W. Biederman5cde2822013-10-05 19:26:05 -07001450 list_for_each_entry_safe(dev, tmp, head, close_list) {
Alexei Starovoitov7f294052013-10-23 16:02:42 -07001451 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
Octavian Purdila44345722010-12-13 12:44:07 +00001452 call_netdevice_notifiers(NETDEV_DOWN, dev);
David S. Miller99c4a262015-03-18 22:52:33 -04001453 if (unlink)
1454 list_del_init(&dev->close_list);
Octavian Purdila44345722010-12-13 12:44:07 +00001455 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001456
Linus Torvalds1da177e2005-04-16 15:20:36 -07001457 return 0;
1458}
David S. Miller99c4a262015-03-18 22:52:33 -04001459EXPORT_SYMBOL(dev_close_many);
Patrick McHardybd380812010-02-26 06:34:53 +00001460
1461/**
1462 * dev_close - shutdown an interface.
1463 * @dev: device to shutdown
1464 *
1465 * This function moves an active device into down state. A
1466 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1467 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1468 * chain.
1469 */
1470int dev_close(struct net_device *dev)
1471{
Eric Dumazete14a5992011-05-10 12:26:06 -07001472 if (dev->flags & IFF_UP) {
1473 LIST_HEAD(single);
Patrick McHardybd380812010-02-26 06:34:53 +00001474
Eric W. Biederman5cde2822013-10-05 19:26:05 -07001475 list_add(&dev->close_list, &single);
David S. Miller99c4a262015-03-18 22:52:33 -04001476 dev_close_many(&single, true);
Eric Dumazete14a5992011-05-10 12:26:06 -07001477 list_del(&single);
1478 }
dingtianhongda6e3782013-05-27 19:53:31 +00001479 return 0;
Patrick McHardybd380812010-02-26 06:34:53 +00001480}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001481EXPORT_SYMBOL(dev_close);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001482
1483
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001484/**
1485 * dev_disable_lro - disable Large Receive Offload on a device
1486 * @dev: device
1487 *
1488 * Disable Large Receive Offload (LRO) on a net device. Must be
1489 * called under RTNL. This is needed if received packets may be
1490 * forwarded to another interface.
1491 */
1492void dev_disable_lro(struct net_device *dev)
1493{
Michal Kubečekfbe168b2014-11-13 07:54:50 +01001494 struct net_device *lower_dev;
1495 struct list_head *iter;
Michal Kubeček529d0482013-11-15 06:18:50 +01001496
Michał Mirosławbc5787c62011-11-15 15:29:55 +00001497 dev->wanted_features &= ~NETIF_F_LRO;
1498 netdev_update_features(dev);
Michał Mirosław27660512011-03-18 16:56:34 +00001499
Michał Mirosław22d59692011-04-21 12:42:15 +00001500 if (unlikely(dev->features & NETIF_F_LRO))
1501 netdev_WARN(dev, "failed to disable LRO!\n");
Michal Kubečekfbe168b2014-11-13 07:54:50 +01001502
1503 netdev_for_each_lower_dev(dev, lower_dev, iter)
1504 dev_disable_lro(lower_dev);
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001505}
1506EXPORT_SYMBOL(dev_disable_lro);
1507
Jiri Pirko351638e2013-05-28 01:30:21 +00001508static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1509 struct net_device *dev)
1510{
1511 struct netdev_notifier_info info;
1512
1513 netdev_notifier_info_init(&info, dev);
1514 return nb->notifier_call(nb, val, &info);
1515}
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001516
Eric W. Biederman881d9662007-09-17 11:56:21 -07001517static int dev_boot_phase = 1;
1518
Linus Torvalds1da177e2005-04-16 15:20:36 -07001519/**
1520 * register_netdevice_notifier - register a network notifier block
1521 * @nb: notifier
1522 *
1523 * Register a notifier to be called when network device events occur.
1524 * The notifier passed is linked into the kernel structures and must
1525 * not be reused until it has been unregistered. A negative errno code
1526 * is returned on a failure.
1527 *
1528 * When registered all registration and up events are replayed
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001529 * to the new notifier to allow device to have a race free
Linus Torvalds1da177e2005-04-16 15:20:36 -07001530 * view of the network device list.
1531 */
1532
1533int register_netdevice_notifier(struct notifier_block *nb)
1534{
1535 struct net_device *dev;
Herbert Xufcc5a032007-07-30 17:03:38 -07001536 struct net_device *last;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001537 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001538 int err;
1539
1540 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001541 err = raw_notifier_chain_register(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001542 if (err)
1543 goto unlock;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001544 if (dev_boot_phase)
1545 goto unlock;
1546 for_each_net(net) {
1547 for_each_netdev(net, dev) {
Jiri Pirko351638e2013-05-28 01:30:21 +00001548 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
Eric W. Biederman881d9662007-09-17 11:56:21 -07001549 err = notifier_to_errno(err);
1550 if (err)
1551 goto rollback;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001552
Eric W. Biederman881d9662007-09-17 11:56:21 -07001553 if (!(dev->flags & IFF_UP))
1554 continue;
Herbert Xufcc5a032007-07-30 17:03:38 -07001555
Jiri Pirko351638e2013-05-28 01:30:21 +00001556 call_netdevice_notifier(nb, NETDEV_UP, dev);
Eric W. Biederman881d9662007-09-17 11:56:21 -07001557 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001558 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001559
1560unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001561 rtnl_unlock();
1562 return err;
Herbert Xufcc5a032007-07-30 17:03:38 -07001563
1564rollback:
1565 last = dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001566 for_each_net(net) {
1567 for_each_netdev(net, dev) {
1568 if (dev == last)
RongQing.Li8f891482011-11-30 23:43:07 -05001569 goto outroll;
Herbert Xufcc5a032007-07-30 17:03:38 -07001570
Eric W. Biederman881d9662007-09-17 11:56:21 -07001571 if (dev->flags & IFF_UP) {
Jiri Pirko351638e2013-05-28 01:30:21 +00001572 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1573 dev);
1574 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
Eric W. Biederman881d9662007-09-17 11:56:21 -07001575 }
Jiri Pirko351638e2013-05-28 01:30:21 +00001576 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001577 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001578 }
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001579
RongQing.Li8f891482011-11-30 23:43:07 -05001580outroll:
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001581 raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001582 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001583}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001584EXPORT_SYMBOL(register_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001585
1586/**
1587 * unregister_netdevice_notifier - unregister a network notifier block
1588 * @nb: notifier
1589 *
1590 * Unregister a notifier previously registered by
1591 * register_netdevice_notifier(). The notifier is unlinked into the
1592 * kernel structures and may then be reused. A negative errno code
1593 * is returned on a failure.
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001594 *
1595 * After unregistering unregister and down device events are synthesized
1596 * for all devices on the device list to the removed notifier to remove
1597 * the need for special case cleanup code.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001598 */
1599
1600int unregister_netdevice_notifier(struct notifier_block *nb)
1601{
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001602 struct net_device *dev;
1603 struct net *net;
Herbert Xu9f514952006-03-25 01:24:25 -08001604 int err;
1605
1606 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001607 err = raw_notifier_chain_unregister(&netdev_chain, nb);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001608 if (err)
1609 goto unlock;
1610
1611 for_each_net(net) {
1612 for_each_netdev(net, dev) {
1613 if (dev->flags & IFF_UP) {
Jiri Pirko351638e2013-05-28 01:30:21 +00001614 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1615 dev);
1616 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001617 }
Jiri Pirko351638e2013-05-28 01:30:21 +00001618 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
Eric W. Biederman7d3d43d2012-04-06 15:33:35 +00001619 }
1620 }
1621unlock:
Herbert Xu9f514952006-03-25 01:24:25 -08001622 rtnl_unlock();
1623 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001624}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001625EXPORT_SYMBOL(unregister_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001626
1627/**
Jiri Pirko351638e2013-05-28 01:30:21 +00001628 * call_netdevice_notifiers_info - call all network notifier blocks
1629 * @val: value passed unmodified to notifier function
1630 * @dev: net_device pointer passed unmodified to notifier function
1631 * @info: notifier information data
1632 *
1633 * Call all network notifier blocks. Parameters and return value
1634 * are as for raw_notifier_call_chain().
1635 */
1636
stephen hemminger1d143d92013-12-29 14:01:29 -08001637static int call_netdevice_notifiers_info(unsigned long val,
1638 struct net_device *dev,
1639 struct netdev_notifier_info *info)
Jiri Pirko351638e2013-05-28 01:30:21 +00001640{
1641 ASSERT_RTNL();
1642 netdev_notifier_info_init(info, dev);
1643 return raw_notifier_call_chain(&netdev_chain, val, info);
1644}
Jiri Pirko351638e2013-05-28 01:30:21 +00001645
1646/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001647 * call_netdevice_notifiers - call all network notifier blocks
1648 * @val: value passed unmodified to notifier function
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001649 * @dev: net_device pointer passed unmodified to notifier function
Linus Torvalds1da177e2005-04-16 15:20:36 -07001650 *
1651 * Call all network notifier blocks. Parameters and return value
Alan Sternf07d5b92006-05-09 15:23:03 -07001652 * are as for raw_notifier_call_chain().
Linus Torvalds1da177e2005-04-16 15:20:36 -07001653 */
1654
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001655int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001656{
Jiri Pirko351638e2013-05-28 01:30:21 +00001657 struct netdev_notifier_info info;
1658
1659 return call_netdevice_notifiers_info(val, dev, &info);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001660}
stephen hemmingeredf947f2011-03-24 13:24:01 +00001661EXPORT_SYMBOL(call_netdevice_notifiers);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001662
Pablo Neira1cf519002015-05-13 18:19:37 +02001663#ifdef CONFIG_NET_INGRESS
Daniel Borkmann45771392015-04-10 23:07:54 +02001664static struct static_key ingress_needed __read_mostly;
1665
1666void net_inc_ingress_queue(void)
1667{
1668 static_key_slow_inc(&ingress_needed);
1669}
1670EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1671
1672void net_dec_ingress_queue(void)
1673{
1674 static_key_slow_dec(&ingress_needed);
1675}
1676EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1677#endif
1678
Ingo Molnarc5905af2012-02-24 08:31:31 +01001679static struct static_key netstamp_needed __read_mostly;
Eric Dumazetb90e5792011-11-28 11:16:50 +00001680#ifdef HAVE_JUMP_LABEL
Ingo Molnarc5905af2012-02-24 08:31:31 +01001681/* We are not allowed to call static_key_slow_dec() from irq context
Eric Dumazetb90e5792011-11-28 11:16:50 +00001682 * If net_disable_timestamp() is called from irq context, defer the
Ingo Molnarc5905af2012-02-24 08:31:31 +01001683 * static_key_slow_dec() calls.
Eric Dumazetb90e5792011-11-28 11:16:50 +00001684 */
1685static atomic_t netstamp_needed_deferred;
1686#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001687
1688void net_enable_timestamp(void)
1689{
Eric Dumazetb90e5792011-11-28 11:16:50 +00001690#ifdef HAVE_JUMP_LABEL
1691 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1692
1693 if (deferred) {
1694 while (--deferred)
Ingo Molnarc5905af2012-02-24 08:31:31 +01001695 static_key_slow_dec(&netstamp_needed);
Eric Dumazetb90e5792011-11-28 11:16:50 +00001696 return;
1697 }
1698#endif
Ingo Molnarc5905af2012-02-24 08:31:31 +01001699 static_key_slow_inc(&netstamp_needed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001700}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001701EXPORT_SYMBOL(net_enable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001702
1703void net_disable_timestamp(void)
1704{
Eric Dumazetb90e5792011-11-28 11:16:50 +00001705#ifdef HAVE_JUMP_LABEL
1706 if (in_interrupt()) {
1707 atomic_inc(&netstamp_needed_deferred);
1708 return;
1709 }
1710#endif
Ingo Molnarc5905af2012-02-24 08:31:31 +01001711 static_key_slow_dec(&netstamp_needed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001712}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001713EXPORT_SYMBOL(net_disable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001714
Eric Dumazet3b098e22010-05-15 23:57:10 -07001715static inline void net_timestamp_set(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001716{
Eric Dumazet588f0332011-11-15 04:12:55 +00001717 skb->tstamp.tv64 = 0;
Ingo Molnarc5905af2012-02-24 08:31:31 +01001718 if (static_key_false(&netstamp_needed))
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001719 __net_timestamp(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001720}
1721
Eric Dumazet588f0332011-11-15 04:12:55 +00001722#define net_timestamp_check(COND, SKB) \
Ingo Molnarc5905af2012-02-24 08:31:31 +01001723 if (static_key_false(&netstamp_needed)) { \
Eric Dumazet588f0332011-11-15 04:12:55 +00001724 if ((COND) && !(SKB)->tstamp.tv64) \
1725 __net_timestamp(SKB); \
1726 } \
Eric Dumazet3b098e22010-05-15 23:57:10 -07001727
Vlad Yasevich1ee481f2014-03-27 17:32:29 -04001728bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
Daniel Lezcano79b569f2011-03-30 02:42:17 -07001729{
1730 unsigned int len;
1731
1732 if (!(dev->flags & IFF_UP))
1733 return false;
1734
1735 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1736 if (skb->len <= len)
1737 return true;
1738
1739 /* if TSO is enabled, we don't care about the length as the packet
1740 * could be forwarded without being segmented before
1741 */
1742 if (skb_is_gso(skb))
1743 return true;
1744
1745 return false;
1746}
Vlad Yasevich1ee481f2014-03-27 17:32:29 -04001747EXPORT_SYMBOL_GPL(is_skb_forwardable);
Daniel Lezcano79b569f2011-03-30 02:42:17 -07001748
Herbert Xua0265d22014-04-17 13:45:03 +08001749int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1750{
Willem de Bruijnbbbf2df2015-06-08 11:53:08 -04001751 if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1752 unlikely(!is_skb_forwardable(dev, skb))) {
Herbert Xua0265d22014-04-17 13:45:03 +08001753 atomic_long_inc(&dev->rx_dropped);
1754 kfree_skb(skb);
1755 return NET_RX_DROP;
1756 }
1757
1758 skb_scrub_packet(skb, true);
WANG Cong08b4b8e2015-03-20 14:29:09 -07001759 skb->priority = 0;
Herbert Xua0265d22014-04-17 13:45:03 +08001760 skb->protocol = eth_type_trans(skb, dev);
Jay Vosburgh2c26d342014-12-19 15:32:00 -08001761 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
Herbert Xua0265d22014-04-17 13:45:03 +08001762
1763 return 0;
1764}
1765EXPORT_SYMBOL_GPL(__dev_forward_skb);
1766
Arnd Bergmann44540962009-11-26 06:07:08 +00001767/**
1768 * dev_forward_skb - loopback an skb to another netif
1769 *
1770 * @dev: destination network device
1771 * @skb: buffer to forward
1772 *
1773 * return values:
1774 * NET_RX_SUCCESS (no congestion)
Eric Dumazet6ec82562010-05-06 00:53:53 -07001775 * NET_RX_DROP (packet was dropped, but freed)
Arnd Bergmann44540962009-11-26 06:07:08 +00001776 *
1777 * dev_forward_skb can be used for injecting an skb from the
1778 * start_xmit function of one device into the receive queue
1779 * of another device.
1780 *
1781 * The receiving device may be in another namespace, so
1782 * we have to clear all information in the skb that could
1783 * impact namespace isolation.
1784 */
1785int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1786{
Herbert Xua0265d22014-04-17 13:45:03 +08001787 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001788}
1789EXPORT_SYMBOL_GPL(dev_forward_skb);
1790
Changli Gao71d9dec2010-12-15 19:57:25 +00001791static inline int deliver_skb(struct sk_buff *skb,
1792 struct packet_type *pt_prev,
1793 struct net_device *orig_dev)
1794{
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00001795 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1796 return -ENOMEM;
Changli Gao71d9dec2010-12-15 19:57:25 +00001797 atomic_inc(&skb->users);
1798 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1799}
1800
Salam Noureddine7866a622015-01-27 11:35:48 -08001801static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1802 struct packet_type **pt,
Jiri Pirkofbcb2172015-03-30 16:56:01 +02001803 struct net_device *orig_dev,
1804 __be16 type,
Salam Noureddine7866a622015-01-27 11:35:48 -08001805 struct list_head *ptype_list)
1806{
1807 struct packet_type *ptype, *pt_prev = *pt;
1808
1809 list_for_each_entry_rcu(ptype, ptype_list, list) {
1810 if (ptype->type != type)
1811 continue;
1812 if (pt_prev)
Jiri Pirkofbcb2172015-03-30 16:56:01 +02001813 deliver_skb(skb, pt_prev, orig_dev);
Salam Noureddine7866a622015-01-27 11:35:48 -08001814 pt_prev = ptype;
1815 }
1816 *pt = pt_prev;
1817}
1818
Eric Leblondc0de08d2012-08-16 22:02:58 +00001819static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1820{
Eric Leblonda3d744e2012-11-06 02:10:10 +00001821 if (!ptype->af_packet_priv || !skb->sk)
Eric Leblondc0de08d2012-08-16 22:02:58 +00001822 return false;
1823
1824 if (ptype->id_match)
1825 return ptype->id_match(ptype, skb->sk);
1826 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1827 return true;
1828
1829 return false;
1830}
1831
Linus Torvalds1da177e2005-04-16 15:20:36 -07001832/*
1833 * Support routine. Sends outgoing frames to any network
1834 * taps currently in use.
1835 */
1836
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001837static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001838{
1839 struct packet_type *ptype;
Changli Gao71d9dec2010-12-15 19:57:25 +00001840 struct sk_buff *skb2 = NULL;
1841 struct packet_type *pt_prev = NULL;
Salam Noureddine7866a622015-01-27 11:35:48 -08001842 struct list_head *ptype_list = &ptype_all;
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001843
Linus Torvalds1da177e2005-04-16 15:20:36 -07001844 rcu_read_lock();
Salam Noureddine7866a622015-01-27 11:35:48 -08001845again:
1846 list_for_each_entry_rcu(ptype, ptype_list, list) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001847 /* Never send packets back to the socket
1848 * they originated from - MvS (miquels@drinkel.ow.org)
1849 */
Salam Noureddine7866a622015-01-27 11:35:48 -08001850 if (skb_loop_sk(ptype, skb))
1851 continue;
Changli Gao71d9dec2010-12-15 19:57:25 +00001852
Salam Noureddine7866a622015-01-27 11:35:48 -08001853 if (pt_prev) {
1854 deliver_skb(skb2, pt_prev, skb->dev);
Changli Gao71d9dec2010-12-15 19:57:25 +00001855 pt_prev = ptype;
Salam Noureddine7866a622015-01-27 11:35:48 -08001856 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001857 }
Salam Noureddine7866a622015-01-27 11:35:48 -08001858
1859 /* need to clone skb, done only once */
1860 skb2 = skb_clone(skb, GFP_ATOMIC);
1861 if (!skb2)
1862 goto out_unlock;
1863
1864 net_timestamp_set(skb2);
1865
1866 /* skb->nh should be correctly
1867 * set by sender, so that the second statement is
1868 * just protection against buggy protocols.
1869 */
1870 skb_reset_mac_header(skb2);
1871
1872 if (skb_network_header(skb2) < skb2->data ||
1873 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1874 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1875 ntohs(skb2->protocol),
1876 dev->name);
1877 skb_reset_network_header(skb2);
1878 }
1879
1880 skb2->transport_header = skb2->network_header;
1881 skb2->pkt_type = PACKET_OUTGOING;
1882 pt_prev = ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001883 }
Salam Noureddine7866a622015-01-27 11:35:48 -08001884
1885 if (ptype_list == &ptype_all) {
1886 ptype_list = &dev->ptype_all;
1887 goto again;
1888 }
1889out_unlock:
Changli Gao71d9dec2010-12-15 19:57:25 +00001890 if (pt_prev)
1891 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001892 rcu_read_unlock();
1893}
1894
Ben Hutchings2c530402012-07-10 10:55:09 +00001895/**
1896 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
John Fastabend4f57c082011-01-17 08:06:04 +00001897 * @dev: Network device
1898 * @txq: number of queues available
1899 *
1900 * If real_num_tx_queues is changed the tc mappings may no longer be
1901 * valid. To resolve this verify the tc mapping remains valid and if
1902 * not NULL the mapping. With no priorities mapping to this
1903 * offset/count pair it will no longer be used. In the worst case TC0
1904 * is invalid nothing can be done so disable priority mappings. If is
1905 * expected that drivers will fix this mapping if they can before
1906 * calling netif_set_real_num_tx_queues.
1907 */
Eric Dumazetbb134d22011-01-20 19:18:08 +00001908static void netif_setup_tc(struct net_device *dev, unsigned int txq)
John Fastabend4f57c082011-01-17 08:06:04 +00001909{
1910 int i;
1911 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1912
1913 /* If TC0 is invalidated disable TC mapping */
1914 if (tc->offset + tc->count > txq) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001915 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
John Fastabend4f57c082011-01-17 08:06:04 +00001916 dev->num_tc = 0;
1917 return;
1918 }
1919
1920 /* Invalidated prio to tc mappings set to TC0 */
1921 for (i = 1; i < TC_BITMASK + 1; i++) {
1922 int q = netdev_get_prio_tc_map(dev, i);
1923
1924 tc = &dev->tc_to_txq[q];
1925 if (tc->offset + tc->count > txq) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00001926 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1927 i, q);
John Fastabend4f57c082011-01-17 08:06:04 +00001928 netdev_set_prio_tc_map(dev, i, 0);
1929 }
1930 }
1931}
1932
Alexander Duyck537c00d2013-01-10 08:57:02 +00001933#ifdef CONFIG_XPS
1934static DEFINE_MUTEX(xps_map_mutex);
1935#define xmap_dereference(P) \
1936 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1937
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001938static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1939 int cpu, u16 index)
1940{
1941 struct xps_map *map = NULL;
1942 int pos;
1943
1944 if (dev_maps)
1945 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1946
1947 for (pos = 0; map && pos < map->len; pos++) {
1948 if (map->queues[pos] == index) {
1949 if (map->len > 1) {
1950 map->queues[pos] = map->queues[--map->len];
1951 } else {
1952 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1953 kfree_rcu(map, rcu);
1954 map = NULL;
1955 }
1956 break;
1957 }
1958 }
1959
1960 return map;
1961}
1962
Alexander Duyck024e9672013-01-10 08:57:46 +00001963static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
Alexander Duyck537c00d2013-01-10 08:57:02 +00001964{
1965 struct xps_dev_maps *dev_maps;
Alexander Duyck024e9672013-01-10 08:57:46 +00001966 int cpu, i;
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001967 bool active = false;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001968
1969 mutex_lock(&xps_map_mutex);
1970 dev_maps = xmap_dereference(dev->xps_maps);
1971
1972 if (!dev_maps)
1973 goto out_no_maps;
1974
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001975 for_each_possible_cpu(cpu) {
Alexander Duyck024e9672013-01-10 08:57:46 +00001976 for (i = index; i < dev->num_tx_queues; i++) {
1977 if (!remove_xps_queue(dev_maps, cpu, i))
1978 break;
1979 }
1980 if (i == dev->num_tx_queues)
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001981 active = true;
Alexander Duyck537c00d2013-01-10 08:57:02 +00001982 }
1983
Alexander Duyck10cdc3f2013-01-10 08:57:17 +00001984 if (!active) {
Alexander Duyck537c00d2013-01-10 08:57:02 +00001985 RCU_INIT_POINTER(dev->xps_maps, NULL);
1986 kfree_rcu(dev_maps, rcu);
1987 }
1988
Alexander Duyck024e9672013-01-10 08:57:46 +00001989 for (i = index; i < dev->num_tx_queues; i++)
1990 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1991 NUMA_NO_NODE);
1992
Alexander Duyck537c00d2013-01-10 08:57:02 +00001993out_no_maps:
1994 mutex_unlock(&xps_map_mutex);
1995}
1996
Alexander Duyck01c5f862013-01-10 08:57:35 +00001997static struct xps_map *expand_xps_map(struct xps_map *map,
1998 int cpu, u16 index)
1999{
2000 struct xps_map *new_map;
2001 int alloc_len = XPS_MIN_MAP_ALLOC;
2002 int i, pos;
2003
2004 for (pos = 0; map && pos < map->len; pos++) {
2005 if (map->queues[pos] != index)
2006 continue;
2007 return map;
2008 }
2009
2010 /* Need to add queue to this CPU's existing map */
2011 if (map) {
2012 if (pos < map->alloc_len)
2013 return map;
2014
2015 alloc_len = map->alloc_len * 2;
2016 }
2017
2018 /* Need to allocate new map to store queue on this CPU's map */
2019 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2020 cpu_to_node(cpu));
2021 if (!new_map)
2022 return NULL;
2023
2024 for (i = 0; i < pos; i++)
2025 new_map->queues[i] = map->queues[i];
2026 new_map->alloc_len = alloc_len;
2027 new_map->len = pos;
2028
2029 return new_map;
2030}
2031
Michael S. Tsirkin35735402013-10-02 09:14:06 +03002032int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2033 u16 index)
Alexander Duyck537c00d2013-01-10 08:57:02 +00002034{
Alexander Duyck01c5f862013-01-10 08:57:35 +00002035 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
Alexander Duyck537c00d2013-01-10 08:57:02 +00002036 struct xps_map *map, *new_map;
Alexander Duyck537c00d2013-01-10 08:57:02 +00002037 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
Alexander Duyck01c5f862013-01-10 08:57:35 +00002038 int cpu, numa_node_id = -2;
2039 bool active = false;
Alexander Duyck537c00d2013-01-10 08:57:02 +00002040
2041 mutex_lock(&xps_map_mutex);
2042
2043 dev_maps = xmap_dereference(dev->xps_maps);
2044
Alexander Duyck01c5f862013-01-10 08:57:35 +00002045 /* allocate memory for queue storage */
2046 for_each_online_cpu(cpu) {
2047 if (!cpumask_test_cpu(cpu, mask))
2048 continue;
Alexander Duyck537c00d2013-01-10 08:57:02 +00002049
Alexander Duyck01c5f862013-01-10 08:57:35 +00002050 if (!new_dev_maps)
2051 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
Alexander Duyck2bb60cb2013-02-22 06:38:44 +00002052 if (!new_dev_maps) {
2053 mutex_unlock(&xps_map_mutex);
Alexander Duyck01c5f862013-01-10 08:57:35 +00002054 return -ENOMEM;
Alexander Duyck2bb60cb2013-02-22 06:38:44 +00002055 }
Alexander Duyck01c5f862013-01-10 08:57:35 +00002056
2057 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2058 NULL;
2059
2060 map = expand_xps_map(map, cpu, index);
2061 if (!map)
2062 goto error;
2063
2064 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2065 }
2066
2067 if (!new_dev_maps)
2068 goto out_no_new_maps;
2069
2070 for_each_possible_cpu(cpu) {
2071 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2072 /* add queue to CPU maps */
2073 int pos = 0;
2074
2075 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2076 while ((pos < map->len) && (map->queues[pos] != index))
2077 pos++;
2078
2079 if (pos == map->len)
2080 map->queues[map->len++] = index;
Alexander Duyck537c00d2013-01-10 08:57:02 +00002081#ifdef CONFIG_NUMA
Alexander Duyck537c00d2013-01-10 08:57:02 +00002082 if (numa_node_id == -2)
2083 numa_node_id = cpu_to_node(cpu);
2084 else if (numa_node_id != cpu_to_node(cpu))
2085 numa_node_id = -1;
Alexander Duyck537c00d2013-01-10 08:57:02 +00002086#endif
Alexander Duyck01c5f862013-01-10 08:57:35 +00002087 } else if (dev_maps) {
2088 /* fill in the new device map from the old device map */
2089 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2090 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
Alexander Duyck537c00d2013-01-10 08:57:02 +00002091 }
Alexander Duyck01c5f862013-01-10 08:57:35 +00002092
Alexander Duyck537c00d2013-01-10 08:57:02 +00002093 }
2094
Alexander Duyck01c5f862013-01-10 08:57:35 +00002095 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2096
Alexander Duyck537c00d2013-01-10 08:57:02 +00002097 /* Cleanup old maps */
Alexander Duyck01c5f862013-01-10 08:57:35 +00002098 if (dev_maps) {
2099 for_each_possible_cpu(cpu) {
2100 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2101 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2102 if (map && map != new_map)
2103 kfree_rcu(map, rcu);
2104 }
Alexander Duyck537c00d2013-01-10 08:57:02 +00002105
Alexander Duyck537c00d2013-01-10 08:57:02 +00002106 kfree_rcu(dev_maps, rcu);
Alexander Duyck01c5f862013-01-10 08:57:35 +00002107 }
Alexander Duyck537c00d2013-01-10 08:57:02 +00002108
Alexander Duyck01c5f862013-01-10 08:57:35 +00002109 dev_maps = new_dev_maps;
2110 active = true;
2111
2112out_no_new_maps:
2113 /* update Tx queue numa node */
Alexander Duyck537c00d2013-01-10 08:57:02 +00002114 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2115 (numa_node_id >= 0) ? numa_node_id :
2116 NUMA_NO_NODE);
2117
Alexander Duyck01c5f862013-01-10 08:57:35 +00002118 if (!dev_maps)
2119 goto out_no_maps;
2120
2121 /* removes queue from unused CPUs */
2122 for_each_possible_cpu(cpu) {
2123 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2124 continue;
2125
2126 if (remove_xps_queue(dev_maps, cpu, index))
2127 active = true;
2128 }
2129
2130 /* free map if not active */
2131 if (!active) {
2132 RCU_INIT_POINTER(dev->xps_maps, NULL);
2133 kfree_rcu(dev_maps, rcu);
2134 }
2135
2136out_no_maps:
Alexander Duyck537c00d2013-01-10 08:57:02 +00002137 mutex_unlock(&xps_map_mutex);
2138
2139 return 0;
2140error:
Alexander Duyck01c5f862013-01-10 08:57:35 +00002141 /* remove any maps that we added */
2142 for_each_possible_cpu(cpu) {
2143 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2144 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2145 NULL;
2146 if (new_map && new_map != map)
2147 kfree(new_map);
2148 }
2149
Alexander Duyck537c00d2013-01-10 08:57:02 +00002150 mutex_unlock(&xps_map_mutex);
2151
Alexander Duyck537c00d2013-01-10 08:57:02 +00002152 kfree(new_dev_maps);
2153 return -ENOMEM;
2154}
2155EXPORT_SYMBOL(netif_set_xps_queue);
2156
2157#endif
John Fastabendf0796d52010-07-01 13:21:57 +00002158/*
2159 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2160 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2161 */
Tom Herberte6484932010-10-18 18:04:39 +00002162int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
John Fastabendf0796d52010-07-01 13:21:57 +00002163{
Tom Herbert1d24eb42010-11-21 13:17:27 +00002164 int rc;
2165
Tom Herberte6484932010-10-18 18:04:39 +00002166 if (txq < 1 || txq > dev->num_tx_queues)
2167 return -EINVAL;
John Fastabendf0796d52010-07-01 13:21:57 +00002168
Ben Hutchings5c565802011-02-15 19:39:21 +00002169 if (dev->reg_state == NETREG_REGISTERED ||
2170 dev->reg_state == NETREG_UNREGISTERING) {
Tom Herberte6484932010-10-18 18:04:39 +00002171 ASSERT_RTNL();
2172
Tom Herbert1d24eb42010-11-21 13:17:27 +00002173 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2174 txq);
Tom Herbertbf264142010-11-26 08:36:09 +00002175 if (rc)
2176 return rc;
2177
John Fastabend4f57c082011-01-17 08:06:04 +00002178 if (dev->num_tc)
2179 netif_setup_tc(dev, txq);
2180
Alexander Duyck024e9672013-01-10 08:57:46 +00002181 if (txq < dev->real_num_tx_queues) {
Tom Herberte6484932010-10-18 18:04:39 +00002182 qdisc_reset_all_tx_gt(dev, txq);
Alexander Duyck024e9672013-01-10 08:57:46 +00002183#ifdef CONFIG_XPS
2184 netif_reset_xps_queues_gt(dev, txq);
2185#endif
2186 }
John Fastabendf0796d52010-07-01 13:21:57 +00002187 }
Tom Herberte6484932010-10-18 18:04:39 +00002188
2189 dev->real_num_tx_queues = txq;
2190 return 0;
John Fastabendf0796d52010-07-01 13:21:57 +00002191}
2192EXPORT_SYMBOL(netif_set_real_num_tx_queues);
Denis Vlasenko56079432006-03-29 15:57:29 -08002193
Michael Daltona953be52014-01-16 22:23:28 -08002194#ifdef CONFIG_SYSFS
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002195/**
2196 * netif_set_real_num_rx_queues - set actual number of RX queues used
2197 * @dev: Network device
2198 * @rxq: Actual number of RX queues
2199 *
2200 * This must be called either with the rtnl_lock held or before
2201 * registration of the net device. Returns 0 on success, or a
Ben Hutchings4e7f7952010-10-08 10:33:39 -07002202 * negative error code. If called before registration, it always
2203 * succeeds.
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002204 */
2205int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2206{
2207 int rc;
2208
Tom Herbertbd25fa72010-10-18 18:00:16 +00002209 if (rxq < 1 || rxq > dev->num_rx_queues)
2210 return -EINVAL;
2211
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002212 if (dev->reg_state == NETREG_REGISTERED) {
2213 ASSERT_RTNL();
2214
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002215 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2216 rxq);
2217 if (rc)
2218 return rc;
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002219 }
2220
2221 dev->real_num_rx_queues = rxq;
2222 return 0;
2223}
2224EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2225#endif
2226
Ben Hutchings2c530402012-07-10 10:55:09 +00002227/**
2228 * netif_get_num_default_rss_queues - default number of RSS queues
Yuval Mintz16917b82012-07-01 03:18:50 +00002229 *
2230 * This routine should set an upper limit on the number of RSS queues
2231 * used by default by multiqueue devices.
2232 */
Ben Hutchingsa55b1382012-07-10 10:54:38 +00002233int netif_get_num_default_rss_queues(void)
Yuval Mintz16917b82012-07-01 03:18:50 +00002234{
2235 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2236}
2237EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2238
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002239static inline void __netif_reschedule(struct Qdisc *q)
2240{
2241 struct softnet_data *sd;
2242 unsigned long flags;
2243
2244 local_irq_save(flags);
Christoph Lameter903ceff2014-08-17 12:30:35 -05002245 sd = this_cpu_ptr(&softnet_data);
Changli Gaoa9cbd582010-04-26 23:06:24 +00002246 q->next_sched = NULL;
2247 *sd->output_queue_tailp = q;
2248 sd->output_queue_tailp = &q->next_sched;
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002249 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2250 local_irq_restore(flags);
2251}
2252
David S. Miller37437bb2008-07-16 02:15:04 -07002253void __netif_schedule(struct Qdisc *q)
Denis Vlasenko56079432006-03-29 15:57:29 -08002254{
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002255 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2256 __netif_reschedule(q);
Denis Vlasenko56079432006-03-29 15:57:29 -08002257}
2258EXPORT_SYMBOL(__netif_schedule);
2259
Eric Dumazete6247022013-12-05 04:45:08 -08002260struct dev_kfree_skb_cb {
2261 enum skb_free_reason reason;
2262};
2263
2264static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
Denis Vlasenko56079432006-03-29 15:57:29 -08002265{
Eric Dumazete6247022013-12-05 04:45:08 -08002266 return (struct dev_kfree_skb_cb *)skb->cb;
Denis Vlasenko56079432006-03-29 15:57:29 -08002267}
Denis Vlasenko56079432006-03-29 15:57:29 -08002268
John Fastabend46e5da42014-09-12 20:04:52 -07002269void netif_schedule_queue(struct netdev_queue *txq)
2270{
2271 rcu_read_lock();
2272 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2273 struct Qdisc *q = rcu_dereference(txq->qdisc);
2274
2275 __netif_schedule(q);
2276 }
2277 rcu_read_unlock();
2278}
2279EXPORT_SYMBOL(netif_schedule_queue);
2280
2281/**
2282 * netif_wake_subqueue - allow sending packets on subqueue
2283 * @dev: network device
2284 * @queue_index: sub queue index
2285 *
2286 * Resume individual transmit queue of a device with multiple transmit queues.
2287 */
2288void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2289{
2290 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2291
2292 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2293 struct Qdisc *q;
2294
2295 rcu_read_lock();
2296 q = rcu_dereference(txq->qdisc);
2297 __netif_schedule(q);
2298 rcu_read_unlock();
2299 }
2300}
2301EXPORT_SYMBOL(netif_wake_subqueue);
2302
2303void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2304{
2305 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2306 struct Qdisc *q;
2307
2308 rcu_read_lock();
2309 q = rcu_dereference(dev_queue->qdisc);
2310 __netif_schedule(q);
2311 rcu_read_unlock();
2312 }
2313}
2314EXPORT_SYMBOL(netif_tx_wake_queue);
2315
Eric Dumazete6247022013-12-05 04:45:08 -08002316void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2317{
2318 unsigned long flags;
2319
2320 if (likely(atomic_read(&skb->users) == 1)) {
2321 smp_rmb();
2322 atomic_set(&skb->users, 0);
2323 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2324 return;
2325 }
2326 get_kfree_skb_cb(skb)->reason = reason;
2327 local_irq_save(flags);
2328 skb->next = __this_cpu_read(softnet_data.completion_queue);
2329 __this_cpu_write(softnet_data.completion_queue, skb);
2330 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2331 local_irq_restore(flags);
2332}
2333EXPORT_SYMBOL(__dev_kfree_skb_irq);
2334
2335void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
Denis Vlasenko56079432006-03-29 15:57:29 -08002336{
2337 if (in_irq() || irqs_disabled())
Eric Dumazete6247022013-12-05 04:45:08 -08002338 __dev_kfree_skb_irq(skb, reason);
Denis Vlasenko56079432006-03-29 15:57:29 -08002339 else
2340 dev_kfree_skb(skb);
2341}
Eric Dumazete6247022013-12-05 04:45:08 -08002342EXPORT_SYMBOL(__dev_kfree_skb_any);
Denis Vlasenko56079432006-03-29 15:57:29 -08002343
2344
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002345/**
2346 * netif_device_detach - mark device as removed
2347 * @dev: network device
2348 *
2349 * Mark device as removed from system and therefore no longer available.
2350 */
Denis Vlasenko56079432006-03-29 15:57:29 -08002351void netif_device_detach(struct net_device *dev)
2352{
2353 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2354 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00002355 netif_tx_stop_all_queues(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08002356 }
2357}
2358EXPORT_SYMBOL(netif_device_detach);
2359
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002360/**
2361 * netif_device_attach - mark device as attached
2362 * @dev: network device
2363 *
2364 * Mark device as attached from system and restart if needed.
2365 */
Denis Vlasenko56079432006-03-29 15:57:29 -08002366void netif_device_attach(struct net_device *dev)
2367{
2368 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2369 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00002370 netif_tx_wake_all_queues(dev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002371 __netdev_watchdog_up(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08002372 }
2373}
2374EXPORT_SYMBOL(netif_device_attach);
2375
Jiri Pirko5605c762015-05-12 14:56:12 +02002376/*
2377 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2378 * to be used as a distribution range.
2379 */
2380u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2381 unsigned int num_tx_queues)
2382{
2383 u32 hash;
2384 u16 qoffset = 0;
2385 u16 qcount = num_tx_queues;
2386
2387 if (skb_rx_queue_recorded(skb)) {
2388 hash = skb_get_rx_queue(skb);
2389 while (unlikely(hash >= num_tx_queues))
2390 hash -= num_tx_queues;
2391 return hash;
2392 }
2393
2394 if (dev->num_tc) {
2395 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2396 qoffset = dev->tc_to_txq[tc].offset;
2397 qcount = dev->tc_to_txq[tc].count;
2398 }
2399
2400 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2401}
2402EXPORT_SYMBOL(__skb_tx_hash);
2403
Ben Hutchings36c92472012-01-17 07:57:56 +00002404static void skb_warn_bad_offload(const struct sk_buff *skb)
2405{
Michał Mirosław65e9d2f2012-01-17 10:00:40 +00002406 static const netdev_features_t null_features = 0;
Ben Hutchings36c92472012-01-17 07:57:56 +00002407 struct net_device *dev = skb->dev;
Bjørn Mork88ad4172015-11-16 19:16:40 +01002408 const char *name = "";
Ben Hutchings36c92472012-01-17 07:57:56 +00002409
Ben Greearc846ad92013-04-19 10:45:52 +00002410 if (!net_ratelimit())
2411 return;
2412
Bjørn Mork88ad4172015-11-16 19:16:40 +01002413 if (dev) {
2414 if (dev->dev.parent)
2415 name = dev_driver_string(dev->dev.parent);
2416 else
2417 name = netdev_name(dev);
2418 }
Ben Hutchings36c92472012-01-17 07:57:56 +00002419 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2420 "gso_type=%d ip_summed=%d\n",
Bjørn Mork88ad4172015-11-16 19:16:40 +01002421 name, dev ? &dev->features : &null_features,
Michał Mirosław65e9d2f2012-01-17 10:00:40 +00002422 skb->sk ? &skb->sk->sk_route_caps : &null_features,
Ben Hutchings36c92472012-01-17 07:57:56 +00002423 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2424 skb_shinfo(skb)->gso_type, skb->ip_summed);
2425}
2426
Linus Torvalds1da177e2005-04-16 15:20:36 -07002427/*
2428 * Invalidate hardware checksum when packet is to be mangled, and
2429 * complete checksum manually on outgoing path.
2430 */
Patrick McHardy84fa7932006-08-29 16:44:56 -07002431int skb_checksum_help(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002432{
Al Virod3bc23e2006-11-14 21:24:49 -08002433 __wsum csum;
Herbert Xu663ead32007-04-09 11:59:07 -07002434 int ret = 0, offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002435
Patrick McHardy84fa7932006-08-29 16:44:56 -07002436 if (skb->ip_summed == CHECKSUM_COMPLETE)
Herbert Xua430a432006-07-08 13:34:56 -07002437 goto out_set_summed;
2438
2439 if (unlikely(skb_shinfo(skb)->gso_size)) {
Ben Hutchings36c92472012-01-17 07:57:56 +00002440 skb_warn_bad_offload(skb);
2441 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002442 }
2443
Eric Dumazetcef401d2013-01-25 20:34:37 +00002444 /* Before computing a checksum, we should make sure no frag could
2445 * be modified by an external entity : checksum could be wrong.
2446 */
2447 if (skb_has_shared_frag(skb)) {
2448 ret = __skb_linearize(skb);
2449 if (ret)
2450 goto out;
2451 }
2452
Michał Mirosław55508d62010-12-14 15:24:08 +00002453 offset = skb_checksum_start_offset(skb);
Herbert Xua0308472007-10-15 01:47:15 -07002454 BUG_ON(offset >= skb_headlen(skb));
2455 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2456
2457 offset += skb->csum_offset;
2458 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2459
2460 if (skb_cloned(skb) &&
2461 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002462 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2463 if (ret)
2464 goto out;
2465 }
2466
Herbert Xua0308472007-10-15 01:47:15 -07002467 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
Herbert Xua430a432006-07-08 13:34:56 -07002468out_set_summed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002469 skb->ip_summed = CHECKSUM_NONE;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002470out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002471 return ret;
2472}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002473EXPORT_SYMBOL(skb_checksum_help);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002474
Tom Herbert6ae23ad2015-12-14 11:19:46 -08002475/* skb_csum_offload_check - Driver helper function to determine if a device
2476 * with limited checksum offload capabilities is able to offload the checksum
2477 * for a given packet.
2478 *
2479 * Arguments:
2480 * skb - sk_buff for the packet in question
2481 * spec - contains the description of what device can offload
2482 * csum_encapped - returns true if the checksum being offloaded is
2483 * encpasulated. That is it is checksum for the transport header
2484 * in the inner headers.
2485 * checksum_help - when set indicates that helper function should
2486 * call skb_checksum_help if offload checks fail
2487 *
2488 * Returns:
2489 * true: Packet has passed the checksum checks and should be offloadable to
2490 * the device (a driver may still need to check for additional
2491 * restrictions of its device)
2492 * false: Checksum is not offloadable. If checksum_help was set then
2493 * skb_checksum_help was called to resolve checksum for non-GSO
2494 * packets and when IP protocol is not SCTP
2495 */
2496bool __skb_csum_offload_chk(struct sk_buff *skb,
2497 const struct skb_csum_offl_spec *spec,
2498 bool *csum_encapped,
2499 bool csum_help)
2500{
2501 struct iphdr *iph;
2502 struct ipv6hdr *ipv6;
2503 void *nhdr;
2504 int protocol;
2505 u8 ip_proto;
2506
2507 if (skb->protocol == htons(ETH_P_8021Q) ||
2508 skb->protocol == htons(ETH_P_8021AD)) {
2509 if (!spec->vlan_okay)
2510 goto need_help;
2511 }
2512
2513 /* We check whether the checksum refers to a transport layer checksum in
2514 * the outermost header or an encapsulated transport layer checksum that
2515 * corresponds to the inner headers of the skb. If the checksum is for
2516 * something else in the packet we need help.
2517 */
2518 if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2519 /* Non-encapsulated checksum */
2520 protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2521 nhdr = skb_network_header(skb);
2522 *csum_encapped = false;
2523 if (spec->no_not_encapped)
2524 goto need_help;
2525 } else if (skb->encapsulation && spec->encap_okay &&
2526 skb_checksum_start_offset(skb) ==
2527 skb_inner_transport_offset(skb)) {
2528 /* Encapsulated checksum */
2529 *csum_encapped = true;
2530 switch (skb->inner_protocol_type) {
2531 case ENCAP_TYPE_ETHER:
2532 protocol = eproto_to_ipproto(skb->inner_protocol);
2533 break;
2534 case ENCAP_TYPE_IPPROTO:
2535 protocol = skb->inner_protocol;
2536 break;
2537 }
2538 nhdr = skb_inner_network_header(skb);
2539 } else {
2540 goto need_help;
2541 }
2542
2543 switch (protocol) {
2544 case IPPROTO_IP:
2545 if (!spec->ipv4_okay)
2546 goto need_help;
2547 iph = nhdr;
2548 ip_proto = iph->protocol;
2549 if (iph->ihl != 5 && !spec->ip_options_okay)
2550 goto need_help;
2551 break;
2552 case IPPROTO_IPV6:
2553 if (!spec->ipv6_okay)
2554 goto need_help;
2555 if (spec->no_encapped_ipv6 && *csum_encapped)
2556 goto need_help;
2557 ipv6 = nhdr;
2558 nhdr += sizeof(*ipv6);
2559 ip_proto = ipv6->nexthdr;
2560 break;
2561 default:
2562 goto need_help;
2563 }
2564
2565ip_proto_again:
2566 switch (ip_proto) {
2567 case IPPROTO_TCP:
2568 if (!spec->tcp_okay ||
2569 skb->csum_offset != offsetof(struct tcphdr, check))
2570 goto need_help;
2571 break;
2572 case IPPROTO_UDP:
2573 if (!spec->udp_okay ||
2574 skb->csum_offset != offsetof(struct udphdr, check))
2575 goto need_help;
2576 break;
2577 case IPPROTO_SCTP:
2578 if (!spec->sctp_okay ||
2579 skb->csum_offset != offsetof(struct sctphdr, checksum))
2580 goto cant_help;
2581 break;
2582 case NEXTHDR_HOP:
2583 case NEXTHDR_ROUTING:
2584 case NEXTHDR_DEST: {
2585 u8 *opthdr = nhdr;
2586
2587 if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2588 goto need_help;
2589
2590 ip_proto = opthdr[0];
2591 nhdr += (opthdr[1] + 1) << 3;
2592
2593 goto ip_proto_again;
2594 }
2595 default:
2596 goto need_help;
2597 }
2598
2599 /* Passed the tests for offloading checksum */
2600 return true;
2601
2602need_help:
2603 if (csum_help && !skb_shinfo(skb)->gso_size)
2604 skb_checksum_help(skb);
2605cant_help:
2606 return false;
2607}
2608EXPORT_SYMBOL(__skb_csum_offload_chk);
2609
Vlad Yasevich53d64712014-03-27 17:26:18 -04002610__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002611{
2612 __be16 type = skb->protocol;
2613
Pravin B Shelar19acc322013-05-07 20:41:07 +00002614 /* Tunnel gso handlers can set protocol to ethernet. */
2615 if (type == htons(ETH_P_TEB)) {
2616 struct ethhdr *eth;
2617
2618 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2619 return 0;
2620
2621 eth = (struct ethhdr *)skb_mac_header(skb);
2622 type = eth->h_proto;
2623 }
2624
Toshiaki Makitad4bcef32015-01-29 20:37:07 +09002625 return __vlan_get_protocol(skb, type, depth);
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002626}
2627
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002628/**
2629 * skb_mac_gso_segment - mac layer segmentation handler.
2630 * @skb: buffer to segment
2631 * @features: features for the output path (see dev->features)
2632 */
2633struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2634 netdev_features_t features)
2635{
2636 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2637 struct packet_offload *ptype;
Vlad Yasevich53d64712014-03-27 17:26:18 -04002638 int vlan_depth = skb->mac_len;
2639 __be16 type = skb_network_protocol(skb, &vlan_depth);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002640
Pravin B Shelarec5f0612013-03-07 09:28:01 +00002641 if (unlikely(!type))
2642 return ERR_PTR(-EINVAL);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002643
Vlad Yasevich53d64712014-03-27 17:26:18 -04002644 __skb_pull(skb, vlan_depth);
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002645
2646 rcu_read_lock();
2647 list_for_each_entry_rcu(ptype, &offload_base, list) {
2648 if (ptype->type == type && ptype->callbacks.gso_segment) {
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002649 segs = ptype->callbacks.gso_segment(skb, features);
2650 break;
2651 }
2652 }
2653 rcu_read_unlock();
2654
2655 __skb_push(skb, skb->data - skb_mac_header(skb));
2656
2657 return segs;
2658}
2659EXPORT_SYMBOL(skb_mac_gso_segment);
2660
2661
Cong Wang12b00042013-02-05 16:36:38 +00002662/* openvswitch calls this on rx path, so we need a different check.
2663 */
2664static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2665{
2666 if (tx_path)
2667 return skb->ip_summed != CHECKSUM_PARTIAL;
2668 else
2669 return skb->ip_summed == CHECKSUM_NONE;
2670}
2671
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002672/**
Cong Wang12b00042013-02-05 16:36:38 +00002673 * __skb_gso_segment - Perform segmentation on skb.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002674 * @skb: buffer to segment
Herbert Xu576a30e2006-06-27 13:22:38 -07002675 * @features: features for the output path (see dev->features)
Cong Wang12b00042013-02-05 16:36:38 +00002676 * @tx_path: whether it is called in TX path
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002677 *
2678 * This function segments the given skb and returns a list of segments.
Herbert Xu576a30e2006-06-27 13:22:38 -07002679 *
2680 * It may return NULL if the skb requires no segmentation. This is
2681 * only possible when GSO is used for verifying header integrity.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002682 */
Cong Wang12b00042013-02-05 16:36:38 +00002683struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2684 netdev_features_t features, bool tx_path)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002685{
Cong Wang12b00042013-02-05 16:36:38 +00002686 if (unlikely(skb_needs_check(skb, tx_path))) {
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002687 int err;
2688
Ben Hutchings36c92472012-01-17 07:57:56 +00002689 skb_warn_bad_offload(skb);
Herbert Xu67fd1a72009-01-19 16:26:44 -08002690
françois romieua40e0a62014-07-15 23:55:35 +02002691 err = skb_cow_head(skb, 0);
2692 if (err < 0)
Herbert Xua430a432006-07-08 13:34:56 -07002693 return ERR_PTR(err);
2694 }
2695
Pravin B Shelar68c33162013-02-14 14:02:41 +00002696 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
Eric Dumazet3347c962013-10-19 11:42:56 -07002697 SKB_GSO_CB(skb)->encap_level = 0;
2698
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002699 skb_reset_mac_header(skb);
2700 skb_reset_mac_len(skb);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002701
Pravin B Shelar05e8ef42013-02-14 09:44:55 +00002702 return skb_mac_gso_segment(skb, features);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002703}
Cong Wang12b00042013-02-05 16:36:38 +00002704EXPORT_SYMBOL(__skb_gso_segment);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002705
Herbert Xufb286bb2005-11-10 13:01:24 -08002706/* Take action when hardware reception checksum errors are detected. */
2707#ifdef CONFIG_BUG
2708void netdev_rx_csum_fault(struct net_device *dev)
2709{
2710 if (net_ratelimit()) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00002711 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
Herbert Xufb286bb2005-11-10 13:01:24 -08002712 dump_stack();
2713 }
2714}
2715EXPORT_SYMBOL(netdev_rx_csum_fault);
2716#endif
2717
Linus Torvalds1da177e2005-04-16 15:20:36 -07002718/* Actually, we should eliminate this check as soon as we know, that:
2719 * 1. IOMMU is present and allows to map all the memory.
2720 * 2. No high memory really exists on this machine.
2721 */
2722
Florian Westphalc1e756b2014-05-05 15:00:44 +02002723static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002724{
Herbert Xu3d3a8532006-06-27 13:33:10 -07002725#ifdef CONFIG_HIGHMEM
Linus Torvalds1da177e2005-04-16 15:20:36 -07002726 int i;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002727 if (!(dev->features & NETIF_F_HIGHDMA)) {
Ian Campbellea2ab692011-08-22 23:44:58 +00002728 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2729 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2730 if (PageHighMem(skb_frag_page(frag)))
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002731 return 1;
Ian Campbellea2ab692011-08-22 23:44:58 +00002732 }
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002733 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002734
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002735 if (PCI_DMA_BUS_IS_PHYS) {
2736 struct device *pdev = dev->dev.parent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002737
Eric Dumazet9092c652010-04-02 13:34:49 -07002738 if (!pdev)
2739 return 0;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002740 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
Ian Campbellea2ab692011-08-22 23:44:58 +00002741 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2742 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00002743 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2744 return 1;
2745 }
2746 }
Herbert Xu3d3a8532006-06-27 13:33:10 -07002747#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002748 return 0;
2749}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002750
Simon Horman3b392dd2014-06-04 08:53:17 +09002751/* If MPLS offload request, verify we are testing hardware MPLS features
2752 * instead of standard features for the netdev.
2753 */
Pravin B Shelard0edc7b2014-12-23 16:20:11 -08002754#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
Simon Horman3b392dd2014-06-04 08:53:17 +09002755static netdev_features_t net_mpls_features(struct sk_buff *skb,
2756 netdev_features_t features,
2757 __be16 type)
2758{
Simon Horman25cd9ba2014-10-06 05:05:13 -07002759 if (eth_p_mpls(type))
Simon Horman3b392dd2014-06-04 08:53:17 +09002760 features &= skb->dev->mpls_features;
2761
2762 return features;
2763}
2764#else
2765static netdev_features_t net_mpls_features(struct sk_buff *skb,
2766 netdev_features_t features,
2767 __be16 type)
2768{
2769 return features;
2770}
2771#endif
2772
Michał Mirosławc8f44af2011-11-15 15:29:55 +00002773static netdev_features_t harmonize_features(struct sk_buff *skb,
Florian Westphalc1e756b2014-05-05 15:00:44 +02002774 netdev_features_t features)
Jesse Grossf01a5232011-01-09 06:23:31 +00002775{
Vlad Yasevich53d64712014-03-27 17:26:18 -04002776 int tmp;
Simon Horman3b392dd2014-06-04 08:53:17 +09002777 __be16 type;
2778
2779 type = skb_network_protocol(skb, &tmp);
2780 features = net_mpls_features(skb, features, type);
Vlad Yasevich53d64712014-03-27 17:26:18 -04002781
Ed Cashinc0d680e2012-09-19 15:49:00 +00002782 if (skb->ip_summed != CHECKSUM_NONE &&
Simon Horman3b392dd2014-06-04 08:53:17 +09002783 !can_checksum_protocol(features, type)) {
Tom Herberta1882222015-12-14 11:19:43 -08002784 features &= ~NETIF_F_CSUM_MASK;
Florian Westphalc1e756b2014-05-05 15:00:44 +02002785 } else if (illegal_highdma(skb->dev, skb)) {
Jesse Grossf01a5232011-01-09 06:23:31 +00002786 features &= ~NETIF_F_SG;
2787 }
2788
2789 return features;
2790}
2791
Toshiaki Makitae38f3022015-03-27 14:31:13 +09002792netdev_features_t passthru_features_check(struct sk_buff *skb,
2793 struct net_device *dev,
2794 netdev_features_t features)
2795{
2796 return features;
2797}
2798EXPORT_SYMBOL(passthru_features_check);
2799
Toshiaki Makita8cb65d02015-03-27 14:31:12 +09002800static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2801 struct net_device *dev,
2802 netdev_features_t features)
2803{
2804 return vlan_features_check(skb, features);
2805}
2806
Florian Westphalc1e756b2014-05-05 15:00:44 +02002807netdev_features_t netif_skb_features(struct sk_buff *skb)
Jesse Gross58e998c2010-10-29 12:14:55 +00002808{
Jesse Gross5f352272014-12-23 22:37:26 -08002809 struct net_device *dev = skb->dev;
Eric Dumazetfcbeb972014-10-05 10:11:27 -07002810 netdev_features_t features = dev->features;
2811 u16 gso_segs = skb_shinfo(skb)->gso_segs;
Jesse Gross58e998c2010-10-29 12:14:55 +00002812
Eric Dumazetfcbeb972014-10-05 10:11:27 -07002813 if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
Ben Hutchings30b678d2012-07-30 15:57:00 +00002814 features &= ~NETIF_F_GSO_MASK;
2815
Jesse Gross5f352272014-12-23 22:37:26 -08002816 /* If encapsulation offload request, verify we are testing
2817 * hardware encapsulation features instead of standard
2818 * features for the netdev
2819 */
2820 if (skb->encapsulation)
2821 features &= dev->hw_enc_features;
2822
Toshiaki Makitaf5a7fb82015-03-27 14:31:11 +09002823 if (skb_vlan_tagged(skb))
2824 features = netdev_intersect_features(features,
2825 dev->vlan_features |
2826 NETIF_F_HW_VLAN_CTAG_TX |
2827 NETIF_F_HW_VLAN_STAG_TX);
Jesse Gross58e998c2010-10-29 12:14:55 +00002828
Jesse Gross5f352272014-12-23 22:37:26 -08002829 if (dev->netdev_ops->ndo_features_check)
2830 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2831 features);
Toshiaki Makita8cb65d02015-03-27 14:31:12 +09002832 else
2833 features &= dflt_features_check(skb, dev, features);
Jesse Gross5f352272014-12-23 22:37:26 -08002834
Florian Westphalc1e756b2014-05-05 15:00:44 +02002835 return harmonize_features(skb, features);
Jesse Gross58e998c2010-10-29 12:14:55 +00002836}
Florian Westphalc1e756b2014-05-05 15:00:44 +02002837EXPORT_SYMBOL(netif_skb_features);
Jesse Gross58e998c2010-10-29 12:14:55 +00002838
David S. Miller2ea25512014-08-29 21:10:01 -07002839static int xmit_one(struct sk_buff *skb, struct net_device *dev,
David S. Miller95f6b3d2014-08-29 21:57:30 -07002840 struct netdev_queue *txq, bool more)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002841{
David S. Miller2ea25512014-08-29 21:10:01 -07002842 unsigned int len;
2843 int rc;
Stephen Hemminger00829822008-11-20 20:14:53 -08002844
Salam Noureddine7866a622015-01-27 11:35:48 -08002845 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
David S. Miller2ea25512014-08-29 21:10:01 -07002846 dev_queue_xmit_nit(skb, dev);
Jesse Grossfc741212011-01-09 06:23:32 +00002847
David S. Miller2ea25512014-08-29 21:10:01 -07002848 len = skb->len;
2849 trace_net_dev_start_xmit(skb, dev);
David S. Miller95f6b3d2014-08-29 21:57:30 -07002850 rc = netdev_start_xmit(skb, dev, txq, more);
David S. Miller2ea25512014-08-29 21:10:01 -07002851 trace_net_dev_xmit(skb, rc, dev, len);
Eric Dumazetadf30902009-06-02 05:19:30 +00002852
Patrick McHardy572a9d72009-11-10 06:14:14 +00002853 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002854}
David S. Miller2ea25512014-08-29 21:10:01 -07002855
David S. Miller8dcda222014-09-01 15:06:40 -07002856struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2857 struct netdev_queue *txq, int *ret)
David S. Miller7f2e8702014-08-29 21:19:14 -07002858{
2859 struct sk_buff *skb = first;
2860 int rc = NETDEV_TX_OK;
2861
2862 while (skb) {
2863 struct sk_buff *next = skb->next;
2864
2865 skb->next = NULL;
David S. Miller95f6b3d2014-08-29 21:57:30 -07002866 rc = xmit_one(skb, dev, txq, next != NULL);
David S. Miller7f2e8702014-08-29 21:19:14 -07002867 if (unlikely(!dev_xmit_complete(rc))) {
2868 skb->next = next;
2869 goto out;
2870 }
2871
2872 skb = next;
2873 if (netif_xmit_stopped(txq) && skb) {
2874 rc = NETDEV_TX_BUSY;
2875 break;
2876 }
2877 }
2878
2879out:
2880 *ret = rc;
2881 return skb;
2882}
2883
Eric Dumazet1ff0dc92014-10-06 11:26:27 -07002884static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2885 netdev_features_t features)
David S. Millereae3f882014-08-30 15:17:13 -07002886{
Jiri Pirkodf8a39d2015-01-13 17:13:44 +01002887 if (skb_vlan_tag_present(skb) &&
Jiri Pirko59682502014-11-19 14:04:59 +01002888 !vlan_hw_offload_capable(features, skb->vlan_proto))
2889 skb = __vlan_hwaccel_push_inside(skb);
David S. Millereae3f882014-08-30 15:17:13 -07002890 return skb;
2891}
2892
Eric Dumazet55a93b32014-10-03 15:31:07 -07002893static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
David S. Millereae3f882014-08-30 15:17:13 -07002894{
2895 netdev_features_t features;
2896
2897 if (skb->next)
2898 return skb;
2899
David S. Millereae3f882014-08-30 15:17:13 -07002900 features = netif_skb_features(skb);
2901 skb = validate_xmit_vlan(skb, features);
2902 if (unlikely(!skb))
2903 goto out_null;
2904
Johannes Berg8b86a612015-04-17 15:45:04 +02002905 if (netif_needs_gso(skb, features)) {
David S. Millerce937182014-08-30 19:22:20 -07002906 struct sk_buff *segs;
2907
2908 segs = skb_gso_segment(skb, features);
Jason Wangcecda692014-09-19 16:04:38 +08002909 if (IS_ERR(segs)) {
Jason Wangaf6dabc2014-12-19 11:09:13 +08002910 goto out_kfree_skb;
Jason Wangcecda692014-09-19 16:04:38 +08002911 } else if (segs) {
2912 consume_skb(skb);
2913 skb = segs;
2914 }
David S. Millereae3f882014-08-30 15:17:13 -07002915 } else {
2916 if (skb_needs_linearize(skb, features) &&
2917 __skb_linearize(skb))
2918 goto out_kfree_skb;
2919
2920 /* If packet is not checksummed and device does not
2921 * support checksumming for this protocol, complete
2922 * checksumming here.
2923 */
2924 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2925 if (skb->encapsulation)
2926 skb_set_inner_transport_header(skb,
2927 skb_checksum_start_offset(skb));
2928 else
2929 skb_set_transport_header(skb,
2930 skb_checksum_start_offset(skb));
Tom Herberta1882222015-12-14 11:19:43 -08002931 if (!(features & NETIF_F_CSUM_MASK) &&
David S. Millereae3f882014-08-30 15:17:13 -07002932 skb_checksum_help(skb))
2933 goto out_kfree_skb;
2934 }
2935 }
2936
2937 return skb;
2938
2939out_kfree_skb:
2940 kfree_skb(skb);
2941out_null:
2942 return NULL;
2943}
2944
Eric Dumazet55a93b32014-10-03 15:31:07 -07002945struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2946{
2947 struct sk_buff *next, *head = NULL, *tail;
2948
Eric Dumazetbec3cfd2014-10-03 20:59:19 -07002949 for (; skb != NULL; skb = next) {
Eric Dumazet55a93b32014-10-03 15:31:07 -07002950 next = skb->next;
2951 skb->next = NULL;
Eric Dumazet55a93b32014-10-03 15:31:07 -07002952
Eric Dumazetbec3cfd2014-10-03 20:59:19 -07002953 /* in case skb wont be segmented, point to itself */
2954 skb->prev = skb;
2955
2956 skb = validate_xmit_skb(skb, dev);
2957 if (!skb)
2958 continue;
2959
2960 if (!head)
2961 head = skb;
2962 else
2963 tail->next = skb;
2964 /* If skb was segmented, skb->prev points to
2965 * the last segment. If not, it still contains skb.
2966 */
2967 tail = skb->prev;
Eric Dumazet55a93b32014-10-03 15:31:07 -07002968 }
2969 return head;
2970}
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002971
Eric Dumazet1def9232013-01-10 12:36:42 +00002972static void qdisc_pkt_len_init(struct sk_buff *skb)
2973{
2974 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2975
2976 qdisc_skb_cb(skb)->pkt_len = skb->len;
2977
2978 /* To get more precise estimation of bytes sent on wire,
2979 * we add to pkt_len the headers size of all segments
2980 */
2981 if (shinfo->gso_size) {
Eric Dumazet757b8b12013-01-15 21:14:21 -08002982 unsigned int hdr_len;
Jason Wang15e5a032013-03-25 20:19:59 +00002983 u16 gso_segs = shinfo->gso_segs;
Eric Dumazet1def9232013-01-10 12:36:42 +00002984
Eric Dumazet757b8b12013-01-15 21:14:21 -08002985 /* mac layer + network layer */
2986 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2987
2988 /* + transport layer */
Eric Dumazet1def9232013-01-10 12:36:42 +00002989 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2990 hdr_len += tcp_hdrlen(skb);
2991 else
2992 hdr_len += sizeof(struct udphdr);
Jason Wang15e5a032013-03-25 20:19:59 +00002993
2994 if (shinfo->gso_type & SKB_GSO_DODGY)
2995 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2996 shinfo->gso_size);
2997
2998 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
Eric Dumazet1def9232013-01-10 12:36:42 +00002999 }
3000}
3001
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00003002static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3003 struct net_device *dev,
3004 struct netdev_queue *txq)
3005{
3006 spinlock_t *root_lock = qdisc_lock(q);
Eric Dumazeta2da5702011-01-20 03:48:19 +00003007 bool contended;
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00003008 int rc;
3009
Eric Dumazet1def9232013-01-10 12:36:42 +00003010 qdisc_pkt_len_init(skb);
Eric Dumazeta2da5702011-01-20 03:48:19 +00003011 qdisc_calculate_pkt_len(skb, q);
Eric Dumazet79640a42010-06-02 05:09:29 -07003012 /*
3013 * Heuristic to force contended enqueues to serialize on a
3014 * separate lock before trying to get qdisc main lock.
Ying Xue9bf2b8c2014-06-26 15:56:31 +08003015 * This permits __QDISC___STATE_RUNNING owner to get the lock more
3016 * often and dequeue packets faster.
Eric Dumazet79640a42010-06-02 05:09:29 -07003017 */
Eric Dumazeta2da5702011-01-20 03:48:19 +00003018 contended = qdisc_is_running(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07003019 if (unlikely(contended))
3020 spin_lock(&q->busylock);
3021
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00003022 spin_lock(root_lock);
3023 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3024 kfree_skb(skb);
3025 rc = NET_XMIT_DROP;
3026 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
Eric Dumazetbc135b22010-06-02 03:23:51 -07003027 qdisc_run_begin(q)) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00003028 /*
3029 * This is a work-conserving queue; there are no old skbs
3030 * waiting to be sent out; and the qdisc is not running -
3031 * xmit the skb directly.
3032 */
Eric Dumazetbfe0d022011-01-09 08:30:54 +00003033
Eric Dumazetbfe0d022011-01-09 08:30:54 +00003034 qdisc_bstats_update(q, skb);
3035
Eric Dumazet55a93b32014-10-03 15:31:07 -07003036 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
Eric Dumazet79640a42010-06-02 05:09:29 -07003037 if (unlikely(contended)) {
3038 spin_unlock(&q->busylock);
3039 contended = false;
3040 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00003041 __qdisc_run(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07003042 } else
Eric Dumazetbc135b22010-06-02 03:23:51 -07003043 qdisc_run_end(q);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00003044
3045 rc = NET_XMIT_SUCCESS;
3046 } else {
Eric Dumazeta2da5702011-01-20 03:48:19 +00003047 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
Eric Dumazet79640a42010-06-02 05:09:29 -07003048 if (qdisc_run_begin(q)) {
3049 if (unlikely(contended)) {
3050 spin_unlock(&q->busylock);
3051 contended = false;
3052 }
3053 __qdisc_run(q);
3054 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00003055 }
3056 spin_unlock(root_lock);
Eric Dumazet79640a42010-06-02 05:09:29 -07003057 if (unlikely(contended))
3058 spin_unlock(&q->busylock);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00003059 return rc;
3060}
3061
Daniel Borkmann86f85152013-12-29 17:27:11 +01003062#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
Neil Horman5bc14212011-11-22 05:10:51 +00003063static void skb_update_prio(struct sk_buff *skb)
3064{
Igor Maravic6977a792011-11-25 07:44:54 +00003065 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
Neil Horman5bc14212011-11-22 05:10:51 +00003066
Eric Dumazet91c68ce2012-07-08 21:45:10 +00003067 if (!skb->priority && skb->sk && map) {
Tejun Heo2a56a1f2015-12-07 17:38:52 -05003068 unsigned int prioidx =
3069 sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
Eric Dumazet91c68ce2012-07-08 21:45:10 +00003070
3071 if (prioidx < map->priomap_len)
3072 skb->priority = map->priomap[prioidx];
3073 }
Neil Horman5bc14212011-11-22 05:10:51 +00003074}
3075#else
3076#define skb_update_prio(skb)
3077#endif
3078
hannes@stressinduktion.orgf60e5992015-04-01 17:07:44 +02003079DEFINE_PER_CPU(int, xmit_recursion);
3080EXPORT_SYMBOL(xmit_recursion);
3081
David S. Miller11a766c2010-10-25 12:51:55 -07003082#define RECURSION_LIMIT 10
Eric Dumazet745e20f2010-09-29 13:23:09 -07003083
Dave Jonesd29f7492008-07-22 14:09:06 -07003084/**
Michel Machado95603e22012-06-12 10:16:35 +00003085 * dev_loopback_xmit - loop back @skb
Eric W. Biederman0c4b51f2015-09-15 20:04:18 -05003086 * @net: network namespace this loopback is happening in
3087 * @sk: sk needed to be a netfilter okfn
Michel Machado95603e22012-06-12 10:16:35 +00003088 * @skb: buffer to transmit
3089 */
Eric W. Biederman0c4b51f2015-09-15 20:04:18 -05003090int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
Michel Machado95603e22012-06-12 10:16:35 +00003091{
3092 skb_reset_mac_header(skb);
3093 __skb_pull(skb, skb_network_offset(skb));
3094 skb->pkt_type = PACKET_LOOPBACK;
3095 skb->ip_summed = CHECKSUM_UNNECESSARY;
3096 WARN_ON(!skb_dst(skb));
3097 skb_dst_force(skb);
3098 netif_rx_ni(skb);
3099 return 0;
3100}
3101EXPORT_SYMBOL(dev_loopback_xmit);
3102
Jiri Pirko638b2a62015-05-12 14:56:13 +02003103static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3104{
3105#ifdef CONFIG_XPS
3106 struct xps_dev_maps *dev_maps;
3107 struct xps_map *map;
3108 int queue_index = -1;
3109
3110 rcu_read_lock();
3111 dev_maps = rcu_dereference(dev->xps_maps);
3112 if (dev_maps) {
3113 map = rcu_dereference(
3114 dev_maps->cpu_map[skb->sender_cpu - 1]);
3115 if (map) {
3116 if (map->len == 1)
3117 queue_index = map->queues[0];
3118 else
3119 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3120 map->len)];
3121 if (unlikely(queue_index >= dev->real_num_tx_queues))
3122 queue_index = -1;
3123 }
3124 }
3125 rcu_read_unlock();
3126
3127 return queue_index;
3128#else
3129 return -1;
3130#endif
3131}
3132
3133static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3134{
3135 struct sock *sk = skb->sk;
3136 int queue_index = sk_tx_queue_get(sk);
3137
3138 if (queue_index < 0 || skb->ooo_okay ||
3139 queue_index >= dev->real_num_tx_queues) {
3140 int new_index = get_xps_queue(dev, skb);
3141 if (new_index < 0)
3142 new_index = skb_tx_hash(dev, skb);
3143
3144 if (queue_index != new_index && sk &&
Eric Dumazet004a5d02015-10-04 21:08:10 -07003145 sk_fullsock(sk) &&
Jiri Pirko638b2a62015-05-12 14:56:13 +02003146 rcu_access_pointer(sk->sk_dst_cache))
3147 sk_tx_queue_set(sk, new_index);
3148
3149 queue_index = new_index;
3150 }
3151
3152 return queue_index;
3153}
3154
3155struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3156 struct sk_buff *skb,
3157 void *accel_priv)
3158{
3159 int queue_index = 0;
3160
3161#ifdef CONFIG_XPS
Eric Dumazet52bd2d62015-11-18 06:30:50 -08003162 u32 sender_cpu = skb->sender_cpu - 1;
3163
3164 if (sender_cpu >= (u32)NR_CPUS)
Jiri Pirko638b2a62015-05-12 14:56:13 +02003165 skb->sender_cpu = raw_smp_processor_id() + 1;
3166#endif
3167
3168 if (dev->real_num_tx_queues != 1) {
3169 const struct net_device_ops *ops = dev->netdev_ops;
3170 if (ops->ndo_select_queue)
3171 queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3172 __netdev_pick_tx);
3173 else
3174 queue_index = __netdev_pick_tx(dev, skb);
3175
3176 if (!accel_priv)
3177 queue_index = netdev_cap_txqueue(dev, queue_index);
3178 }
3179
3180 skb_set_queue_mapping(skb, queue_index);
3181 return netdev_get_tx_queue(dev, queue_index);
3182}
3183
Michel Machado95603e22012-06-12 10:16:35 +00003184/**
Jason Wang9d08dd32014-01-20 11:25:13 +08003185 * __dev_queue_xmit - transmit a buffer
Dave Jonesd29f7492008-07-22 14:09:06 -07003186 * @skb: buffer to transmit
Jason Wang9d08dd32014-01-20 11:25:13 +08003187 * @accel_priv: private data used for L2 forwarding offload
Dave Jonesd29f7492008-07-22 14:09:06 -07003188 *
3189 * Queue a buffer for transmission to a network device. The caller must
3190 * have set the device and priority and built the buffer before calling
3191 * this function. The function can be called from an interrupt.
3192 *
3193 * A negative errno code is returned on a failure. A success does not
3194 * guarantee the frame will be transmitted as it may be dropped due
3195 * to congestion or traffic shaping.
3196 *
3197 * -----------------------------------------------------------------------------------
3198 * I notice this method can also return errors from the queue disciplines,
3199 * including NET_XMIT_DROP, which is a positive value. So, errors can also
3200 * be positive.
3201 *
3202 * Regardless of the return value, the skb is consumed, so it is currently
3203 * difficult to retry a send to this method. (You can bump the ref count
3204 * before sending to hold a reference for retry if you are careful.)
3205 *
3206 * When calling this method, interrupts MUST be enabled. This is because
3207 * the BH enable code must have IRQs enabled so that it will not deadlock.
3208 * --BLG
3209 */
Rashika Kheria0a59f3a2014-02-09 20:26:25 +05303210static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003211{
3212 struct net_device *dev = skb->dev;
David S. Millerdc2b4842008-07-08 17:18:23 -07003213 struct netdev_queue *txq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003214 struct Qdisc *q;
3215 int rc = -ENOMEM;
3216
Eric Dumazet6d1ccff2013-02-05 20:22:20 +00003217 skb_reset_mac_header(skb);
3218
Willem de Bruijne7fd2882014-08-04 22:11:48 -04003219 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3220 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3221
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003222 /* Disable soft irqs for various locks below. Also
3223 * stops preemption for RCU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003224 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003225 rcu_read_lock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003226
Neil Horman5bc14212011-11-22 05:10:51 +00003227 skb_update_prio(skb);
3228
Eric Dumazet02875872014-10-05 18:38:35 -07003229 /* If device/qdisc don't need skb->dst, release it right now while
3230 * its hot in this cpu cache.
3231 */
3232 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3233 skb_dst_drop(skb);
3234 else
3235 skb_dst_force(skb);
3236
Scott Feldman0c4f6912015-07-18 18:24:48 -07003237#ifdef CONFIG_NET_SWITCHDEV
3238 /* Don't forward if offload device already forwarded */
3239 if (skb->offload_fwd_mark &&
3240 skb->offload_fwd_mark == dev->offload_fwd_mark) {
3241 consume_skb(skb);
3242 rc = NET_XMIT_SUCCESS;
3243 goto out;
3244 }
3245#endif
3246
Jason Wangf663dd92014-01-10 16:18:26 +08003247 txq = netdev_pick_tx(dev, skb, accel_priv);
Paul E. McKenneya898def2010-02-22 17:04:49 -08003248 q = rcu_dereference_bh(txq->qdisc);
David S. Miller37437bb2008-07-16 02:15:04 -07003249
Linus Torvalds1da177e2005-04-16 15:20:36 -07003250#ifdef CONFIG_NET_CLS_ACT
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003251 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003252#endif
Koki Sanagicf66ba52010-08-23 18:45:02 +09003253 trace_net_dev_queue(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003254 if (q->enqueue) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00003255 rc = __dev_xmit_skb(skb, q, dev, txq);
David S. Miller37437bb2008-07-16 02:15:04 -07003256 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003257 }
3258
3259 /* The device has no queue. Common case for software devices:
3260 loopback, all the sorts of tunnels...
3261
Herbert Xu932ff272006-06-09 12:20:56 -07003262 Really, it is unlikely that netif_tx_lock protection is necessary
3263 here. (f.e. loopback and IP tunnels are clean ignoring statistics
Linus Torvalds1da177e2005-04-16 15:20:36 -07003264 counters.)
3265 However, it is possible, that they rely on protection
3266 made by us here.
3267
3268 Check this and shot the lock. It is not prone from deadlocks.
3269 Either shot noqueue qdisc, it is even simpler 8)
3270 */
3271 if (dev->flags & IFF_UP) {
3272 int cpu = smp_processor_id(); /* ok because BHs are off */
3273
David S. Millerc773e842008-07-08 23:13:53 -07003274 if (txq->xmit_lock_owner != cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003275
Eric Dumazet745e20f2010-09-29 13:23:09 -07003276 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3277 goto recursion_alert;
3278
Jesper Dangaard Brouer1f595332014-09-03 17:56:09 +02003279 skb = validate_xmit_skb(skb, dev);
3280 if (!skb)
3281 goto drop;
3282
David S. Millerc773e842008-07-08 23:13:53 -07003283 HARD_TX_LOCK(dev, txq, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003284
Tom Herbert734664982011-11-28 16:32:44 +00003285 if (!netif_xmit_stopped(txq)) {
Eric Dumazet745e20f2010-09-29 13:23:09 -07003286 __this_cpu_inc(xmit_recursion);
David S. Millerce937182014-08-30 19:22:20 -07003287 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
Eric Dumazet745e20f2010-09-29 13:23:09 -07003288 __this_cpu_dec(xmit_recursion);
Patrick McHardy572a9d72009-11-10 06:14:14 +00003289 if (dev_xmit_complete(rc)) {
David S. Millerc773e842008-07-08 23:13:53 -07003290 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003291 goto out;
3292 }
3293 }
David S. Millerc773e842008-07-08 23:13:53 -07003294 HARD_TX_UNLOCK(dev, txq);
Joe Perchese87cc472012-05-13 21:56:26 +00003295 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3296 dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003297 } else {
3298 /* Recursion is detected! It is possible,
Eric Dumazet745e20f2010-09-29 13:23:09 -07003299 * unfortunately
3300 */
3301recursion_alert:
Joe Perchese87cc472012-05-13 21:56:26 +00003302 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3303 dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003304 }
3305 }
3306
3307 rc = -ENETDOWN;
Jesper Dangaard Brouer1f595332014-09-03 17:56:09 +02003308drop:
Herbert Xud4828d82006-06-22 02:28:18 -07003309 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003310
Eric Dumazet015f0682014-03-27 08:45:56 -07003311 atomic_long_inc(&dev->tx_dropped);
Jesper Dangaard Brouer1f595332014-09-03 17:56:09 +02003312 kfree_skb_list(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003313 return rc;
3314out:
Herbert Xud4828d82006-06-22 02:28:18 -07003315 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003316 return rc;
3317}
Jason Wangf663dd92014-01-10 16:18:26 +08003318
Eric W. Biederman2b4aa3c2015-09-15 20:04:07 -05003319int dev_queue_xmit(struct sk_buff *skb)
Jason Wangf663dd92014-01-10 16:18:26 +08003320{
3321 return __dev_queue_xmit(skb, NULL);
3322}
Eric W. Biederman2b4aa3c2015-09-15 20:04:07 -05003323EXPORT_SYMBOL(dev_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003324
Jason Wangf663dd92014-01-10 16:18:26 +08003325int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3326{
3327 return __dev_queue_xmit(skb, accel_priv);
3328}
3329EXPORT_SYMBOL(dev_queue_xmit_accel);
3330
Linus Torvalds1da177e2005-04-16 15:20:36 -07003331
3332/*=======================================================================
3333 Receiver routines
3334 =======================================================================*/
3335
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07003336int netdev_max_backlog __read_mostly = 1000;
Eric Dumazetc9e6bc62012-09-27 19:29:05 +00003337EXPORT_SYMBOL(netdev_max_backlog);
3338
Eric Dumazet3b098e22010-05-15 23:57:10 -07003339int netdev_tstamp_prequeue __read_mostly = 1;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07003340int netdev_budget __read_mostly = 300;
3341int weight_p __read_mostly = 64; /* old backlog weight */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003342
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003343/* Called with irq disabled */
3344static inline void ____napi_schedule(struct softnet_data *sd,
3345 struct napi_struct *napi)
3346{
3347 list_add_tail(&napi->poll_list, &sd->poll_list);
3348 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3349}
3350
Eric Dumazetdf334542010-03-24 19:13:54 +00003351#ifdef CONFIG_RPS
Tom Herbertfec5e652010-04-16 16:01:27 -07003352
3353/* One global table that all flow-based protocols share. */
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00003354struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
Tom Herbertfec5e652010-04-16 16:01:27 -07003355EXPORT_SYMBOL(rps_sock_flow_table);
Eric Dumazet567e4b72015-02-06 12:59:01 -08003356u32 rps_cpu_mask __read_mostly;
3357EXPORT_SYMBOL(rps_cpu_mask);
Tom Herbertfec5e652010-04-16 16:01:27 -07003358
Ingo Molnarc5905af2012-02-24 08:31:31 +01003359struct static_key rps_needed __read_mostly;
Eric Dumazetadc93002011-11-17 03:13:26 +00003360
Ben Hutchingsc4454772011-01-19 11:03:53 +00003361static struct rps_dev_flow *
3362set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3363 struct rps_dev_flow *rflow, u16 next_cpu)
3364{
Eric Dumazeta31196b2015-04-25 09:35:24 -07003365 if (next_cpu < nr_cpu_ids) {
Ben Hutchingsc4454772011-01-19 11:03:53 +00003366#ifdef CONFIG_RFS_ACCEL
3367 struct netdev_rx_queue *rxqueue;
3368 struct rps_dev_flow_table *flow_table;
3369 struct rps_dev_flow *old_rflow;
3370 u32 flow_id;
3371 u16 rxq_index;
3372 int rc;
3373
3374 /* Should we steer this flow to a different hardware queue? */
Ben Hutchings69a19ee2011-02-15 20:32:04 +00003375 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3376 !(dev->features & NETIF_F_NTUPLE))
Ben Hutchingsc4454772011-01-19 11:03:53 +00003377 goto out;
3378 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3379 if (rxq_index == skb_get_rx_queue(skb))
3380 goto out;
3381
3382 rxqueue = dev->_rx + rxq_index;
3383 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3384 if (!flow_table)
3385 goto out;
Tom Herbert61b905d2014-03-24 15:34:47 -07003386 flow_id = skb_get_hash(skb) & flow_table->mask;
Ben Hutchingsc4454772011-01-19 11:03:53 +00003387 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3388 rxq_index, flow_id);
3389 if (rc < 0)
3390 goto out;
3391 old_rflow = rflow;
3392 rflow = &flow_table->flows[flow_id];
Ben Hutchingsc4454772011-01-19 11:03:53 +00003393 rflow->filter = rc;
3394 if (old_rflow->filter == rflow->filter)
3395 old_rflow->filter = RPS_NO_FILTER;
3396 out:
3397#endif
3398 rflow->last_qtail =
Ben Hutchings09994d12011-10-03 04:42:46 +00003399 per_cpu(softnet_data, next_cpu).input_queue_head;
Ben Hutchingsc4454772011-01-19 11:03:53 +00003400 }
3401
Ben Hutchings09994d12011-10-03 04:42:46 +00003402 rflow->cpu = next_cpu;
Ben Hutchingsc4454772011-01-19 11:03:53 +00003403 return rflow;
3404}
3405
Tom Herbert0a9627f2010-03-16 08:03:29 +00003406/*
3407 * get_rps_cpu is called from netif_receive_skb and returns the target
3408 * CPU from the RPS map of the receiving queue for a given skb.
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003409 * rcu_read_lock must be held on entry.
Tom Herbert0a9627f2010-03-16 08:03:29 +00003410 */
Tom Herbertfec5e652010-04-16 16:01:27 -07003411static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3412 struct rps_dev_flow **rflowp)
Tom Herbert0a9627f2010-03-16 08:03:29 +00003413{
Eric Dumazet567e4b72015-02-06 12:59:01 -08003414 const struct rps_sock_flow_table *sock_flow_table;
3415 struct netdev_rx_queue *rxqueue = dev->_rx;
Tom Herbertfec5e652010-04-16 16:01:27 -07003416 struct rps_dev_flow_table *flow_table;
Eric Dumazet567e4b72015-02-06 12:59:01 -08003417 struct rps_map *map;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003418 int cpu = -1;
Eric Dumazet567e4b72015-02-06 12:59:01 -08003419 u32 tcpu;
Tom Herbert61b905d2014-03-24 15:34:47 -07003420 u32 hash;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003421
Tom Herbert0a9627f2010-03-16 08:03:29 +00003422 if (skb_rx_queue_recorded(skb)) {
3423 u16 index = skb_get_rx_queue(skb);
Eric Dumazet567e4b72015-02-06 12:59:01 -08003424
Ben Hutchings62fe0b42010-09-27 08:24:33 +00003425 if (unlikely(index >= dev->real_num_rx_queues)) {
3426 WARN_ONCE(dev->real_num_rx_queues > 1,
3427 "%s received packet on queue %u, but number "
3428 "of RX queues is %u\n",
3429 dev->name, index, dev->real_num_rx_queues);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003430 goto done;
3431 }
Eric Dumazet567e4b72015-02-06 12:59:01 -08003432 rxqueue += index;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003433 }
3434
Eric Dumazet567e4b72015-02-06 12:59:01 -08003435 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3436
3437 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3438 map = rcu_dereference(rxqueue->rps_map);
3439 if (!flow_table && !map)
3440 goto done;
3441
Changli Gao2d47b452010-08-17 19:00:56 +00003442 skb_reset_network_header(skb);
Tom Herbert61b905d2014-03-24 15:34:47 -07003443 hash = skb_get_hash(skb);
3444 if (!hash)
Tom Herbert0a9627f2010-03-16 08:03:29 +00003445 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003446
Tom Herbertfec5e652010-04-16 16:01:27 -07003447 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3448 if (flow_table && sock_flow_table) {
Tom Herbertfec5e652010-04-16 16:01:27 -07003449 struct rps_dev_flow *rflow;
Eric Dumazet567e4b72015-02-06 12:59:01 -08003450 u32 next_cpu;
3451 u32 ident;
Tom Herbertfec5e652010-04-16 16:01:27 -07003452
Eric Dumazet567e4b72015-02-06 12:59:01 -08003453 /* First check into global flow table if there is a match */
3454 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3455 if ((ident ^ hash) & ~rps_cpu_mask)
3456 goto try_rps;
3457
3458 next_cpu = ident & rps_cpu_mask;
3459
3460 /* OK, now we know there is a match,
3461 * we can look at the local (per receive queue) flow table
3462 */
Tom Herbert61b905d2014-03-24 15:34:47 -07003463 rflow = &flow_table->flows[hash & flow_table->mask];
Tom Herbertfec5e652010-04-16 16:01:27 -07003464 tcpu = rflow->cpu;
3465
Tom Herbertfec5e652010-04-16 16:01:27 -07003466 /*
3467 * If the desired CPU (where last recvmsg was done) is
3468 * different from current CPU (one in the rx-queue flow
3469 * table entry), switch if one of the following holds:
Eric Dumazeta31196b2015-04-25 09:35:24 -07003470 * - Current CPU is unset (>= nr_cpu_ids).
Tom Herbertfec5e652010-04-16 16:01:27 -07003471 * - Current CPU is offline.
3472 * - The current CPU's queue tail has advanced beyond the
3473 * last packet that was enqueued using this table entry.
3474 * This guarantees that all previous packets for the flow
3475 * have been dequeued, thus preserving in order delivery.
3476 */
3477 if (unlikely(tcpu != next_cpu) &&
Eric Dumazeta31196b2015-04-25 09:35:24 -07003478 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
Tom Herbertfec5e652010-04-16 16:01:27 -07003479 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
Tom Herbertbaefa312012-11-16 09:04:15 +00003480 rflow->last_qtail)) >= 0)) {
3481 tcpu = next_cpu;
Ben Hutchingsc4454772011-01-19 11:03:53 +00003482 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
Tom Herbertbaefa312012-11-16 09:04:15 +00003483 }
Ben Hutchingsc4454772011-01-19 11:03:53 +00003484
Eric Dumazeta31196b2015-04-25 09:35:24 -07003485 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
Tom Herbertfec5e652010-04-16 16:01:27 -07003486 *rflowp = rflow;
3487 cpu = tcpu;
3488 goto done;
3489 }
3490 }
3491
Eric Dumazet567e4b72015-02-06 12:59:01 -08003492try_rps:
3493
Tom Herbert0a9627f2010-03-16 08:03:29 +00003494 if (map) {
Daniel Borkmann8fc54f62014-08-23 20:58:54 +02003495 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
Tom Herbert0a9627f2010-03-16 08:03:29 +00003496 if (cpu_online(tcpu)) {
3497 cpu = tcpu;
3498 goto done;
3499 }
3500 }
3501
3502done:
Tom Herbert0a9627f2010-03-16 08:03:29 +00003503 return cpu;
3504}
3505
Ben Hutchingsc4454772011-01-19 11:03:53 +00003506#ifdef CONFIG_RFS_ACCEL
3507
3508/**
3509 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3510 * @dev: Device on which the filter was set
3511 * @rxq_index: RX queue index
3512 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3513 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3514 *
3515 * Drivers that implement ndo_rx_flow_steer() should periodically call
3516 * this function for each installed filter and remove the filters for
3517 * which it returns %true.
3518 */
3519bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3520 u32 flow_id, u16 filter_id)
3521{
3522 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3523 struct rps_dev_flow_table *flow_table;
3524 struct rps_dev_flow *rflow;
3525 bool expire = true;
Eric Dumazeta31196b2015-04-25 09:35:24 -07003526 unsigned int cpu;
Ben Hutchingsc4454772011-01-19 11:03:53 +00003527
3528 rcu_read_lock();
3529 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3530 if (flow_table && flow_id <= flow_table->mask) {
3531 rflow = &flow_table->flows[flow_id];
3532 cpu = ACCESS_ONCE(rflow->cpu);
Eric Dumazeta31196b2015-04-25 09:35:24 -07003533 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
Ben Hutchingsc4454772011-01-19 11:03:53 +00003534 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3535 rflow->last_qtail) <
3536 (int)(10 * flow_table->mask)))
3537 expire = false;
3538 }
3539 rcu_read_unlock();
3540 return expire;
3541}
3542EXPORT_SYMBOL(rps_may_expire_flow);
3543
3544#endif /* CONFIG_RFS_ACCEL */
3545
Tom Herbert0a9627f2010-03-16 08:03:29 +00003546/* Called from hardirq (IPI) context */
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003547static void rps_trigger_softirq(void *data)
Tom Herbert0a9627f2010-03-16 08:03:29 +00003548{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003549 struct softnet_data *sd = data;
3550
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003551 ____napi_schedule(sd, &sd->backlog);
Changli Gaodee42872010-05-02 05:42:16 +00003552 sd->received_rps++;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003553}
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003554
Tom Herbertfec5e652010-04-16 16:01:27 -07003555#endif /* CONFIG_RPS */
Tom Herbert0a9627f2010-03-16 08:03:29 +00003556
3557/*
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003558 * Check if this softnet_data structure is another cpu one
3559 * If yes, queue it to our IPI list and return 1
3560 * If no, return 0
3561 */
3562static int rps_ipi_queued(struct softnet_data *sd)
3563{
3564#ifdef CONFIG_RPS
Christoph Lameter903ceff2014-08-17 12:30:35 -05003565 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003566
3567 if (sd != mysd) {
3568 sd->rps_ipi_next = mysd->rps_ipi_list;
3569 mysd->rps_ipi_list = sd;
3570
3571 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3572 return 1;
3573 }
3574#endif /* CONFIG_RPS */
3575 return 0;
3576}
3577
Willem de Bruijn99bbc702013-05-20 04:02:32 +00003578#ifdef CONFIG_NET_FLOW_LIMIT
3579int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3580#endif
3581
3582static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3583{
3584#ifdef CONFIG_NET_FLOW_LIMIT
3585 struct sd_flow_limit *fl;
3586 struct softnet_data *sd;
3587 unsigned int old_flow, new_flow;
3588
3589 if (qlen < (netdev_max_backlog >> 1))
3590 return false;
3591
Christoph Lameter903ceff2014-08-17 12:30:35 -05003592 sd = this_cpu_ptr(&softnet_data);
Willem de Bruijn99bbc702013-05-20 04:02:32 +00003593
3594 rcu_read_lock();
3595 fl = rcu_dereference(sd->flow_limit);
3596 if (fl) {
Tom Herbert3958afa1b2013-12-15 22:12:06 -08003597 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
Willem de Bruijn99bbc702013-05-20 04:02:32 +00003598 old_flow = fl->history[fl->history_head];
3599 fl->history[fl->history_head] = new_flow;
3600
3601 fl->history_head++;
3602 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3603
3604 if (likely(fl->buckets[old_flow]))
3605 fl->buckets[old_flow]--;
3606
3607 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3608 fl->count++;
3609 rcu_read_unlock();
3610 return true;
3611 }
3612 }
3613 rcu_read_unlock();
3614#endif
3615 return false;
3616}
3617
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003618/*
Tom Herbert0a9627f2010-03-16 08:03:29 +00003619 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3620 * queue (may be a remote CPU queue).
3621 */
Tom Herbertfec5e652010-04-16 16:01:27 -07003622static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3623 unsigned int *qtail)
Tom Herbert0a9627f2010-03-16 08:03:29 +00003624{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003625 struct softnet_data *sd;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003626 unsigned long flags;
Willem de Bruijn99bbc702013-05-20 04:02:32 +00003627 unsigned int qlen;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003628
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003629 sd = &per_cpu(softnet_data, cpu);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003630
3631 local_irq_save(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003632
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003633 rps_lock(sd);
Julian Anastasove9e4dd32015-07-09 09:59:09 +03003634 if (!netif_running(skb->dev))
3635 goto drop;
Willem de Bruijn99bbc702013-05-20 04:02:32 +00003636 qlen = skb_queue_len(&sd->input_pkt_queue);
3637 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
Li RongQinge008f3f2014-12-08 09:42:55 +08003638 if (qlen) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00003639enqueue:
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003640 __skb_queue_tail(&sd->input_pkt_queue, skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003641 input_queue_tail_incr_save(sd, qtail);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003642 rps_unlock(sd);
Changli Gao152102c2010-03-30 20:16:22 +00003643 local_irq_restore(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003644 return NET_RX_SUCCESS;
3645 }
3646
Eric Dumazetebda37c22010-05-06 23:51:21 +00003647 /* Schedule NAPI for backlog device
3648 * We can use non atomic operation since we own the queue lock
3649 */
3650 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003651 if (!rps_ipi_queued(sd))
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003652 ____napi_schedule(sd, &sd->backlog);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003653 }
3654 goto enqueue;
3655 }
3656
Julian Anastasove9e4dd32015-07-09 09:59:09 +03003657drop:
Changli Gaodee42872010-05-02 05:42:16 +00003658 sd->dropped++;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003659 rps_unlock(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003660
Tom Herbert0a9627f2010-03-16 08:03:29 +00003661 local_irq_restore(flags);
3662
Eric Dumazetcaf586e2010-09-30 21:06:55 +00003663 atomic_long_inc(&skb->dev->rx_dropped);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003664 kfree_skb(skb);
3665 return NET_RX_DROP;
3666}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003667
Ben Hutchingsae78dbf2014-01-10 22:17:24 +00003668static int netif_rx_internal(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003669{
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003670 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003671
Eric Dumazet588f0332011-11-15 04:12:55 +00003672 net_timestamp_check(netdev_tstamp_prequeue, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003673
Koki Sanagicf66ba52010-08-23 18:45:02 +09003674 trace_netif_rx(skb);
Eric Dumazetdf334542010-03-24 19:13:54 +00003675#ifdef CONFIG_RPS
Ingo Molnarc5905af2012-02-24 08:31:31 +01003676 if (static_key_false(&rps_needed)) {
Tom Herbertfec5e652010-04-16 16:01:27 -07003677 struct rps_dev_flow voidflow, *rflow = &voidflow;
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003678 int cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003679
Changli Gaocece1942010-08-07 20:35:43 -07003680 preempt_disable();
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003681 rcu_read_lock();
Tom Herbertfec5e652010-04-16 16:01:27 -07003682
3683 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003684 if (cpu < 0)
3685 cpu = smp_processor_id();
Tom Herbertfec5e652010-04-16 16:01:27 -07003686
3687 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3688
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003689 rcu_read_unlock();
Changli Gaocece1942010-08-07 20:35:43 -07003690 preempt_enable();
Eric Dumazetadc93002011-11-17 03:13:26 +00003691 } else
3692#endif
Tom Herbertfec5e652010-04-16 16:01:27 -07003693 {
3694 unsigned int qtail;
3695 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3696 put_cpu();
3697 }
Eric Dumazetb0e28f12010-04-15 00:14:07 -07003698 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003699}
Ben Hutchingsae78dbf2014-01-10 22:17:24 +00003700
3701/**
3702 * netif_rx - post buffer to the network code
3703 * @skb: buffer to post
3704 *
3705 * This function receives a packet from a device driver and queues it for
3706 * the upper (protocol) levels to process. It always succeeds. The buffer
3707 * may be dropped during processing for congestion control or by the
3708 * protocol layers.
3709 *
3710 * return values:
3711 * NET_RX_SUCCESS (no congestion)
3712 * NET_RX_DROP (packet was dropped)
3713 *
3714 */
3715
3716int netif_rx(struct sk_buff *skb)
3717{
3718 trace_netif_rx_entry(skb);
3719
3720 return netif_rx_internal(skb);
3721}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003722EXPORT_SYMBOL(netif_rx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003723
3724int netif_rx_ni(struct sk_buff *skb)
3725{
3726 int err;
3727
Ben Hutchingsae78dbf2014-01-10 22:17:24 +00003728 trace_netif_rx_ni_entry(skb);
3729
Linus Torvalds1da177e2005-04-16 15:20:36 -07003730 preempt_disable();
Ben Hutchingsae78dbf2014-01-10 22:17:24 +00003731 err = netif_rx_internal(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003732 if (local_softirq_pending())
3733 do_softirq();
3734 preempt_enable();
3735
3736 return err;
3737}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003738EXPORT_SYMBOL(netif_rx_ni);
3739
Linus Torvalds1da177e2005-04-16 15:20:36 -07003740static void net_tx_action(struct softirq_action *h)
3741{
Christoph Lameter903ceff2014-08-17 12:30:35 -05003742 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003743
3744 if (sd->completion_queue) {
3745 struct sk_buff *clist;
3746
3747 local_irq_disable();
3748 clist = sd->completion_queue;
3749 sd->completion_queue = NULL;
3750 local_irq_enable();
3751
3752 while (clist) {
3753 struct sk_buff *skb = clist;
3754 clist = clist->next;
3755
Ilpo Järvinen547b7922008-07-25 21:43:18 -07003756 WARN_ON(atomic_read(&skb->users));
Eric Dumazete6247022013-12-05 04:45:08 -08003757 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3758 trace_consume_skb(skb);
3759 else
3760 trace_kfree_skb(skb, net_tx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003761 __kfree_skb(skb);
3762 }
3763 }
3764
3765 if (sd->output_queue) {
David S. Miller37437bb2008-07-16 02:15:04 -07003766 struct Qdisc *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003767
3768 local_irq_disable();
3769 head = sd->output_queue;
3770 sd->output_queue = NULL;
Changli Gaoa9cbd582010-04-26 23:06:24 +00003771 sd->output_queue_tailp = &sd->output_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003772 local_irq_enable();
3773
3774 while (head) {
David S. Miller37437bb2008-07-16 02:15:04 -07003775 struct Qdisc *q = head;
3776 spinlock_t *root_lock;
3777
Linus Torvalds1da177e2005-04-16 15:20:36 -07003778 head = head->next_sched;
3779
David S. Miller5fb66222008-08-02 20:02:43 -07003780 root_lock = qdisc_lock(q);
David S. Miller37437bb2008-07-16 02:15:04 -07003781 if (spin_trylock(root_lock)) {
Peter Zijlstra4e857c52014-03-17 18:06:10 +01003782 smp_mb__before_atomic();
Jarek Poplawskidef82a12008-08-17 21:54:43 -07003783 clear_bit(__QDISC_STATE_SCHED,
3784 &q->state);
David S. Miller37437bb2008-07-16 02:15:04 -07003785 qdisc_run(q);
3786 spin_unlock(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003787 } else {
David S. Miller195648b2008-08-19 04:00:36 -07003788 if (!test_bit(__QDISC_STATE_DEACTIVATED,
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07003789 &q->state)) {
David S. Miller195648b2008-08-19 04:00:36 -07003790 __netif_reschedule(q);
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07003791 } else {
Peter Zijlstra4e857c52014-03-17 18:06:10 +01003792 smp_mb__before_atomic();
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07003793 clear_bit(__QDISC_STATE_SCHED,
3794 &q->state);
3795 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003796 }
3797 }
3798 }
3799}
3800
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003801#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3802 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
Michał Mirosławda678292009-06-05 05:35:28 +00003803/* This hook is defined here for ATM LANE */
3804int (*br_fdb_test_addr_hook)(struct net_device *dev,
3805 unsigned char *addr) __read_mostly;
Stephen Hemminger4fb019a2009-09-11 11:50:08 -07003806EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
Michał Mirosławda678292009-06-05 05:35:28 +00003807#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003808
Herbert Xuf697c3e2007-10-14 00:38:47 -07003809static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3810 struct packet_type **pt_prev,
3811 int *ret, struct net_device *orig_dev)
3812{
Daniel Borkmanne7582ba2015-05-19 22:33:25 +02003813#ifdef CONFIG_NET_CLS_ACT
Daniel Borkmannd2788d32015-05-09 22:51:32 +02003814 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3815 struct tcf_result cl_res;
Eric Dumazet24824a02010-10-02 06:11:55 +00003816
Daniel Borkmannc9e99fd2015-05-09 22:51:31 +02003817 /* If there's at least one ingress present somewhere (so
3818 * we get here via enabled static key), remaining devices
3819 * that are not configured with an ingress qdisc will bail
Daniel Borkmannd2788d32015-05-09 22:51:32 +02003820 * out here.
Daniel Borkmannc9e99fd2015-05-09 22:51:31 +02003821 */
Daniel Borkmannd2788d32015-05-09 22:51:32 +02003822 if (!cl)
Daniel Borkmann45771392015-04-10 23:07:54 +02003823 return skb;
Herbert Xuf697c3e2007-10-14 00:38:47 -07003824 if (*pt_prev) {
3825 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3826 *pt_prev = NULL;
Herbert Xuf697c3e2007-10-14 00:38:47 -07003827 }
3828
Florian Westphal33654952015-05-14 00:36:28 +02003829 qdisc_skb_cb(skb)->pkt_len = skb->len;
Daniel Borkmannc9e99fd2015-05-09 22:51:31 +02003830 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
Eric Dumazet24ea5912015-07-06 05:18:03 -07003831 qdisc_bstats_cpu_update(cl->q, skb);
Daniel Borkmannc9e99fd2015-05-09 22:51:31 +02003832
Daniel Borkmann3b3ae882015-08-26 23:00:06 +02003833 switch (tc_classify(skb, cl, &cl_res, false)) {
Daniel Borkmannd2788d32015-05-09 22:51:32 +02003834 case TC_ACT_OK:
3835 case TC_ACT_RECLASSIFY:
3836 skb->tc_index = TC_H_MIN(cl_res.classid);
3837 break;
3838 case TC_ACT_SHOT:
Eric Dumazet24ea5912015-07-06 05:18:03 -07003839 qdisc_qstats_cpu_drop(cl->q);
Daniel Borkmannd2788d32015-05-09 22:51:32 +02003840 case TC_ACT_STOLEN:
3841 case TC_ACT_QUEUED:
3842 kfree_skb(skb);
3843 return NULL;
Alexei Starovoitov27b29f62015-09-15 23:05:43 -07003844 case TC_ACT_REDIRECT:
3845 /* skb_mac_header check was done by cls/act_bpf, so
3846 * we can safely push the L2 header back before
3847 * redirecting to another netdev
3848 */
3849 __skb_push(skb, skb->mac_len);
3850 skb_do_redirect(skb);
3851 return NULL;
Daniel Borkmannd2788d32015-05-09 22:51:32 +02003852 default:
3853 break;
Herbert Xuf697c3e2007-10-14 00:38:47 -07003854 }
Daniel Borkmanne7582ba2015-05-19 22:33:25 +02003855#endif /* CONFIG_NET_CLS_ACT */
Herbert Xuf697c3e2007-10-14 00:38:47 -07003856 return skb;
3857}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003858
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003859/**
3860 * netdev_rx_handler_register - register receive handler
3861 * @dev: device to register a handler for
3862 * @rx_handler: receive handler to register
Jiri Pirko93e2c322010-06-10 03:34:59 +00003863 * @rx_handler_data: data pointer that is used by rx handler
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003864 *
Masanari Iidae2278672014-02-18 22:54:36 +09003865 * Register a receive handler for a device. This handler will then be
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003866 * called from __netif_receive_skb. A negative errno code is returned
3867 * on a failure.
3868 *
3869 * The caller must hold the rtnl_mutex.
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003870 *
3871 * For a general description of rx_handler, see enum rx_handler_result.
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003872 */
3873int netdev_rx_handler_register(struct net_device *dev,
Jiri Pirko93e2c322010-06-10 03:34:59 +00003874 rx_handler_func_t *rx_handler,
3875 void *rx_handler_data)
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003876{
3877 ASSERT_RTNL();
3878
3879 if (dev->rx_handler)
3880 return -EBUSY;
3881
Eric Dumazet00cfec32013-03-29 03:01:22 +00003882 /* Note: rx_handler_data must be set before rx_handler */
Jiri Pirko93e2c322010-06-10 03:34:59 +00003883 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003884 rcu_assign_pointer(dev->rx_handler, rx_handler);
3885
3886 return 0;
3887}
3888EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3889
3890/**
3891 * netdev_rx_handler_unregister - unregister receive handler
3892 * @dev: device to unregister a handler from
3893 *
Kusanagi Kouichi166ec362013-03-18 02:59:52 +00003894 * Unregister a receive handler from a device.
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003895 *
3896 * The caller must hold the rtnl_mutex.
3897 */
3898void netdev_rx_handler_unregister(struct net_device *dev)
3899{
3900
3901 ASSERT_RTNL();
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00003902 RCU_INIT_POINTER(dev->rx_handler, NULL);
Eric Dumazet00cfec32013-03-29 03:01:22 +00003903 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3904 * section has a guarantee to see a non NULL rx_handler_data
3905 * as well.
3906 */
3907 synchronize_net();
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00003908 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003909}
3910EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3911
Mel Gormanb4b9e352012-07-31 16:44:26 -07003912/*
3913 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3914 * the special handling of PFMEMALLOC skbs.
3915 */
3916static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3917{
3918 switch (skb->protocol) {
Joe Perches2b8837a2014-03-12 10:04:17 -07003919 case htons(ETH_P_ARP):
3920 case htons(ETH_P_IP):
3921 case htons(ETH_P_IPV6):
3922 case htons(ETH_P_8021Q):
3923 case htons(ETH_P_8021AD):
Mel Gormanb4b9e352012-07-31 16:44:26 -07003924 return true;
3925 default:
3926 return false;
3927 }
3928}
3929
Pablo Neirae687ad62015-05-13 18:19:38 +02003930static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3931 int *ret, struct net_device *orig_dev)
3932{
Daniel Borkmanne7582ba2015-05-19 22:33:25 +02003933#ifdef CONFIG_NETFILTER_INGRESS
Pablo Neirae687ad62015-05-13 18:19:38 +02003934 if (nf_hook_ingress_active(skb)) {
3935 if (*pt_prev) {
3936 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3937 *pt_prev = NULL;
3938 }
3939
3940 return nf_hook_ingress(skb);
3941 }
Daniel Borkmanne7582ba2015-05-19 22:33:25 +02003942#endif /* CONFIG_NETFILTER_INGRESS */
Pablo Neirae687ad62015-05-13 18:19:38 +02003943 return 0;
3944}
Pablo Neirae687ad62015-05-13 18:19:38 +02003945
David S. Miller9754e292013-02-14 15:57:38 -05003946static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003947{
3948 struct packet_type *ptype, *pt_prev;
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003949 rx_handler_func_t *rx_handler;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003950 struct net_device *orig_dev;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00003951 bool deliver_exact = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003952 int ret = NET_RX_DROP;
Al Viro252e3342006-11-14 20:48:11 -08003953 __be16 type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003954
Eric Dumazet588f0332011-11-15 04:12:55 +00003955 net_timestamp_check(!netdev_tstamp_prequeue, skb);
Eric Dumazet81bbb3d2009-09-30 16:42:42 -07003956
Koki Sanagicf66ba52010-08-23 18:45:02 +09003957 trace_netif_receive_skb(skb);
Patrick McHardy9b22ea52008-11-04 14:49:57 -08003958
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07003959 orig_dev = skb->dev;
Jiri Pirko1765a572011-02-12 06:48:36 +00003960
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003961 skb_reset_network_header(skb);
Eric Dumazetfda55ec2013-01-07 09:28:21 +00003962 if (!skb_transport_header_was_set(skb))
3963 skb_reset_transport_header(skb);
Jiri Pirko0b5c9db2011-06-10 06:56:58 +00003964 skb_reset_mac_len(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003965
3966 pt_prev = NULL;
3967
David S. Miller63d8ea72011-02-28 10:48:59 -08003968another_round:
David S. Millerb6858172012-07-23 16:27:54 -07003969 skb->skb_iif = skb->dev->ifindex;
David S. Miller63d8ea72011-02-28 10:48:59 -08003970
3971 __this_cpu_inc(softnet_data.processed);
3972
Patrick McHardy8ad227f2013-04-19 02:04:31 +00003973 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3974 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
Vlad Yasevich0d5501c2014-08-08 14:42:13 -04003975 skb = skb_vlan_untag(skb);
Jiri Pirkobcc6d472011-04-07 19:48:33 +00003976 if (unlikely(!skb))
Julian Anastasov2c17d272015-07-09 09:59:10 +03003977 goto out;
Jiri Pirkobcc6d472011-04-07 19:48:33 +00003978 }
3979
Linus Torvalds1da177e2005-04-16 15:20:36 -07003980#ifdef CONFIG_NET_CLS_ACT
3981 if (skb->tc_verd & TC_NCLS) {
3982 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3983 goto ncls;
3984 }
3985#endif
3986
David S. Miller9754e292013-02-14 15:57:38 -05003987 if (pfmemalloc)
Mel Gormanb4b9e352012-07-31 16:44:26 -07003988 goto skip_taps;
3989
Linus Torvalds1da177e2005-04-16 15:20:36 -07003990 list_for_each_entry_rcu(ptype, &ptype_all, list) {
Salam Noureddine7866a622015-01-27 11:35:48 -08003991 if (pt_prev)
3992 ret = deliver_skb(skb, pt_prev, orig_dev);
3993 pt_prev = ptype;
3994 }
3995
3996 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3997 if (pt_prev)
3998 ret = deliver_skb(skb, pt_prev, orig_dev);
3999 pt_prev = ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004000 }
4001
Mel Gormanb4b9e352012-07-31 16:44:26 -07004002skip_taps:
Pablo Neira1cf519002015-05-13 18:19:37 +02004003#ifdef CONFIG_NET_INGRESS
Daniel Borkmann45771392015-04-10 23:07:54 +02004004 if (static_key_false(&ingress_needed)) {
4005 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
4006 if (!skb)
Julian Anastasov2c17d272015-07-09 09:59:10 +03004007 goto out;
Pablo Neirae687ad62015-05-13 18:19:38 +02004008
4009 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
Julian Anastasov2c17d272015-07-09 09:59:10 +03004010 goto out;
Daniel Borkmann45771392015-04-10 23:07:54 +02004011 }
Pablo Neira1cf519002015-05-13 18:19:37 +02004012#endif
4013#ifdef CONFIG_NET_CLS_ACT
Daniel Borkmann45771392015-04-10 23:07:54 +02004014 skb->tc_verd = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004015ncls:
4016#endif
David S. Miller9754e292013-02-14 15:57:38 -05004017 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
Mel Gormanb4b9e352012-07-31 16:44:26 -07004018 goto drop;
4019
Jiri Pirkodf8a39d2015-01-13 17:13:44 +01004020 if (skb_vlan_tag_present(skb)) {
John Fastabend24257172011-10-10 09:16:41 +00004021 if (pt_prev) {
4022 ret = deliver_skb(skb, pt_prev, orig_dev);
4023 pt_prev = NULL;
4024 }
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00004025 if (vlan_do_receive(&skb))
John Fastabend24257172011-10-10 09:16:41 +00004026 goto another_round;
4027 else if (unlikely(!skb))
Julian Anastasov2c17d272015-07-09 09:59:10 +03004028 goto out;
John Fastabend24257172011-10-10 09:16:41 +00004029 }
4030
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00004031 rx_handler = rcu_dereference(skb->dev->rx_handler);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00004032 if (rx_handler) {
4033 if (pt_prev) {
4034 ret = deliver_skb(skb, pt_prev, orig_dev);
4035 pt_prev = NULL;
4036 }
Jiri Pirko8a4eb572011-03-12 03:14:39 +00004037 switch (rx_handler(&skb)) {
4038 case RX_HANDLER_CONSUMED:
Cristian Bercaru3bc1b1a2013-03-08 07:03:38 +00004039 ret = NET_RX_SUCCESS;
Julian Anastasov2c17d272015-07-09 09:59:10 +03004040 goto out;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00004041 case RX_HANDLER_ANOTHER:
David S. Miller63d8ea72011-02-28 10:48:59 -08004042 goto another_round;
Jiri Pirko8a4eb572011-03-12 03:14:39 +00004043 case RX_HANDLER_EXACT:
4044 deliver_exact = true;
4045 case RX_HANDLER_PASS:
4046 break;
4047 default:
4048 BUG();
4049 }
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00004050 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004051
Jiri Pirkodf8a39d2015-01-13 17:13:44 +01004052 if (unlikely(skb_vlan_tag_present(skb))) {
4053 if (skb_vlan_tag_get_id(skb))
Eric Dumazetd4b812d2013-07-18 07:19:26 -07004054 skb->pkt_type = PACKET_OTHERHOST;
4055 /* Note: we might in the future use prio bits
4056 * and set skb->priority like in vlan_do_receive()
4057 * For the time being, just ignore Priority Code Point
4058 */
4059 skb->vlan_tci = 0;
4060 }
Florian Zumbiehl48cc32d32012-10-07 15:51:58 +00004061
Linus Torvalds1da177e2005-04-16 15:20:36 -07004062 type = skb->protocol;
Salam Noureddine7866a622015-01-27 11:35:48 -08004063
4064 /* deliver only exact match when indicated */
4065 if (likely(!deliver_exact)) {
4066 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4067 &ptype_base[ntohs(type) &
4068 PTYPE_HASH_MASK]);
4069 }
4070
4071 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4072 &orig_dev->ptype_specific);
4073
4074 if (unlikely(skb->dev != orig_dev)) {
4075 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4076 &skb->dev->ptype_specific);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004077 }
4078
4079 if (pt_prev) {
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00004080 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
Michael S. Tsirkin0e698bf2012-09-15 22:44:16 +00004081 goto drop;
Michael S. Tsirkin1080e512012-07-20 09:23:17 +00004082 else
4083 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004084 } else {
Mel Gormanb4b9e352012-07-31 16:44:26 -07004085drop:
Eric Dumazetcaf586e2010-09-30 21:06:55 +00004086 atomic_long_inc(&skb->dev->rx_dropped);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004087 kfree_skb(skb);
4088 /* Jamal, now you will not able to escape explaining
4089 * me how you were going to use this. :-)
4090 */
4091 ret = NET_RX_DROP;
4092 }
4093
Julian Anastasov2c17d272015-07-09 09:59:10 +03004094out:
David S. Miller9754e292013-02-14 15:57:38 -05004095 return ret;
4096}
4097
4098static int __netif_receive_skb(struct sk_buff *skb)
4099{
4100 int ret;
4101
4102 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4103 unsigned long pflags = current->flags;
4104
4105 /*
4106 * PFMEMALLOC skbs are special, they should
4107 * - be delivered to SOCK_MEMALLOC sockets only
4108 * - stay away from userspace
4109 * - have bounded memory usage
4110 *
4111 * Use PF_MEMALLOC as this saves us from propagating the allocation
4112 * context down to all allocation sites.
4113 */
4114 current->flags |= PF_MEMALLOC;
4115 ret = __netif_receive_skb_core(skb, true);
4116 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4117 } else
4118 ret = __netif_receive_skb_core(skb, false);
4119
Linus Torvalds1da177e2005-04-16 15:20:36 -07004120 return ret;
4121}
Tom Herbert0a9627f2010-03-16 08:03:29 +00004122
Ben Hutchingsae78dbf2014-01-10 22:17:24 +00004123static int netif_receive_skb_internal(struct sk_buff *skb)
Tom Herbert0a9627f2010-03-16 08:03:29 +00004124{
Julian Anastasov2c17d272015-07-09 09:59:10 +03004125 int ret;
4126
Eric Dumazet588f0332011-11-15 04:12:55 +00004127 net_timestamp_check(netdev_tstamp_prequeue, skb);
Eric Dumazet3b098e22010-05-15 23:57:10 -07004128
Richard Cochranc1f19b52010-07-17 08:49:36 +00004129 if (skb_defer_rx_timestamp(skb))
4130 return NET_RX_SUCCESS;
4131
Julian Anastasov2c17d272015-07-09 09:59:10 +03004132 rcu_read_lock();
4133
Eric Dumazetdf334542010-03-24 19:13:54 +00004134#ifdef CONFIG_RPS
Ingo Molnarc5905af2012-02-24 08:31:31 +01004135 if (static_key_false(&rps_needed)) {
Eric Dumazet3b098e22010-05-15 23:57:10 -07004136 struct rps_dev_flow voidflow, *rflow = &voidflow;
Julian Anastasov2c17d272015-07-09 09:59:10 +03004137 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
Tom Herbertfec5e652010-04-16 16:01:27 -07004138
Eric Dumazet3b098e22010-05-15 23:57:10 -07004139 if (cpu >= 0) {
4140 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4141 rcu_read_unlock();
Eric Dumazetadc93002011-11-17 03:13:26 +00004142 return ret;
Eric Dumazet3b098e22010-05-15 23:57:10 -07004143 }
Tom Herbertfec5e652010-04-16 16:01:27 -07004144 }
Tom Herbert1e94d722010-03-18 17:45:44 -07004145#endif
Julian Anastasov2c17d272015-07-09 09:59:10 +03004146 ret = __netif_receive_skb(skb);
4147 rcu_read_unlock();
4148 return ret;
Tom Herbert0a9627f2010-03-16 08:03:29 +00004149}
Ben Hutchingsae78dbf2014-01-10 22:17:24 +00004150
4151/**
4152 * netif_receive_skb - process receive buffer from network
4153 * @skb: buffer to process
4154 *
4155 * netif_receive_skb() is the main receive data processing function.
4156 * It always succeeds. The buffer may be dropped during processing
4157 * for congestion control or by the protocol layers.
4158 *
4159 * This function may only be called from softirq context and interrupts
4160 * should be enabled.
4161 *
4162 * Return values (usually ignored):
4163 * NET_RX_SUCCESS: no congestion
4164 * NET_RX_DROP: packet was dropped
4165 */
Eric W. Biederman04eb4482015-09-15 20:04:15 -05004166int netif_receive_skb(struct sk_buff *skb)
Ben Hutchingsae78dbf2014-01-10 22:17:24 +00004167{
4168 trace_netif_receive_skb_entry(skb);
4169
4170 return netif_receive_skb_internal(skb);
4171}
Eric W. Biederman04eb4482015-09-15 20:04:15 -05004172EXPORT_SYMBOL(netif_receive_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004173
Eric Dumazet88751272010-04-19 05:07:33 +00004174/* Network device is going away, flush any packets still pending
4175 * Called with irqs disabled.
4176 */
Changli Gao152102c2010-03-30 20:16:22 +00004177static void flush_backlog(void *arg)
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07004178{
Changli Gao152102c2010-03-30 20:16:22 +00004179 struct net_device *dev = arg;
Christoph Lameter903ceff2014-08-17 12:30:35 -05004180 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07004181 struct sk_buff *skb, *tmp;
4182
Eric Dumazete36fa2f2010-04-19 21:17:14 +00004183 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07004184 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07004185 if (skb->dev == dev) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00004186 __skb_unlink(skb, &sd->input_pkt_queue);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07004187 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00004188 input_queue_head_incr(sd);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07004189 }
Changli Gao6e7676c2010-04-27 15:07:33 -07004190 }
Eric Dumazete36fa2f2010-04-19 21:17:14 +00004191 rps_unlock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07004192
4193 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4194 if (skb->dev == dev) {
4195 __skb_unlink(skb, &sd->process_queue);
4196 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00004197 input_queue_head_incr(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07004198 }
4199 }
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07004200}
4201
Herbert Xud565b0a2008-12-15 23:38:52 -08004202static int napi_gro_complete(struct sk_buff *skb)
4203{
Vlad Yasevich22061d82012-11-15 08:49:11 +00004204 struct packet_offload *ptype;
Herbert Xud565b0a2008-12-15 23:38:52 -08004205 __be16 type = skb->protocol;
Vlad Yasevich22061d82012-11-15 08:49:11 +00004206 struct list_head *head = &offload_base;
Herbert Xud565b0a2008-12-15 23:38:52 -08004207 int err = -ENOENT;
4208
Eric Dumazetc3c7c252012-12-06 13:54:59 +00004209 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4210
Herbert Xufc59f9a2009-04-14 15:11:06 -07004211 if (NAPI_GRO_CB(skb)->count == 1) {
4212 skb_shinfo(skb)->gso_size = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08004213 goto out;
Herbert Xufc59f9a2009-04-14 15:11:06 -07004214 }
Herbert Xud565b0a2008-12-15 23:38:52 -08004215
4216 rcu_read_lock();
4217 list_for_each_entry_rcu(ptype, head, list) {
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00004218 if (ptype->type != type || !ptype->callbacks.gro_complete)
Herbert Xud565b0a2008-12-15 23:38:52 -08004219 continue;
4220
Jerry Chu299603e82013-12-11 20:53:45 -08004221 err = ptype->callbacks.gro_complete(skb, 0);
Herbert Xud565b0a2008-12-15 23:38:52 -08004222 break;
4223 }
4224 rcu_read_unlock();
4225
4226 if (err) {
4227 WARN_ON(&ptype->list == head);
4228 kfree_skb(skb);
4229 return NET_RX_SUCCESS;
4230 }
4231
4232out:
Ben Hutchingsae78dbf2014-01-10 22:17:24 +00004233 return netif_receive_skb_internal(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08004234}
4235
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004236/* napi->gro_list contains packets ordered by age.
4237 * youngest packets at the head of it.
4238 * Complete skbs in reverse order to reduce latencies.
4239 */
4240void napi_gro_flush(struct napi_struct *napi, bool flush_old)
Herbert Xud565b0a2008-12-15 23:38:52 -08004241{
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004242 struct sk_buff *skb, *prev = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08004243
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004244 /* scan list and build reverse chain */
4245 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4246 skb->prev = prev;
4247 prev = skb;
Herbert Xud565b0a2008-12-15 23:38:52 -08004248 }
4249
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004250 for (skb = prev; skb; skb = prev) {
4251 skb->next = NULL;
4252
4253 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4254 return;
4255
4256 prev = skb->prev;
4257 napi_gro_complete(skb);
4258 napi->gro_count--;
4259 }
4260
Herbert Xud565b0a2008-12-15 23:38:52 -08004261 napi->gro_list = NULL;
4262}
Eric Dumazet86cac582010-08-31 18:25:32 +00004263EXPORT_SYMBOL(napi_gro_flush);
Herbert Xud565b0a2008-12-15 23:38:52 -08004264
Eric Dumazet89c5fa32012-12-10 13:28:16 +00004265static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4266{
4267 struct sk_buff *p;
4268 unsigned int maclen = skb->dev->hard_header_len;
Tom Herbert0b4cec82014-01-15 08:58:06 -08004269 u32 hash = skb_get_hash_raw(skb);
Eric Dumazet89c5fa32012-12-10 13:28:16 +00004270
4271 for (p = napi->gro_list; p; p = p->next) {
4272 unsigned long diffs;
4273
Tom Herbert0b4cec82014-01-15 08:58:06 -08004274 NAPI_GRO_CB(p)->flush = 0;
4275
4276 if (hash != skb_get_hash_raw(p)) {
4277 NAPI_GRO_CB(p)->same_flow = 0;
4278 continue;
4279 }
4280
Eric Dumazet89c5fa32012-12-10 13:28:16 +00004281 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4282 diffs |= p->vlan_tci ^ skb->vlan_tci;
4283 if (maclen == ETH_HLEN)
4284 diffs |= compare_ether_header(skb_mac_header(p),
Eric Dumazeta50e2332014-03-29 21:28:21 -07004285 skb_mac_header(skb));
Eric Dumazet89c5fa32012-12-10 13:28:16 +00004286 else if (!diffs)
4287 diffs = memcmp(skb_mac_header(p),
Eric Dumazeta50e2332014-03-29 21:28:21 -07004288 skb_mac_header(skb),
Eric Dumazet89c5fa32012-12-10 13:28:16 +00004289 maclen);
4290 NAPI_GRO_CB(p)->same_flow = !diffs;
Eric Dumazet89c5fa32012-12-10 13:28:16 +00004291 }
4292}
4293
Jerry Chu299603e82013-12-11 20:53:45 -08004294static void skb_gro_reset_offset(struct sk_buff *skb)
4295{
4296 const struct skb_shared_info *pinfo = skb_shinfo(skb);
4297 const skb_frag_t *frag0 = &pinfo->frags[0];
4298
4299 NAPI_GRO_CB(skb)->data_offset = 0;
4300 NAPI_GRO_CB(skb)->frag0 = NULL;
4301 NAPI_GRO_CB(skb)->frag0_len = 0;
4302
4303 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4304 pinfo->nr_frags &&
4305 !PageHighMem(skb_frag_page(frag0))) {
4306 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4307 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
Herbert Xud565b0a2008-12-15 23:38:52 -08004308 }
4309}
4310
Eric Dumazeta50e2332014-03-29 21:28:21 -07004311static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4312{
4313 struct skb_shared_info *pinfo = skb_shinfo(skb);
4314
4315 BUG_ON(skb->end - skb->tail < grow);
4316
4317 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4318
4319 skb->data_len -= grow;
4320 skb->tail += grow;
4321
4322 pinfo->frags[0].page_offset += grow;
4323 skb_frag_size_sub(&pinfo->frags[0], grow);
4324
4325 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4326 skb_frag_unref(skb, 0);
4327 memmove(pinfo->frags, pinfo->frags + 1,
4328 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4329 }
4330}
4331
Rami Rosenbb728822012-11-28 21:55:25 +00004332static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xud565b0a2008-12-15 23:38:52 -08004333{
4334 struct sk_buff **pp = NULL;
Vlad Yasevich22061d82012-11-15 08:49:11 +00004335 struct packet_offload *ptype;
Herbert Xud565b0a2008-12-15 23:38:52 -08004336 __be16 type = skb->protocol;
Vlad Yasevich22061d82012-11-15 08:49:11 +00004337 struct list_head *head = &offload_base;
Herbert Xu0da2afd52008-12-26 14:57:42 -08004338 int same_flow;
Ben Hutchings5b252f02009-10-29 07:17:09 +00004339 enum gro_result ret;
Eric Dumazeta50e2332014-03-29 21:28:21 -07004340 int grow;
Herbert Xud565b0a2008-12-15 23:38:52 -08004341
Eric W. Biederman9c62a682014-03-14 20:51:52 -07004342 if (!(skb->dev->features & NETIF_F_GRO))
Herbert Xud565b0a2008-12-15 23:38:52 -08004343 goto normal;
4344
Tom Herbert5a212322014-08-31 15:12:41 -07004345 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
Herbert Xuf17f5c92009-01-14 14:36:12 -08004346 goto normal;
4347
Eric Dumazet89c5fa32012-12-10 13:28:16 +00004348 gro_list_prepare(napi, skb);
4349
Herbert Xud565b0a2008-12-15 23:38:52 -08004350 rcu_read_lock();
4351 list_for_each_entry_rcu(ptype, head, list) {
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00004352 if (ptype->type != type || !ptype->callbacks.gro_receive)
Herbert Xud565b0a2008-12-15 23:38:52 -08004353 continue;
4354
Herbert Xu86911732009-01-29 14:19:50 +00004355 skb_set_network_header(skb, skb_gro_offset(skb));
Eric Dumazetefd94502013-02-14 17:31:48 +00004356 skb_reset_mac_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08004357 NAPI_GRO_CB(skb)->same_flow = 0;
4358 NAPI_GRO_CB(skb)->flush = 0;
Herbert Xu5d38a072009-01-04 16:13:40 -08004359 NAPI_GRO_CB(skb)->free = 0;
Or Gerlitzb582ef02014-01-20 13:59:19 +02004360 NAPI_GRO_CB(skb)->udp_mark = 0;
Tom Herbert15e23962015-02-10 16:30:31 -08004361 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08004362
Tom Herbert662880f2014-08-27 21:26:56 -07004363 /* Setup for GRO checksum validation */
4364 switch (skb->ip_summed) {
4365 case CHECKSUM_COMPLETE:
4366 NAPI_GRO_CB(skb)->csum = skb->csum;
4367 NAPI_GRO_CB(skb)->csum_valid = 1;
4368 NAPI_GRO_CB(skb)->csum_cnt = 0;
4369 break;
4370 case CHECKSUM_UNNECESSARY:
4371 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4372 NAPI_GRO_CB(skb)->csum_valid = 0;
4373 break;
4374 default:
4375 NAPI_GRO_CB(skb)->csum_cnt = 0;
4376 NAPI_GRO_CB(skb)->csum_valid = 0;
4377 }
Herbert Xud565b0a2008-12-15 23:38:52 -08004378
Vlad Yasevichf191a1d2012-11-15 08:49:23 +00004379 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08004380 break;
4381 }
4382 rcu_read_unlock();
4383
4384 if (&ptype->list == head)
4385 goto normal;
4386
Herbert Xu0da2afd52008-12-26 14:57:42 -08004387 same_flow = NAPI_GRO_CB(skb)->same_flow;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004388 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
Herbert Xu0da2afd52008-12-26 14:57:42 -08004389
Herbert Xud565b0a2008-12-15 23:38:52 -08004390 if (pp) {
4391 struct sk_buff *nskb = *pp;
4392
4393 *pp = nskb->next;
4394 nskb->next = NULL;
4395 napi_gro_complete(nskb);
Herbert Xu4ae55442009-02-08 18:00:36 +00004396 napi->gro_count--;
Herbert Xud565b0a2008-12-15 23:38:52 -08004397 }
4398
Herbert Xu0da2afd52008-12-26 14:57:42 -08004399 if (same_flow)
Herbert Xud565b0a2008-12-15 23:38:52 -08004400 goto ok;
4401
Eric Dumazet600adc12014-01-09 14:12:19 -08004402 if (NAPI_GRO_CB(skb)->flush)
Herbert Xud565b0a2008-12-15 23:38:52 -08004403 goto normal;
Herbert Xud565b0a2008-12-15 23:38:52 -08004404
Eric Dumazet600adc12014-01-09 14:12:19 -08004405 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4406 struct sk_buff *nskb = napi->gro_list;
4407
4408 /* locate the end of the list to select the 'oldest' flow */
4409 while (nskb->next) {
4410 pp = &nskb->next;
4411 nskb = *pp;
4412 }
4413 *pp = NULL;
4414 nskb->next = NULL;
4415 napi_gro_complete(nskb);
4416 } else {
4417 napi->gro_count++;
4418 }
Herbert Xud565b0a2008-12-15 23:38:52 -08004419 NAPI_GRO_CB(skb)->count = 1;
Eric Dumazet2e71a6f2012-10-06 08:08:49 +00004420 NAPI_GRO_CB(skb)->age = jiffies;
Eric Dumazet29e98242014-05-16 11:34:37 -07004421 NAPI_GRO_CB(skb)->last = skb;
Herbert Xu86911732009-01-29 14:19:50 +00004422 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08004423 skb->next = napi->gro_list;
4424 napi->gro_list = skb;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004425 ret = GRO_HELD;
Herbert Xud565b0a2008-12-15 23:38:52 -08004426
Herbert Xuad0f9902009-02-01 01:24:55 -08004427pull:
Eric Dumazeta50e2332014-03-29 21:28:21 -07004428 grow = skb_gro_offset(skb) - skb_headlen(skb);
4429 if (grow > 0)
4430 gro_pull_from_frag0(skb, grow);
Herbert Xud565b0a2008-12-15 23:38:52 -08004431ok:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004432 return ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08004433
4434normal:
Herbert Xuad0f9902009-02-01 01:24:55 -08004435 ret = GRO_NORMAL;
4436 goto pull;
Herbert Xu5d38a072009-01-04 16:13:40 -08004437}
Herbert Xu96e93ea2009-01-06 10:49:34 -08004438
Jerry Chubf5a7552014-01-07 10:23:19 -08004439struct packet_offload *gro_find_receive_by_type(__be16 type)
4440{
4441 struct list_head *offload_head = &offload_base;
4442 struct packet_offload *ptype;
4443
4444 list_for_each_entry_rcu(ptype, offload_head, list) {
4445 if (ptype->type != type || !ptype->callbacks.gro_receive)
4446 continue;
4447 return ptype;
4448 }
4449 return NULL;
4450}
Or Gerlitze27a2f82014-01-20 13:59:20 +02004451EXPORT_SYMBOL(gro_find_receive_by_type);
Jerry Chubf5a7552014-01-07 10:23:19 -08004452
4453struct packet_offload *gro_find_complete_by_type(__be16 type)
4454{
4455 struct list_head *offload_head = &offload_base;
4456 struct packet_offload *ptype;
4457
4458 list_for_each_entry_rcu(ptype, offload_head, list) {
4459 if (ptype->type != type || !ptype->callbacks.gro_complete)
4460 continue;
4461 return ptype;
4462 }
4463 return NULL;
4464}
Or Gerlitze27a2f82014-01-20 13:59:20 +02004465EXPORT_SYMBOL(gro_find_complete_by_type);
Herbert Xu96e93ea2009-01-06 10:49:34 -08004466
Rami Rosenbb728822012-11-28 21:55:25 +00004467static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
Herbert Xu5d38a072009-01-04 16:13:40 -08004468{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004469 switch (ret) {
4470 case GRO_NORMAL:
Ben Hutchingsae78dbf2014-01-10 22:17:24 +00004471 if (netif_receive_skb_internal(skb))
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07004472 ret = GRO_DROP;
4473 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08004474
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004475 case GRO_DROP:
Herbert Xu5d38a072009-01-04 16:13:40 -08004476 kfree_skb(skb);
4477 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00004478
Eric Dumazetdaa86542012-04-19 07:07:40 +00004479 case GRO_MERGED_FREE:
Eric Dumazetd7e88832012-04-30 08:10:34 +00004480 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4481 kmem_cache_free(skbuff_head_cache, skb);
4482 else
4483 __kfree_skb(skb);
Eric Dumazetdaa86542012-04-19 07:07:40 +00004484 break;
4485
Ben Hutchings5b252f02009-10-29 07:17:09 +00004486 case GRO_HELD:
4487 case GRO_MERGED:
4488 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08004489 }
4490
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07004491 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004492}
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004493
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07004494gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004495{
Eric Dumazet93f93a42015-11-18 06:30:59 -08004496 skb_mark_napi_id(skb, napi);
Ben Hutchingsae78dbf2014-01-10 22:17:24 +00004497 trace_napi_gro_receive_entry(skb);
Herbert Xu86911732009-01-29 14:19:50 +00004498
Eric Dumazeta50e2332014-03-29 21:28:21 -07004499 skb_gro_reset_offset(skb);
4500
Eric Dumazet89c5fa32012-12-10 13:28:16 +00004501 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08004502}
4503EXPORT_SYMBOL(napi_gro_receive);
4504
stephen hemmingerd0c2b0d2010-10-19 07:12:10 +00004505static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu96e93ea2009-01-06 10:49:34 -08004506{
Eric Dumazet93a35f52014-10-23 06:30:30 -07004507 if (unlikely(skb->pfmemalloc)) {
4508 consume_skb(skb);
4509 return;
4510 }
Herbert Xu96e93ea2009-01-06 10:49:34 -08004511 __skb_pull(skb, skb_headlen(skb));
Eric Dumazet2a2a4592012-03-21 06:58:03 +00004512 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4513 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
Jesse Gross3701e512010-10-20 13:56:06 +00004514 skb->vlan_tci = 0;
Herbert Xu66c46d72011-01-29 20:44:54 -08004515 skb->dev = napi->dev;
Andy Gospodarek6d152e22011-02-02 14:53:25 -08004516 skb->skb_iif = 0;
Jerry Chuc3caf112014-07-14 15:54:46 -07004517 skb->encapsulation = 0;
4518 skb_shinfo(skb)->gso_type = 0;
Eric Dumazete33d0ba2014-04-03 09:28:10 -07004519 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
Herbert Xu96e93ea2009-01-06 10:49:34 -08004520
4521 napi->skb = skb;
4522}
Herbert Xu96e93ea2009-01-06 10:49:34 -08004523
Herbert Xu76620aa2009-04-16 02:02:07 -07004524struct sk_buff *napi_get_frags(struct napi_struct *napi)
Herbert Xu5d38a072009-01-04 16:13:40 -08004525{
Herbert Xu5d38a072009-01-04 16:13:40 -08004526 struct sk_buff *skb = napi->skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08004527
4528 if (!skb) {
Alexander Duyckfd11a832014-12-09 19:40:49 -08004529 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
Eric Dumazete2f9dc32015-11-19 12:11:23 -08004530 if (skb) {
4531 napi->skb = skb;
4532 skb_mark_napi_id(skb, napi);
4533 }
Herbert Xu5d38a072009-01-04 16:13:40 -08004534 }
Herbert Xu96e93ea2009-01-06 10:49:34 -08004535 return skb;
4536}
Herbert Xu76620aa2009-04-16 02:02:07 -07004537EXPORT_SYMBOL(napi_get_frags);
Herbert Xu96e93ea2009-01-06 10:49:34 -08004538
Eric Dumazeta50e2332014-03-29 21:28:21 -07004539static gro_result_t napi_frags_finish(struct napi_struct *napi,
4540 struct sk_buff *skb,
4541 gro_result_t ret)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004542{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004543 switch (ret) {
4544 case GRO_NORMAL:
Eric Dumazeta50e2332014-03-29 21:28:21 -07004545 case GRO_HELD:
4546 __skb_push(skb, ETH_HLEN);
4547 skb->protocol = eth_type_trans(skb, skb->dev);
4548 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07004549 ret = GRO_DROP;
Herbert Xu86911732009-01-29 14:19:50 +00004550 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004551
4552 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004553 case GRO_MERGED_FREE:
4554 napi_reuse_skb(napi, skb);
4555 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00004556
4557 case GRO_MERGED:
4558 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004559 }
4560
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07004561 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004562}
Herbert Xu5d0d9be2009-01-29 14:19:48 +00004563
Eric Dumazeta50e2332014-03-29 21:28:21 -07004564/* Upper GRO stack assumes network header starts at gro_offset=0
4565 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4566 * We copy ethernet header into skb->data to have a common layout.
4567 */
Eric Dumazet4adb9c42012-05-18 20:49:06 +00004568static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
Herbert Xu96e93ea2009-01-06 10:49:34 -08004569{
Herbert Xu76620aa2009-04-16 02:02:07 -07004570 struct sk_buff *skb = napi->skb;
Eric Dumazeta50e2332014-03-29 21:28:21 -07004571 const struct ethhdr *eth;
4572 unsigned int hlen = sizeof(*eth);
Herbert Xu76620aa2009-04-16 02:02:07 -07004573
4574 napi->skb = NULL;
4575
Eric Dumazeta50e2332014-03-29 21:28:21 -07004576 skb_reset_mac_header(skb);
4577 skb_gro_reset_offset(skb);
4578
4579 eth = skb_gro_header_fast(skb, 0);
4580 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4581 eth = skb_gro_header_slow(skb, hlen, 0);
4582 if (unlikely(!eth)) {
4583 napi_reuse_skb(napi, skb);
4584 return NULL;
4585 }
4586 } else {
4587 gro_pull_from_frag0(skb, hlen);
4588 NAPI_GRO_CB(skb)->frag0 += hlen;
4589 NAPI_GRO_CB(skb)->frag0_len -= hlen;
Herbert Xu76620aa2009-04-16 02:02:07 -07004590 }
Eric Dumazeta50e2332014-03-29 21:28:21 -07004591 __skb_pull(skb, hlen);
4592
4593 /*
4594 * This works because the only protocols we care about don't require
4595 * special handling.
4596 * We'll fix it up properly in napi_frags_finish()
4597 */
4598 skb->protocol = eth->h_proto;
Herbert Xu76620aa2009-04-16 02:02:07 -07004599
Herbert Xu76620aa2009-04-16 02:02:07 -07004600 return skb;
4601}
Herbert Xu76620aa2009-04-16 02:02:07 -07004602
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07004603gro_result_t napi_gro_frags(struct napi_struct *napi)
Herbert Xu76620aa2009-04-16 02:02:07 -07004604{
4605 struct sk_buff *skb = napi_frags_skb(napi);
Herbert Xu96e93ea2009-01-06 10:49:34 -08004606
4607 if (!skb)
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07004608 return GRO_DROP;
Herbert Xu96e93ea2009-01-06 10:49:34 -08004609
Ben Hutchingsae78dbf2014-01-10 22:17:24 +00004610 trace_napi_gro_frags_entry(skb);
4611
Eric Dumazet89c5fa32012-12-10 13:28:16 +00004612 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
Herbert Xu5d38a072009-01-04 16:13:40 -08004613}
4614EXPORT_SYMBOL(napi_gro_frags);
4615
Tom Herbert573e8fc2014-08-22 13:33:47 -07004616/* Compute the checksum from gro_offset and return the folded value
4617 * after adding in any pseudo checksum.
4618 */
4619__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4620{
4621 __wsum wsum;
4622 __sum16 sum;
4623
4624 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4625
4626 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4627 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4628 if (likely(!sum)) {
4629 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4630 !skb->csum_complete_sw)
4631 netdev_rx_csum_fault(skb->dev);
4632 }
4633
4634 NAPI_GRO_CB(skb)->csum = wsum;
4635 NAPI_GRO_CB(skb)->csum_valid = 1;
4636
4637 return sum;
4638}
4639EXPORT_SYMBOL(__skb_gro_checksum_complete);
4640
Eric Dumazete326bed2010-04-22 00:22:45 -07004641/*
Zhi Yong Wu855abcf2014-01-01 04:34:50 +08004642 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
Eric Dumazete326bed2010-04-22 00:22:45 -07004643 * Note: called with local irq disabled, but exits with local irq enabled.
4644 */
4645static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4646{
4647#ifdef CONFIG_RPS
4648 struct softnet_data *remsd = sd->rps_ipi_list;
4649
4650 if (remsd) {
4651 sd->rps_ipi_list = NULL;
4652
4653 local_irq_enable();
4654
4655 /* Send pending IPI's to kick RPS processing on remote cpus. */
4656 while (remsd) {
4657 struct softnet_data *next = remsd->rps_ipi_next;
4658
4659 if (cpu_online(remsd->cpu))
Frederic Weisbeckerc46fff22014-02-24 16:40:02 +01004660 smp_call_function_single_async(remsd->cpu,
Frederic Weisbeckerfce8ad12014-02-24 16:40:01 +01004661 &remsd->csd);
Eric Dumazete326bed2010-04-22 00:22:45 -07004662 remsd = next;
4663 }
4664 } else
4665#endif
4666 local_irq_enable();
4667}
4668
Eric Dumazetd75b1ad2014-11-02 06:19:33 -08004669static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4670{
4671#ifdef CONFIG_RPS
4672 return sd->rps_ipi_list != NULL;
4673#else
4674 return false;
4675#endif
4676}
4677
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004678static int process_backlog(struct napi_struct *napi, int quota)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004679{
4680 int work = 0;
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004681 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004682
Eric Dumazete326bed2010-04-22 00:22:45 -07004683 /* Check if we have pending ipi, its better to send them now,
4684 * not waiting net_rx_action() end.
4685 */
Eric Dumazetd75b1ad2014-11-02 06:19:33 -08004686 if (sd_has_rps_ipi_waiting(sd)) {
Eric Dumazete326bed2010-04-22 00:22:45 -07004687 local_irq_disable();
4688 net_rps_action_and_irq_enable(sd);
4689 }
Eric Dumazetd75b1ad2014-11-02 06:19:33 -08004690
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004691 napi->weight = weight_p;
Changli Gao6e7676c2010-04-27 15:07:33 -07004692 local_irq_disable();
Tom Herbert11ef7a82014-06-30 09:50:40 -07004693 while (1) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004694 struct sk_buff *skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004695
Changli Gao6e7676c2010-04-27 15:07:33 -07004696 while ((skb = __skb_dequeue(&sd->process_queue))) {
Julian Anastasov2c17d272015-07-09 09:59:10 +03004697 rcu_read_lock();
Eric Dumazete4008272010-04-05 15:42:39 -07004698 local_irq_enable();
Changli Gao6e7676c2010-04-27 15:07:33 -07004699 __netif_receive_skb(skb);
Julian Anastasov2c17d272015-07-09 09:59:10 +03004700 rcu_read_unlock();
Changli Gao6e7676c2010-04-27 15:07:33 -07004701 local_irq_disable();
Tom Herbert76cc8b12010-05-20 18:37:59 +00004702 input_queue_head_incr(sd);
4703 if (++work >= quota) {
4704 local_irq_enable();
4705 return work;
4706 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004707 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004708
Changli Gao6e7676c2010-04-27 15:07:33 -07004709 rps_lock(sd);
Tom Herbert11ef7a82014-06-30 09:50:40 -07004710 if (skb_queue_empty(&sd->input_pkt_queue)) {
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004711 /*
4712 * Inline a custom version of __napi_complete().
4713 * only current cpu owns and manipulates this napi,
Tom Herbert11ef7a82014-06-30 09:50:40 -07004714 * and NAPI_STATE_SCHED is the only possible flag set
4715 * on backlog.
4716 * We can use a plain write instead of clear_bit(),
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004717 * and we dont need an smp_mb() memory barrier.
4718 */
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004719 napi->state = 0;
Tom Herbert11ef7a82014-06-30 09:50:40 -07004720 rps_unlock(sd);
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07004721
Tom Herbert11ef7a82014-06-30 09:50:40 -07004722 break;
Changli Gao6e7676c2010-04-27 15:07:33 -07004723 }
Tom Herbert11ef7a82014-06-30 09:50:40 -07004724
4725 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4726 &sd->process_queue);
Changli Gao6e7676c2010-04-27 15:07:33 -07004727 rps_unlock(sd);
4728 }
4729 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004730
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004731 return work;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004732}
4733
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004734/**
4735 * __napi_schedule - schedule for receive
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004736 * @n: entry to schedule
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004737 *
Eric Dumazetbc9ad162014-10-28 18:05:13 -07004738 * The entry's receive function will be scheduled to run.
4739 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004740 */
Harvey Harrisonb5606c22008-02-13 15:03:16 -08004741void __napi_schedule(struct napi_struct *n)
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004742{
4743 unsigned long flags;
4744
4745 local_irq_save(flags);
Christoph Lameter903ceff2014-08-17 12:30:35 -05004746 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004747 local_irq_restore(flags);
4748}
4749EXPORT_SYMBOL(__napi_schedule);
4750
Eric Dumazetbc9ad162014-10-28 18:05:13 -07004751/**
4752 * __napi_schedule_irqoff - schedule for receive
4753 * @n: entry to schedule
4754 *
4755 * Variant of __napi_schedule() assuming hard irqs are masked
4756 */
4757void __napi_schedule_irqoff(struct napi_struct *n)
4758{
4759 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4760}
4761EXPORT_SYMBOL(__napi_schedule_irqoff);
4762
Herbert Xud565b0a2008-12-15 23:38:52 -08004763void __napi_complete(struct napi_struct *n)
4764{
4765 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
Herbert Xud565b0a2008-12-15 23:38:52 -08004766
Eric Dumazetd75b1ad2014-11-02 06:19:33 -08004767 list_del_init(&n->poll_list);
Peter Zijlstra4e857c52014-03-17 18:06:10 +01004768 smp_mb__before_atomic();
Herbert Xud565b0a2008-12-15 23:38:52 -08004769 clear_bit(NAPI_STATE_SCHED, &n->state);
4770}
4771EXPORT_SYMBOL(__napi_complete);
4772
Eric Dumazet3b47d302014-11-06 21:09:44 -08004773void napi_complete_done(struct napi_struct *n, int work_done)
Herbert Xud565b0a2008-12-15 23:38:52 -08004774{
4775 unsigned long flags;
4776
4777 /*
4778 * don't let napi dequeue from the cpu poll list
4779 * just in case its running on a different cpu
4780 */
4781 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4782 return;
4783
Eric Dumazet3b47d302014-11-06 21:09:44 -08004784 if (n->gro_list) {
4785 unsigned long timeout = 0;
Eric Dumazetd75b1ad2014-11-02 06:19:33 -08004786
Eric Dumazet3b47d302014-11-06 21:09:44 -08004787 if (work_done)
4788 timeout = n->dev->gro_flush_timeout;
4789
4790 if (timeout)
4791 hrtimer_start(&n->timer, ns_to_ktime(timeout),
4792 HRTIMER_MODE_REL_PINNED);
4793 else
4794 napi_gro_flush(n, false);
4795 }
Eric Dumazetd75b1ad2014-11-02 06:19:33 -08004796 if (likely(list_empty(&n->poll_list))) {
4797 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4798 } else {
4799 /* If n->poll_list is not empty, we need to mask irqs */
4800 local_irq_save(flags);
4801 __napi_complete(n);
4802 local_irq_restore(flags);
4803 }
Herbert Xud565b0a2008-12-15 23:38:52 -08004804}
Eric Dumazet3b47d302014-11-06 21:09:44 -08004805EXPORT_SYMBOL(napi_complete_done);
Herbert Xud565b0a2008-12-15 23:38:52 -08004806
Eliezer Tamiraf12fa62013-06-10 11:39:41 +03004807/* must be called under rcu_read_lock(), as we dont take a reference */
Eric Dumazet02d62e82015-11-18 06:30:52 -08004808static struct napi_struct *napi_by_id(unsigned int napi_id)
Eliezer Tamiraf12fa62013-06-10 11:39:41 +03004809{
4810 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4811 struct napi_struct *napi;
4812
4813 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4814 if (napi->napi_id == napi_id)
4815 return napi;
4816
4817 return NULL;
4818}
Eric Dumazet02d62e82015-11-18 06:30:52 -08004819
4820#if defined(CONFIG_NET_RX_BUSY_POLL)
Eric Dumazetce6aea92015-11-18 06:30:54 -08004821#define BUSY_POLL_BUDGET 8
Eric Dumazet02d62e82015-11-18 06:30:52 -08004822bool sk_busy_loop(struct sock *sk, int nonblock)
4823{
4824 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
Eric Dumazetce6aea92015-11-18 06:30:54 -08004825 int (*busy_poll)(struct napi_struct *dev);
Eric Dumazet02d62e82015-11-18 06:30:52 -08004826 struct napi_struct *napi;
4827 int rc = false;
4828
Eric Dumazet2a028ec2015-11-18 06:30:53 -08004829 rcu_read_lock();
Eric Dumazet02d62e82015-11-18 06:30:52 -08004830
4831 napi = napi_by_id(sk->sk_napi_id);
4832 if (!napi)
4833 goto out;
4834
Eric Dumazetce6aea92015-11-18 06:30:54 -08004835 /* Note: ndo_busy_poll method is optional in linux-4.5 */
4836 busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
Eric Dumazet02d62e82015-11-18 06:30:52 -08004837
4838 do {
Eric Dumazetce6aea92015-11-18 06:30:54 -08004839 rc = 0;
Eric Dumazet2a028ec2015-11-18 06:30:53 -08004840 local_bh_disable();
Eric Dumazetce6aea92015-11-18 06:30:54 -08004841 if (busy_poll) {
4842 rc = busy_poll(napi);
4843 } else if (napi_schedule_prep(napi)) {
4844 void *have = netpoll_poll_lock(napi);
4845
4846 if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4847 rc = napi->poll(napi, BUSY_POLL_BUDGET);
4848 trace_napi_poll(napi);
4849 if (rc == BUSY_POLL_BUDGET) {
4850 napi_complete_done(napi, rc);
4851 napi_schedule(napi);
4852 }
4853 }
4854 netpoll_poll_unlock(have);
4855 }
Eric Dumazet2a028ec2015-11-18 06:30:53 -08004856 if (rc > 0)
4857 NET_ADD_STATS_BH(sock_net(sk),
4858 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
4859 local_bh_enable();
Eric Dumazet02d62e82015-11-18 06:30:52 -08004860
4861 if (rc == LL_FLUSH_FAILED)
4862 break; /* permanent failure */
4863
Eric Dumazet02d62e82015-11-18 06:30:52 -08004864 cpu_relax();
Eric Dumazet02d62e82015-11-18 06:30:52 -08004865 } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
4866 !need_resched() && !busy_loop_timeout(end_time));
4867
4868 rc = !skb_queue_empty(&sk->sk_receive_queue);
4869out:
Eric Dumazet2a028ec2015-11-18 06:30:53 -08004870 rcu_read_unlock();
Eric Dumazet02d62e82015-11-18 06:30:52 -08004871 return rc;
4872}
4873EXPORT_SYMBOL(sk_busy_loop);
4874
4875#endif /* CONFIG_NET_RX_BUSY_POLL */
Eliezer Tamiraf12fa62013-06-10 11:39:41 +03004876
4877void napi_hash_add(struct napi_struct *napi)
4878{
Eric Dumazetd64b5e82015-11-18 06:31:00 -08004879 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
4880 test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
Eric Dumazet52bd2d62015-11-18 06:30:50 -08004881 return;
Eliezer Tamiraf12fa62013-06-10 11:39:41 +03004882
Eric Dumazet52bd2d62015-11-18 06:30:50 -08004883 spin_lock(&napi_hash_lock);
Eliezer Tamiraf12fa62013-06-10 11:39:41 +03004884
Eric Dumazet52bd2d62015-11-18 06:30:50 -08004885 /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
4886 do {
4887 if (unlikely(++napi_gen_id < NR_CPUS + 1))
4888 napi_gen_id = NR_CPUS + 1;
4889 } while (napi_by_id(napi_gen_id));
4890 napi->napi_id = napi_gen_id;
Eliezer Tamiraf12fa62013-06-10 11:39:41 +03004891
Eric Dumazet52bd2d62015-11-18 06:30:50 -08004892 hlist_add_head_rcu(&napi->napi_hash_node,
4893 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
Eliezer Tamiraf12fa62013-06-10 11:39:41 +03004894
Eric Dumazet52bd2d62015-11-18 06:30:50 -08004895 spin_unlock(&napi_hash_lock);
Eliezer Tamiraf12fa62013-06-10 11:39:41 +03004896}
4897EXPORT_SYMBOL_GPL(napi_hash_add);
4898
4899/* Warning : caller is responsible to make sure rcu grace period
4900 * is respected before freeing memory containing @napi
4901 */
Eric Dumazet34cbe272015-11-18 06:31:02 -08004902bool napi_hash_del(struct napi_struct *napi)
Eliezer Tamiraf12fa62013-06-10 11:39:41 +03004903{
Eric Dumazet34cbe272015-11-18 06:31:02 -08004904 bool rcu_sync_needed = false;
4905
Eliezer Tamiraf12fa62013-06-10 11:39:41 +03004906 spin_lock(&napi_hash_lock);
4907
Eric Dumazet34cbe272015-11-18 06:31:02 -08004908 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
4909 rcu_sync_needed = true;
Eliezer Tamiraf12fa62013-06-10 11:39:41 +03004910 hlist_del_rcu(&napi->napi_hash_node);
Eric Dumazet34cbe272015-11-18 06:31:02 -08004911 }
Eliezer Tamiraf12fa62013-06-10 11:39:41 +03004912 spin_unlock(&napi_hash_lock);
Eric Dumazet34cbe272015-11-18 06:31:02 -08004913 return rcu_sync_needed;
Eliezer Tamiraf12fa62013-06-10 11:39:41 +03004914}
4915EXPORT_SYMBOL_GPL(napi_hash_del);
4916
Eric Dumazet3b47d302014-11-06 21:09:44 -08004917static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4918{
4919 struct napi_struct *napi;
4920
4921 napi = container_of(timer, struct napi_struct, timer);
4922 if (napi->gro_list)
4923 napi_schedule(napi);
4924
4925 return HRTIMER_NORESTART;
4926}
4927
Herbert Xud565b0a2008-12-15 23:38:52 -08004928void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4929 int (*poll)(struct napi_struct *, int), int weight)
4930{
4931 INIT_LIST_HEAD(&napi->poll_list);
Eric Dumazet3b47d302014-11-06 21:09:44 -08004932 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4933 napi->timer.function = napi_watchdog;
Herbert Xu4ae55442009-02-08 18:00:36 +00004934 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08004935 napi->gro_list = NULL;
Herbert Xu5d38a072009-01-04 16:13:40 -08004936 napi->skb = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08004937 napi->poll = poll;
Eric Dumazet82dc3c62013-03-05 15:57:22 +00004938 if (weight > NAPI_POLL_WEIGHT)
4939 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4940 weight, dev->name);
Herbert Xud565b0a2008-12-15 23:38:52 -08004941 napi->weight = weight;
4942 list_add(&napi->dev_list, &dev->napi_list);
Herbert Xud565b0a2008-12-15 23:38:52 -08004943 napi->dev = dev;
Herbert Xu5d38a072009-01-04 16:13:40 -08004944#ifdef CONFIG_NETPOLL
Herbert Xud565b0a2008-12-15 23:38:52 -08004945 spin_lock_init(&napi->poll_lock);
4946 napi->poll_owner = -1;
4947#endif
4948 set_bit(NAPI_STATE_SCHED, &napi->state);
Eric Dumazet93d05d42015-11-18 06:31:03 -08004949 napi_hash_add(napi);
Herbert Xud565b0a2008-12-15 23:38:52 -08004950}
4951EXPORT_SYMBOL(netif_napi_add);
4952
Eric Dumazet3b47d302014-11-06 21:09:44 -08004953void napi_disable(struct napi_struct *n)
4954{
4955 might_sleep();
4956 set_bit(NAPI_STATE_DISABLE, &n->state);
4957
4958 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4959 msleep(1);
Neil Horman2d8bff12015-09-23 14:57:58 -04004960 while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
4961 msleep(1);
Eric Dumazet3b47d302014-11-06 21:09:44 -08004962
4963 hrtimer_cancel(&n->timer);
4964
4965 clear_bit(NAPI_STATE_DISABLE, &n->state);
4966}
4967EXPORT_SYMBOL(napi_disable);
4968
Eric Dumazet93d05d42015-11-18 06:31:03 -08004969/* Must be called in process context */
Herbert Xud565b0a2008-12-15 23:38:52 -08004970void netif_napi_del(struct napi_struct *napi)
4971{
Eric Dumazet93d05d42015-11-18 06:31:03 -08004972 might_sleep();
4973 if (napi_hash_del(napi))
4974 synchronize_net();
Peter P Waskiewicz Jrd7b06632008-12-26 01:35:35 -08004975 list_del_init(&napi->dev_list);
Herbert Xu76620aa2009-04-16 02:02:07 -07004976 napi_free_frags(napi);
Herbert Xud565b0a2008-12-15 23:38:52 -08004977
Eric Dumazet289dccb2013-12-20 14:29:08 -08004978 kfree_skb_list(napi->gro_list);
Herbert Xud565b0a2008-12-15 23:38:52 -08004979 napi->gro_list = NULL;
Herbert Xu4ae55442009-02-08 18:00:36 +00004980 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08004981}
4982EXPORT_SYMBOL(netif_napi_del);
4983
Herbert Xu726ce702014-12-21 07:16:21 +11004984static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4985{
4986 void *have;
4987 int work, weight;
4988
4989 list_del_init(&n->poll_list);
4990
4991 have = netpoll_poll_lock(n);
4992
4993 weight = n->weight;
4994
4995 /* This NAPI_STATE_SCHED test is for avoiding a race
4996 * with netpoll's poll_napi(). Only the entity which
4997 * obtains the lock and sees NAPI_STATE_SCHED set will
4998 * actually make the ->poll() call. Therefore we avoid
4999 * accidentally calling ->poll() when NAPI is not scheduled.
5000 */
5001 work = 0;
5002 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5003 work = n->poll(n, weight);
5004 trace_napi_poll(n);
5005 }
5006
5007 WARN_ON_ONCE(work > weight);
5008
5009 if (likely(work < weight))
5010 goto out_unlock;
5011
5012 /* Drivers must not modify the NAPI state if they
5013 * consume the entire weight. In such cases this code
5014 * still "owns" the NAPI instance and therefore can
5015 * move the instance around on the list at-will.
5016 */
5017 if (unlikely(napi_disable_pending(n))) {
5018 napi_complete(n);
5019 goto out_unlock;
5020 }
5021
5022 if (n->gro_list) {
5023 /* flush too old packets
5024 * If HZ < 1000, flush all packets.
5025 */
5026 napi_gro_flush(n, HZ >= 1000);
5027 }
5028
Herbert Xu001ce542014-12-21 07:16:22 +11005029 /* Some drivers may have called napi_schedule
5030 * prior to exhausting their budget.
5031 */
5032 if (unlikely(!list_empty(&n->poll_list))) {
5033 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5034 n->dev ? n->dev->name : "backlog");
5035 goto out_unlock;
5036 }
5037
Herbert Xu726ce702014-12-21 07:16:21 +11005038 list_add_tail(&n->poll_list, repoll);
5039
5040out_unlock:
5041 netpoll_poll_unlock(have);
5042
5043 return work;
5044}
5045
Linus Torvalds1da177e2005-04-16 15:20:36 -07005046static void net_rx_action(struct softirq_action *h)
5047{
Christoph Lameter903ceff2014-08-17 12:30:35 -05005048 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
Stephen Hemminger24f8b232008-11-03 17:14:38 -08005049 unsigned long time_limit = jiffies + 2;
Stephen Hemminger51b0bde2005-06-23 20:14:40 -07005050 int budget = netdev_budget;
Eric Dumazetd75b1ad2014-11-02 06:19:33 -08005051 LIST_HEAD(list);
5052 LIST_HEAD(repoll);
Matt Mackall53fb95d2005-08-11 19:27:43 -07005053
Linus Torvalds1da177e2005-04-16 15:20:36 -07005054 local_irq_disable();
Eric Dumazetd75b1ad2014-11-02 06:19:33 -08005055 list_splice_init(&sd->poll_list, &list);
5056 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005057
Herbert Xuceb8d5b2014-12-21 07:16:25 +11005058 for (;;) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07005059 struct napi_struct *n;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005060
Herbert Xuceb8d5b2014-12-21 07:16:25 +11005061 if (list_empty(&list)) {
5062 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5063 return;
5064 break;
5065 }
5066
Herbert Xu6bd373e2014-12-21 07:16:24 +11005067 n = list_first_entry(&list, struct napi_struct, poll_list);
5068 budget -= napi_poll(n, &repoll);
5069
Eric Dumazetd75b1ad2014-11-02 06:19:33 -08005070 /* If softirq window is exhausted then punt.
Stephen Hemminger24f8b232008-11-03 17:14:38 -08005071 * Allow this to run for 2 jiffies since which will allow
5072 * an average latency of 1.5/HZ.
Stephen Hemmingerbea33482007-10-03 16:41:36 -07005073 */
Herbert Xuceb8d5b2014-12-21 07:16:25 +11005074 if (unlikely(budget <= 0 ||
5075 time_after_eq(jiffies, time_limit))) {
5076 sd->time_squeeze++;
5077 break;
5078 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005079 }
Eric Dumazetd75b1ad2014-11-02 06:19:33 -08005080
Eric Dumazetd75b1ad2014-11-02 06:19:33 -08005081 local_irq_disable();
5082
5083 list_splice_tail_init(&sd->poll_list, &list);
5084 list_splice_tail(&repoll, &list);
5085 list_splice(&list, &sd->poll_list);
5086 if (!list_empty(&sd->poll_list))
5087 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5088
Eric Dumazete326bed2010-04-22 00:22:45 -07005089 net_rps_action_and_irq_enable(sd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005090}
5091
Veaceslav Falicoaa9d8562013-08-28 23:25:04 +02005092struct netdev_adjacent {
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005093 struct net_device *dev;
Veaceslav Falico5d261912013-08-28 23:25:05 +02005094
5095 /* upper master flag, there can only be one master device per list */
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005096 bool master;
Veaceslav Falico5d261912013-08-28 23:25:05 +02005097
Veaceslav Falico5d261912013-08-28 23:25:05 +02005098 /* counter for the number of times this device was added to us */
5099 u16 ref_nr;
5100
Veaceslav Falico402dae92013-09-25 09:20:09 +02005101 /* private field for the users */
5102 void *private;
5103
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005104 struct list_head list;
5105 struct rcu_head rcu;
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005106};
5107
Michal Kubeček6ea29da2015-09-24 10:59:05 +02005108static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005109 struct list_head *adj_list)
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005110{
Veaceslav Falico5d261912013-08-28 23:25:05 +02005111 struct netdev_adjacent *adj;
Veaceslav Falico5d261912013-08-28 23:25:05 +02005112
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005113 list_for_each_entry(adj, adj_list, list) {
Veaceslav Falico5d261912013-08-28 23:25:05 +02005114 if (adj->dev == adj_dev)
5115 return adj;
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005116 }
5117 return NULL;
5118}
5119
5120/**
5121 * netdev_has_upper_dev - Check if device is linked to an upper device
5122 * @dev: device
5123 * @upper_dev: upper device to check
5124 *
5125 * Find out if a device is linked to specified upper device and return true
5126 * in case it is. Note that this checks only immediate upper device,
5127 * not through a complete stack of devices. The caller must hold the RTNL lock.
5128 */
5129bool netdev_has_upper_dev(struct net_device *dev,
5130 struct net_device *upper_dev)
5131{
5132 ASSERT_RTNL();
5133
Michal Kubeček6ea29da2015-09-24 10:59:05 +02005134 return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005135}
5136EXPORT_SYMBOL(netdev_has_upper_dev);
5137
5138/**
5139 * netdev_has_any_upper_dev - Check if device is linked to some device
5140 * @dev: device
5141 *
5142 * Find out if a device is linked to an upper device and return true in case
5143 * it is. The caller must hold the RTNL lock.
5144 */
stephen hemminger1d143d92013-12-29 14:01:29 -08005145static bool netdev_has_any_upper_dev(struct net_device *dev)
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005146{
5147 ASSERT_RTNL();
5148
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005149 return !list_empty(&dev->all_adj_list.upper);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005150}
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005151
5152/**
5153 * netdev_master_upper_dev_get - Get master upper device
5154 * @dev: device
5155 *
5156 * Find a master upper device and return pointer to it or NULL in case
5157 * it's not there. The caller must hold the RTNL lock.
5158 */
5159struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5160{
Veaceslav Falicoaa9d8562013-08-28 23:25:04 +02005161 struct netdev_adjacent *upper;
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005162
5163 ASSERT_RTNL();
5164
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005165 if (list_empty(&dev->adj_list.upper))
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005166 return NULL;
5167
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005168 upper = list_first_entry(&dev->adj_list.upper,
Veaceslav Falicoaa9d8562013-08-28 23:25:04 +02005169 struct netdev_adjacent, list);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005170 if (likely(upper->master))
5171 return upper->dev;
5172 return NULL;
5173}
5174EXPORT_SYMBOL(netdev_master_upper_dev_get);
5175
Veaceslav Falicob6ccba42013-09-25 09:20:23 +02005176void *netdev_adjacent_get_private(struct list_head *adj_list)
5177{
5178 struct netdev_adjacent *adj;
5179
5180 adj = list_entry(adj_list, struct netdev_adjacent, list);
5181
5182 return adj->private;
5183}
5184EXPORT_SYMBOL(netdev_adjacent_get_private);
5185
Veaceslav Falico31088a12013-09-25 09:20:12 +02005186/**
Vlad Yasevich44a40852014-05-16 17:20:38 -04005187 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5188 * @dev: device
5189 * @iter: list_head ** of the current position
5190 *
5191 * Gets the next device from the dev's upper list, starting from iter
5192 * position. The caller must hold RCU read lock.
5193 */
5194struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5195 struct list_head **iter)
5196{
5197 struct netdev_adjacent *upper;
5198
5199 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5200
5201 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5202
5203 if (&upper->list == &dev->adj_list.upper)
5204 return NULL;
5205
5206 *iter = &upper->list;
5207
5208 return upper->dev;
5209}
5210EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5211
5212/**
Veaceslav Falico31088a12013-09-25 09:20:12 +02005213 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
Veaceslav Falico48311f42013-08-28 23:25:07 +02005214 * @dev: device
5215 * @iter: list_head ** of the current position
5216 *
5217 * Gets the next device from the dev's upper list, starting from iter
5218 * position. The caller must hold RCU read lock.
5219 */
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005220struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5221 struct list_head **iter)
Veaceslav Falico48311f42013-08-28 23:25:07 +02005222{
5223 struct netdev_adjacent *upper;
5224
John Fastabend85328242013-11-26 06:33:52 +00005225 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
Veaceslav Falico48311f42013-08-28 23:25:07 +02005226
5227 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5228
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005229 if (&upper->list == &dev->all_adj_list.upper)
Veaceslav Falico48311f42013-08-28 23:25:07 +02005230 return NULL;
5231
5232 *iter = &upper->list;
5233
5234 return upper->dev;
5235}
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005236EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
Veaceslav Falico48311f42013-08-28 23:25:07 +02005237
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005238/**
Veaceslav Falico31088a12013-09-25 09:20:12 +02005239 * netdev_lower_get_next_private - Get the next ->private from the
5240 * lower neighbour list
5241 * @dev: device
5242 * @iter: list_head ** of the current position
5243 *
5244 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5245 * list, starting from iter position. The caller must hold either hold the
5246 * RTNL lock or its own locking that guarantees that the neighbour lower
subashab@codeaurora.orgb4691392015-07-24 03:03:29 +00005247 * list will remain unchanged.
Veaceslav Falico31088a12013-09-25 09:20:12 +02005248 */
5249void *netdev_lower_get_next_private(struct net_device *dev,
5250 struct list_head **iter)
5251{
5252 struct netdev_adjacent *lower;
5253
5254 lower = list_entry(*iter, struct netdev_adjacent, list);
5255
5256 if (&lower->list == &dev->adj_list.lower)
5257 return NULL;
5258
Veaceslav Falico6859e7d2014-04-07 11:25:12 +02005259 *iter = lower->list.next;
Veaceslav Falico31088a12013-09-25 09:20:12 +02005260
5261 return lower->private;
5262}
5263EXPORT_SYMBOL(netdev_lower_get_next_private);
5264
5265/**
5266 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5267 * lower neighbour list, RCU
5268 * variant
5269 * @dev: device
5270 * @iter: list_head ** of the current position
5271 *
5272 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5273 * list, starting from iter position. The caller must hold RCU read lock.
5274 */
5275void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5276 struct list_head **iter)
5277{
5278 struct netdev_adjacent *lower;
5279
5280 WARN_ON_ONCE(!rcu_read_lock_held());
5281
5282 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5283
5284 if (&lower->list == &dev->adj_list.lower)
5285 return NULL;
5286
Veaceslav Falico6859e7d2014-04-07 11:25:12 +02005287 *iter = &lower->list;
Veaceslav Falico31088a12013-09-25 09:20:12 +02005288
5289 return lower->private;
5290}
5291EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5292
5293/**
Vlad Yasevich4085ebe2014-05-16 17:04:53 -04005294 * netdev_lower_get_next - Get the next device from the lower neighbour
5295 * list
5296 * @dev: device
5297 * @iter: list_head ** of the current position
5298 *
5299 * Gets the next netdev_adjacent from the dev's lower neighbour
5300 * list, starting from iter position. The caller must hold RTNL lock or
5301 * its own locking that guarantees that the neighbour lower
subashab@codeaurora.orgb4691392015-07-24 03:03:29 +00005302 * list will remain unchanged.
Vlad Yasevich4085ebe2014-05-16 17:04:53 -04005303 */
5304void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5305{
5306 struct netdev_adjacent *lower;
5307
5308 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5309
5310 if (&lower->list == &dev->adj_list.lower)
5311 return NULL;
5312
5313 *iter = &lower->list;
5314
5315 return lower->dev;
5316}
5317EXPORT_SYMBOL(netdev_lower_get_next);
5318
5319/**
dingtianhonge001bfa2013-12-13 10:19:55 +08005320 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5321 * lower neighbour list, RCU
5322 * variant
5323 * @dev: device
5324 *
5325 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5326 * list. The caller must hold RCU read lock.
5327 */
5328void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5329{
5330 struct netdev_adjacent *lower;
5331
5332 lower = list_first_or_null_rcu(&dev->adj_list.lower,
5333 struct netdev_adjacent, list);
5334 if (lower)
5335 return lower->private;
5336 return NULL;
5337}
5338EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5339
5340/**
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005341 * netdev_master_upper_dev_get_rcu - Get master upper device
5342 * @dev: device
5343 *
5344 * Find a master upper device and return pointer to it or NULL in case
5345 * it's not there. The caller must hold the RCU read lock.
5346 */
5347struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5348{
Veaceslav Falicoaa9d8562013-08-28 23:25:04 +02005349 struct netdev_adjacent *upper;
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005350
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005351 upper = list_first_or_null_rcu(&dev->adj_list.upper,
Veaceslav Falicoaa9d8562013-08-28 23:25:04 +02005352 struct netdev_adjacent, list);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005353 if (upper && likely(upper->master))
5354 return upper->dev;
5355 return NULL;
5356}
5357EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5358
Rashika Kheria0a59f3a2014-02-09 20:26:25 +05305359static int netdev_adjacent_sysfs_add(struct net_device *dev,
Veaceslav Falico3ee32702014-01-14 21:58:50 +01005360 struct net_device *adj_dev,
5361 struct list_head *dev_list)
5362{
5363 char linkname[IFNAMSIZ+7];
5364 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5365 "upper_%s" : "lower_%s", adj_dev->name);
5366 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5367 linkname);
5368}
Rashika Kheria0a59f3a2014-02-09 20:26:25 +05305369static void netdev_adjacent_sysfs_del(struct net_device *dev,
Veaceslav Falico3ee32702014-01-14 21:58:50 +01005370 char *name,
5371 struct list_head *dev_list)
5372{
5373 char linkname[IFNAMSIZ+7];
5374 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5375 "upper_%s" : "lower_%s", name);
5376 sysfs_remove_link(&(dev->dev.kobj), linkname);
5377}
5378
Alexander Y. Fomichev7ce64c72014-09-15 14:22:35 +04005379static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5380 struct net_device *adj_dev,
5381 struct list_head *dev_list)
5382{
5383 return (dev_list == &dev->adj_list.upper ||
5384 dev_list == &dev->adj_list.lower) &&
5385 net_eq(dev_net(dev), dev_net(adj_dev));
5386}
Veaceslav Falico3ee32702014-01-14 21:58:50 +01005387
Veaceslav Falico5d261912013-08-28 23:25:05 +02005388static int __netdev_adjacent_dev_insert(struct net_device *dev,
5389 struct net_device *adj_dev,
Veaceslav Falico7863c052013-09-25 09:20:06 +02005390 struct list_head *dev_list,
Veaceslav Falico402dae92013-09-25 09:20:09 +02005391 void *private, bool master)
Veaceslav Falico5d261912013-08-28 23:25:05 +02005392{
5393 struct netdev_adjacent *adj;
Veaceslav Falico842d67a2013-09-25 09:20:31 +02005394 int ret;
Veaceslav Falico5d261912013-08-28 23:25:05 +02005395
Michal Kubeček6ea29da2015-09-24 10:59:05 +02005396 adj = __netdev_find_adj(adj_dev, dev_list);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005397
5398 if (adj) {
Veaceslav Falico5d261912013-08-28 23:25:05 +02005399 adj->ref_nr++;
5400 return 0;
5401 }
5402
5403 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5404 if (!adj)
5405 return -ENOMEM;
5406
5407 adj->dev = adj_dev;
5408 adj->master = master;
Veaceslav Falico5d261912013-08-28 23:25:05 +02005409 adj->ref_nr = 1;
Veaceslav Falico402dae92013-09-25 09:20:09 +02005410 adj->private = private;
Veaceslav Falico5d261912013-08-28 23:25:05 +02005411 dev_hold(adj_dev);
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005412
5413 pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5414 adj_dev->name, dev->name, adj_dev->name);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005415
Alexander Y. Fomichev7ce64c72014-09-15 14:22:35 +04005416 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
Veaceslav Falico3ee32702014-01-14 21:58:50 +01005417 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
Veaceslav Falico5831d66e2013-09-25 09:20:32 +02005418 if (ret)
5419 goto free_adj;
5420 }
5421
Veaceslav Falico7863c052013-09-25 09:20:06 +02005422 /* Ensure that master link is always the first item in list. */
Veaceslav Falico842d67a2013-09-25 09:20:31 +02005423 if (master) {
5424 ret = sysfs_create_link(&(dev->dev.kobj),
5425 &(adj_dev->dev.kobj), "master");
5426 if (ret)
Veaceslav Falico5831d66e2013-09-25 09:20:32 +02005427 goto remove_symlinks;
Veaceslav Falico842d67a2013-09-25 09:20:31 +02005428
Veaceslav Falico7863c052013-09-25 09:20:06 +02005429 list_add_rcu(&adj->list, dev_list);
Veaceslav Falico842d67a2013-09-25 09:20:31 +02005430 } else {
Veaceslav Falico7863c052013-09-25 09:20:06 +02005431 list_add_tail_rcu(&adj->list, dev_list);
Veaceslav Falico842d67a2013-09-25 09:20:31 +02005432 }
Veaceslav Falico5d261912013-08-28 23:25:05 +02005433
5434 return 0;
Veaceslav Falico842d67a2013-09-25 09:20:31 +02005435
Veaceslav Falico5831d66e2013-09-25 09:20:32 +02005436remove_symlinks:
Alexander Y. Fomichev7ce64c72014-09-15 14:22:35 +04005437 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
Veaceslav Falico3ee32702014-01-14 21:58:50 +01005438 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
Veaceslav Falico842d67a2013-09-25 09:20:31 +02005439free_adj:
5440 kfree(adj);
Nikolay Aleksandrov974daef2013-10-23 15:28:56 +02005441 dev_put(adj_dev);
Veaceslav Falico842d67a2013-09-25 09:20:31 +02005442
5443 return ret;
Veaceslav Falico5d261912013-08-28 23:25:05 +02005444}
5445
stephen hemminger1d143d92013-12-29 14:01:29 -08005446static void __netdev_adjacent_dev_remove(struct net_device *dev,
5447 struct net_device *adj_dev,
5448 struct list_head *dev_list)
Veaceslav Falico5d261912013-08-28 23:25:05 +02005449{
5450 struct netdev_adjacent *adj;
5451
Michal Kubeček6ea29da2015-09-24 10:59:05 +02005452 adj = __netdev_find_adj(adj_dev, dev_list);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005453
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005454 if (!adj) {
5455 pr_err("tried to remove device %s from %s\n",
5456 dev->name, adj_dev->name);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005457 BUG();
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005458 }
Veaceslav Falico5d261912013-08-28 23:25:05 +02005459
5460 if (adj->ref_nr > 1) {
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005461 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5462 adj->ref_nr-1);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005463 adj->ref_nr--;
5464 return;
5465 }
5466
Veaceslav Falico842d67a2013-09-25 09:20:31 +02005467 if (adj->master)
5468 sysfs_remove_link(&(dev->dev.kobj), "master");
5469
Alexander Y. Fomichev7ce64c72014-09-15 14:22:35 +04005470 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
Veaceslav Falico3ee32702014-01-14 21:58:50 +01005471 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
Veaceslav Falico5831d66e2013-09-25 09:20:32 +02005472
Veaceslav Falico5d261912013-08-28 23:25:05 +02005473 list_del_rcu(&adj->list);
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005474 pr_debug("dev_put for %s, because link removed from %s to %s\n",
5475 adj_dev->name, dev->name, adj_dev->name);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005476 dev_put(adj_dev);
5477 kfree_rcu(adj, rcu);
5478}
5479
stephen hemminger1d143d92013-12-29 14:01:29 -08005480static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5481 struct net_device *upper_dev,
5482 struct list_head *up_list,
5483 struct list_head *down_list,
5484 void *private, bool master)
Veaceslav Falico5d261912013-08-28 23:25:05 +02005485{
5486 int ret;
5487
Veaceslav Falico402dae92013-09-25 09:20:09 +02005488 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5489 master);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005490 if (ret)
5491 return ret;
5492
Veaceslav Falico402dae92013-09-25 09:20:09 +02005493 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5494 false);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005495 if (ret) {
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005496 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005497 return ret;
5498 }
5499
5500 return 0;
5501}
5502
stephen hemminger1d143d92013-12-29 14:01:29 -08005503static int __netdev_adjacent_dev_link(struct net_device *dev,
5504 struct net_device *upper_dev)
Veaceslav Falico5d261912013-08-28 23:25:05 +02005505{
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005506 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5507 &dev->all_adj_list.upper,
5508 &upper_dev->all_adj_list.lower,
Veaceslav Falico402dae92013-09-25 09:20:09 +02005509 NULL, false);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005510}
5511
stephen hemminger1d143d92013-12-29 14:01:29 -08005512static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5513 struct net_device *upper_dev,
5514 struct list_head *up_list,
5515 struct list_head *down_list)
Veaceslav Falico5d261912013-08-28 23:25:05 +02005516{
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005517 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5518 __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005519}
5520
stephen hemminger1d143d92013-12-29 14:01:29 -08005521static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5522 struct net_device *upper_dev)
Veaceslav Falico5d261912013-08-28 23:25:05 +02005523{
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005524 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5525 &dev->all_adj_list.upper,
5526 &upper_dev->all_adj_list.lower);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005527}
5528
stephen hemminger1d143d92013-12-29 14:01:29 -08005529static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5530 struct net_device *upper_dev,
5531 void *private, bool master)
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005532{
5533 int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5534
5535 if (ret)
5536 return ret;
5537
5538 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5539 &dev->adj_list.upper,
5540 &upper_dev->adj_list.lower,
Veaceslav Falico402dae92013-09-25 09:20:09 +02005541 private, master);
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005542 if (ret) {
5543 __netdev_adjacent_dev_unlink(dev, upper_dev);
5544 return ret;
5545 }
5546
5547 return 0;
5548}
5549
stephen hemminger1d143d92013-12-29 14:01:29 -08005550static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5551 struct net_device *upper_dev)
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005552{
5553 __netdev_adjacent_dev_unlink(dev, upper_dev);
5554 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5555 &dev->adj_list.upper,
5556 &upper_dev->adj_list.lower);
5557}
Veaceslav Falico5d261912013-08-28 23:25:05 +02005558
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005559static int __netdev_upper_dev_link(struct net_device *dev,
Veaceslav Falico402dae92013-09-25 09:20:09 +02005560 struct net_device *upper_dev, bool master,
Jiri Pirko29bf24a2015-12-03 12:12:11 +01005561 void *upper_priv, void *upper_info)
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005562{
Jiri Pirko0e4ead92015-08-27 09:31:18 +02005563 struct netdev_notifier_changeupper_info changeupper_info;
Veaceslav Falico5d261912013-08-28 23:25:05 +02005564 struct netdev_adjacent *i, *j, *to_i, *to_j;
5565 int ret = 0;
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005566
5567 ASSERT_RTNL();
5568
5569 if (dev == upper_dev)
5570 return -EBUSY;
5571
5572 /* To prevent loops, check if dev is not upper device to upper_dev. */
Michal Kubeček6ea29da2015-09-24 10:59:05 +02005573 if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005574 return -EBUSY;
5575
Michal Kubeček6ea29da2015-09-24 10:59:05 +02005576 if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005577 return -EEXIST;
5578
5579 if (master && netdev_master_upper_dev_get(dev))
5580 return -EBUSY;
5581
Jiri Pirko0e4ead92015-08-27 09:31:18 +02005582 changeupper_info.upper_dev = upper_dev;
5583 changeupper_info.master = master;
5584 changeupper_info.linking = true;
Jiri Pirko29bf24a2015-12-03 12:12:11 +01005585 changeupper_info.upper_info = upper_info;
Jiri Pirko0e4ead92015-08-27 09:31:18 +02005586
Jiri Pirko573c7ba2015-10-16 14:01:22 +02005587 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5588 &changeupper_info.info);
5589 ret = notifier_to_errno(ret);
5590 if (ret)
5591 return ret;
5592
Jiri Pirko6dffb042015-12-03 12:12:10 +01005593 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
Veaceslav Falico402dae92013-09-25 09:20:09 +02005594 master);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005595 if (ret)
5596 return ret;
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005597
Veaceslav Falico5d261912013-08-28 23:25:05 +02005598 /* Now that we linked these devs, make all the upper_dev's
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005599 * all_adj_list.upper visible to every dev's all_adj_list.lower an
Veaceslav Falico5d261912013-08-28 23:25:05 +02005600 * versa, and don't forget the devices itself. All of these
5601 * links are non-neighbours.
5602 */
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005603 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5604 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5605 pr_debug("Interlinking %s with %s, non-neighbour\n",
5606 i->dev->name, j->dev->name);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005607 ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5608 if (ret)
5609 goto rollback_mesh;
5610 }
5611 }
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005612
Veaceslav Falico5d261912013-08-28 23:25:05 +02005613 /* add dev to every upper_dev's upper device */
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005614 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5615 pr_debug("linking %s's upper device %s with %s\n",
5616 upper_dev->name, i->dev->name, dev->name);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005617 ret = __netdev_adjacent_dev_link(dev, i->dev);
5618 if (ret)
5619 goto rollback_upper_mesh;
5620 }
5621
5622 /* add upper_dev to every dev's lower device */
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005623 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5624 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5625 i->dev->name, upper_dev->name);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005626 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5627 if (ret)
5628 goto rollback_lower_mesh;
5629 }
5630
Ido Schimmelb03804e2015-12-03 12:12:03 +01005631 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5632 &changeupper_info.info);
5633 ret = notifier_to_errno(ret);
5634 if (ret)
5635 goto rollback_lower_mesh;
5636
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005637 return 0;
Veaceslav Falico5d261912013-08-28 23:25:05 +02005638
5639rollback_lower_mesh:
5640 to_i = i;
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005641 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
Veaceslav Falico5d261912013-08-28 23:25:05 +02005642 if (i == to_i)
5643 break;
5644 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5645 }
5646
5647 i = NULL;
5648
5649rollback_upper_mesh:
5650 to_i = i;
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005651 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
Veaceslav Falico5d261912013-08-28 23:25:05 +02005652 if (i == to_i)
5653 break;
5654 __netdev_adjacent_dev_unlink(dev, i->dev);
5655 }
5656
5657 i = j = NULL;
5658
5659rollback_mesh:
5660 to_i = i;
5661 to_j = j;
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005662 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5663 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
Veaceslav Falico5d261912013-08-28 23:25:05 +02005664 if (i == to_i && j == to_j)
5665 break;
5666 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5667 }
5668 if (i == to_i)
5669 break;
5670 }
5671
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005672 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005673
5674 return ret;
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005675}
5676
5677/**
5678 * netdev_upper_dev_link - Add a link to the upper device
5679 * @dev: device
5680 * @upper_dev: new upper device
5681 *
5682 * Adds a link to device which is upper to this one. The caller must hold
5683 * the RTNL lock. On a failure a negative errno code is returned.
5684 * On success the reference counts are adjusted and the function
5685 * returns zero.
5686 */
5687int netdev_upper_dev_link(struct net_device *dev,
5688 struct net_device *upper_dev)
5689{
Jiri Pirko29bf24a2015-12-03 12:12:11 +01005690 return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005691}
5692EXPORT_SYMBOL(netdev_upper_dev_link);
5693
5694/**
5695 * netdev_master_upper_dev_link - Add a master link to the upper device
5696 * @dev: device
5697 * @upper_dev: new upper device
Jiri Pirko6dffb042015-12-03 12:12:10 +01005698 * @upper_priv: upper device private
Jiri Pirko29bf24a2015-12-03 12:12:11 +01005699 * @upper_info: upper info to be passed down via notifier
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005700 *
5701 * Adds a link to device which is upper to this one. In this case, only
5702 * one master upper device can be linked, although other non-master devices
5703 * might be linked as well. The caller must hold the RTNL lock.
5704 * On a failure a negative errno code is returned. On success the reference
5705 * counts are adjusted and the function returns zero.
5706 */
5707int netdev_master_upper_dev_link(struct net_device *dev,
Jiri Pirko6dffb042015-12-03 12:12:10 +01005708 struct net_device *upper_dev,
Jiri Pirko29bf24a2015-12-03 12:12:11 +01005709 void *upper_priv, void *upper_info)
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005710{
Jiri Pirko29bf24a2015-12-03 12:12:11 +01005711 return __netdev_upper_dev_link(dev, upper_dev, true,
5712 upper_priv, upper_info);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005713}
5714EXPORT_SYMBOL(netdev_master_upper_dev_link);
5715
5716/**
5717 * netdev_upper_dev_unlink - Removes a link to upper device
5718 * @dev: device
5719 * @upper_dev: new upper device
5720 *
5721 * Removes a link to device which is upper to this one. The caller must hold
5722 * the RTNL lock.
5723 */
5724void netdev_upper_dev_unlink(struct net_device *dev,
5725 struct net_device *upper_dev)
5726{
Jiri Pirko0e4ead92015-08-27 09:31:18 +02005727 struct netdev_notifier_changeupper_info changeupper_info;
Veaceslav Falico5d261912013-08-28 23:25:05 +02005728 struct netdev_adjacent *i, *j;
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005729 ASSERT_RTNL();
5730
Jiri Pirko0e4ead92015-08-27 09:31:18 +02005731 changeupper_info.upper_dev = upper_dev;
5732 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5733 changeupper_info.linking = false;
5734
Jiri Pirko573c7ba2015-10-16 14:01:22 +02005735 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5736 &changeupper_info.info);
5737
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005738 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
Veaceslav Falico5d261912013-08-28 23:25:05 +02005739
5740 /* Here is the tricky part. We must remove all dev's lower
5741 * devices from all upper_dev's upper devices and vice
5742 * versa, to maintain the graph relationship.
5743 */
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005744 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5745 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
Veaceslav Falico5d261912013-08-28 23:25:05 +02005746 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5747
5748 /* remove also the devices itself from lower/upper device
5749 * list
5750 */
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005751 list_for_each_entry(i, &dev->all_adj_list.lower, list)
Veaceslav Falico5d261912013-08-28 23:25:05 +02005752 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5753
Veaceslav Falico2f268f12013-09-25 09:20:07 +02005754 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
Veaceslav Falico5d261912013-08-28 23:25:05 +02005755 __netdev_adjacent_dev_unlink(dev, i->dev);
5756
Jiri Pirko0e4ead92015-08-27 09:31:18 +02005757 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5758 &changeupper_info.info);
Jiri Pirko9ff162a2013-01-03 22:48:49 +00005759}
5760EXPORT_SYMBOL(netdev_upper_dev_unlink);
5761
Moni Shoua61bd3852015-02-03 16:48:29 +02005762/**
5763 * netdev_bonding_info_change - Dispatch event about slave change
5764 * @dev: device
Masanari Iida4a26e4532015-02-14 22:26:34 +09005765 * @bonding_info: info to dispatch
Moni Shoua61bd3852015-02-03 16:48:29 +02005766 *
5767 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5768 * The caller must hold the RTNL lock.
5769 */
5770void netdev_bonding_info_change(struct net_device *dev,
5771 struct netdev_bonding_info *bonding_info)
5772{
5773 struct netdev_notifier_bonding_info info;
5774
5775 memcpy(&info.bonding_info, bonding_info,
5776 sizeof(struct netdev_bonding_info));
5777 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5778 &info.info);
5779}
5780EXPORT_SYMBOL(netdev_bonding_info_change);
5781
Eric Dumazet2ce1ee12015-02-04 13:37:44 -08005782static void netdev_adjacent_add_links(struct net_device *dev)
Alexander Y. Fomichev4c754312014-08-25 16:26:45 +04005783{
5784 struct netdev_adjacent *iter;
5785
5786 struct net *net = dev_net(dev);
5787
5788 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5789 if (!net_eq(net,dev_net(iter->dev)))
5790 continue;
5791 netdev_adjacent_sysfs_add(iter->dev, dev,
5792 &iter->dev->adj_list.lower);
5793 netdev_adjacent_sysfs_add(dev, iter->dev,
5794 &dev->adj_list.upper);
5795 }
5796
5797 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5798 if (!net_eq(net,dev_net(iter->dev)))
5799 continue;
5800 netdev_adjacent_sysfs_add(iter->dev, dev,
5801 &iter->dev->adj_list.upper);
5802 netdev_adjacent_sysfs_add(dev, iter->dev,
5803 &dev->adj_list.lower);
5804 }
5805}
5806
Eric Dumazet2ce1ee12015-02-04 13:37:44 -08005807static void netdev_adjacent_del_links(struct net_device *dev)
Alexander Y. Fomichev4c754312014-08-25 16:26:45 +04005808{
5809 struct netdev_adjacent *iter;
5810
5811 struct net *net = dev_net(dev);
5812
5813 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5814 if (!net_eq(net,dev_net(iter->dev)))
5815 continue;
5816 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5817 &iter->dev->adj_list.lower);
5818 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5819 &dev->adj_list.upper);
5820 }
5821
5822 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5823 if (!net_eq(net,dev_net(iter->dev)))
5824 continue;
5825 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5826 &iter->dev->adj_list.upper);
5827 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5828 &dev->adj_list.lower);
5829 }
5830}
5831
Veaceslav Falico5bb025f2014-01-14 21:58:51 +01005832void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
Veaceslav Falico402dae92013-09-25 09:20:09 +02005833{
Veaceslav Falico5bb025f2014-01-14 21:58:51 +01005834 struct netdev_adjacent *iter;
Veaceslav Falico402dae92013-09-25 09:20:09 +02005835
Alexander Y. Fomichev4c754312014-08-25 16:26:45 +04005836 struct net *net = dev_net(dev);
5837
Veaceslav Falico5bb025f2014-01-14 21:58:51 +01005838 list_for_each_entry(iter, &dev->adj_list.upper, list) {
Alexander Y. Fomichev4c754312014-08-25 16:26:45 +04005839 if (!net_eq(net,dev_net(iter->dev)))
5840 continue;
Veaceslav Falico5bb025f2014-01-14 21:58:51 +01005841 netdev_adjacent_sysfs_del(iter->dev, oldname,
5842 &iter->dev->adj_list.lower);
5843 netdev_adjacent_sysfs_add(iter->dev, dev,
5844 &iter->dev->adj_list.lower);
5845 }
Veaceslav Falico402dae92013-09-25 09:20:09 +02005846
Veaceslav Falico5bb025f2014-01-14 21:58:51 +01005847 list_for_each_entry(iter, &dev->adj_list.lower, list) {
Alexander Y. Fomichev4c754312014-08-25 16:26:45 +04005848 if (!net_eq(net,dev_net(iter->dev)))
5849 continue;
Veaceslav Falico5bb025f2014-01-14 21:58:51 +01005850 netdev_adjacent_sysfs_del(iter->dev, oldname,
5851 &iter->dev->adj_list.upper);
5852 netdev_adjacent_sysfs_add(iter->dev, dev,
5853 &iter->dev->adj_list.upper);
5854 }
Veaceslav Falico402dae92013-09-25 09:20:09 +02005855}
Veaceslav Falico402dae92013-09-25 09:20:09 +02005856
5857void *netdev_lower_dev_get_private(struct net_device *dev,
5858 struct net_device *lower_dev)
5859{
5860 struct netdev_adjacent *lower;
5861
5862 if (!lower_dev)
5863 return NULL;
Michal Kubeček6ea29da2015-09-24 10:59:05 +02005864 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
Veaceslav Falico402dae92013-09-25 09:20:09 +02005865 if (!lower)
5866 return NULL;
5867
5868 return lower->private;
5869}
5870EXPORT_SYMBOL(netdev_lower_dev_get_private);
5871
Vlad Yasevich4085ebe2014-05-16 17:04:53 -04005872
5873int dev_get_nest_level(struct net_device *dev,
Jiri Pirkob618aaa2015-12-04 15:01:31 +01005874 bool (*type_check)(const struct net_device *dev))
Vlad Yasevich4085ebe2014-05-16 17:04:53 -04005875{
5876 struct net_device *lower = NULL;
5877 struct list_head *iter;
5878 int max_nest = -1;
5879 int nest;
5880
5881 ASSERT_RTNL();
5882
5883 netdev_for_each_lower_dev(dev, lower, iter) {
5884 nest = dev_get_nest_level(lower, type_check);
5885 if (max_nest < nest)
5886 max_nest = nest;
5887 }
5888
5889 if (type_check(dev))
5890 max_nest++;
5891
5892 return max_nest;
5893}
5894EXPORT_SYMBOL(dev_get_nest_level);
5895
Jiri Pirko04d48262015-12-03 12:12:15 +01005896/**
5897 * netdev_lower_change - Dispatch event about lower device state change
5898 * @lower_dev: device
5899 * @lower_state_info: state to dispatch
5900 *
5901 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
5902 * The caller must hold the RTNL lock.
5903 */
5904void netdev_lower_state_changed(struct net_device *lower_dev,
5905 void *lower_state_info)
5906{
5907 struct netdev_notifier_changelowerstate_info changelowerstate_info;
5908
5909 ASSERT_RTNL();
5910 changelowerstate_info.lower_state_info = lower_state_info;
5911 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
5912 &changelowerstate_info.info);
5913}
5914EXPORT_SYMBOL(netdev_lower_state_changed);
5915
Patrick McHardyb6c40d62008-10-07 15:26:48 -07005916static void dev_change_rx_flags(struct net_device *dev, int flags)
5917{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005918 const struct net_device_ops *ops = dev->netdev_ops;
5919
Vlad Yasevichd2615bf2013-11-19 20:47:15 -05005920 if (ops->ndo_change_rx_flags)
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005921 ops->ndo_change_rx_flags(dev, flags);
Patrick McHardyb6c40d62008-10-07 15:26:48 -07005922}
5923
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005924static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
Patrick McHardy4417da62007-06-27 01:28:10 -07005925{
Eric Dumazetb536db92011-11-30 21:42:26 +00005926 unsigned int old_flags = dev->flags;
Eric W. Biedermand04a48b2012-05-23 17:01:57 -06005927 kuid_t uid;
5928 kgid_t gid;
Patrick McHardy4417da62007-06-27 01:28:10 -07005929
Patrick McHardy24023452007-07-14 18:51:31 -07005930 ASSERT_RTNL();
5931
Wang Chendad9b332008-06-18 01:48:28 -07005932 dev->flags |= IFF_PROMISC;
5933 dev->promiscuity += inc;
5934 if (dev->promiscuity == 0) {
5935 /*
5936 * Avoid overflow.
5937 * If inc causes overflow, untouch promisc and return error.
5938 */
5939 if (inc < 0)
5940 dev->flags &= ~IFF_PROMISC;
5941 else {
5942 dev->promiscuity -= inc;
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005943 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5944 dev->name);
Wang Chendad9b332008-06-18 01:48:28 -07005945 return -EOVERFLOW;
5946 }
5947 }
Patrick McHardy4417da62007-06-27 01:28:10 -07005948 if (dev->flags != old_flags) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00005949 pr_info("device %s %s promiscuous mode\n",
5950 dev->name,
5951 dev->flags & IFF_PROMISC ? "entered" : "left");
David Howells8192b0c2008-11-14 10:39:10 +11005952 if (audit_enabled) {
5953 current_uid_gid(&uid, &gid);
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05005954 audit_log(current->audit_context, GFP_ATOMIC,
5955 AUDIT_ANOM_PROMISCUOUS,
5956 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5957 dev->name, (dev->flags & IFF_PROMISC),
5958 (old_flags & IFF_PROMISC),
Eric W. Biedermane1760bd2012-09-10 22:39:43 -07005959 from_kuid(&init_user_ns, audit_get_loginuid(current)),
Eric W. Biedermand04a48b2012-05-23 17:01:57 -06005960 from_kuid(&init_user_ns, uid),
5961 from_kgid(&init_user_ns, gid),
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05005962 audit_get_sessionid(current));
David Howells8192b0c2008-11-14 10:39:10 +11005963 }
Patrick McHardy24023452007-07-14 18:51:31 -07005964
Patrick McHardyb6c40d62008-10-07 15:26:48 -07005965 dev_change_rx_flags(dev, IFF_PROMISC);
Patrick McHardy4417da62007-06-27 01:28:10 -07005966 }
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005967 if (notify)
5968 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
Wang Chendad9b332008-06-18 01:48:28 -07005969 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07005970}
5971
Linus Torvalds1da177e2005-04-16 15:20:36 -07005972/**
5973 * dev_set_promiscuity - update promiscuity count on a device
5974 * @dev: device
5975 * @inc: modifier
5976 *
Stephen Hemminger3041a062006-05-26 13:25:24 -07005977 * Add or remove promiscuity from a device. While the count in the device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005978 * remains above zero the interface remains promiscuous. Once it hits zero
5979 * the device reverts back to normal filtering operation. A negative inc
5980 * value is used to drop promiscuity on the device.
Wang Chendad9b332008-06-18 01:48:28 -07005981 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005982 */
Wang Chendad9b332008-06-18 01:48:28 -07005983int dev_set_promiscuity(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005984{
Eric Dumazetb536db92011-11-30 21:42:26 +00005985 unsigned int old_flags = dev->flags;
Wang Chendad9b332008-06-18 01:48:28 -07005986 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005987
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005988 err = __dev_set_promiscuity(dev, inc, true);
Patrick McHardy4b5a6982008-07-06 15:49:08 -07005989 if (err < 0)
Wang Chendad9b332008-06-18 01:48:28 -07005990 return err;
Patrick McHardy4417da62007-06-27 01:28:10 -07005991 if (dev->flags != old_flags)
5992 dev_set_rx_mode(dev);
Wang Chendad9b332008-06-18 01:48:28 -07005993 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005994}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005995EXPORT_SYMBOL(dev_set_promiscuity);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005996
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005997static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005998{
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02005999 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006000
Patrick McHardy24023452007-07-14 18:51:31 -07006001 ASSERT_RTNL();
6002
Linus Torvalds1da177e2005-04-16 15:20:36 -07006003 dev->flags |= IFF_ALLMULTI;
Wang Chendad9b332008-06-18 01:48:28 -07006004 dev->allmulti += inc;
6005 if (dev->allmulti == 0) {
6006 /*
6007 * Avoid overflow.
6008 * If inc causes overflow, untouch allmulti and return error.
6009 */
6010 if (inc < 0)
6011 dev->flags &= ~IFF_ALLMULTI;
6012 else {
6013 dev->allmulti -= inc;
Joe Perches7b6cd1c2012-02-01 10:54:43 +00006014 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6015 dev->name);
Wang Chendad9b332008-06-18 01:48:28 -07006016 return -EOVERFLOW;
6017 }
6018 }
Patrick McHardy24023452007-07-14 18:51:31 -07006019 if (dev->flags ^ old_flags) {
Patrick McHardyb6c40d62008-10-07 15:26:48 -07006020 dev_change_rx_flags(dev, IFF_ALLMULTI);
Patrick McHardy4417da62007-06-27 01:28:10 -07006021 dev_set_rx_mode(dev);
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02006022 if (notify)
6023 __dev_notify_flags(dev, old_flags,
6024 dev->gflags ^ old_gflags);
Patrick McHardy24023452007-07-14 18:51:31 -07006025 }
Wang Chendad9b332008-06-18 01:48:28 -07006026 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07006027}
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02006028
6029/**
6030 * dev_set_allmulti - update allmulti count on a device
6031 * @dev: device
6032 * @inc: modifier
6033 *
6034 * Add or remove reception of all multicast frames to a device. While the
6035 * count in the device remains above zero the interface remains listening
6036 * to all interfaces. Once it hits zero the device reverts back to normal
6037 * filtering operation. A negative @inc value is used to drop the counter
6038 * when releasing a resource needing all multicasts.
6039 * Return 0 if successful or a negative errno code on error.
6040 */
6041
6042int dev_set_allmulti(struct net_device *dev, int inc)
6043{
6044 return __dev_set_allmulti(dev, inc, true);
6045}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07006046EXPORT_SYMBOL(dev_set_allmulti);
Patrick McHardy4417da62007-06-27 01:28:10 -07006047
6048/*
6049 * Upload unicast and multicast address lists to device and
6050 * configure RX filtering. When the device doesn't support unicast
Joe Perches53ccaae2007-12-20 14:02:06 -08006051 * filtering it is put in promiscuous mode while unicast addresses
Patrick McHardy4417da62007-06-27 01:28:10 -07006052 * are present.
6053 */
6054void __dev_set_rx_mode(struct net_device *dev)
6055{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08006056 const struct net_device_ops *ops = dev->netdev_ops;
6057
Patrick McHardy4417da62007-06-27 01:28:10 -07006058 /* dev_open will call this function so the list will stay sane. */
6059 if (!(dev->flags&IFF_UP))
6060 return;
6061
6062 if (!netif_device_present(dev))
YOSHIFUJI Hideaki40b77c92007-07-19 10:43:23 +09006063 return;
Patrick McHardy4417da62007-06-27 01:28:10 -07006064
Jiri Pirko01789342011-08-16 06:29:00 +00006065 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
Patrick McHardy4417da62007-06-27 01:28:10 -07006066 /* Unicast addresses changes may only happen under the rtnl,
6067 * therefore calling __dev_set_promiscuity here is safe.
6068 */
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08006069 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02006070 __dev_set_promiscuity(dev, 1, false);
Joe Perches2d348d12011-07-25 16:17:35 -07006071 dev->uc_promisc = true;
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08006072 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02006073 __dev_set_promiscuity(dev, -1, false);
Joe Perches2d348d12011-07-25 16:17:35 -07006074 dev->uc_promisc = false;
Patrick McHardy4417da62007-06-27 01:28:10 -07006075 }
Patrick McHardy4417da62007-06-27 01:28:10 -07006076 }
Jiri Pirko01789342011-08-16 06:29:00 +00006077
6078 if (ops->ndo_set_rx_mode)
6079 ops->ndo_set_rx_mode(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07006080}
6081
6082void dev_set_rx_mode(struct net_device *dev)
6083{
David S. Millerb9e40852008-07-15 00:15:08 -07006084 netif_addr_lock_bh(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07006085 __dev_set_rx_mode(dev);
David S. Millerb9e40852008-07-15 00:15:08 -07006086 netif_addr_unlock_bh(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006087}
6088
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006089/**
6090 * dev_get_flags - get flags reported to userspace
6091 * @dev: device
6092 *
6093 * Get the combination of flag bits exported through APIs to userspace.
6094 */
Eric Dumazet95c96172012-04-15 05:58:06 +00006095unsigned int dev_get_flags(const struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006096{
Eric Dumazet95c96172012-04-15 05:58:06 +00006097 unsigned int flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006098
6099 flags = (dev->flags & ~(IFF_PROMISC |
6100 IFF_ALLMULTI |
Stefan Rompfb00055a2006-03-20 17:09:11 -08006101 IFF_RUNNING |
6102 IFF_LOWER_UP |
6103 IFF_DORMANT)) |
Linus Torvalds1da177e2005-04-16 15:20:36 -07006104 (dev->gflags & (IFF_PROMISC |
6105 IFF_ALLMULTI));
6106
Stefan Rompfb00055a2006-03-20 17:09:11 -08006107 if (netif_running(dev)) {
6108 if (netif_oper_up(dev))
6109 flags |= IFF_RUNNING;
6110 if (netif_carrier_ok(dev))
6111 flags |= IFF_LOWER_UP;
6112 if (netif_dormant(dev))
6113 flags |= IFF_DORMANT;
6114 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006115
6116 return flags;
6117}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07006118EXPORT_SYMBOL(dev_get_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006119
Patrick McHardybd380812010-02-26 06:34:53 +00006120int __dev_change_flags(struct net_device *dev, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006121{
Eric Dumazetb536db92011-11-30 21:42:26 +00006122 unsigned int old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00006123 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006124
Patrick McHardy24023452007-07-14 18:51:31 -07006125 ASSERT_RTNL();
6126
Linus Torvalds1da177e2005-04-16 15:20:36 -07006127 /*
6128 * Set the flags on our device.
6129 */
6130
6131 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6132 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6133 IFF_AUTOMEDIA)) |
6134 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6135 IFF_ALLMULTI));
6136
6137 /*
6138 * Load in the correct multicast list now the flags have changed.
6139 */
6140
Patrick McHardyb6c40d62008-10-07 15:26:48 -07006141 if ((old_flags ^ flags) & IFF_MULTICAST)
6142 dev_change_rx_flags(dev, IFF_MULTICAST);
Patrick McHardy24023452007-07-14 18:51:31 -07006143
Patrick McHardy4417da62007-06-27 01:28:10 -07006144 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006145
6146 /*
6147 * Have we downed the interface. We handle IFF_UP ourselves
6148 * according to user attempts to set it, rather than blindly
6149 * setting it.
6150 */
6151
6152 ret = 0;
Peter Pan(潘卫平)d215d102014-06-16 21:57:22 +08006153 if ((old_flags ^ flags) & IFF_UP)
Patrick McHardybd380812010-02-26 06:34:53 +00006154 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006155
Linus Torvalds1da177e2005-04-16 15:20:36 -07006156 if ((flags ^ dev->gflags) & IFF_PROMISC) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07006157 int inc = (flags & IFF_PROMISC) ? 1 : -1;
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02006158 unsigned int old_flags = dev->flags;
Eric Dumazetd1b19df2009-09-03 01:29:39 -07006159
Linus Torvalds1da177e2005-04-16 15:20:36 -07006160 dev->gflags ^= IFF_PROMISC;
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02006161
6162 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6163 if (dev->flags != old_flags)
6164 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006165 }
6166
6167 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6168 is important. Some (broken) drivers set IFF_PROMISC, when
6169 IFF_ALLMULTI is requested not asking us and not reporting.
6170 */
6171 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07006172 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6173
Linus Torvalds1da177e2005-04-16 15:20:36 -07006174 dev->gflags ^= IFF_ALLMULTI;
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02006175 __dev_set_allmulti(dev, inc, false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006176 }
6177
Patrick McHardybd380812010-02-26 06:34:53 +00006178 return ret;
6179}
6180
Nicolas Dichtela528c212013-09-25 12:02:44 +02006181void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6182 unsigned int gchanges)
Patrick McHardybd380812010-02-26 06:34:53 +00006183{
6184 unsigned int changes = dev->flags ^ old_flags;
6185
Nicolas Dichtela528c212013-09-25 12:02:44 +02006186 if (gchanges)
Alexei Starovoitov7f294052013-10-23 16:02:42 -07006187 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
Nicolas Dichtela528c212013-09-25 12:02:44 +02006188
Patrick McHardybd380812010-02-26 06:34:53 +00006189 if (changes & IFF_UP) {
6190 if (dev->flags & IFF_UP)
6191 call_netdevice_notifiers(NETDEV_UP, dev);
6192 else
6193 call_netdevice_notifiers(NETDEV_DOWN, dev);
6194 }
6195
6196 if (dev->flags & IFF_UP &&
Jiri Pirkobe9efd32013-05-28 01:30:22 +00006197 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6198 struct netdev_notifier_change_info change_info;
6199
6200 change_info.flags_changed = changes;
6201 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6202 &change_info.info);
6203 }
Patrick McHardybd380812010-02-26 06:34:53 +00006204}
6205
6206/**
6207 * dev_change_flags - change device settings
6208 * @dev: device
6209 * @flags: device state flags
6210 *
6211 * Change settings on device based state flags. The flags are
6212 * in the userspace exported format.
6213 */
Eric Dumazetb536db92011-11-30 21:42:26 +00006214int dev_change_flags(struct net_device *dev, unsigned int flags)
Patrick McHardybd380812010-02-26 06:34:53 +00006215{
Eric Dumazetb536db92011-11-30 21:42:26 +00006216 int ret;
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02006217 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
Patrick McHardybd380812010-02-26 06:34:53 +00006218
6219 ret = __dev_change_flags(dev, flags);
6220 if (ret < 0)
6221 return ret;
6222
Nicolas Dichtel991fb3f2013-09-25 12:02:45 +02006223 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
Nicolas Dichtela528c212013-09-25 12:02:44 +02006224 __dev_notify_flags(dev, old_flags, changes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006225 return ret;
6226}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07006227EXPORT_SYMBOL(dev_change_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006228
Veaceslav Falico2315dc92014-01-10 16:56:25 +01006229static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6230{
6231 const struct net_device_ops *ops = dev->netdev_ops;
6232
6233 if (ops->ndo_change_mtu)
6234 return ops->ndo_change_mtu(dev, new_mtu);
6235
6236 dev->mtu = new_mtu;
6237 return 0;
6238}
6239
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006240/**
6241 * dev_set_mtu - Change maximum transfer unit
6242 * @dev: device
6243 * @new_mtu: new transfer unit
6244 *
6245 * Change the maximum transfer size of the network device.
6246 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07006247int dev_set_mtu(struct net_device *dev, int new_mtu)
6248{
Veaceslav Falico2315dc92014-01-10 16:56:25 +01006249 int err, orig_mtu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006250
6251 if (new_mtu == dev->mtu)
6252 return 0;
6253
6254 /* MTU must be positive. */
6255 if (new_mtu < 0)
6256 return -EINVAL;
6257
6258 if (!netif_device_present(dev))
6259 return -ENODEV;
6260
Veaceslav Falico1d486bf2014-01-16 00:02:18 +01006261 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6262 err = notifier_to_errno(err);
6263 if (err)
6264 return err;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08006265
Veaceslav Falico2315dc92014-01-10 16:56:25 +01006266 orig_mtu = dev->mtu;
6267 err = __dev_set_mtu(dev, new_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006268
Veaceslav Falico2315dc92014-01-10 16:56:25 +01006269 if (!err) {
6270 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6271 err = notifier_to_errno(err);
6272 if (err) {
6273 /* setting mtu back and notifying everyone again,
6274 * so that they have a chance to revert changes.
6275 */
6276 __dev_set_mtu(dev, orig_mtu);
6277 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6278 }
6279 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006280 return err;
6281}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07006282EXPORT_SYMBOL(dev_set_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006283
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006284/**
Vlad Dogarucbda10f2011-01-13 23:38:30 +00006285 * dev_set_group - Change group this device belongs to
6286 * @dev: device
6287 * @new_group: group this device should belong to
6288 */
6289void dev_set_group(struct net_device *dev, int new_group)
6290{
6291 dev->group = new_group;
6292}
6293EXPORT_SYMBOL(dev_set_group);
6294
6295/**
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006296 * dev_set_mac_address - Change Media Access Control Address
6297 * @dev: device
6298 * @sa: new address
6299 *
6300 * Change the hardware (MAC) address of the device
6301 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07006302int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6303{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08006304 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006305 int err;
6306
Stephen Hemmingerd3147742008-11-19 21:32:24 -08006307 if (!ops->ndo_set_mac_address)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006308 return -EOPNOTSUPP;
6309 if (sa->sa_family != dev->type)
6310 return -EINVAL;
6311 if (!netif_device_present(dev))
6312 return -ENODEV;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08006313 err = ops->ndo_set_mac_address(dev, sa);
Jiri Pirkof6521512013-01-01 03:30:14 +00006314 if (err)
6315 return err;
Jiri Pirkofbdeca22013-01-01 03:30:16 +00006316 dev->addr_assign_type = NET_ADDR_SET;
Jiri Pirkof6521512013-01-01 03:30:14 +00006317 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04006318 add_device_randomness(dev->dev_addr, dev->addr_len);
Jiri Pirkof6521512013-01-01 03:30:14 +00006319 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006320}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07006321EXPORT_SYMBOL(dev_set_mac_address);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006322
Jiri Pirko4bf84c32012-12-27 23:49:37 +00006323/**
6324 * dev_change_carrier - Change device carrier
6325 * @dev: device
Randy Dunlap691b3b72013-03-04 12:32:43 +00006326 * @new_carrier: new value
Jiri Pirko4bf84c32012-12-27 23:49:37 +00006327 *
6328 * Change device carrier
6329 */
6330int dev_change_carrier(struct net_device *dev, bool new_carrier)
6331{
6332 const struct net_device_ops *ops = dev->netdev_ops;
6333
6334 if (!ops->ndo_change_carrier)
6335 return -EOPNOTSUPP;
6336 if (!netif_device_present(dev))
6337 return -ENODEV;
6338 return ops->ndo_change_carrier(dev, new_carrier);
6339}
6340EXPORT_SYMBOL(dev_change_carrier);
6341
Linus Torvalds1da177e2005-04-16 15:20:36 -07006342/**
Jiri Pirko66b52b02013-07-29 18:16:49 +02006343 * dev_get_phys_port_id - Get device physical port ID
6344 * @dev: device
6345 * @ppid: port ID
6346 *
6347 * Get device physical port ID
6348 */
6349int dev_get_phys_port_id(struct net_device *dev,
Jiri Pirko02637fc2014-11-28 14:34:16 +01006350 struct netdev_phys_item_id *ppid)
Jiri Pirko66b52b02013-07-29 18:16:49 +02006351{
6352 const struct net_device_ops *ops = dev->netdev_ops;
6353
6354 if (!ops->ndo_get_phys_port_id)
6355 return -EOPNOTSUPP;
6356 return ops->ndo_get_phys_port_id(dev, ppid);
6357}
6358EXPORT_SYMBOL(dev_get_phys_port_id);
6359
6360/**
David Aherndb24a902015-03-17 20:23:15 -06006361 * dev_get_phys_port_name - Get device physical port name
6362 * @dev: device
6363 * @name: port name
6364 *
6365 * Get device physical port name
6366 */
6367int dev_get_phys_port_name(struct net_device *dev,
6368 char *name, size_t len)
6369{
6370 const struct net_device_ops *ops = dev->netdev_ops;
6371
6372 if (!ops->ndo_get_phys_port_name)
6373 return -EOPNOTSUPP;
6374 return ops->ndo_get_phys_port_name(dev, name, len);
6375}
6376EXPORT_SYMBOL(dev_get_phys_port_name);
6377
6378/**
Anuradha Karuppiahd746d702015-07-14 13:43:19 -07006379 * dev_change_proto_down - update protocol port state information
6380 * @dev: device
6381 * @proto_down: new value
6382 *
6383 * This info can be used by switch drivers to set the phys state of the
6384 * port.
6385 */
6386int dev_change_proto_down(struct net_device *dev, bool proto_down)
6387{
6388 const struct net_device_ops *ops = dev->netdev_ops;
6389
6390 if (!ops->ndo_change_proto_down)
6391 return -EOPNOTSUPP;
6392 if (!netif_device_present(dev))
6393 return -ENODEV;
6394 return ops->ndo_change_proto_down(dev, proto_down);
6395}
6396EXPORT_SYMBOL(dev_change_proto_down);
6397
6398/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07006399 * dev_new_index - allocate an ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07006400 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07006401 *
6402 * Returns a suitable unique value for a new device interface
6403 * number. The caller must hold the rtnl semaphore or the
6404 * dev_base_lock to be sure it remains unique.
6405 */
Eric W. Biederman881d9662007-09-17 11:56:21 -07006406static int dev_new_index(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006407{
Pavel Emelyanovaa79e662012-08-08 21:53:19 +00006408 int ifindex = net->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006409 for (;;) {
6410 if (++ifindex <= 0)
6411 ifindex = 1;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006412 if (!__dev_get_by_index(net, ifindex))
Pavel Emelyanovaa79e662012-08-08 21:53:19 +00006413 return net->ifindex = ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006414 }
6415}
6416
Linus Torvalds1da177e2005-04-16 15:20:36 -07006417/* Delayed registration/unregisteration */
Denis Cheng3b5b34f2007-12-07 00:49:17 -08006418static LIST_HEAD(net_todo_list);
Cong Wang200b9162014-05-12 15:11:20 -07006419DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006420
Stephen Hemminger6f05f622007-03-08 20:46:03 -08006421static void net_set_todo(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006422{
Linus Torvalds1da177e2005-04-16 15:20:36 -07006423 list_add_tail(&dev->todo_list, &net_todo_list);
Eric W. Biederman50624c92013-09-23 21:19:49 -07006424 dev_net(dev)->dev_unreg_count++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006425}
6426
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006427static void rollback_registered_many(struct list_head *head)
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07006428{
Krishna Kumare93737b2009-12-08 22:26:02 +00006429 struct net_device *dev, *tmp;
Eric W. Biederman5cde2822013-10-05 19:26:05 -07006430 LIST_HEAD(close_head);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006431
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07006432 BUG_ON(dev_boot_phase);
6433 ASSERT_RTNL();
6434
Krishna Kumare93737b2009-12-08 22:26:02 +00006435 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006436 /* Some devices call without registering
Krishna Kumare93737b2009-12-08 22:26:02 +00006437 * for initialization unwind. Remove those
6438 * devices and proceed with the remaining.
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006439 */
6440 if (dev->reg_state == NETREG_UNINITIALIZED) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00006441 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6442 dev->name, dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07006443
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006444 WARN_ON(1);
Krishna Kumare93737b2009-12-08 22:26:02 +00006445 list_del(&dev->unreg_list);
6446 continue;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006447 }
Eric Dumazet449f4542011-05-19 12:24:16 +00006448 dev->dismantle = true;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006449 BUG_ON(dev->reg_state != NETREG_REGISTERED);
Octavian Purdila44345722010-12-13 12:44:07 +00006450 }
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006451
Octavian Purdila44345722010-12-13 12:44:07 +00006452 /* If device is running, close it first. */
Eric W. Biederman5cde2822013-10-05 19:26:05 -07006453 list_for_each_entry(dev, head, unreg_list)
6454 list_add_tail(&dev->close_list, &close_head);
David S. Miller99c4a262015-03-18 22:52:33 -04006455 dev_close_many(&close_head, true);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006456
Octavian Purdila44345722010-12-13 12:44:07 +00006457 list_for_each_entry(dev, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006458 /* And unlink it from device chain. */
6459 unlist_netdevice(dev);
6460
6461 dev->reg_state = NETREG_UNREGISTERING;
Julian Anastasove9e4dd32015-07-09 09:59:09 +03006462 on_each_cpu(flush_backlog, dev, 1);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07006463 }
6464
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006465 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07006466
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006467 list_for_each_entry(dev, head, unreg_list) {
Mahesh Bandewar395eea62014-12-03 13:46:24 -08006468 struct sk_buff *skb = NULL;
6469
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006470 /* Shutdown queueing discipline. */
6471 dev_shutdown(dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07006472
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07006473
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006474 /* Notify protocols, that we are about to destroy
6475 this device. They should clean all the things.
6476 */
6477 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6478
Mahesh Bandewar395eea62014-12-03 13:46:24 -08006479 if (!dev->rtnl_link_ops ||
6480 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6481 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6482 GFP_KERNEL);
6483
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006484 /*
6485 * Flush the unicast and multicast chains
6486 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00006487 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00006488 dev_mc_flush(dev);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006489
6490 if (dev->netdev_ops->ndo_uninit)
6491 dev->netdev_ops->ndo_uninit(dev);
6492
Mahesh Bandewar395eea62014-12-03 13:46:24 -08006493 if (skb)
6494 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
Roopa Prabhu56bfa7e2014-05-01 11:40:30 -07006495
Jiri Pirko9ff162a2013-01-03 22:48:49 +00006496 /* Notifier chain MUST detach us all upper devices. */
6497 WARN_ON(netdev_has_any_upper_dev(dev));
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006498
6499 /* Remove entries from kobject tree */
6500 netdev_unregister_kobject(dev);
Alexander Duyck024e9672013-01-10 08:57:46 +00006501#ifdef CONFIG_XPS
6502 /* Remove XPS queueing entries */
6503 netif_reset_xps_queues_gt(dev, 0);
6504#endif
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006505 }
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07006506
Eric W. Biederman850a5452011-10-13 22:25:23 +00006507 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07006508
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00006509 list_for_each_entry(dev, head, unreg_list)
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006510 dev_put(dev);
6511}
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07006512
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006513static void rollback_registered(struct net_device *dev)
6514{
6515 LIST_HEAD(single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07006516
Eric Dumazet9b5e3832009-10-27 07:04:19 +00006517 list_add(&dev->unreg_list, &single);
6518 rollback_registered_many(&single);
Eric Dumazetceaaec92011-02-17 22:59:19 +00006519 list_del(&single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07006520}
6521
Jarod Wilsonfd867d52015-11-02 21:55:59 -05006522static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6523 struct net_device *upper, netdev_features_t features)
6524{
6525 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6526 netdev_features_t feature;
Jarod Wilson5ba3f7d2015-11-03 10:15:59 -05006527 int feature_bit;
Jarod Wilsonfd867d52015-11-02 21:55:59 -05006528
Jarod Wilson5ba3f7d2015-11-03 10:15:59 -05006529 for_each_netdev_feature(&upper_disables, feature_bit) {
6530 feature = __NETIF_F_BIT(feature_bit);
Jarod Wilsonfd867d52015-11-02 21:55:59 -05006531 if (!(upper->wanted_features & feature)
6532 && (features & feature)) {
6533 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6534 &feature, upper->name);
6535 features &= ~feature;
6536 }
6537 }
6538
6539 return features;
6540}
6541
6542static void netdev_sync_lower_features(struct net_device *upper,
6543 struct net_device *lower, netdev_features_t features)
6544{
6545 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6546 netdev_features_t feature;
Jarod Wilson5ba3f7d2015-11-03 10:15:59 -05006547 int feature_bit;
Jarod Wilsonfd867d52015-11-02 21:55:59 -05006548
Jarod Wilson5ba3f7d2015-11-03 10:15:59 -05006549 for_each_netdev_feature(&upper_disables, feature_bit) {
6550 feature = __NETIF_F_BIT(feature_bit);
Jarod Wilsonfd867d52015-11-02 21:55:59 -05006551 if (!(features & feature) && (lower->features & feature)) {
6552 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6553 &feature, lower->name);
6554 lower->wanted_features &= ~feature;
6555 netdev_update_features(lower);
6556
6557 if (unlikely(lower->features & feature))
6558 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6559 &feature, lower->name);
6560 }
6561 }
6562}
6563
Michał Mirosławc8f44af2011-11-15 15:29:55 +00006564static netdev_features_t netdev_fix_features(struct net_device *dev,
6565 netdev_features_t features)
Herbert Xub63365a2008-10-23 01:11:29 -07006566{
Michał Mirosław57422dc2011-01-22 12:14:12 +00006567 /* Fix illegal checksum combinations */
6568 if ((features & NETIF_F_HW_CSUM) &&
6569 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04006570 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
Michał Mirosław57422dc2011-01-22 12:14:12 +00006571 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6572 }
6573
Herbert Xub63365a2008-10-23 01:11:29 -07006574 /* TSO requires that SG is present as well. */
Ben Hutchingsea2d3682011-04-12 14:38:37 +00006575 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04006576 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
Ben Hutchingsea2d3682011-04-12 14:38:37 +00006577 features &= ~NETIF_F_ALL_TSO;
Herbert Xub63365a2008-10-23 01:11:29 -07006578 }
6579
Pravin B Shelarec5f0612013-03-07 09:28:01 +00006580 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6581 !(features & NETIF_F_IP_CSUM)) {
6582 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6583 features &= ~NETIF_F_TSO;
6584 features &= ~NETIF_F_TSO_ECN;
6585 }
6586
6587 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6588 !(features & NETIF_F_IPV6_CSUM)) {
6589 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6590 features &= ~NETIF_F_TSO6;
6591 }
6592
Ben Hutchings31d8b9e2011-04-12 14:47:15 +00006593 /* TSO ECN requires that TSO is present as well. */
6594 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6595 features &= ~NETIF_F_TSO_ECN;
6596
Michał Mirosław212b5732011-02-15 16:59:16 +00006597 /* Software GSO depends on SG. */
6598 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04006599 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
Michał Mirosław212b5732011-02-15 16:59:16 +00006600 features &= ~NETIF_F_GSO;
6601 }
6602
Michał Mirosławacd11302011-01-24 15:45:15 -08006603 /* UFO needs SG and checksumming */
Herbert Xub63365a2008-10-23 01:11:29 -07006604 if (features & NETIF_F_UFO) {
Michał Mirosław79032642010-11-30 06:38:00 +00006605 /* maybe split UFO into V4 and V6? */
Tom Herbertc8cd0982015-12-14 11:19:44 -08006606 if (!(features & NETIF_F_HW_CSUM) &&
6607 ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6608 (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04006609 netdev_dbg(dev,
Michał Mirosławacd11302011-01-24 15:45:15 -08006610 "Dropping NETIF_F_UFO since no checksum offload features.\n");
Herbert Xub63365a2008-10-23 01:11:29 -07006611 features &= ~NETIF_F_UFO;
6612 }
6613
6614 if (!(features & NETIF_F_SG)) {
Michał Mirosław6f404e42011-05-16 15:14:21 -04006615 netdev_dbg(dev,
Michał Mirosławacd11302011-01-24 15:45:15 -08006616 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
Herbert Xub63365a2008-10-23 01:11:29 -07006617 features &= ~NETIF_F_UFO;
6618 }
6619 }
6620
Jiri Pirkod0290212014-04-02 23:09:31 +02006621#ifdef CONFIG_NET_RX_BUSY_POLL
6622 if (dev->netdev_ops->ndo_busy_poll)
6623 features |= NETIF_F_BUSY_POLL;
6624 else
6625#endif
6626 features &= ~NETIF_F_BUSY_POLL;
6627
Herbert Xub63365a2008-10-23 01:11:29 -07006628 return features;
6629}
Herbert Xub63365a2008-10-23 01:11:29 -07006630
Michał Mirosław6cb6a272011-04-02 22:48:47 -07006631int __netdev_update_features(struct net_device *dev)
Michał Mirosław5455c692011-02-15 16:59:17 +00006632{
Jarod Wilsonfd867d52015-11-02 21:55:59 -05006633 struct net_device *upper, *lower;
Michał Mirosławc8f44af2011-11-15 15:29:55 +00006634 netdev_features_t features;
Jarod Wilsonfd867d52015-11-02 21:55:59 -05006635 struct list_head *iter;
Jarod Wilsone7868a82015-11-03 23:09:32 -05006636 int err = -1;
Michał Mirosław5455c692011-02-15 16:59:17 +00006637
Michał Mirosław87267482011-04-12 09:56:38 +00006638 ASSERT_RTNL();
6639
Michał Mirosław5455c692011-02-15 16:59:17 +00006640 features = netdev_get_wanted_features(dev);
6641
6642 if (dev->netdev_ops->ndo_fix_features)
6643 features = dev->netdev_ops->ndo_fix_features(dev, features);
6644
6645 /* driver might be less strict about feature dependencies */
6646 features = netdev_fix_features(dev, features);
6647
Jarod Wilsonfd867d52015-11-02 21:55:59 -05006648 /* some features can't be enabled if they're off an an upper device */
6649 netdev_for_each_upper_dev_rcu(dev, upper, iter)
6650 features = netdev_sync_upper_features(dev, upper, features);
6651
Michał Mirosław5455c692011-02-15 16:59:17 +00006652 if (dev->features == features)
Jarod Wilsone7868a82015-11-03 23:09:32 -05006653 goto sync_lower;
Michał Mirosław5455c692011-02-15 16:59:17 +00006654
Michał Mirosławc8f44af2011-11-15 15:29:55 +00006655 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6656 &dev->features, &features);
Michał Mirosław5455c692011-02-15 16:59:17 +00006657
6658 if (dev->netdev_ops->ndo_set_features)
6659 err = dev->netdev_ops->ndo_set_features(dev, features);
Nikolay Aleksandrov5f8dc332015-11-13 14:54:01 +01006660 else
6661 err = 0;
Michał Mirosław5455c692011-02-15 16:59:17 +00006662
Michał Mirosław6cb6a272011-04-02 22:48:47 -07006663 if (unlikely(err < 0)) {
Michał Mirosław5455c692011-02-15 16:59:17 +00006664 netdev_err(dev,
Michał Mirosławc8f44af2011-11-15 15:29:55 +00006665 "set_features() failed (%d); wanted %pNF, left %pNF\n",
6666 err, &features, &dev->features);
Nikolay Aleksandrov17b85d22015-11-17 15:49:06 +01006667 /* return non-0 since some features might have changed and
6668 * it's better to fire a spurious notification than miss it
6669 */
6670 return -1;
Michał Mirosław6cb6a272011-04-02 22:48:47 -07006671 }
6672
Jarod Wilsone7868a82015-11-03 23:09:32 -05006673sync_lower:
Jarod Wilsonfd867d52015-11-02 21:55:59 -05006674 /* some features must be disabled on lower devices when disabled
6675 * on an upper device (think: bonding master or bridge)
6676 */
6677 netdev_for_each_lower_dev(dev, lower, iter)
6678 netdev_sync_lower_features(dev, lower, features);
6679
Michał Mirosław6cb6a272011-04-02 22:48:47 -07006680 if (!err)
6681 dev->features = features;
6682
Jarod Wilsone7868a82015-11-03 23:09:32 -05006683 return err < 0 ? 0 : 1;
Michał Mirosław6cb6a272011-04-02 22:48:47 -07006684}
6685
Michał Mirosławafe12cc2011-05-07 03:22:17 +00006686/**
6687 * netdev_update_features - recalculate device features
6688 * @dev: the device to check
6689 *
6690 * Recalculate dev->features set and send notifications if it
6691 * has changed. Should be called after driver or hardware dependent
6692 * conditions might have changed that influence the features.
6693 */
Michał Mirosław6cb6a272011-04-02 22:48:47 -07006694void netdev_update_features(struct net_device *dev)
6695{
6696 if (__netdev_update_features(dev))
6697 netdev_features_change(dev);
Michał Mirosław5455c692011-02-15 16:59:17 +00006698}
6699EXPORT_SYMBOL(netdev_update_features);
6700
Linus Torvalds1da177e2005-04-16 15:20:36 -07006701/**
Michał Mirosławafe12cc2011-05-07 03:22:17 +00006702 * netdev_change_features - recalculate device features
6703 * @dev: the device to check
6704 *
6705 * Recalculate dev->features set and send notifications even
6706 * if they have not changed. Should be called instead of
6707 * netdev_update_features() if also dev->vlan_features might
6708 * have changed to allow the changes to be propagated to stacked
6709 * VLAN devices.
6710 */
6711void netdev_change_features(struct net_device *dev)
6712{
6713 __netdev_update_features(dev);
6714 netdev_features_change(dev);
6715}
6716EXPORT_SYMBOL(netdev_change_features);
6717
6718/**
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08006719 * netif_stacked_transfer_operstate - transfer operstate
6720 * @rootdev: the root or lower level device to transfer state from
6721 * @dev: the device to transfer operstate to
6722 *
6723 * Transfer operational state from root to device. This is normally
6724 * called when a stacking relationship exists between the root
6725 * device and the device(a leaf device).
6726 */
6727void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6728 struct net_device *dev)
6729{
6730 if (rootdev->operstate == IF_OPER_DORMANT)
6731 netif_dormant_on(dev);
6732 else
6733 netif_dormant_off(dev);
6734
6735 if (netif_carrier_ok(rootdev)) {
6736 if (!netif_carrier_ok(dev))
6737 netif_carrier_on(dev);
6738 } else {
6739 if (netif_carrier_ok(dev))
6740 netif_carrier_off(dev);
6741 }
6742}
6743EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6744
Michael Daltona953be52014-01-16 22:23:28 -08006745#ifdef CONFIG_SYSFS
Eric Dumazet1b4bf462010-09-23 17:26:35 +00006746static int netif_alloc_rx_queues(struct net_device *dev)
6747{
Eric Dumazet1b4bf462010-09-23 17:26:35 +00006748 unsigned int i, count = dev->num_rx_queues;
Tom Herbertbd25fa72010-10-18 18:00:16 +00006749 struct netdev_rx_queue *rx;
Pankaj Gupta10595902015-01-12 11:41:28 +05306750 size_t sz = count * sizeof(*rx);
Eric Dumazet1b4bf462010-09-23 17:26:35 +00006751
Tom Herbertbd25fa72010-10-18 18:00:16 +00006752 BUG_ON(count < 1);
Eric Dumazet1b4bf462010-09-23 17:26:35 +00006753
Pankaj Gupta10595902015-01-12 11:41:28 +05306754 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6755 if (!rx) {
6756 rx = vzalloc(sz);
6757 if (!rx)
6758 return -ENOMEM;
6759 }
Tom Herbertbd25fa72010-10-18 18:00:16 +00006760 dev->_rx = rx;
6761
Tom Herbertbd25fa72010-10-18 18:00:16 +00006762 for (i = 0; i < count; i++)
Tom Herbertfe822242010-11-09 10:47:38 +00006763 rx[i].dev = dev;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00006764 return 0;
6765}
Tom Herbertbf264142010-11-26 08:36:09 +00006766#endif
Eric Dumazet1b4bf462010-09-23 17:26:35 +00006767
Changli Gaoaa942102010-12-04 02:31:41 +00006768static void netdev_init_one_queue(struct net_device *dev,
6769 struct netdev_queue *queue, void *_unused)
6770{
6771 /* Initialize queue lock */
6772 spin_lock_init(&queue->_xmit_lock);
6773 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6774 queue->xmit_lock_owner = -1;
Changli Gaob236da62010-12-14 03:09:15 +00006775 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
Changli Gaoaa942102010-12-04 02:31:41 +00006776 queue->dev = dev;
Tom Herbert114cf582011-11-28 16:33:09 +00006777#ifdef CONFIG_BQL
6778 dql_init(&queue->dql, HZ);
6779#endif
Changli Gaoaa942102010-12-04 02:31:41 +00006780}
6781
Eric Dumazet60877a32013-06-20 01:15:51 -07006782static void netif_free_tx_queues(struct net_device *dev)
6783{
WANG Cong4cb28972014-06-02 15:55:22 -07006784 kvfree(dev->_tx);
Eric Dumazet60877a32013-06-20 01:15:51 -07006785}
6786
Tom Herberte6484932010-10-18 18:04:39 +00006787static int netif_alloc_netdev_queues(struct net_device *dev)
6788{
6789 unsigned int count = dev->num_tx_queues;
6790 struct netdev_queue *tx;
Eric Dumazet60877a32013-06-20 01:15:51 -07006791 size_t sz = count * sizeof(*tx);
Tom Herberte6484932010-10-18 18:04:39 +00006792
Eric Dumazetd3397272015-07-06 17:13:26 +02006793 if (count < 1 || count > 0xffff)
6794 return -EINVAL;
Tom Herberte6484932010-10-18 18:04:39 +00006795
Eric Dumazet60877a32013-06-20 01:15:51 -07006796 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6797 if (!tx) {
6798 tx = vzalloc(sz);
6799 if (!tx)
6800 return -ENOMEM;
6801 }
Tom Herberte6484932010-10-18 18:04:39 +00006802 dev->_tx = tx;
Tom Herbert1d24eb42010-11-21 13:17:27 +00006803
Tom Herberte6484932010-10-18 18:04:39 +00006804 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6805 spin_lock_init(&dev->tx_global_lock);
Changli Gaoaa942102010-12-04 02:31:41 +00006806
6807 return 0;
Tom Herberte6484932010-10-18 18:04:39 +00006808}
6809
Denys Vlasenkoa2029242015-05-11 21:17:53 +02006810void netif_tx_stop_all_queues(struct net_device *dev)
6811{
6812 unsigned int i;
6813
6814 for (i = 0; i < dev->num_tx_queues; i++) {
6815 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6816 netif_tx_stop_queue(txq);
6817 }
6818}
6819EXPORT_SYMBOL(netif_tx_stop_all_queues);
6820
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08006821/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07006822 * register_netdevice - register a network device
6823 * @dev: device to register
6824 *
6825 * Take a completed network device structure and add it to the kernel
6826 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6827 * chain. 0 is returned on success. A negative errno code is returned
6828 * on a failure to set up the device, or if the name is a duplicate.
6829 *
6830 * Callers must hold the rtnl semaphore. You may want
6831 * register_netdev() instead of this.
6832 *
6833 * BUGS:
6834 * The locking appears insufficient to guarantee two parallel registers
6835 * will not get the same name.
6836 */
6837
6838int register_netdevice(struct net_device *dev)
6839{
Linus Torvalds1da177e2005-04-16 15:20:36 -07006840 int ret;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08006841 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006842
6843 BUG_ON(dev_boot_phase);
6844 ASSERT_RTNL();
6845
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07006846 might_sleep();
6847
Linus Torvalds1da177e2005-04-16 15:20:36 -07006848 /* When net_device's are persistent, this will be fatal. */
6849 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
Stephen Hemmingerd3147742008-11-19 21:32:24 -08006850 BUG_ON(!net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006851
David S. Millerf1f28aa2008-07-15 00:08:33 -07006852 spin_lock_init(&dev->addr_list_lock);
David S. Millercf508b12008-07-22 14:16:42 -07006853 netdev_set_addr_lockdep_class(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006854
Gao feng828de4f2012-09-13 20:58:27 +00006855 ret = dev_get_valid_name(net, dev, dev->name);
Peter Pan(潘卫平)0696c3a2011-05-12 15:46:56 +00006856 if (ret < 0)
6857 goto out;
6858
Linus Torvalds1da177e2005-04-16 15:20:36 -07006859 /* Init, if this function is available */
Stephen Hemmingerd3147742008-11-19 21:32:24 -08006860 if (dev->netdev_ops->ndo_init) {
6861 ret = dev->netdev_ops->ndo_init(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006862 if (ret) {
6863 if (ret > 0)
6864 ret = -EIO;
Adrian Bunk90833aa2006-11-13 16:02:22 -08006865 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006866 }
6867 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09006868
Patrick McHardyf6469682013-04-19 02:04:27 +00006869 if (((dev->hw_features | dev->features) &
6870 NETIF_F_HW_VLAN_CTAG_FILTER) &&
Michał Mirosławd2ed2732013-01-29 15:14:16 +00006871 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6872 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6873 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6874 ret = -EINVAL;
6875 goto err_uninit;
6876 }
6877
Pavel Emelyanov9c7dafb2012-08-08 21:52:46 +00006878 ret = -EBUSY;
6879 if (!dev->ifindex)
6880 dev->ifindex = dev_new_index(net);
6881 else if (__dev_get_by_index(net, dev->ifindex))
6882 goto err_uninit;
6883
Michał Mirosław5455c692011-02-15 16:59:17 +00006884 /* Transfer changeable features to wanted_features and enable
6885 * software offloads (GSO and GRO).
6886 */
6887 dev->hw_features |= NETIF_F_SOFT_FEATURES;
Michał Mirosław14d12322011-02-22 16:52:28 +00006888 dev->features |= NETIF_F_SOFT_FEATURES;
6889 dev->wanted_features = dev->features & dev->hw_features;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006890
Michał Mirosław34324dc2011-11-15 15:29:55 +00006891 if (!(dev->flags & IFF_LOOPBACK)) {
6892 dev->hw_features |= NETIF_F_NOCACHE_COPY;
Tom Herbertc6e1a0d2011-04-04 22:30:30 -07006893 }
6894
Michał Mirosław1180e7d2011-07-14 14:41:11 -07006895 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
Brandon Philips16c3ea72010-09-15 09:24:24 +00006896 */
Michał Mirosław1180e7d2011-07-14 14:41:11 -07006897 dev->vlan_features |= NETIF_F_HIGHDMA;
Brandon Philips16c3ea72010-09-15 09:24:24 +00006898
Pravin B Shelaree579672013-03-07 09:28:08 +00006899 /* Make NETIF_F_SG inheritable to tunnel devices.
6900 */
6901 dev->hw_enc_features |= NETIF_F_SG;
6902
Simon Horman0d89d202013-05-23 21:02:52 +00006903 /* Make NETIF_F_SG inheritable to MPLS.
6904 */
6905 dev->mpls_features |= NETIF_F_SG;
6906
Johannes Berg7ffbe3f2009-10-02 05:15:27 +00006907 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6908 ret = notifier_to_errno(ret);
6909 if (ret)
6910 goto err_uninit;
6911
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006912 ret = netdev_register_kobject(dev);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07006913 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07006914 goto err_uninit;
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07006915 dev->reg_state = NETREG_REGISTERED;
6916
Michał Mirosław6cb6a272011-04-02 22:48:47 -07006917 __netdev_update_features(dev);
Michał Mirosław8e9b59b2011-02-22 16:52:28 +00006918
Linus Torvalds1da177e2005-04-16 15:20:36 -07006919 /*
6920 * Default initial state at registry is that the
6921 * device is present.
6922 */
6923
6924 set_bit(__LINK_STATE_PRESENT, &dev->state);
6925
Ben Hutchings8f4cccb2012-08-20 22:16:51 +01006926 linkwatch_init_dev(dev);
6927
Linus Torvalds1da177e2005-04-16 15:20:36 -07006928 dev_init_scheduler(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006929 dev_hold(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006930 list_netdevice(dev);
Theodore Ts'o7bf23572012-07-04 21:23:25 -04006931 add_device_randomness(dev->dev_addr, dev->addr_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006932
Jiri Pirko948b3372013-01-08 01:38:25 +00006933 /* If the device has permanent device address, driver should
6934 * set dev_addr and also addr_assign_type should be set to
6935 * NET_ADDR_PERM (default value).
6936 */
6937 if (dev->addr_assign_type == NET_ADDR_PERM)
6938 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6939
Linus Torvalds1da177e2005-04-16 15:20:36 -07006940 /* Notify protocols, that a new device appeared. */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07006941 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07006942 ret = notifier_to_errno(ret);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07006943 if (ret) {
6944 rollback_registered(dev);
6945 dev->reg_state = NETREG_UNREGISTERED;
6946 }
Eric W. Biedermand90a9092009-12-12 22:11:15 +00006947 /*
6948 * Prevent userspace races by waiting until the network
6949 * device is fully setup before sending notifications.
6950 */
Patrick McHardya2835762010-02-26 06:34:51 +00006951 if (!dev->rtnl_link_ops ||
6952 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
Alexei Starovoitov7f294052013-10-23 16:02:42 -07006953 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006954
6955out:
6956 return ret;
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07006957
6958err_uninit:
Stephen Hemmingerd3147742008-11-19 21:32:24 -08006959 if (dev->netdev_ops->ndo_uninit)
6960 dev->netdev_ops->ndo_uninit(dev);
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07006961 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006962}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07006963EXPORT_SYMBOL(register_netdevice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006964
6965/**
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08006966 * init_dummy_netdev - init a dummy network device for NAPI
6967 * @dev: device to init
6968 *
6969 * This takes a network device structure and initialize the minimum
6970 * amount of fields so it can be used to schedule NAPI polls without
6971 * registering a full blown interface. This is to be used by drivers
6972 * that need to tie several hardware interfaces to a single NAPI
6973 * poll scheduler due to HW limitations.
6974 */
6975int init_dummy_netdev(struct net_device *dev)
6976{
6977 /* Clear everything. Note we don't initialize spinlocks
6978 * are they aren't supposed to be taken by any of the
6979 * NAPI code and this dummy netdev is supposed to be
6980 * only ever used for NAPI polls
6981 */
6982 memset(dev, 0, sizeof(struct net_device));
6983
6984 /* make sure we BUG if trying to hit standard
6985 * register/unregister code path
6986 */
6987 dev->reg_state = NETREG_DUMMY;
6988
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08006989 /* NAPI wants this */
6990 INIT_LIST_HEAD(&dev->napi_list);
6991
6992 /* a dummy interface is started by default */
6993 set_bit(__LINK_STATE_PRESENT, &dev->state);
6994 set_bit(__LINK_STATE_START, &dev->state);
6995
Eric Dumazet29b44332010-10-11 10:22:12 +00006996 /* Note : We dont allocate pcpu_refcnt for dummy devices,
6997 * because users of this 'device' dont need to change
6998 * its refcount.
6999 */
7000
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08007001 return 0;
7002}
7003EXPORT_SYMBOL_GPL(init_dummy_netdev);
7004
7005
7006/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07007007 * register_netdev - register a network device
7008 * @dev: device to register
7009 *
7010 * Take a completed network device structure and add it to the kernel
7011 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7012 * chain. 0 is returned on success. A negative errno code is returned
7013 * on a failure to set up the device, or if the name is a duplicate.
7014 *
Borislav Petkov38b4da32007-04-20 22:14:10 -07007015 * This is a wrapper around register_netdevice that takes the rtnl semaphore
Linus Torvalds1da177e2005-04-16 15:20:36 -07007016 * and expands the device name if you passed a format string to
7017 * alloc_netdev.
7018 */
7019int register_netdev(struct net_device *dev)
7020{
7021 int err;
7022
7023 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07007024 err = register_netdevice(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007025 rtnl_unlock();
7026 return err;
7027}
7028EXPORT_SYMBOL(register_netdev);
7029
Eric Dumazet29b44332010-10-11 10:22:12 +00007030int netdev_refcnt_read(const struct net_device *dev)
7031{
7032 int i, refcnt = 0;
7033
7034 for_each_possible_cpu(i)
7035 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7036 return refcnt;
7037}
7038EXPORT_SYMBOL(netdev_refcnt_read);
7039
Ben Hutchings2c530402012-07-10 10:55:09 +00007040/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07007041 * netdev_wait_allrefs - wait until all references are gone.
Randy Dunlap3de7a372012-08-18 14:36:44 +00007042 * @dev: target net_device
Linus Torvalds1da177e2005-04-16 15:20:36 -07007043 *
7044 * This is called when unregistering network devices.
7045 *
7046 * Any protocol or device that holds a reference should register
7047 * for netdevice notification, and cleanup and put back the
7048 * reference if they receive an UNREGISTER event.
7049 * We can get stuck here if buggy protocols don't correctly
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09007050 * call dev_put.
Linus Torvalds1da177e2005-04-16 15:20:36 -07007051 */
7052static void netdev_wait_allrefs(struct net_device *dev)
7053{
7054 unsigned long rebroadcast_time, warning_time;
Eric Dumazet29b44332010-10-11 10:22:12 +00007055 int refcnt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007056
Eric Dumazete014deb2009-11-17 05:59:21 +00007057 linkwatch_forget_dev(dev);
7058
Linus Torvalds1da177e2005-04-16 15:20:36 -07007059 rebroadcast_time = warning_time = jiffies;
Eric Dumazet29b44332010-10-11 10:22:12 +00007060 refcnt = netdev_refcnt_read(dev);
7061
7062 while (refcnt != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07007063 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08007064 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07007065
7066 /* Rebroadcast unregister notification */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07007067 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007068
Eric Dumazet748e2d92012-08-22 21:50:59 +00007069 __rtnl_unlock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00007070 rcu_barrier();
Eric Dumazet748e2d92012-08-22 21:50:59 +00007071 rtnl_lock();
7072
Eric Dumazet0115e8e2012-08-22 17:19:46 +00007073 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007074 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7075 &dev->state)) {
7076 /* We must not have linkwatch events
7077 * pending on unregister. If this
7078 * happens, we simply run the queue
7079 * unscheduled, resulting in a noop
7080 * for this device.
7081 */
7082 linkwatch_run_queue();
7083 }
7084
Stephen Hemminger6756ae42006-03-20 22:23:58 -08007085 __rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07007086
7087 rebroadcast_time = jiffies;
7088 }
7089
7090 msleep(250);
7091
Eric Dumazet29b44332010-10-11 10:22:12 +00007092 refcnt = netdev_refcnt_read(dev);
7093
Linus Torvalds1da177e2005-04-16 15:20:36 -07007094 if (time_after(jiffies, warning_time + 10 * HZ)) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00007095 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7096 dev->name, refcnt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007097 warning_time = jiffies;
7098 }
7099 }
7100}
7101
7102/* The sequence is:
7103 *
7104 * rtnl_lock();
7105 * ...
7106 * register_netdevice(x1);
7107 * register_netdevice(x2);
7108 * ...
7109 * unregister_netdevice(y1);
7110 * unregister_netdevice(y2);
7111 * ...
7112 * rtnl_unlock();
7113 * free_netdev(y1);
7114 * free_netdev(y2);
7115 *
Herbert Xu58ec3b42008-10-07 15:50:03 -07007116 * We are invoked by rtnl_unlock().
Linus Torvalds1da177e2005-04-16 15:20:36 -07007117 * This allows us to deal with problems:
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07007118 * 1) We can delete sysfs objects which invoke hotplug
Linus Torvalds1da177e2005-04-16 15:20:36 -07007119 * without deadlocking with linkwatch via keventd.
7120 * 2) Since we run with the RTNL semaphore not held, we can sleep
7121 * safely in order to wait for the netdev refcnt to drop to zero.
Herbert Xu58ec3b42008-10-07 15:50:03 -07007122 *
7123 * We must not return until all unregister events added during
7124 * the interval the lock was held have been completed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07007125 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07007126void netdev_run_todo(void)
7127{
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07007128 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007129
Linus Torvalds1da177e2005-04-16 15:20:36 -07007130 /* Snapshot list, allow later requests */
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07007131 list_replace_init(&net_todo_list, &list);
Herbert Xu58ec3b42008-10-07 15:50:03 -07007132
7133 __rtnl_unlock();
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07007134
Eric Dumazet0115e8e2012-08-22 17:19:46 +00007135
7136 /* Wait for rcu callbacks to finish before next phase */
Eric W. Biederman850a5452011-10-13 22:25:23 +00007137 if (!list_empty(&list))
7138 rcu_barrier();
7139
Linus Torvalds1da177e2005-04-16 15:20:36 -07007140 while (!list_empty(&list)) {
7141 struct net_device *dev
stephen hemmingere5e26d72010-02-24 14:01:38 +00007142 = list_first_entry(&list, struct net_device, todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007143 list_del(&dev->todo_list);
7144
Eric Dumazet748e2d92012-08-22 21:50:59 +00007145 rtnl_lock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00007146 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Eric Dumazet748e2d92012-08-22 21:50:59 +00007147 __rtnl_unlock();
Eric Dumazet0115e8e2012-08-22 17:19:46 +00007148
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07007149 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00007150 pr_err("network todo '%s' but state %d\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07007151 dev->name, dev->reg_state);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07007152 dump_stack();
7153 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007154 }
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07007155
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07007156 dev->reg_state = NETREG_UNREGISTERED;
7157
7158 netdev_wait_allrefs(dev);
7159
7160 /* paranoia */
Eric Dumazet29b44332010-10-11 10:22:12 +00007161 BUG_ON(netdev_refcnt_read(dev));
Salam Noureddine7866a622015-01-27 11:35:48 -08007162 BUG_ON(!list_empty(&dev->ptype_all));
7163 BUG_ON(!list_empty(&dev->ptype_specific));
Eric Dumazet33d480c2011-08-11 19:30:52 +00007164 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7165 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
Ilpo Järvinen547b7922008-07-25 21:43:18 -07007166 WARN_ON(dev->dn_ptr);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07007167
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07007168 if (dev->destructor)
7169 dev->destructor(dev);
Stephen Hemminger9093bbb2007-05-19 15:39:25 -07007170
Eric W. Biederman50624c92013-09-23 21:19:49 -07007171 /* Report a network device has been unregistered */
7172 rtnl_lock();
7173 dev_net(dev)->dev_unreg_count--;
7174 __rtnl_unlock();
7175 wake_up(&netdev_unregistering_wq);
7176
Stephen Hemminger9093bbb2007-05-19 15:39:25 -07007177 /* Free network device */
7178 kobject_put(&dev->dev.kobj);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007179 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07007180}
7181
Ben Hutchings3cfde792010-07-09 09:11:52 +00007182/* Convert net_device_stats to rtnl_link_stats64. They have the same
7183 * fields in the same order, with only the type differing.
7184 */
Eric Dumazet77a1abf2012-03-05 04:50:09 +00007185void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7186 const struct net_device_stats *netdev_stats)
Ben Hutchings3cfde792010-07-09 09:11:52 +00007187{
7188#if BITS_PER_LONG == 64
Eric Dumazet77a1abf2012-03-05 04:50:09 +00007189 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
7190 memcpy(stats64, netdev_stats, sizeof(*stats64));
Ben Hutchings3cfde792010-07-09 09:11:52 +00007191#else
7192 size_t i, n = sizeof(*stats64) / sizeof(u64);
7193 const unsigned long *src = (const unsigned long *)netdev_stats;
7194 u64 *dst = (u64 *)stats64;
7195
7196 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
7197 sizeof(*stats64) / sizeof(u64));
7198 for (i = 0; i < n; i++)
7199 dst[i] = src[i];
7200#endif
7201}
Eric Dumazet77a1abf2012-03-05 04:50:09 +00007202EXPORT_SYMBOL(netdev_stats_to_stats64);
Ben Hutchings3cfde792010-07-09 09:11:52 +00007203
Eric Dumazetd83345a2009-11-16 03:36:51 +00007204/**
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08007205 * dev_get_stats - get network device statistics
7206 * @dev: device to get statistics from
Eric Dumazet28172732010-07-07 14:58:56 -07007207 * @storage: place to store stats
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08007208 *
Ben Hutchingsd7753512010-07-09 09:12:41 +00007209 * Get network statistics from device. Return @storage.
7210 * The device driver may provide its own method by setting
7211 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7212 * otherwise the internal statistics structure is used.
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08007213 */
Ben Hutchingsd7753512010-07-09 09:12:41 +00007214struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7215 struct rtnl_link_stats64 *storage)
Eric Dumazet7004bf22009-05-18 00:34:33 +00007216{
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08007217 const struct net_device_ops *ops = dev->netdev_ops;
7218
Eric Dumazet28172732010-07-07 14:58:56 -07007219 if (ops->ndo_get_stats64) {
7220 memset(storage, 0, sizeof(*storage));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00007221 ops->ndo_get_stats64(dev, storage);
7222 } else if (ops->ndo_get_stats) {
Ben Hutchings3cfde792010-07-09 09:11:52 +00007223 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00007224 } else {
7225 netdev_stats_to_stats64(storage, &dev->stats);
Eric Dumazet28172732010-07-07 14:58:56 -07007226 }
Eric Dumazetcaf586e2010-09-30 21:06:55 +00007227 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
Eric Dumazet015f0682014-03-27 08:45:56 -07007228 storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
Eric Dumazet28172732010-07-07 14:58:56 -07007229 return storage;
Rusty Russellc45d2862007-03-28 14:29:08 -07007230}
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08007231EXPORT_SYMBOL(dev_get_stats);
Rusty Russellc45d2862007-03-28 14:29:08 -07007232
Eric Dumazet24824a02010-10-02 06:11:55 +00007233struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
David S. Millerdc2b4842008-07-08 17:18:23 -07007234{
Eric Dumazet24824a02010-10-02 06:11:55 +00007235 struct netdev_queue *queue = dev_ingress_queue(dev);
David S. Millerdc2b4842008-07-08 17:18:23 -07007236
Eric Dumazet24824a02010-10-02 06:11:55 +00007237#ifdef CONFIG_NET_CLS_ACT
7238 if (queue)
7239 return queue;
7240 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7241 if (!queue)
7242 return NULL;
7243 netdev_init_one_queue(dev, queue, NULL);
Eric Dumazet2ce1ee12015-02-04 13:37:44 -08007244 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
Eric Dumazet24824a02010-10-02 06:11:55 +00007245 queue->qdisc_sleeping = &noop_qdisc;
7246 rcu_assign_pointer(dev->ingress_queue, queue);
7247#endif
7248 return queue;
David S. Millerbb949fb2008-07-08 16:55:56 -07007249}
7250
Eric Dumazet2c60db02012-09-16 09:17:26 +00007251static const struct ethtool_ops default_ethtool_ops;
7252
Stanislaw Gruszkad07d7502013-01-10 23:19:10 +00007253void netdev_set_default_ethtool_ops(struct net_device *dev,
7254 const struct ethtool_ops *ops)
7255{
7256 if (dev->ethtool_ops == &default_ethtool_ops)
7257 dev->ethtool_ops = ops;
7258}
7259EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7260
Eric Dumazet74d332c2013-10-30 13:10:44 -07007261void netdev_freemem(struct net_device *dev)
7262{
7263 char *addr = (char *)dev - dev->padded;
7264
WANG Cong4cb28972014-06-02 15:55:22 -07007265 kvfree(addr);
Eric Dumazet74d332c2013-10-30 13:10:44 -07007266}
7267
Linus Torvalds1da177e2005-04-16 15:20:36 -07007268/**
Tom Herbert36909ea2011-01-09 19:36:31 +00007269 * alloc_netdev_mqs - allocate network device
Tom Gundersenc835a672014-07-14 16:37:24 +02007270 * @sizeof_priv: size of private data to allocate space for
7271 * @name: device name format string
7272 * @name_assign_type: origin of device name
7273 * @setup: callback to initialize device
7274 * @txqs: the number of TX subqueues to allocate
7275 * @rxqs: the number of RX subqueues to allocate
Linus Torvalds1da177e2005-04-16 15:20:36 -07007276 *
7277 * Allocates a struct net_device with private data area for driver use
Li Zhong90e51ad2013-11-22 15:04:46 +08007278 * and performs basic initialization. Also allocates subqueue structs
Tom Herbert36909ea2011-01-09 19:36:31 +00007279 * for each queue on the device.
Linus Torvalds1da177e2005-04-16 15:20:36 -07007280 */
Tom Herbert36909ea2011-01-09 19:36:31 +00007281struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
Tom Gundersenc835a672014-07-14 16:37:24 +02007282 unsigned char name_assign_type,
Tom Herbert36909ea2011-01-09 19:36:31 +00007283 void (*setup)(struct net_device *),
7284 unsigned int txqs, unsigned int rxqs)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007285{
Linus Torvalds1da177e2005-04-16 15:20:36 -07007286 struct net_device *dev;
Stephen Hemminger79439862008-07-21 13:28:44 -07007287 size_t alloc_size;
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00007288 struct net_device *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007289
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07007290 BUG_ON(strlen(name) >= sizeof(dev->name));
7291
Tom Herbert36909ea2011-01-09 19:36:31 +00007292 if (txqs < 1) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00007293 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
Tom Herbert55513fb2010-10-18 17:55:58 +00007294 return NULL;
7295 }
7296
Michael Daltona953be52014-01-16 22:23:28 -08007297#ifdef CONFIG_SYSFS
Tom Herbert36909ea2011-01-09 19:36:31 +00007298 if (rxqs < 1) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00007299 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
Tom Herbert36909ea2011-01-09 19:36:31 +00007300 return NULL;
7301 }
7302#endif
7303
David S. Millerfd2ea0a2008-07-17 01:56:23 -07007304 alloc_size = sizeof(struct net_device);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07007305 if (sizeof_priv) {
7306 /* ensure 32-byte alignment of private area */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00007307 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07007308 alloc_size += sizeof_priv;
7309 }
7310 /* ensure 32-byte alignment of whole construct */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00007311 alloc_size += NETDEV_ALIGN - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007312
Eric Dumazet74d332c2013-10-30 13:10:44 -07007313 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7314 if (!p)
7315 p = vzalloc(alloc_size);
Joe Perches62b59422013-02-04 16:48:16 +00007316 if (!p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007317 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007318
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00007319 dev = PTR_ALIGN(p, NETDEV_ALIGN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007320 dev->padded = (char *)dev - (char *)p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00007321
Eric Dumazet29b44332010-10-11 10:22:12 +00007322 dev->pcpu_refcnt = alloc_percpu(int);
7323 if (!dev->pcpu_refcnt)
Eric Dumazet74d332c2013-10-30 13:10:44 -07007324 goto free_dev;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00007325
Linus Torvalds1da177e2005-04-16 15:20:36 -07007326 if (dev_addr_init(dev))
Eric Dumazet29b44332010-10-11 10:22:12 +00007327 goto free_pcpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007328
Jiri Pirko22bedad32010-04-01 21:22:57 +00007329 dev_mc_init(dev);
Jiri Pirkoa748ee22010-04-01 21:22:09 +00007330 dev_uc_init(dev);
Jiri Pirkoccffad252009-05-22 23:22:17 +00007331
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09007332 dev_net_set(dev, &init_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007333
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07007334 dev->gso_max_size = GSO_MAX_SIZE;
Ben Hutchings30b678d2012-07-30 15:57:00 +00007335 dev->gso_max_segs = GSO_MAX_SEGS;
Eric Dumazetfcbeb972014-10-05 10:11:27 -07007336 dev->gso_min_segs = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007337
Herbert Xud565b0a2008-12-15 23:38:52 -08007338 INIT_LIST_HEAD(&dev->napi_list);
Eric W. Biederman9fdce092009-10-30 14:51:13 +00007339 INIT_LIST_HEAD(&dev->unreg_list);
Eric W. Biederman5cde2822013-10-05 19:26:05 -07007340 INIT_LIST_HEAD(&dev->close_list);
Eric Dumazete014deb2009-11-17 05:59:21 +00007341 INIT_LIST_HEAD(&dev->link_watch_list);
Veaceslav Falico2f268f12013-09-25 09:20:07 +02007342 INIT_LIST_HEAD(&dev->adj_list.upper);
7343 INIT_LIST_HEAD(&dev->adj_list.lower);
7344 INIT_LIST_HEAD(&dev->all_adj_list.upper);
7345 INIT_LIST_HEAD(&dev->all_adj_list.lower);
Salam Noureddine7866a622015-01-27 11:35:48 -08007346 INIT_LIST_HEAD(&dev->ptype_all);
7347 INIT_LIST_HEAD(&dev->ptype_specific);
Eric Dumazet02875872014-10-05 18:38:35 -07007348 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007349 setup(dev);
David S. Miller8d3bdbd2011-02-08 15:02:50 -08007350
Phil Sutter906470c2015-08-18 10:30:48 +02007351 if (!dev->tx_queue_len)
Phil Sutterf84bb1e2015-08-27 21:21:36 +02007352 dev->priv_flags |= IFF_NO_QUEUE;
Phil Sutter906470c2015-08-18 10:30:48 +02007353
David S. Miller8d3bdbd2011-02-08 15:02:50 -08007354 dev->num_tx_queues = txqs;
7355 dev->real_num_tx_queues = txqs;
7356 if (netif_alloc_netdev_queues(dev))
7357 goto free_all;
7358
Michael Daltona953be52014-01-16 22:23:28 -08007359#ifdef CONFIG_SYSFS
David S. Miller8d3bdbd2011-02-08 15:02:50 -08007360 dev->num_rx_queues = rxqs;
7361 dev->real_num_rx_queues = rxqs;
7362 if (netif_alloc_rx_queues(dev))
7363 goto free_all;
7364#endif
7365
Linus Torvalds1da177e2005-04-16 15:20:36 -07007366 strcpy(dev->name, name);
Tom Gundersenc835a672014-07-14 16:37:24 +02007367 dev->name_assign_type = name_assign_type;
Vlad Dogarucbda10f2011-01-13 23:38:30 +00007368 dev->group = INIT_NETDEV_GROUP;
Eric Dumazet2c60db02012-09-16 09:17:26 +00007369 if (!dev->ethtool_ops)
7370 dev->ethtool_ops = &default_ethtool_ops;
Pablo Neirae687ad62015-05-13 18:19:38 +02007371
7372 nf_hook_ingress_init(dev);
7373
Linus Torvalds1da177e2005-04-16 15:20:36 -07007374 return dev;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00007375
David S. Miller8d3bdbd2011-02-08 15:02:50 -08007376free_all:
7377 free_netdev(dev);
7378 return NULL;
7379
Eric Dumazet29b44332010-10-11 10:22:12 +00007380free_pcpu:
7381 free_percpu(dev->pcpu_refcnt);
Eric Dumazet74d332c2013-10-30 13:10:44 -07007382free_dev:
7383 netdev_freemem(dev);
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00007384 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007385}
Tom Herbert36909ea2011-01-09 19:36:31 +00007386EXPORT_SYMBOL(alloc_netdev_mqs);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007387
7388/**
7389 * free_netdev - free network device
7390 * @dev: device
7391 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09007392 * This function does the last stage of destroying an allocated device
7393 * interface. The reference to the device object is released.
Linus Torvalds1da177e2005-04-16 15:20:36 -07007394 * If this is the last reference then it will be freed.
Eric Dumazet93d05d42015-11-18 06:31:03 -08007395 * Must be called in process context.
Linus Torvalds1da177e2005-04-16 15:20:36 -07007396 */
7397void free_netdev(struct net_device *dev)
7398{
Herbert Xud565b0a2008-12-15 23:38:52 -08007399 struct napi_struct *p, *n;
7400
Eric Dumazet93d05d42015-11-18 06:31:03 -08007401 might_sleep();
Eric Dumazet60877a32013-06-20 01:15:51 -07007402 netif_free_tx_queues(dev);
Michael Daltona953be52014-01-16 22:23:28 -08007403#ifdef CONFIG_SYSFS
Pankaj Gupta10595902015-01-12 11:41:28 +05307404 kvfree(dev->_rx);
Tom Herbertfe822242010-11-09 10:47:38 +00007405#endif
David S. Millere8a04642008-07-17 00:34:19 -07007406
Eric Dumazet33d480c2011-08-11 19:30:52 +00007407 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
Eric Dumazet24824a02010-10-02 06:11:55 +00007408
Jiri Pirkof001fde2009-05-05 02:48:28 +00007409 /* Flush device addresses */
7410 dev_addr_flush(dev);
7411
Herbert Xud565b0a2008-12-15 23:38:52 -08007412 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7413 netif_napi_del(p);
7414
Eric Dumazet29b44332010-10-11 10:22:12 +00007415 free_percpu(dev->pcpu_refcnt);
7416 dev->pcpu_refcnt = NULL;
7417
Stephen Hemminger3041a062006-05-26 13:25:24 -07007418 /* Compatibility with error handling in drivers */
Linus Torvalds1da177e2005-04-16 15:20:36 -07007419 if (dev->reg_state == NETREG_UNINITIALIZED) {
Eric Dumazet74d332c2013-10-30 13:10:44 -07007420 netdev_freemem(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007421 return;
7422 }
7423
7424 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7425 dev->reg_state = NETREG_RELEASED;
7426
Greg Kroah-Hartman43cb76d2002-04-09 12:14:34 -07007427 /* will free via device release */
7428 put_device(&dev->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007429}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07007430EXPORT_SYMBOL(free_netdev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09007431
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07007432/**
7433 * synchronize_net - Synchronize with packet receive processing
7434 *
7435 * Wait for packets currently being received to be done.
7436 * Does not block later packets from starting.
7437 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09007438void synchronize_net(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007439{
7440 might_sleep();
Eric Dumazetbe3fc412011-05-23 23:07:32 +00007441 if (rtnl_is_locked())
7442 synchronize_rcu_expedited();
7443 else
7444 synchronize_rcu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07007445}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07007446EXPORT_SYMBOL(synchronize_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007447
7448/**
Eric Dumazet44a08732009-10-27 07:03:04 +00007449 * unregister_netdevice_queue - remove device from the kernel
Linus Torvalds1da177e2005-04-16 15:20:36 -07007450 * @dev: device
Eric Dumazet44a08732009-10-27 07:03:04 +00007451 * @head: list
Jaswinder Singh Rajput6ebfbc02009-11-22 20:43:13 -08007452 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07007453 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08007454 * from the kernel tables.
Eric Dumazet44a08732009-10-27 07:03:04 +00007455 * If head not NULL, device is queued to be unregistered later.
Linus Torvalds1da177e2005-04-16 15:20:36 -07007456 *
7457 * Callers must hold the rtnl semaphore. You may want
7458 * unregister_netdev() instead of this.
7459 */
7460
Eric Dumazet44a08732009-10-27 07:03:04 +00007461void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007462{
Herbert Xua6620712007-12-12 19:21:56 -08007463 ASSERT_RTNL();
7464
Eric Dumazet44a08732009-10-27 07:03:04 +00007465 if (head) {
Eric W. Biederman9fdce092009-10-30 14:51:13 +00007466 list_move_tail(&dev->unreg_list, head);
Eric Dumazet44a08732009-10-27 07:03:04 +00007467 } else {
7468 rollback_registered(dev);
7469 /* Finish processing unregister after unlock */
7470 net_set_todo(dev);
7471 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07007472}
Eric Dumazet44a08732009-10-27 07:03:04 +00007473EXPORT_SYMBOL(unregister_netdevice_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007474
7475/**
Eric Dumazet9b5e3832009-10-27 07:04:19 +00007476 * unregister_netdevice_many - unregister many devices
7477 * @head: list of devices
Eric Dumazet87757a92014-06-06 06:44:03 -07007478 *
7479 * Note: As most callers use a stack allocated list_head,
7480 * we force a list_del() to make sure stack wont be corrupted later.
Eric Dumazet9b5e3832009-10-27 07:04:19 +00007481 */
7482void unregister_netdevice_many(struct list_head *head)
7483{
7484 struct net_device *dev;
7485
7486 if (!list_empty(head)) {
7487 rollback_registered_many(head);
7488 list_for_each_entry(dev, head, unreg_list)
7489 net_set_todo(dev);
Eric Dumazet87757a92014-06-06 06:44:03 -07007490 list_del(head);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00007491 }
7492}
Eric Dumazet63c80992009-10-27 07:06:49 +00007493EXPORT_SYMBOL(unregister_netdevice_many);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00007494
7495/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07007496 * unregister_netdev - remove device from the kernel
7497 * @dev: device
7498 *
7499 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08007500 * from the kernel tables.
Linus Torvalds1da177e2005-04-16 15:20:36 -07007501 *
7502 * This is just a wrapper for unregister_netdevice that takes
7503 * the rtnl semaphore. In general you want to use this and not
7504 * unregister_netdevice.
7505 */
7506void unregister_netdev(struct net_device *dev)
7507{
7508 rtnl_lock();
7509 unregister_netdevice(dev);
7510 rtnl_unlock();
7511}
Linus Torvalds1da177e2005-04-16 15:20:36 -07007512EXPORT_SYMBOL(unregister_netdev);
7513
Eric W. Biedermance286d32007-09-12 13:53:49 +02007514/**
7515 * dev_change_net_namespace - move device to different nethost namespace
7516 * @dev: device
7517 * @net: network namespace
7518 * @pat: If not NULL name pattern to try if the current device name
7519 * is already taken in the destination network namespace.
7520 *
7521 * This function shuts down a device interface and moves it
7522 * to a new network namespace. On success 0 is returned, on
7523 * a failure a netagive errno code is returned.
7524 *
7525 * Callers must hold the rtnl semaphore.
7526 */
7527
7528int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7529{
Eric W. Biedermance286d32007-09-12 13:53:49 +02007530 int err;
7531
7532 ASSERT_RTNL();
7533
7534 /* Don't allow namespace local devices to be moved. */
7535 err = -EINVAL;
7536 if (dev->features & NETIF_F_NETNS_LOCAL)
7537 goto out;
7538
7539 /* Ensure the device has been registrered */
Eric W. Biedermance286d32007-09-12 13:53:49 +02007540 if (dev->reg_state != NETREG_REGISTERED)
7541 goto out;
7542
7543 /* Get out if there is nothing todo */
7544 err = 0;
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09007545 if (net_eq(dev_net(dev), net))
Eric W. Biedermance286d32007-09-12 13:53:49 +02007546 goto out;
7547
7548 /* Pick the destination device name, and ensure
7549 * we can use it in the destination network namespace.
7550 */
7551 err = -EEXIST;
Octavian Purdilad9031022009-11-18 02:36:59 +00007552 if (__dev_get_by_name(net, dev->name)) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02007553 /* We get here if we can't use the current device name */
7554 if (!pat)
7555 goto out;
Gao feng828de4f2012-09-13 20:58:27 +00007556 if (dev_get_valid_name(net, dev, pat) < 0)
Eric W. Biedermance286d32007-09-12 13:53:49 +02007557 goto out;
7558 }
7559
7560 /*
7561 * And now a mini version of register_netdevice unregister_netdevice.
7562 */
7563
7564 /* If device is running close it first. */
Pavel Emelyanov9b772652007-10-10 02:49:09 -07007565 dev_close(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02007566
7567 /* And unlink it from device chain */
7568 err = -ENODEV;
7569 unlist_netdevice(dev);
7570
7571 synchronize_net();
7572
7573 /* Shutdown queueing discipline. */
7574 dev_shutdown(dev);
7575
7576 /* Notify protocols, that we are about to destroy
7577 this device. They should clean all the things.
David Lamparter3b27e102010-09-17 03:22:19 +00007578
7579 Note that dev->reg_state stays at NETREG_REGISTERED.
7580 This is wanted because this way 8021q and macvlan know
7581 the device is just moving and can keep their slaves up.
Eric W. Biedermance286d32007-09-12 13:53:49 +02007582 */
7583 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Gao feng6549dd42012-08-23 15:36:55 +00007584 rcu_barrier();
7585 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
Alexei Starovoitov7f294052013-10-23 16:02:42 -07007586 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
Eric W. Biedermance286d32007-09-12 13:53:49 +02007587
7588 /*
7589 * Flush the unicast and multicast chains
7590 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00007591 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00007592 dev_mc_flush(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02007593
Serge Hallyn4e66ae22012-12-03 16:17:12 +00007594 /* Send a netdev-removed uevent to the old namespace */
7595 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
Alexander Y. Fomichev4c754312014-08-25 16:26:45 +04007596 netdev_adjacent_del_links(dev);
Serge Hallyn4e66ae22012-12-03 16:17:12 +00007597
Eric W. Biedermance286d32007-09-12 13:53:49 +02007598 /* Actually switch the network namespace */
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09007599 dev_net_set(dev, net);
Eric W. Biedermance286d32007-09-12 13:53:49 +02007600
Eric W. Biedermance286d32007-09-12 13:53:49 +02007601 /* If there is an ifindex conflict assign a new one */
Nicolas Dichtel7a66bbc2015-04-02 17:07:09 +02007602 if (__dev_get_by_index(net, dev->ifindex))
Eric W. Biedermance286d32007-09-12 13:53:49 +02007603 dev->ifindex = dev_new_index(net);
Eric W. Biedermance286d32007-09-12 13:53:49 +02007604
Serge Hallyn4e66ae22012-12-03 16:17:12 +00007605 /* Send a netdev-add uevent to the new namespace */
7606 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
Alexander Y. Fomichev4c754312014-08-25 16:26:45 +04007607 netdev_adjacent_add_links(dev);
Serge Hallyn4e66ae22012-12-03 16:17:12 +00007608
Eric W. Biederman8b41d182007-09-26 22:02:53 -07007609 /* Fixup kobjects */
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07007610 err = device_rename(&dev->dev, dev->name);
Eric W. Biederman8b41d182007-09-26 22:02:53 -07007611 WARN_ON(err);
Eric W. Biedermance286d32007-09-12 13:53:49 +02007612
7613 /* Add the device back in the hashes */
7614 list_netdevice(dev);
7615
7616 /* Notify protocols, that a new device appeared. */
7617 call_netdevice_notifiers(NETDEV_REGISTER, dev);
7618
Eric W. Biedermand90a9092009-12-12 22:11:15 +00007619 /*
7620 * Prevent userspace races by waiting until the network
7621 * device is fully setup before sending notifications.
7622 */
Alexei Starovoitov7f294052013-10-23 16:02:42 -07007623 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
Eric W. Biedermand90a9092009-12-12 22:11:15 +00007624
Eric W. Biedermance286d32007-09-12 13:53:49 +02007625 synchronize_net();
7626 err = 0;
7627out:
7628 return err;
7629}
Johannes Berg463d0182009-07-14 00:33:35 +02007630EXPORT_SYMBOL_GPL(dev_change_net_namespace);
Eric W. Biedermance286d32007-09-12 13:53:49 +02007631
Linus Torvalds1da177e2005-04-16 15:20:36 -07007632static int dev_cpu_callback(struct notifier_block *nfb,
7633 unsigned long action,
7634 void *ocpu)
7635{
7636 struct sk_buff **list_skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007637 struct sk_buff *skb;
7638 unsigned int cpu, oldcpu = (unsigned long)ocpu;
7639 struct softnet_data *sd, *oldsd;
7640
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07007641 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007642 return NOTIFY_OK;
7643
7644 local_irq_disable();
7645 cpu = smp_processor_id();
7646 sd = &per_cpu(softnet_data, cpu);
7647 oldsd = &per_cpu(softnet_data, oldcpu);
7648
7649 /* Find end of our completion_queue. */
7650 list_skb = &sd->completion_queue;
7651 while (*list_skb)
7652 list_skb = &(*list_skb)->next;
7653 /* Append completion queue from offline CPU. */
7654 *list_skb = oldsd->completion_queue;
7655 oldsd->completion_queue = NULL;
7656
Linus Torvalds1da177e2005-04-16 15:20:36 -07007657 /* Append output queue from offline CPU. */
Changli Gaoa9cbd582010-04-26 23:06:24 +00007658 if (oldsd->output_queue) {
7659 *sd->output_queue_tailp = oldsd->output_queue;
7660 sd->output_queue_tailp = oldsd->output_queue_tailp;
7661 oldsd->output_queue = NULL;
7662 oldsd->output_queue_tailp = &oldsd->output_queue;
7663 }
Eric Dumazetac64da02015-01-15 17:04:22 -08007664 /* Append NAPI poll list from offline CPU, with one exception :
7665 * process_backlog() must be called by cpu owning percpu backlog.
7666 * We properly handle process_queue & input_pkt_queue later.
7667 */
7668 while (!list_empty(&oldsd->poll_list)) {
7669 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7670 struct napi_struct,
7671 poll_list);
7672
7673 list_del_init(&napi->poll_list);
7674 if (napi->poll == process_backlog)
7675 napi->state = 0;
7676 else
7677 ____napi_schedule(sd, napi);
Heiko Carstens264524d2011-06-06 20:50:03 +00007678 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07007679
7680 raise_softirq_irqoff(NET_TX_SOFTIRQ);
7681 local_irq_enable();
7682
7683 /* Process offline CPU's input_pkt_queue */
Tom Herbert76cc8b12010-05-20 18:37:59 +00007684 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
Eric Dumazet91e83132015-02-05 14:58:14 -08007685 netif_rx_ni(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00007686 input_queue_head_incr(oldsd);
7687 }
Eric Dumazetac64da02015-01-15 17:04:22 -08007688 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
Eric Dumazet91e83132015-02-05 14:58:14 -08007689 netif_rx_ni(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00007690 input_queue_head_incr(oldsd);
Tom Herbertfec5e652010-04-16 16:01:27 -07007691 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07007692
7693 return NOTIFY_OK;
7694}
Linus Torvalds1da177e2005-04-16 15:20:36 -07007695
7696
Herbert Xu7f353bf2007-08-10 15:47:58 -07007697/**
Herbert Xub63365a2008-10-23 01:11:29 -07007698 * netdev_increment_features - increment feature set by one
7699 * @all: current feature set
7700 * @one: new feature set
7701 * @mask: mask feature set
Herbert Xu7f353bf2007-08-10 15:47:58 -07007702 *
7703 * Computes a new feature set after adding a device with feature set
Herbert Xub63365a2008-10-23 01:11:29 -07007704 * @one to the master device with current feature set @all. Will not
7705 * enable anything that is off in @mask. Returns the new feature set.
Herbert Xu7f353bf2007-08-10 15:47:58 -07007706 */
Michał Mirosławc8f44af2011-11-15 15:29:55 +00007707netdev_features_t netdev_increment_features(netdev_features_t all,
7708 netdev_features_t one, netdev_features_t mask)
Herbert Xu7f353bf2007-08-10 15:47:58 -07007709{
Tom Herbertc8cd0982015-12-14 11:19:44 -08007710 if (mask & NETIF_F_HW_CSUM)
Tom Herberta1882222015-12-14 11:19:43 -08007711 mask |= NETIF_F_CSUM_MASK;
Michał Mirosław1742f182011-04-22 06:31:16 +00007712 mask |= NETIF_F_VLAN_CHALLENGED;
7713
Tom Herberta1882222015-12-14 11:19:43 -08007714 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
Michał Mirosław1742f182011-04-22 06:31:16 +00007715 all &= one | ~NETIF_F_ALL_FOR_ALL;
7716
Michał Mirosław1742f182011-04-22 06:31:16 +00007717 /* If one device supports hw checksumming, set for all. */
Tom Herbertc8cd0982015-12-14 11:19:44 -08007718 if (all & NETIF_F_HW_CSUM)
7719 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
Herbert Xu7f353bf2007-08-10 15:47:58 -07007720
7721 return all;
7722}
Herbert Xub63365a2008-10-23 01:11:29 -07007723EXPORT_SYMBOL(netdev_increment_features);
Herbert Xu7f353bf2007-08-10 15:47:58 -07007724
Baruch Siach430f03c2013-06-02 20:43:55 +00007725static struct hlist_head * __net_init netdev_create_hash(void)
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07007726{
7727 int i;
7728 struct hlist_head *hash;
7729
7730 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7731 if (hash != NULL)
7732 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7733 INIT_HLIST_HEAD(&hash[i]);
7734
7735 return hash;
7736}
7737
Eric W. Biederman881d9662007-09-17 11:56:21 -07007738/* Initialize per network namespace state */
Pavel Emelyanov46650792007-10-08 20:38:39 -07007739static int __net_init netdev_init(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07007740{
Rustad, Mark D734b6542012-07-18 09:06:07 +00007741 if (net != &init_net)
7742 INIT_LIST_HEAD(&net->dev_base_head);
Eric W. Biederman881d9662007-09-17 11:56:21 -07007743
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07007744 net->dev_name_head = netdev_create_hash();
7745 if (net->dev_name_head == NULL)
7746 goto err_name;
Eric W. Biederman881d9662007-09-17 11:56:21 -07007747
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07007748 net->dev_index_head = netdev_create_hash();
7749 if (net->dev_index_head == NULL)
7750 goto err_idx;
Eric W. Biederman881d9662007-09-17 11:56:21 -07007751
7752 return 0;
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07007753
7754err_idx:
7755 kfree(net->dev_name_head);
7756err_name:
7757 return -ENOMEM;
Eric W. Biederman881d9662007-09-17 11:56:21 -07007758}
7759
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07007760/**
7761 * netdev_drivername - network driver for the device
7762 * @dev: network device
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07007763 *
7764 * Determine network driver for device.
7765 */
David S. Miller3019de12011-06-06 16:41:33 -07007766const char *netdev_drivername(const struct net_device *dev)
Arjan van de Ven6579e572008-07-21 13:31:48 -07007767{
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07007768 const struct device_driver *driver;
7769 const struct device *parent;
David S. Miller3019de12011-06-06 16:41:33 -07007770 const char *empty = "";
Arjan van de Ven6579e572008-07-21 13:31:48 -07007771
7772 parent = dev->dev.parent;
Arjan van de Ven6579e572008-07-21 13:31:48 -07007773 if (!parent)
David S. Miller3019de12011-06-06 16:41:33 -07007774 return empty;
Arjan van de Ven6579e572008-07-21 13:31:48 -07007775
7776 driver = parent->driver;
7777 if (driver && driver->name)
David S. Miller3019de12011-06-06 16:41:33 -07007778 return driver->name;
7779 return empty;
Arjan van de Ven6579e572008-07-21 13:31:48 -07007780}
7781
Joe Perches6ea754e2014-09-22 11:10:50 -07007782static void __netdev_printk(const char *level, const struct net_device *dev,
7783 struct va_format *vaf)
Joe Perches256df2f2010-06-27 01:02:35 +00007784{
Joe Perchesb004ff42012-09-12 20:12:19 -07007785 if (dev && dev->dev.parent) {
Joe Perches6ea754e2014-09-22 11:10:50 -07007786 dev_printk_emit(level[1] - '0',
7787 dev->dev.parent,
7788 "%s %s %s%s: %pV",
7789 dev_driver_string(dev->dev.parent),
7790 dev_name(dev->dev.parent),
7791 netdev_name(dev), netdev_reg_state(dev),
7792 vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07007793 } else if (dev) {
Joe Perches6ea754e2014-09-22 11:10:50 -07007794 printk("%s%s%s: %pV",
7795 level, netdev_name(dev), netdev_reg_state(dev), vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07007796 } else {
Joe Perches6ea754e2014-09-22 11:10:50 -07007797 printk("%s(NULL net_device): %pV", level, vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07007798 }
Joe Perches256df2f2010-06-27 01:02:35 +00007799}
7800
Joe Perches6ea754e2014-09-22 11:10:50 -07007801void netdev_printk(const char *level, const struct net_device *dev,
7802 const char *format, ...)
Joe Perches256df2f2010-06-27 01:02:35 +00007803{
7804 struct va_format vaf;
7805 va_list args;
Joe Perches256df2f2010-06-27 01:02:35 +00007806
7807 va_start(args, format);
7808
7809 vaf.fmt = format;
7810 vaf.va = &args;
7811
Joe Perches6ea754e2014-09-22 11:10:50 -07007812 __netdev_printk(level, dev, &vaf);
Joe Perchesb004ff42012-09-12 20:12:19 -07007813
Joe Perches256df2f2010-06-27 01:02:35 +00007814 va_end(args);
Joe Perches256df2f2010-06-27 01:02:35 +00007815}
7816EXPORT_SYMBOL(netdev_printk);
7817
7818#define define_netdev_printk_level(func, level) \
Joe Perches6ea754e2014-09-22 11:10:50 -07007819void func(const struct net_device *dev, const char *fmt, ...) \
Joe Perches256df2f2010-06-27 01:02:35 +00007820{ \
Joe Perches256df2f2010-06-27 01:02:35 +00007821 struct va_format vaf; \
7822 va_list args; \
7823 \
7824 va_start(args, fmt); \
7825 \
7826 vaf.fmt = fmt; \
7827 vaf.va = &args; \
7828 \
Joe Perches6ea754e2014-09-22 11:10:50 -07007829 __netdev_printk(level, dev, &vaf); \
Joe Perchesb004ff42012-09-12 20:12:19 -07007830 \
Joe Perches256df2f2010-06-27 01:02:35 +00007831 va_end(args); \
Joe Perches256df2f2010-06-27 01:02:35 +00007832} \
7833EXPORT_SYMBOL(func);
7834
7835define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7836define_netdev_printk_level(netdev_alert, KERN_ALERT);
7837define_netdev_printk_level(netdev_crit, KERN_CRIT);
7838define_netdev_printk_level(netdev_err, KERN_ERR);
7839define_netdev_printk_level(netdev_warn, KERN_WARNING);
7840define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7841define_netdev_printk_level(netdev_info, KERN_INFO);
7842
Pavel Emelyanov46650792007-10-08 20:38:39 -07007843static void __net_exit netdev_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07007844{
7845 kfree(net->dev_name_head);
7846 kfree(net->dev_index_head);
7847}
7848
Denis V. Lunev022cbae2007-11-13 03:23:50 -08007849static struct pernet_operations __net_initdata netdev_net_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07007850 .init = netdev_init,
7851 .exit = netdev_exit,
7852};
7853
Pavel Emelyanov46650792007-10-08 20:38:39 -07007854static void __net_exit default_device_exit(struct net *net)
Eric W. Biedermance286d32007-09-12 13:53:49 +02007855{
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00007856 struct net_device *dev, *aux;
Eric W. Biedermance286d32007-09-12 13:53:49 +02007857 /*
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00007858 * Push all migratable network devices back to the
Eric W. Biedermance286d32007-09-12 13:53:49 +02007859 * initial network namespace
7860 */
7861 rtnl_lock();
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00007862 for_each_netdev_safe(net, dev, aux) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02007863 int err;
Pavel Emelyanovaca51392008-05-08 01:24:25 -07007864 char fb_name[IFNAMSIZ];
Eric W. Biedermance286d32007-09-12 13:53:49 +02007865
7866 /* Ignore unmoveable devices (i.e. loopback) */
7867 if (dev->features & NETIF_F_NETNS_LOCAL)
7868 continue;
7869
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00007870 /* Leave virtual devices for the generic cleanup */
7871 if (dev->rtnl_link_ops)
7872 continue;
Eric W. Biedermand0c082c2008-11-05 15:59:38 -08007873
Lucas De Marchi25985ed2011-03-30 22:57:33 -03007874 /* Push remaining network devices to init_net */
Pavel Emelyanovaca51392008-05-08 01:24:25 -07007875 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7876 err = dev_change_net_namespace(dev, &init_net, fb_name);
Eric W. Biedermance286d32007-09-12 13:53:49 +02007877 if (err) {
Joe Perches7b6cd1c2012-02-01 10:54:43 +00007878 pr_emerg("%s: failed to move %s to init_net: %d\n",
7879 __func__, dev->name, err);
Pavel Emelyanovaca51392008-05-08 01:24:25 -07007880 BUG();
Eric W. Biedermance286d32007-09-12 13:53:49 +02007881 }
7882 }
7883 rtnl_unlock();
7884}
7885
Eric W. Biederman50624c92013-09-23 21:19:49 -07007886static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7887{
7888 /* Return with the rtnl_lock held when there are no network
7889 * devices unregistering in any network namespace in net_list.
7890 */
7891 struct net *net;
7892 bool unregistering;
Peter Zijlstraff960a72014-10-29 17:04:56 +01007893 DEFINE_WAIT_FUNC(wait, woken_wake_function);
Eric W. Biederman50624c92013-09-23 21:19:49 -07007894
Peter Zijlstraff960a72014-10-29 17:04:56 +01007895 add_wait_queue(&netdev_unregistering_wq, &wait);
Eric W. Biederman50624c92013-09-23 21:19:49 -07007896 for (;;) {
Eric W. Biederman50624c92013-09-23 21:19:49 -07007897 unregistering = false;
7898 rtnl_lock();
7899 list_for_each_entry(net, net_list, exit_list) {
7900 if (net->dev_unreg_count > 0) {
7901 unregistering = true;
7902 break;
7903 }
7904 }
7905 if (!unregistering)
7906 break;
7907 __rtnl_unlock();
Peter Zijlstraff960a72014-10-29 17:04:56 +01007908
7909 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
Eric W. Biederman50624c92013-09-23 21:19:49 -07007910 }
Peter Zijlstraff960a72014-10-29 17:04:56 +01007911 remove_wait_queue(&netdev_unregistering_wq, &wait);
Eric W. Biederman50624c92013-09-23 21:19:49 -07007912}
7913
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00007914static void __net_exit default_device_exit_batch(struct list_head *net_list)
7915{
7916 /* At exit all network devices most be removed from a network
Uwe Kleine-Königb5950762010-11-01 15:38:34 -04007917 * namespace. Do this in the reverse order of registration.
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00007918 * Do this across as many network namespaces as possible to
7919 * improve batching efficiency.
7920 */
7921 struct net_device *dev;
7922 struct net *net;
7923 LIST_HEAD(dev_kill_list);
7924
Eric W. Biederman50624c92013-09-23 21:19:49 -07007925 /* To prevent network device cleanup code from dereferencing
7926 * loopback devices or network devices that have been freed
7927 * wait here for all pending unregistrations to complete,
7928 * before unregistring the loopback device and allowing the
7929 * network namespace be freed.
7930 *
7931 * The netdev todo list containing all network devices
7932 * unregistrations that happen in default_device_exit_batch
7933 * will run in the rtnl_unlock() at the end of
7934 * default_device_exit_batch.
7935 */
7936 rtnl_lock_unregistering(net_list);
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00007937 list_for_each_entry(net, net_list, exit_list) {
7938 for_each_netdev_reverse(net, dev) {
Jiri Pirkob0ab2fa2014-06-26 09:58:25 +02007939 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00007940 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7941 else
7942 unregister_netdevice_queue(dev, &dev_kill_list);
7943 }
7944 }
7945 unregister_netdevice_many(&dev_kill_list);
7946 rtnl_unlock();
7947}
7948
Denis V. Lunev022cbae2007-11-13 03:23:50 -08007949static struct pernet_operations __net_initdata default_device_ops = {
Eric W. Biedermance286d32007-09-12 13:53:49 +02007950 .exit = default_device_exit,
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00007951 .exit_batch = default_device_exit_batch,
Eric W. Biedermance286d32007-09-12 13:53:49 +02007952};
7953
Linus Torvalds1da177e2005-04-16 15:20:36 -07007954/*
7955 * Initialize the DEV module. At boot time this walks the device list and
7956 * unhooks any devices that fail to initialise (normally hardware not
7957 * present) and leaves us with a valid list of present and active devices.
7958 *
7959 */
7960
7961/*
7962 * This is called single threaded during boot, so no need
7963 * to take the rtnl semaphore.
7964 */
7965static int __init net_dev_init(void)
7966{
7967 int i, rc = -ENOMEM;
7968
7969 BUG_ON(!dev_boot_phase);
7970
Linus Torvalds1da177e2005-04-16 15:20:36 -07007971 if (dev_proc_init())
7972 goto out;
7973
Eric W. Biederman8b41d182007-09-26 22:02:53 -07007974 if (netdev_kobject_init())
Linus Torvalds1da177e2005-04-16 15:20:36 -07007975 goto out;
7976
7977 INIT_LIST_HEAD(&ptype_all);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08007978 for (i = 0; i < PTYPE_HASH_SIZE; i++)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007979 INIT_LIST_HEAD(&ptype_base[i]);
7980
Vlad Yasevich62532da2012-11-15 08:49:10 +00007981 INIT_LIST_HEAD(&offload_base);
7982
Eric W. Biederman881d9662007-09-17 11:56:21 -07007983 if (register_pernet_subsys(&netdev_net_ops))
7984 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007985
7986 /*
7987 * Initialise the packet receive queues.
7988 */
7989
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07007990 for_each_possible_cpu(i) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00007991 struct softnet_data *sd = &per_cpu(softnet_data, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007992
Eric Dumazete36fa2f2010-04-19 21:17:14 +00007993 skb_queue_head_init(&sd->input_pkt_queue);
Changli Gao6e7676c2010-04-27 15:07:33 -07007994 skb_queue_head_init(&sd->process_queue);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00007995 INIT_LIST_HEAD(&sd->poll_list);
Changli Gaoa9cbd582010-04-26 23:06:24 +00007996 sd->output_queue_tailp = &sd->output_queue;
Eric Dumazetdf334542010-03-24 19:13:54 +00007997#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +00007998 sd->csd.func = rps_trigger_softirq;
7999 sd->csd.info = sd;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00008000 sd->cpu = i;
Tom Herbert1e94d722010-03-18 17:45:44 -07008001#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00008002
Eric Dumazete36fa2f2010-04-19 21:17:14 +00008003 sd->backlog.poll = process_backlog;
8004 sd->backlog.weight = weight_p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008005 }
8006
Linus Torvalds1da177e2005-04-16 15:20:36 -07008007 dev_boot_phase = 0;
8008
Eric W. Biederman505d4f72008-11-07 22:54:20 -08008009 /* The loopback device is special if any other network devices
8010 * is present in a network namespace the loopback device must
8011 * be present. Since we now dynamically allocate and free the
8012 * loopback device ensure this invariant is maintained by
8013 * keeping the loopback device as the first device on the
8014 * list of network devices. Ensuring the loopback devices
8015 * is the first device that appears and the last network device
8016 * that disappears.
8017 */
8018 if (register_pernet_device(&loopback_net_ops))
8019 goto out;
8020
8021 if (register_pernet_device(&default_device_ops))
8022 goto out;
8023
Carlos R. Mafra962cf362008-05-15 11:15:37 -03008024 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8025 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008026
8027 hotcpu_notifier(dev_cpu_callback, 0);
Thomas Graff38a9eb2015-07-21 10:43:56 +02008028 dst_subsys_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07008029 rc = 0;
8030out:
8031 return rc;
8032}
8033
8034subsys_initcall(net_dev_init);