blob: 2730352d2ccc98449e97b20700141d7e0632267a [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <asm/system.h>
77#include <linux/bitops.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080078#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070079#include <linux/cpu.h>
80#include <linux/types.h>
81#include <linux/kernel.h>
stephen hemminger08e98972009-11-10 07:20:34 +000082#include <linux/hash.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090083#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/sched.h>
Arjan van de Ven4a3e2f72006-03-20 22:33:17 -080085#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070086#include <linux/string.h>
87#include <linux/mm.h>
88#include <linux/socket.h>
89#include <linux/sockios.h>
90#include <linux/errno.h>
91#include <linux/interrupt.h>
92#include <linux/if_ether.h>
93#include <linux/netdevice.h>
94#include <linux/etherdevice.h>
Ben Hutchings0187bdf2008-06-19 16:15:47 -070095#include <linux/ethtool.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070096#include <linux/notifier.h>
97#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020098#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070099#include <net/sock.h>
100#include <linux/rtnetlink.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/stat.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700104#include <net/dst.h>
105#include <net/pkt_sched.h>
106#include <net/checksum.h>
Arnd Bergmann44540962009-11-26 06:07:08 +0000107#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#include <linux/highmem.h>
109#include <linux/init.h>
110#include <linux/kmod.h>
111#include <linux/module.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112#include <linux/netpoll.h>
113#include <linux/rcupdate.h>
114#include <linux/delay.h>
Johannes Berg295f4a12007-04-26 20:43:56 -0700115#include <net/wext.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116#include <net/iw_handler.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117#include <asm/current.h>
Steve Grubb5bdb9882005-12-03 08:39:35 -0500118#include <linux/audit.h>
Chris Leechdb217332006-06-17 21:24:58 -0700119#include <linux/dmaengine.h>
Herbert Xuf6a78bf2006-06-22 02:57:17 -0700120#include <linux/err.h>
David S. Millerc7fa9d12006-08-15 16:34:13 -0700121#include <linux/ctype.h>
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700122#include <linux/if_arp.h>
Ben Hutchings6de329e2008-06-16 17:02:28 -0700123#include <linux/if_vlan.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700124#include <linux/ip.h>
Alexander Duyckad55dca2008-09-20 22:05:50 -0700125#include <net/ip.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700126#include <linux/ipv6.h>
127#include <linux/in.h>
David S. Millerb6b2fed2008-07-21 09:48:06 -0700128#include <linux/jhash.h>
129#include <linux/random.h>
David S. Miller9cbc1cb2009-06-15 03:02:23 -0700130#include <trace/events/napi.h>
Koki Sanagicf66ba52010-08-23 18:45:02 +0900131#include <trace/events/net.h>
Koki Sanagi07dc22e2010-08-23 18:46:12 +0900132#include <trace/events/skb.h>
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +0000133#include <linux/pci.h>
Stephen Rothwellcaeda9b2010-09-16 21:39:16 -0700134#include <linux/inetdevice.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135
Pavel Emelyanov342709e2007-10-23 21:14:45 -0700136#include "net-sysfs.h"
137
Herbert Xud565b0a2008-12-15 23:38:52 -0800138/* Instead of increasing this, you should create a hash table. */
139#define MAX_GRO_SKBS 8
140
Herbert Xu5d38a072009-01-04 16:13:40 -0800141/* This should be increased if a protocol with a bigger head is added. */
142#define GRO_MAX_HEAD (MAX_HEADER + 128)
143
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144/*
145 * The list of packet types we will receive (as opposed to discard)
146 * and the routines to invoke.
147 *
148 * Why 16. Because with 16 the only overlap we get on a hash of the
149 * low nibble of the protocol value is RARP/SNAP/X.25.
150 *
151 * NOTE: That is no longer true with the addition of VLAN tags. Not
152 * sure which should go first, but I bet it won't make much
153 * difference if we are running VLANs. The good news is that
154 * this protocol won't be in the list unless compiled in, so
Stephen Hemminger3041a062006-05-26 13:25:24 -0700155 * the average user (w/out VLANs) will not be adversely affected.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156 * --BLG
157 *
158 * 0800 IP
159 * 8100 802.1Q VLAN
160 * 0001 802.3
161 * 0002 AX.25
162 * 0004 802.2
163 * 8035 RARP
164 * 0005 SNAP
165 * 0805 X.25
166 * 0806 ARP
167 * 8137 IPX
168 * 0009 Localtalk
169 * 86DD IPv6
170 */
171
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +0800172#define PTYPE_HASH_SIZE (16)
173#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
174
Linus Torvalds1da177e2005-04-16 15:20:36 -0700175static DEFINE_SPINLOCK(ptype_lock);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +0800176static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -0700177static struct list_head ptype_all __read_mostly; /* Taps */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178
Linus Torvalds1da177e2005-04-16 15:20:36 -0700179/*
Pavel Emelianov7562f872007-05-03 15:13:45 -0700180 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181 * semaphore.
182 *
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800183 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184 *
185 * Writers must hold the rtnl semaphore while they loop through the
Pavel Emelianov7562f872007-05-03 15:13:45 -0700186 * dev_base_head list, and hold dev_base_lock for writing when they do the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700187 * actual updates. This allows pure readers to access the list even
188 * while a writer is preparing to update it.
189 *
190 * To put it another way, dev_base_lock is held for writing only to
191 * protect against pure readers; the rtnl semaphore provides the
192 * protection against other writers.
193 *
194 * See, for example usages, register_netdevice() and
195 * unregister_netdevice(), which must be called with the rtnl
196 * semaphore held.
197 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198DEFINE_RWLOCK(dev_base_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199EXPORT_SYMBOL(dev_base_lock);
200
Eric W. Biederman881d9662007-09-17 11:56:21 -0700201static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202{
203 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
stephen hemminger08e98972009-11-10 07:20:34 +0000204 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700205}
206
Eric W. Biederman881d9662007-09-17 11:56:21 -0700207static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700208{
Eric Dumazet7c28bd02009-10-24 06:13:17 -0700209 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700210}
211
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000212static inline void rps_lock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000213{
214#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000215 spin_lock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000216#endif
217}
218
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000219static inline void rps_unlock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000220{
221#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000222 spin_unlock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000223#endif
224}
225
Eric W. Biedermance286d32007-09-12 13:53:49 +0200226/* Device list insertion */
227static int list_netdevice(struct net_device *dev)
228{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900229 struct net *net = dev_net(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200230
231 ASSERT_RTNL();
232
233 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800234 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
Eric Dumazet72c95282009-10-30 07:11:27 +0000235 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000236 hlist_add_head_rcu(&dev->index_hlist,
237 dev_index_hash(net, dev->ifindex));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200238 write_unlock_bh(&dev_base_lock);
239 return 0;
240}
241
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000242/* Device list removal
243 * caller must respect a RCU grace period before freeing/reusing dev
244 */
Eric W. Biedermance286d32007-09-12 13:53:49 +0200245static void unlist_netdevice(struct net_device *dev)
246{
247 ASSERT_RTNL();
248
249 /* Unlink dev from the device chain */
250 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800251 list_del_rcu(&dev->dev_list);
Eric Dumazet72c95282009-10-30 07:11:27 +0000252 hlist_del_rcu(&dev->name_hlist);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000253 hlist_del_rcu(&dev->index_hlist);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200254 write_unlock_bh(&dev_base_lock);
255}
256
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257/*
258 * Our notifier list
259 */
260
Alan Sternf07d5b92006-05-09 15:23:03 -0700261static RAW_NOTIFIER_HEAD(netdev_chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700262
263/*
264 * Device drivers call our routines to queue packets here. We empty the
265 * queue in the local softnet handler.
266 */
Stephen Hemmingerbea33482007-10-03 16:41:36 -0700267
Eric Dumazet9958da02010-04-17 04:17:02 +0000268DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700269EXPORT_PER_CPU_SYMBOL(softnet_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700270
David S. Millercf508b12008-07-22 14:16:42 -0700271#ifdef CONFIG_LOCKDEP
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700272/*
David S. Millerc773e842008-07-08 23:13:53 -0700273 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700274 * according to dev->type
275 */
276static const unsigned short netdev_lock_type[] =
277 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
278 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
279 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
280 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
281 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
282 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
283 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
284 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
285 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
286 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
287 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
288 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
289 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
Rémi Denis-Courmont2d91d782008-12-17 15:47:29 -0800290 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
Dmitry Eremin-Solenikov929122cd2009-08-14 20:00:20 +0400291 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000292 ARPHRD_VOID, ARPHRD_NONE};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700293
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700294static const char *const netdev_lock_name[] =
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700295 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
Rémi Denis-Courmont2d91d782008-12-17 15:47:29 -0800308 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
Dmitry Eremin-Solenikov929122cd2009-08-14 20:00:20 +0400309 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000310 "_xmit_VOID", "_xmit_NONE"};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700311
312static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
David S. Millercf508b12008-07-22 14:16:42 -0700313static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700314
315static inline unsigned short netdev_lock_pos(unsigned short dev_type)
316{
317 int i;
318
319 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
320 if (netdev_lock_type[i] == dev_type)
321 return i;
322 /* the last key is used by default */
323 return ARRAY_SIZE(netdev_lock_type) - 1;
324}
325
David S. Millercf508b12008-07-22 14:16:42 -0700326static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
327 unsigned short dev_type)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700328{
329 int i;
330
331 i = netdev_lock_pos(dev_type);
332 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
333 netdev_lock_name[i]);
334}
David S. Millercf508b12008-07-22 14:16:42 -0700335
336static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
337{
338 int i;
339
340 i = netdev_lock_pos(dev->type);
341 lockdep_set_class_and_name(&dev->addr_list_lock,
342 &netdev_addr_lock_key[i],
343 netdev_lock_name[i]);
344}
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700345#else
David S. Millercf508b12008-07-22 14:16:42 -0700346static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
347 unsigned short dev_type)
348{
349}
350static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700351{
352}
353#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700354
355/*******************************************************************************
356
357 Protocol management and registration routines
358
359*******************************************************************************/
360
361/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700362 * Add a protocol ID to the list. Now that the input handler is
363 * smarter we can dispense with all the messy stuff that used to be
364 * here.
365 *
366 * BEWARE!!! Protocol handlers, mangling input packets,
367 * MUST BE last in hash buckets and checking protocol handlers
368 * MUST start from promiscuous ptype_all chain in net_bh.
369 * It is true now, do not change it.
370 * Explanation follows: if protocol handler, mangling packet, will
371 * be the first on list, it is not able to sense, that packet
372 * is cloned and should be copied-on-write, so that it will
373 * change it and subsequent readers will get broken packet.
374 * --ANK (980803)
375 */
376
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000377static inline struct list_head *ptype_head(const struct packet_type *pt)
378{
379 if (pt->type == htons(ETH_P_ALL))
380 return &ptype_all;
381 else
382 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383}
384
Linus Torvalds1da177e2005-04-16 15:20:36 -0700385/**
386 * dev_add_pack - add packet handler
387 * @pt: packet type declaration
388 *
389 * Add a protocol handler to the networking stack. The passed &packet_type
390 * is linked into kernel lists and may not be freed until it has been
391 * removed from the kernel lists.
392 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900393 * This call does not sleep therefore it can not
Linus Torvalds1da177e2005-04-16 15:20:36 -0700394 * guarantee all CPU's that are in middle of receiving packets
395 * will see the new packet type (until the next received packet).
396 */
397
398void dev_add_pack(struct packet_type *pt)
399{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000400 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000402 spin_lock(&ptype_lock);
403 list_add_rcu(&pt->list, head);
404 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700405}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700406EXPORT_SYMBOL(dev_add_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700407
Linus Torvalds1da177e2005-04-16 15:20:36 -0700408/**
409 * __dev_remove_pack - remove packet handler
410 * @pt: packet type declaration
411 *
412 * Remove a protocol handler that was previously added to the kernel
413 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
414 * from the kernel lists and can be freed or reused once this function
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900415 * returns.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416 *
417 * The packet type might still be in use by receivers
418 * and must not be freed until after all the CPU's have gone
419 * through a quiescent state.
420 */
421void __dev_remove_pack(struct packet_type *pt)
422{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000423 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700424 struct packet_type *pt1;
425
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000426 spin_lock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700427
428 list_for_each_entry(pt1, head, list) {
429 if (pt == pt1) {
430 list_del_rcu(&pt->list);
431 goto out;
432 }
433 }
434
435 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
436out:
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000437 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700438}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700439EXPORT_SYMBOL(__dev_remove_pack);
440
Linus Torvalds1da177e2005-04-16 15:20:36 -0700441/**
442 * dev_remove_pack - remove packet handler
443 * @pt: packet type declaration
444 *
445 * Remove a protocol handler that was previously added to the kernel
446 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
447 * from the kernel lists and can be freed or reused once this function
448 * returns.
449 *
450 * This call sleeps to guarantee that no CPU is looking at the packet
451 * type after return.
452 */
453void dev_remove_pack(struct packet_type *pt)
454{
455 __dev_remove_pack(pt);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900456
Linus Torvalds1da177e2005-04-16 15:20:36 -0700457 synchronize_net();
458}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700459EXPORT_SYMBOL(dev_remove_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460
461/******************************************************************************
462
463 Device Boot-time Settings Routines
464
465*******************************************************************************/
466
467/* Boot time configuration table */
468static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
469
470/**
471 * netdev_boot_setup_add - add new setup entry
472 * @name: name of the device
473 * @map: configured settings for the device
474 *
475 * Adds new setup entry to the dev_boot_setup list. The function
476 * returns 0 on error and 1 on success. This is a generic routine to
477 * all netdevices.
478 */
479static int netdev_boot_setup_add(char *name, struct ifmap *map)
480{
481 struct netdev_boot_setup *s;
482 int i;
483
484 s = dev_boot_setup;
485 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
486 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
487 memset(s[i].name, 0, sizeof(s[i].name));
Wang Chen93b3cff2008-07-01 19:57:19 -0700488 strlcpy(s[i].name, name, IFNAMSIZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700489 memcpy(&s[i].map, map, sizeof(s[i].map));
490 break;
491 }
492 }
493
494 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
495}
496
497/**
498 * netdev_boot_setup_check - check boot time settings
499 * @dev: the netdevice
500 *
501 * Check boot time settings for the device.
502 * The found settings are set for the device to be used
503 * later in the device probing.
504 * Returns 0 if no settings found, 1 if they are.
505 */
506int netdev_boot_setup_check(struct net_device *dev)
507{
508 struct netdev_boot_setup *s = dev_boot_setup;
509 int i;
510
511 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
512 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
Wang Chen93b3cff2008-07-01 19:57:19 -0700513 !strcmp(dev->name, s[i].name)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700514 dev->irq = s[i].map.irq;
515 dev->base_addr = s[i].map.base_addr;
516 dev->mem_start = s[i].map.mem_start;
517 dev->mem_end = s[i].map.mem_end;
518 return 1;
519 }
520 }
521 return 0;
522}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700523EXPORT_SYMBOL(netdev_boot_setup_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700524
525
526/**
527 * netdev_boot_base - get address from boot time settings
528 * @prefix: prefix for network device
529 * @unit: id for network device
530 *
531 * Check boot time settings for the base address of device.
532 * The found settings are set for the device to be used
533 * later in the device probing.
534 * Returns 0 if no settings found.
535 */
536unsigned long netdev_boot_base(const char *prefix, int unit)
537{
538 const struct netdev_boot_setup *s = dev_boot_setup;
539 char name[IFNAMSIZ];
540 int i;
541
542 sprintf(name, "%s%d", prefix, unit);
543
544 /*
545 * If device already registered then return base of 1
546 * to indicate not to probe for this interface
547 */
Eric W. Biederman881d9662007-09-17 11:56:21 -0700548 if (__dev_get_by_name(&init_net, name))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549 return 1;
550
551 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
552 if (!strcmp(name, s[i].name))
553 return s[i].map.base_addr;
554 return 0;
555}
556
557/*
558 * Saves at boot time configured settings for any netdevice.
559 */
560int __init netdev_boot_setup(char *str)
561{
562 int ints[5];
563 struct ifmap map;
564
565 str = get_options(str, ARRAY_SIZE(ints), ints);
566 if (!str || !*str)
567 return 0;
568
569 /* Save settings */
570 memset(&map, 0, sizeof(map));
571 if (ints[0] > 0)
572 map.irq = ints[1];
573 if (ints[0] > 1)
574 map.base_addr = ints[2];
575 if (ints[0] > 2)
576 map.mem_start = ints[3];
577 if (ints[0] > 3)
578 map.mem_end = ints[4];
579
580 /* Add new entry to the list */
581 return netdev_boot_setup_add(str, &map);
582}
583
584__setup("netdev=", netdev_boot_setup);
585
586/*******************************************************************************
587
588 Device Interface Subroutines
589
590*******************************************************************************/
591
592/**
593 * __dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700594 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700595 * @name: name to find
596 *
597 * Find an interface by name. Must be called under RTNL semaphore
598 * or @dev_base_lock. If the name is found a pointer to the device
599 * is returned. If the name is not found then %NULL is returned. The
600 * reference counters are not incremented so the caller must be
601 * careful with locks.
602 */
603
Eric W. Biederman881d9662007-09-17 11:56:21 -0700604struct net_device *__dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700605{
606 struct hlist_node *p;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700607 struct net_device *dev;
608 struct hlist_head *head = dev_name_hash(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700609
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700610 hlist_for_each_entry(dev, p, head, name_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611 if (!strncmp(dev->name, name, IFNAMSIZ))
612 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700613
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614 return NULL;
615}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700616EXPORT_SYMBOL(__dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700617
618/**
Eric Dumazet72c95282009-10-30 07:11:27 +0000619 * dev_get_by_name_rcu - find a device by its name
620 * @net: the applicable net namespace
621 * @name: name to find
622 *
623 * Find an interface by name.
624 * If the name is found a pointer to the device is returned.
625 * If the name is not found then %NULL is returned.
626 * The reference counters are not incremented so the caller must be
627 * careful with locks. The caller must hold RCU lock.
628 */
629
630struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
631{
632 struct hlist_node *p;
633 struct net_device *dev;
634 struct hlist_head *head = dev_name_hash(net, name);
635
636 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
637 if (!strncmp(dev->name, name, IFNAMSIZ))
638 return dev;
639
640 return NULL;
641}
642EXPORT_SYMBOL(dev_get_by_name_rcu);
643
644/**
Linus Torvalds1da177e2005-04-16 15:20:36 -0700645 * dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700646 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647 * @name: name to find
648 *
649 * Find an interface by name. This can be called from any
650 * context and does its own locking. The returned handle has
651 * the usage count incremented and the caller must use dev_put() to
652 * release it when it is no longer needed. %NULL is returned if no
653 * matching device is found.
654 */
655
Eric W. Biederman881d9662007-09-17 11:56:21 -0700656struct net_device *dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700657{
658 struct net_device *dev;
659
Eric Dumazet72c95282009-10-30 07:11:27 +0000660 rcu_read_lock();
661 dev = dev_get_by_name_rcu(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662 if (dev)
663 dev_hold(dev);
Eric Dumazet72c95282009-10-30 07:11:27 +0000664 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665 return dev;
666}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700667EXPORT_SYMBOL(dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700668
669/**
670 * __dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700671 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700672 * @ifindex: index of device
673 *
674 * Search for an interface by index. Returns %NULL if the device
675 * is not found or a pointer to the device. The device has not
676 * had its reference counter increased so the caller must be careful
677 * about locking. The caller must hold either the RTNL semaphore
678 * or @dev_base_lock.
679 */
680
Eric W. Biederman881d9662007-09-17 11:56:21 -0700681struct net_device *__dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700682{
683 struct hlist_node *p;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700684 struct net_device *dev;
685 struct hlist_head *head = dev_index_hash(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700686
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700687 hlist_for_each_entry(dev, p, head, index_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700688 if (dev->ifindex == ifindex)
689 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700690
Linus Torvalds1da177e2005-04-16 15:20:36 -0700691 return NULL;
692}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700693EXPORT_SYMBOL(__dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000695/**
696 * dev_get_by_index_rcu - find a device by its ifindex
697 * @net: the applicable net namespace
698 * @ifindex: index of device
699 *
700 * Search for an interface by index. Returns %NULL if the device
701 * is not found or a pointer to the device. The device has not
702 * had its reference counter increased so the caller must be careful
703 * about locking. The caller must hold RCU lock.
704 */
705
706struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
707{
708 struct hlist_node *p;
709 struct net_device *dev;
710 struct hlist_head *head = dev_index_hash(net, ifindex);
711
712 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
713 if (dev->ifindex == ifindex)
714 return dev;
715
716 return NULL;
717}
718EXPORT_SYMBOL(dev_get_by_index_rcu);
719
Linus Torvalds1da177e2005-04-16 15:20:36 -0700720
721/**
722 * dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700723 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724 * @ifindex: index of device
725 *
726 * Search for an interface by index. Returns NULL if the device
727 * is not found or a pointer to the device. The device returned has
728 * had a reference added and the pointer is safe until the user calls
729 * dev_put to indicate they have finished with it.
730 */
731
Eric W. Biederman881d9662007-09-17 11:56:21 -0700732struct net_device *dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700733{
734 struct net_device *dev;
735
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000736 rcu_read_lock();
737 dev = dev_get_by_index_rcu(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700738 if (dev)
739 dev_hold(dev);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000740 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700741 return dev;
742}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700743EXPORT_SYMBOL(dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700744
745/**
Eric Dumazet941666c2010-12-05 01:23:53 +0000746 * dev_getbyhwaddr_rcu - find a device by its hardware address
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700747 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700748 * @type: media type of device
749 * @ha: hardware address
750 *
751 * Search for an interface by MAC address. Returns NULL if the device
Eric Dumazet941666c2010-12-05 01:23:53 +0000752 * is not found or a pointer to the device. The caller must hold RCU
753 * The returned device has not had its ref count increased
Linus Torvalds1da177e2005-04-16 15:20:36 -0700754 * and the caller must therefore be careful about locking
755 *
Linus Torvalds1da177e2005-04-16 15:20:36 -0700756 */
757
Eric Dumazet941666c2010-12-05 01:23:53 +0000758struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
759 const char *ha)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700760{
761 struct net_device *dev;
762
Eric Dumazet941666c2010-12-05 01:23:53 +0000763 for_each_netdev_rcu(net, dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700764 if (dev->type == type &&
765 !memcmp(dev->dev_addr, ha, dev->addr_len))
Pavel Emelianov7562f872007-05-03 15:13:45 -0700766 return dev;
767
768 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700769}
Eric Dumazet941666c2010-12-05 01:23:53 +0000770EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
Jochen Friedrichcf309e32005-09-22 04:44:55 -0300771
Eric W. Biederman881d9662007-09-17 11:56:21 -0700772struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700773{
774 struct net_device *dev;
775
776 ASSERT_RTNL();
Eric W. Biederman881d9662007-09-17 11:56:21 -0700777 for_each_netdev(net, dev)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700778 if (dev->type == type)
Pavel Emelianov7562f872007-05-03 15:13:45 -0700779 return dev;
780
781 return NULL;
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700782}
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700783EXPORT_SYMBOL(__dev_getfirstbyhwtype);
784
Eric W. Biederman881d9662007-09-17 11:56:21 -0700785struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700786{
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000787 struct net_device *dev, *ret = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700788
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000789 rcu_read_lock();
790 for_each_netdev_rcu(net, dev)
791 if (dev->type == type) {
792 dev_hold(dev);
793 ret = dev;
794 break;
795 }
796 rcu_read_unlock();
797 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700798}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799EXPORT_SYMBOL(dev_getfirstbyhwtype);
800
801/**
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000802 * dev_get_by_flags_rcu - find any device with given flags
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700803 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700804 * @if_flags: IFF_* values
805 * @mask: bitmask of bits in if_flags to check
806 *
807 * Search for any interface with the given flags. Returns NULL if a device
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000808 * is not found or a pointer to the device. Must be called inside
809 * rcu_read_lock(), and result refcount is unchanged.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700810 */
811
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000812struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700813 unsigned short mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700814{
Pavel Emelianov7562f872007-05-03 15:13:45 -0700815 struct net_device *dev, *ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700816
Pavel Emelianov7562f872007-05-03 15:13:45 -0700817 ret = NULL;
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800818 for_each_netdev_rcu(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700819 if (((dev->flags ^ if_flags) & mask) == 0) {
Pavel Emelianov7562f872007-05-03 15:13:45 -0700820 ret = dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700821 break;
822 }
823 }
Pavel Emelianov7562f872007-05-03 15:13:45 -0700824 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700825}
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000826EXPORT_SYMBOL(dev_get_by_flags_rcu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700827
828/**
829 * dev_valid_name - check if name is okay for network device
830 * @name: name string
831 *
832 * Network device names need to be valid file names to
David S. Millerc7fa9d12006-08-15 16:34:13 -0700833 * to allow sysfs to work. We also disallow any kind of
834 * whitespace.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700835 */
Mitch Williamsc2373ee2005-11-09 10:34:45 -0800836int dev_valid_name(const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700837{
David S. Millerc7fa9d12006-08-15 16:34:13 -0700838 if (*name == '\0')
839 return 0;
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -0700840 if (strlen(name) >= IFNAMSIZ)
841 return 0;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700842 if (!strcmp(name, ".") || !strcmp(name, ".."))
843 return 0;
844
845 while (*name) {
846 if (*name == '/' || isspace(*name))
847 return 0;
848 name++;
849 }
850 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700851}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700852EXPORT_SYMBOL(dev_valid_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700853
854/**
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200855 * __dev_alloc_name - allocate a name for a device
856 * @net: network namespace to allocate the device name in
Linus Torvalds1da177e2005-04-16 15:20:36 -0700857 * @name: name format string
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200858 * @buf: scratch buffer and result name string
Linus Torvalds1da177e2005-04-16 15:20:36 -0700859 *
860 * Passed a format string - eg "lt%d" it will try and find a suitable
Stephen Hemminger3041a062006-05-26 13:25:24 -0700861 * id. It scans list of devices to build up a free map, then chooses
862 * the first empty slot. The caller must hold the dev_base or rtnl lock
863 * while allocating the name and adding the device in order to avoid
864 * duplicates.
865 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
866 * Returns the number of the unit assigned or a negative errno code.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700867 */
868
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200869static int __dev_alloc_name(struct net *net, const char *name, char *buf)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700870{
871 int i = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700872 const char *p;
873 const int max_netdevices = 8*PAGE_SIZE;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700874 unsigned long *inuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700875 struct net_device *d;
876
877 p = strnchr(name, IFNAMSIZ-1, '%');
878 if (p) {
879 /*
880 * Verify the string as this thing may have come from
881 * the user. There must be either one "%d" and no other "%"
882 * characters.
883 */
884 if (p[1] != 'd' || strchr(p + 2, '%'))
885 return -EINVAL;
886
887 /* Use one page as a bit array of possible slots */
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700888 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700889 if (!inuse)
890 return -ENOMEM;
891
Eric W. Biederman881d9662007-09-17 11:56:21 -0700892 for_each_netdev(net, d) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700893 if (!sscanf(d->name, name, &i))
894 continue;
895 if (i < 0 || i >= max_netdevices)
896 continue;
897
898 /* avoid cases where sscanf is not exact inverse of printf */
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200899 snprintf(buf, IFNAMSIZ, name, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700900 if (!strncmp(buf, d->name, IFNAMSIZ))
901 set_bit(i, inuse);
902 }
903
904 i = find_first_zero_bit(inuse, max_netdevices);
905 free_page((unsigned long) inuse);
906 }
907
Octavian Purdilad9031022009-11-18 02:36:59 +0000908 if (buf != name)
909 snprintf(buf, IFNAMSIZ, name, i);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200910 if (!__dev_get_by_name(net, buf))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700911 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700912
913 /* It is possible to run out of possible slots
914 * when the name is long and there isn't enough space left
915 * for the digits, or if all bits are used.
916 */
917 return -ENFILE;
918}
919
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200920/**
921 * dev_alloc_name - allocate a name for a device
922 * @dev: device
923 * @name: name format string
924 *
925 * Passed a format string - eg "lt%d" it will try and find a suitable
926 * id. It scans list of devices to build up a free map, then chooses
927 * the first empty slot. The caller must hold the dev_base or rtnl lock
928 * while allocating the name and adding the device in order to avoid
929 * duplicates.
930 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
931 * Returns the number of the unit assigned or a negative errno code.
932 */
933
934int dev_alloc_name(struct net_device *dev, const char *name)
935{
936 char buf[IFNAMSIZ];
937 struct net *net;
938 int ret;
939
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900940 BUG_ON(!dev_net(dev));
941 net = dev_net(dev);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200942 ret = __dev_alloc_name(net, name, buf);
943 if (ret >= 0)
944 strlcpy(dev->name, buf, IFNAMSIZ);
945 return ret;
946}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700947EXPORT_SYMBOL(dev_alloc_name);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200948
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000949static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
Octavian Purdilad9031022009-11-18 02:36:59 +0000950{
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000951 struct net *net;
952
953 BUG_ON(!dev_net(dev));
954 net = dev_net(dev);
955
Octavian Purdilad9031022009-11-18 02:36:59 +0000956 if (!dev_valid_name(name))
957 return -EINVAL;
958
959 if (fmt && strchr(name, '%'))
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000960 return dev_alloc_name(dev, name);
Octavian Purdilad9031022009-11-18 02:36:59 +0000961 else if (__dev_get_by_name(net, name))
962 return -EEXIST;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000963 else if (dev->name != name)
964 strlcpy(dev->name, name, IFNAMSIZ);
Octavian Purdilad9031022009-11-18 02:36:59 +0000965
966 return 0;
967}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968
969/**
970 * dev_change_name - change name of a device
971 * @dev: device
972 * @newname: name (or format string) must be at least IFNAMSIZ
973 *
974 * Change name of a device, can pass format strings "eth%d".
975 * for wildcarding.
976 */
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -0700977int dev_change_name(struct net_device *dev, const char *newname)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700978{
Herbert Xufcc5a032007-07-30 17:03:38 -0700979 char oldname[IFNAMSIZ];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980 int err = 0;
Herbert Xufcc5a032007-07-30 17:03:38 -0700981 int ret;
Eric W. Biederman881d9662007-09-17 11:56:21 -0700982 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700983
984 ASSERT_RTNL();
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900985 BUG_ON(!dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700986
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900987 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700988 if (dev->flags & IFF_UP)
989 return -EBUSY;
990
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -0700991 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
992 return 0;
993
Herbert Xufcc5a032007-07-30 17:03:38 -0700994 memcpy(oldname, dev->name, IFNAMSIZ);
995
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000996 err = dev_get_valid_name(dev, newname, 1);
Octavian Purdilad9031022009-11-18 02:36:59 +0000997 if (err < 0)
998 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700999
Herbert Xufcc5a032007-07-30 17:03:38 -07001000rollback:
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001001 ret = device_rename(&dev->dev, dev->name);
1002 if (ret) {
1003 memcpy(dev->name, oldname, IFNAMSIZ);
1004 return ret;
Stephen Hemmingerdcc99772008-05-14 22:33:38 -07001005 }
Herbert Xu7f988ea2007-07-30 16:35:46 -07001006
1007 write_lock_bh(&dev_base_lock);
Eric W. Biederman92749822007-04-03 00:07:30 -06001008 hlist_del(&dev->name_hlist);
Eric Dumazet72c95282009-10-30 07:11:27 +00001009 write_unlock_bh(&dev_base_lock);
1010
1011 synchronize_rcu();
1012
1013 write_lock_bh(&dev_base_lock);
1014 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Herbert Xu7f988ea2007-07-30 16:35:46 -07001015 write_unlock_bh(&dev_base_lock);
1016
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001017 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001018 ret = notifier_to_errno(ret);
1019
1020 if (ret) {
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001021 /* err >= 0 after dev_alloc_name() or stores the first errno */
1022 if (err >= 0) {
Herbert Xufcc5a032007-07-30 17:03:38 -07001023 err = ret;
1024 memcpy(dev->name, oldname, IFNAMSIZ);
1025 goto rollback;
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001026 } else {
1027 printk(KERN_ERR
1028 "%s: name change rollback failed: %d.\n",
1029 dev->name, ret);
Herbert Xufcc5a032007-07-30 17:03:38 -07001030 }
1031 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001032
1033 return err;
1034}
1035
1036/**
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001037 * dev_set_alias - change ifalias of a device
1038 * @dev: device
1039 * @alias: name up to IFALIASZ
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07001040 * @len: limit of bytes to copy from info
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001041 *
1042 * Set ifalias for a device,
1043 */
1044int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1045{
1046 ASSERT_RTNL();
1047
1048 if (len >= IFALIASZ)
1049 return -EINVAL;
1050
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001051 if (!len) {
1052 if (dev->ifalias) {
1053 kfree(dev->ifalias);
1054 dev->ifalias = NULL;
1055 }
1056 return 0;
1057 }
1058
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001059 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001060 if (!dev->ifalias)
1061 return -ENOMEM;
1062
1063 strlcpy(dev->ifalias, alias, len+1);
1064 return len;
1065}
1066
1067
1068/**
Stephen Hemminger3041a062006-05-26 13:25:24 -07001069 * netdev_features_change - device changes features
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001070 * @dev: device to cause notification
1071 *
1072 * Called to indicate a device has changed features.
1073 */
1074void netdev_features_change(struct net_device *dev)
1075{
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001076 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001077}
1078EXPORT_SYMBOL(netdev_features_change);
1079
1080/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001081 * netdev_state_change - device changes state
1082 * @dev: device to cause notification
1083 *
1084 * Called to indicate a device has changed state. This function calls
1085 * the notifier chains for netdev_chain and sends a NEWLINK message
1086 * to the routing socket.
1087 */
1088void netdev_state_change(struct net_device *dev)
1089{
1090 if (dev->flags & IFF_UP) {
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001091 call_netdevice_notifiers(NETDEV_CHANGE, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001092 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1093 }
1094}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001095EXPORT_SYMBOL(netdev_state_change);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001096
Jiri Pirko3ca5b402010-03-10 10:29:35 +00001097int netdev_bonding_change(struct net_device *dev, unsigned long event)
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001098{
Jiri Pirko3ca5b402010-03-10 10:29:35 +00001099 return call_netdevice_notifiers(event, dev);
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001100}
1101EXPORT_SYMBOL(netdev_bonding_change);
1102
Linus Torvalds1da177e2005-04-16 15:20:36 -07001103/**
1104 * dev_load - load a network module
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001105 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07001106 * @name: name of interface
1107 *
1108 * If a network interface is not present and the process has suitable
1109 * privileges this function loads the module. If module loading is not
1110 * available in this kernel then it becomes a nop.
1111 */
1112
Eric W. Biederman881d9662007-09-17 11:56:21 -07001113void dev_load(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001114{
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001115 struct net_device *dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001116
Eric Dumazet72c95282009-10-30 07:11:27 +00001117 rcu_read_lock();
1118 dev = dev_get_by_name_rcu(net, name);
1119 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001120
Eric Parisa8f80e82009-08-13 09:44:51 -04001121 if (!dev && capable(CAP_NET_ADMIN))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001122 request_module("%s", name);
1123}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001124EXPORT_SYMBOL(dev_load);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001125
Patrick McHardybd380812010-02-26 06:34:53 +00001126static int __dev_open(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001127{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001128 const struct net_device_ops *ops = dev->netdev_ops;
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001129 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001131 ASSERT_RTNL();
1132
Linus Torvalds1da177e2005-04-16 15:20:36 -07001133 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001134 * Is it even present?
1135 */
1136 if (!netif_device_present(dev))
1137 return -ENODEV;
1138
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001139 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1140 ret = notifier_to_errno(ret);
1141 if (ret)
1142 return ret;
1143
Linus Torvalds1da177e2005-04-16 15:20:36 -07001144 /*
1145 * Call device private open method
1146 */
1147 set_bit(__LINK_STATE_START, &dev->state);
Jeff Garzikbada3392007-10-23 20:19:37 -07001148
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001149 if (ops->ndo_validate_addr)
1150 ret = ops->ndo_validate_addr(dev);
Jeff Garzikbada3392007-10-23 20:19:37 -07001151
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001152 if (!ret && ops->ndo_open)
1153 ret = ops->ndo_open(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001154
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001155 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001156 * If it went open OK then:
1157 */
1158
Jeff Garzikbada3392007-10-23 20:19:37 -07001159 if (ret)
1160 clear_bit(__LINK_STATE_START, &dev->state);
1161 else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001162 /*
1163 * Set the flags.
1164 */
1165 dev->flags |= IFF_UP;
1166
1167 /*
Dan Williams649274d2009-01-11 00:20:39 -08001168 * Enable NET_DMA
1169 */
David S. Millerb4bd07c2009-02-06 22:06:43 -08001170 net_dmaengine_get();
Dan Williams649274d2009-01-11 00:20:39 -08001171
1172 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001173 * Initialize multicasting status
1174 */
Patrick McHardy4417da62007-06-27 01:28:10 -07001175 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001176
1177 /*
1178 * Wakeup transmit queue engine
1179 */
1180 dev_activate(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001181 }
Jeff Garzikbada3392007-10-23 20:19:37 -07001182
Linus Torvalds1da177e2005-04-16 15:20:36 -07001183 return ret;
1184}
Patrick McHardybd380812010-02-26 06:34:53 +00001185
1186/**
1187 * dev_open - prepare an interface for use.
1188 * @dev: device to open
1189 *
1190 * Takes a device from down to up state. The device's private open
1191 * function is invoked and then the multicast lists are loaded. Finally
1192 * the device is moved into the up state and a %NETDEV_UP message is
1193 * sent to the netdev notifier chain.
1194 *
1195 * Calling this function on an active interface is a nop. On a failure
1196 * a negative errno code is returned.
1197 */
1198int dev_open(struct net_device *dev)
1199{
1200 int ret;
1201
1202 /*
1203 * Is it already up?
1204 */
1205 if (dev->flags & IFF_UP)
1206 return 0;
1207
1208 /*
1209 * Open device
1210 */
1211 ret = __dev_open(dev);
1212 if (ret < 0)
1213 return ret;
1214
1215 /*
1216 * ... and announce new interface.
1217 */
1218 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1219 call_netdevice_notifiers(NETDEV_UP, dev);
1220
1221 return ret;
1222}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001223EXPORT_SYMBOL(dev_open);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001224
Octavian Purdila44345722010-12-13 12:44:07 +00001225static int __dev_close_many(struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001226{
Octavian Purdila44345722010-12-13 12:44:07 +00001227 struct net_device *dev;
Patrick McHardybd380812010-02-26 06:34:53 +00001228
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001229 ASSERT_RTNL();
David S. Miller9d5010d2007-09-12 14:33:25 +02001230 might_sleep();
1231
Octavian Purdila44345722010-12-13 12:44:07 +00001232 list_for_each_entry(dev, head, unreg_list) {
1233 /*
1234 * Tell people we are going down, so that they can
1235 * prepare to death, when device is still operating.
1236 */
1237 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001238
Octavian Purdila44345722010-12-13 12:44:07 +00001239 clear_bit(__LINK_STATE_START, &dev->state);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001240
Octavian Purdila44345722010-12-13 12:44:07 +00001241 /* Synchronize to scheduled poll. We cannot touch poll list, it
1242 * can be even on different cpu. So just clear netif_running().
1243 *
1244 * dev->stop() will invoke napi_disable() on all of it's
1245 * napi_struct instances on this device.
1246 */
1247 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1248 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001249
Octavian Purdila44345722010-12-13 12:44:07 +00001250 dev_deactivate_many(head);
1251
1252 list_for_each_entry(dev, head, unreg_list) {
1253 const struct net_device_ops *ops = dev->netdev_ops;
1254
1255 /*
1256 * Call the device specific close. This cannot fail.
1257 * Only if device is UP
1258 *
1259 * We allow it to be called even after a DETACH hot-plug
1260 * event.
1261 */
1262 if (ops->ndo_stop)
1263 ops->ndo_stop(dev);
1264
1265 /*
1266 * Device is now down.
1267 */
1268
1269 dev->flags &= ~IFF_UP;
1270
1271 /*
1272 * Shutdown NET_DMA
1273 */
1274 net_dmaengine_put();
1275 }
1276
1277 return 0;
1278}
1279
1280static int __dev_close(struct net_device *dev)
1281{
1282 LIST_HEAD(single);
1283
1284 list_add(&dev->unreg_list, &single);
1285 return __dev_close_many(&single);
1286}
1287
Eric Dumazet3fbd8752011-01-19 21:23:22 +00001288static int dev_close_many(struct list_head *head)
Octavian Purdila44345722010-12-13 12:44:07 +00001289{
1290 struct net_device *dev, *tmp;
1291 LIST_HEAD(tmp_list);
1292
1293 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1294 if (!(dev->flags & IFF_UP))
1295 list_move(&dev->unreg_list, &tmp_list);
1296
1297 __dev_close_many(head);
Matti Linnanvuorid8b2a4d2008-02-12 23:10:11 -08001298
Linus Torvalds1da177e2005-04-16 15:20:36 -07001299 /*
Octavian Purdila44345722010-12-13 12:44:07 +00001300 * Tell people we are down
Linus Torvalds1da177e2005-04-16 15:20:36 -07001301 */
Octavian Purdila44345722010-12-13 12:44:07 +00001302 list_for_each_entry(dev, head, unreg_list) {
1303 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1304 call_netdevice_notifiers(NETDEV_DOWN, dev);
1305 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001306
Octavian Purdila44345722010-12-13 12:44:07 +00001307 /* rollback_registered_many needs the complete original list */
1308 list_splice(&tmp_list, head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001309 return 0;
1310}
Patrick McHardybd380812010-02-26 06:34:53 +00001311
1312/**
1313 * dev_close - shutdown an interface.
1314 * @dev: device to shutdown
1315 *
1316 * This function moves an active device into down state. A
1317 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1318 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1319 * chain.
1320 */
1321int dev_close(struct net_device *dev)
1322{
Octavian Purdila44345722010-12-13 12:44:07 +00001323 LIST_HEAD(single);
Patrick McHardybd380812010-02-26 06:34:53 +00001324
Octavian Purdila44345722010-12-13 12:44:07 +00001325 list_add(&dev->unreg_list, &single);
1326 dev_close_many(&single);
Patrick McHardybd380812010-02-26 06:34:53 +00001327
1328 return 0;
1329}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001330EXPORT_SYMBOL(dev_close);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001331
1332
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001333/**
1334 * dev_disable_lro - disable Large Receive Offload on a device
1335 * @dev: device
1336 *
1337 * Disable Large Receive Offload (LRO) on a net device. Must be
1338 * called under RTNL. This is needed if received packets may be
1339 * forwarded to another interface.
1340 */
1341void dev_disable_lro(struct net_device *dev)
1342{
1343 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1344 dev->ethtool_ops->set_flags) {
1345 u32 flags = dev->ethtool_ops->get_flags(dev);
1346 if (flags & ETH_FLAG_LRO) {
1347 flags &= ~ETH_FLAG_LRO;
1348 dev->ethtool_ops->set_flags(dev, flags);
1349 }
1350 }
1351 WARN_ON(dev->features & NETIF_F_LRO);
1352}
1353EXPORT_SYMBOL(dev_disable_lro);
1354
1355
Eric W. Biederman881d9662007-09-17 11:56:21 -07001356static int dev_boot_phase = 1;
1357
Linus Torvalds1da177e2005-04-16 15:20:36 -07001358/*
1359 * Device change register/unregister. These are not inline or static
1360 * as we export them to the world.
1361 */
1362
1363/**
1364 * register_netdevice_notifier - register a network notifier block
1365 * @nb: notifier
1366 *
1367 * Register a notifier to be called when network device events occur.
1368 * The notifier passed is linked into the kernel structures and must
1369 * not be reused until it has been unregistered. A negative errno code
1370 * is returned on a failure.
1371 *
1372 * When registered all registration and up events are replayed
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001373 * to the new notifier to allow device to have a race free
Linus Torvalds1da177e2005-04-16 15:20:36 -07001374 * view of the network device list.
1375 */
1376
1377int register_netdevice_notifier(struct notifier_block *nb)
1378{
1379 struct net_device *dev;
Herbert Xufcc5a032007-07-30 17:03:38 -07001380 struct net_device *last;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001381 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001382 int err;
1383
1384 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001385 err = raw_notifier_chain_register(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001386 if (err)
1387 goto unlock;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001388 if (dev_boot_phase)
1389 goto unlock;
1390 for_each_net(net) {
1391 for_each_netdev(net, dev) {
1392 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1393 err = notifier_to_errno(err);
1394 if (err)
1395 goto rollback;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001396
Eric W. Biederman881d9662007-09-17 11:56:21 -07001397 if (!(dev->flags & IFF_UP))
1398 continue;
Herbert Xufcc5a032007-07-30 17:03:38 -07001399
Eric W. Biederman881d9662007-09-17 11:56:21 -07001400 nb->notifier_call(nb, NETDEV_UP, dev);
1401 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001402 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001403
1404unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001405 rtnl_unlock();
1406 return err;
Herbert Xufcc5a032007-07-30 17:03:38 -07001407
1408rollback:
1409 last = dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001410 for_each_net(net) {
1411 for_each_netdev(net, dev) {
1412 if (dev == last)
1413 break;
Herbert Xufcc5a032007-07-30 17:03:38 -07001414
Eric W. Biederman881d9662007-09-17 11:56:21 -07001415 if (dev->flags & IFF_UP) {
1416 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1417 nb->notifier_call(nb, NETDEV_DOWN, dev);
1418 }
1419 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00001420 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001421 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001422 }
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001423
1424 raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001425 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001426}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001427EXPORT_SYMBOL(register_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001428
1429/**
1430 * unregister_netdevice_notifier - unregister a network notifier block
1431 * @nb: notifier
1432 *
1433 * Unregister a notifier previously registered by
1434 * register_netdevice_notifier(). The notifier is unlinked into the
1435 * kernel structures and may then be reused. A negative errno code
1436 * is returned on a failure.
1437 */
1438
1439int unregister_netdevice_notifier(struct notifier_block *nb)
1440{
Herbert Xu9f514952006-03-25 01:24:25 -08001441 int err;
1442
1443 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001444 err = raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xu9f514952006-03-25 01:24:25 -08001445 rtnl_unlock();
1446 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001447}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001448EXPORT_SYMBOL(unregister_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001449
1450/**
1451 * call_netdevice_notifiers - call all network notifier blocks
1452 * @val: value passed unmodified to notifier function
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001453 * @dev: net_device pointer passed unmodified to notifier function
Linus Torvalds1da177e2005-04-16 15:20:36 -07001454 *
1455 * Call all network notifier blocks. Parameters and return value
Alan Sternf07d5b92006-05-09 15:23:03 -07001456 * are as for raw_notifier_call_chain().
Linus Torvalds1da177e2005-04-16 15:20:36 -07001457 */
1458
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001459int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001460{
Jiri Pirkoab930472010-04-20 01:45:37 -07001461 ASSERT_RTNL();
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001462 return raw_notifier_call_chain(&netdev_chain, val, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001463}
1464
1465/* When > 0 there are consumers of rx skb time stamps */
1466static atomic_t netstamp_needed = ATOMIC_INIT(0);
1467
1468void net_enable_timestamp(void)
1469{
1470 atomic_inc(&netstamp_needed);
1471}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001472EXPORT_SYMBOL(net_enable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001473
1474void net_disable_timestamp(void)
1475{
1476 atomic_dec(&netstamp_needed);
1477}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001478EXPORT_SYMBOL(net_disable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001479
Eric Dumazet3b098e22010-05-15 23:57:10 -07001480static inline void net_timestamp_set(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001481{
1482 if (atomic_read(&netstamp_needed))
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001483 __net_timestamp(skb);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001484 else
1485 skb->tstamp.tv64 = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001486}
1487
Eric Dumazet3b098e22010-05-15 23:57:10 -07001488static inline void net_timestamp_check(struct sk_buff *skb)
1489{
1490 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1491 __net_timestamp(skb);
1492}
1493
Arnd Bergmann44540962009-11-26 06:07:08 +00001494/**
1495 * dev_forward_skb - loopback an skb to another netif
1496 *
1497 * @dev: destination network device
1498 * @skb: buffer to forward
1499 *
1500 * return values:
1501 * NET_RX_SUCCESS (no congestion)
Eric Dumazet6ec82562010-05-06 00:53:53 -07001502 * NET_RX_DROP (packet was dropped, but freed)
Arnd Bergmann44540962009-11-26 06:07:08 +00001503 *
1504 * dev_forward_skb can be used for injecting an skb from the
1505 * start_xmit function of one device into the receive queue
1506 * of another device.
1507 *
1508 * The receiving device may be in another namespace, so
1509 * we have to clear all information in the skb that could
1510 * impact namespace isolation.
1511 */
1512int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1513{
1514 skb_orphan(skb);
Ben Greearc736eef2010-07-22 09:54:47 +00001515 nf_reset(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001516
Eric Dumazetcaf586e2010-09-30 21:06:55 +00001517 if (unlikely(!(dev->flags & IFF_UP) ||
David S. Miller2198a102010-10-21 08:43:05 -07001518 (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
Eric Dumazetcaf586e2010-09-30 21:06:55 +00001519 atomic_long_inc(&dev->rx_dropped);
Eric Dumazet6ec82562010-05-06 00:53:53 -07001520 kfree_skb(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001521 return NET_RX_DROP;
Eric Dumazet6ec82562010-05-06 00:53:53 -07001522 }
Arnd Bergmann8a83a002010-01-30 12:23:03 +00001523 skb_set_dev(skb, dev);
Arnd Bergmann44540962009-11-26 06:07:08 +00001524 skb->tstamp.tv64 = 0;
1525 skb->pkt_type = PACKET_HOST;
1526 skb->protocol = eth_type_trans(skb, dev);
Arnd Bergmann44540962009-11-26 06:07:08 +00001527 return netif_rx(skb);
1528}
1529EXPORT_SYMBOL_GPL(dev_forward_skb);
1530
Changli Gao71d9dec2010-12-15 19:57:25 +00001531static inline int deliver_skb(struct sk_buff *skb,
1532 struct packet_type *pt_prev,
1533 struct net_device *orig_dev)
1534{
1535 atomic_inc(&skb->users);
1536 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1537}
1538
Linus Torvalds1da177e2005-04-16 15:20:36 -07001539/*
1540 * Support routine. Sends outgoing frames to any network
1541 * taps currently in use.
1542 */
1543
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001544static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001545{
1546 struct packet_type *ptype;
Changli Gao71d9dec2010-12-15 19:57:25 +00001547 struct sk_buff *skb2 = NULL;
1548 struct packet_type *pt_prev = NULL;
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001549
Linus Torvalds1da177e2005-04-16 15:20:36 -07001550 rcu_read_lock();
1551 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1552 /* Never send packets back to the socket
1553 * they originated from - MvS (miquels@drinkel.ow.org)
1554 */
1555 if ((ptype->dev == dev || !ptype->dev) &&
1556 (ptype->af_packet_priv == NULL ||
1557 (struct sock *)ptype->af_packet_priv != skb->sk)) {
Changli Gao71d9dec2010-12-15 19:57:25 +00001558 if (pt_prev) {
1559 deliver_skb(skb2, pt_prev, skb->dev);
1560 pt_prev = ptype;
1561 continue;
1562 }
1563
1564 skb2 = skb_clone(skb, GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001565 if (!skb2)
1566 break;
1567
Eric Dumazet70978182010-12-20 21:22:51 +00001568 net_timestamp_set(skb2);
1569
Linus Torvalds1da177e2005-04-16 15:20:36 -07001570 /* skb->nh should be correctly
1571 set by sender, so that the second statement is
1572 just protection against buggy protocols.
1573 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001574 skb_reset_mac_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001575
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001576 if (skb_network_header(skb2) < skb2->data ||
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001577 skb2->network_header > skb2->tail) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001578 if (net_ratelimit())
1579 printk(KERN_CRIT "protocol %04x is "
1580 "buggy, dev %s\n",
Sebastian Andrzej Siewior70777d02010-06-30 10:39:19 -07001581 ntohs(skb2->protocol),
1582 dev->name);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07001583 skb_reset_network_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001584 }
1585
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001586 skb2->transport_header = skb2->network_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001587 skb2->pkt_type = PACKET_OUTGOING;
Changli Gao71d9dec2010-12-15 19:57:25 +00001588 pt_prev = ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589 }
1590 }
Changli Gao71d9dec2010-12-15 19:57:25 +00001591 if (pt_prev)
1592 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001593 rcu_read_unlock();
1594}
1595
John Fastabend4f57c082011-01-17 08:06:04 +00001596/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1597 * @dev: Network device
1598 * @txq: number of queues available
1599 *
1600 * If real_num_tx_queues is changed the tc mappings may no longer be
1601 * valid. To resolve this verify the tc mapping remains valid and if
1602 * not NULL the mapping. With no priorities mapping to this
1603 * offset/count pair it will no longer be used. In the worst case TC0
1604 * is invalid nothing can be done so disable priority mappings. If is
1605 * expected that drivers will fix this mapping if they can before
1606 * calling netif_set_real_num_tx_queues.
1607 */
1608void netif_setup_tc(struct net_device *dev, unsigned int txq)
1609{
1610 int i;
1611 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1612
1613 /* If TC0 is invalidated disable TC mapping */
1614 if (tc->offset + tc->count > txq) {
1615 pr_warning("Number of in use tx queues changed "
1616 "invalidating tc mappings. Priority "
1617 "traffic classification disabled!\n");
1618 dev->num_tc = 0;
1619 return;
1620 }
1621
1622 /* Invalidated prio to tc mappings set to TC0 */
1623 for (i = 1; i < TC_BITMASK + 1; i++) {
1624 int q = netdev_get_prio_tc_map(dev, i);
1625
1626 tc = &dev->tc_to_txq[q];
1627 if (tc->offset + tc->count > txq) {
1628 pr_warning("Number of in use tx queues "
1629 "changed. Priority %i to tc "
1630 "mapping %i is no longer valid "
1631 "setting map to 0\n",
1632 i, q);
1633 netdev_set_prio_tc_map(dev, i, 0);
1634 }
1635 }
1636}
1637
John Fastabendf0796d52010-07-01 13:21:57 +00001638/*
1639 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1640 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1641 */
Tom Herberte6484932010-10-18 18:04:39 +00001642int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
John Fastabendf0796d52010-07-01 13:21:57 +00001643{
Tom Herbert1d24eb42010-11-21 13:17:27 +00001644 int rc;
1645
Tom Herberte6484932010-10-18 18:04:39 +00001646 if (txq < 1 || txq > dev->num_tx_queues)
1647 return -EINVAL;
John Fastabendf0796d52010-07-01 13:21:57 +00001648
Tom Herberte6484932010-10-18 18:04:39 +00001649 if (dev->reg_state == NETREG_REGISTERED) {
1650 ASSERT_RTNL();
1651
Tom Herbert1d24eb42010-11-21 13:17:27 +00001652 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1653 txq);
Tom Herbertbf264142010-11-26 08:36:09 +00001654 if (rc)
1655 return rc;
1656
John Fastabend4f57c082011-01-17 08:06:04 +00001657 if (dev->num_tc)
1658 netif_setup_tc(dev, txq);
1659
Tom Herberte6484932010-10-18 18:04:39 +00001660 if (txq < dev->real_num_tx_queues)
1661 qdisc_reset_all_tx_gt(dev, txq);
John Fastabendf0796d52010-07-01 13:21:57 +00001662 }
Tom Herberte6484932010-10-18 18:04:39 +00001663
1664 dev->real_num_tx_queues = txq;
1665 return 0;
John Fastabendf0796d52010-07-01 13:21:57 +00001666}
1667EXPORT_SYMBOL(netif_set_real_num_tx_queues);
Denis Vlasenko56079432006-03-29 15:57:29 -08001668
Ben Hutchings62fe0b42010-09-27 08:24:33 +00001669#ifdef CONFIG_RPS
1670/**
1671 * netif_set_real_num_rx_queues - set actual number of RX queues used
1672 * @dev: Network device
1673 * @rxq: Actual number of RX queues
1674 *
1675 * This must be called either with the rtnl_lock held or before
1676 * registration of the net device. Returns 0 on success, or a
Ben Hutchings4e7f7952010-10-08 10:33:39 -07001677 * negative error code. If called before registration, it always
1678 * succeeds.
Ben Hutchings62fe0b42010-09-27 08:24:33 +00001679 */
1680int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1681{
1682 int rc;
1683
Tom Herbertbd25fa72010-10-18 18:00:16 +00001684 if (rxq < 1 || rxq > dev->num_rx_queues)
1685 return -EINVAL;
1686
Ben Hutchings62fe0b42010-09-27 08:24:33 +00001687 if (dev->reg_state == NETREG_REGISTERED) {
1688 ASSERT_RTNL();
1689
Ben Hutchings62fe0b42010-09-27 08:24:33 +00001690 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1691 rxq);
1692 if (rc)
1693 return rc;
Ben Hutchings62fe0b42010-09-27 08:24:33 +00001694 }
1695
1696 dev->real_num_rx_queues = rxq;
1697 return 0;
1698}
1699EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1700#endif
1701
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001702static inline void __netif_reschedule(struct Qdisc *q)
1703{
1704 struct softnet_data *sd;
1705 unsigned long flags;
1706
1707 local_irq_save(flags);
1708 sd = &__get_cpu_var(softnet_data);
Changli Gaoa9cbd582010-04-26 23:06:24 +00001709 q->next_sched = NULL;
1710 *sd->output_queue_tailp = q;
1711 sd->output_queue_tailp = &q->next_sched;
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001712 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1713 local_irq_restore(flags);
1714}
1715
David S. Miller37437bb2008-07-16 02:15:04 -07001716void __netif_schedule(struct Qdisc *q)
Denis Vlasenko56079432006-03-29 15:57:29 -08001717{
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001718 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1719 __netif_reschedule(q);
Denis Vlasenko56079432006-03-29 15:57:29 -08001720}
1721EXPORT_SYMBOL(__netif_schedule);
1722
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001723void dev_kfree_skb_irq(struct sk_buff *skb)
Denis Vlasenko56079432006-03-29 15:57:29 -08001724{
David S. Miller3578b0c2010-08-03 00:24:04 -07001725 if (atomic_dec_and_test(&skb->users)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001726 struct softnet_data *sd;
1727 unsigned long flags;
Denis Vlasenko56079432006-03-29 15:57:29 -08001728
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001729 local_irq_save(flags);
1730 sd = &__get_cpu_var(softnet_data);
1731 skb->next = sd->completion_queue;
1732 sd->completion_queue = skb;
1733 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1734 local_irq_restore(flags);
1735 }
Denis Vlasenko56079432006-03-29 15:57:29 -08001736}
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001737EXPORT_SYMBOL(dev_kfree_skb_irq);
Denis Vlasenko56079432006-03-29 15:57:29 -08001738
1739void dev_kfree_skb_any(struct sk_buff *skb)
1740{
1741 if (in_irq() || irqs_disabled())
1742 dev_kfree_skb_irq(skb);
1743 else
1744 dev_kfree_skb(skb);
1745}
1746EXPORT_SYMBOL(dev_kfree_skb_any);
1747
1748
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001749/**
1750 * netif_device_detach - mark device as removed
1751 * @dev: network device
1752 *
1753 * Mark device as removed from system and therefore no longer available.
1754 */
Denis Vlasenko56079432006-03-29 15:57:29 -08001755void netif_device_detach(struct net_device *dev)
1756{
1757 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1758 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00001759 netif_tx_stop_all_queues(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08001760 }
1761}
1762EXPORT_SYMBOL(netif_device_detach);
1763
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001764/**
1765 * netif_device_attach - mark device as attached
1766 * @dev: network device
1767 *
1768 * Mark device as attached from system and restart if needed.
1769 */
Denis Vlasenko56079432006-03-29 15:57:29 -08001770void netif_device_attach(struct net_device *dev)
1771{
1772 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1773 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00001774 netif_tx_wake_all_queues(dev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001775 __netdev_watchdog_up(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08001776 }
1777}
1778EXPORT_SYMBOL(netif_device_attach);
1779
Arnd Bergmann8a83a002010-01-30 12:23:03 +00001780/**
1781 * skb_dev_set -- assign a new device to a buffer
1782 * @skb: buffer for the new device
1783 * @dev: network device
1784 *
1785 * If an skb is owned by a device already, we have to reset
1786 * all data private to the namespace a device belongs to
1787 * before assigning it a new device.
1788 */
1789#ifdef CONFIG_NET_NS
1790void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1791{
1792 skb_dst_drop(skb);
1793 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1794 secpath_reset(skb);
1795 nf_reset(skb);
1796 skb_init_secmark(skb);
1797 skb->mark = 0;
1798 skb->priority = 0;
1799 skb->nf_trace = 0;
1800 skb->ipvs_property = 0;
1801#ifdef CONFIG_NET_SCHED
1802 skb->tc_index = 0;
1803#endif
1804 }
1805 skb->dev = dev;
1806}
1807EXPORT_SYMBOL(skb_set_dev);
1808#endif /* CONFIG_NET_NS */
1809
Linus Torvalds1da177e2005-04-16 15:20:36 -07001810/*
1811 * Invalidate hardware checksum when packet is to be mangled, and
1812 * complete checksum manually on outgoing path.
1813 */
Patrick McHardy84fa7932006-08-29 16:44:56 -07001814int skb_checksum_help(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001815{
Al Virod3bc23e2006-11-14 21:24:49 -08001816 __wsum csum;
Herbert Xu663ead32007-04-09 11:59:07 -07001817 int ret = 0, offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001818
Patrick McHardy84fa7932006-08-29 16:44:56 -07001819 if (skb->ip_summed == CHECKSUM_COMPLETE)
Herbert Xua430a432006-07-08 13:34:56 -07001820 goto out_set_summed;
1821
1822 if (unlikely(skb_shinfo(skb)->gso_size)) {
Herbert Xua430a432006-07-08 13:34:56 -07001823 /* Let GSO fix up the checksum. */
1824 goto out_set_summed;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001825 }
1826
Michał Mirosław55508d62010-12-14 15:24:08 +00001827 offset = skb_checksum_start_offset(skb);
Herbert Xua0308472007-10-15 01:47:15 -07001828 BUG_ON(offset >= skb_headlen(skb));
1829 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1830
1831 offset += skb->csum_offset;
1832 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1833
1834 if (skb_cloned(skb) &&
1835 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001836 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1837 if (ret)
1838 goto out;
1839 }
1840
Herbert Xua0308472007-10-15 01:47:15 -07001841 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
Herbert Xua430a432006-07-08 13:34:56 -07001842out_set_summed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001843 skb->ip_summed = CHECKSUM_NONE;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001844out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001845 return ret;
1846}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001847EXPORT_SYMBOL(skb_checksum_help);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001848
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001849/**
1850 * skb_gso_segment - Perform segmentation on skb.
1851 * @skb: buffer to segment
Herbert Xu576a30e2006-06-27 13:22:38 -07001852 * @features: features for the output path (see dev->features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001853 *
1854 * This function segments the given skb and returns a list of segments.
Herbert Xu576a30e2006-06-27 13:22:38 -07001855 *
1856 * It may return NULL if the skb requires no segmentation. This is
1857 * only possible when GSO is used for verifying header integrity.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001858 */
Herbert Xu576a30e2006-06-27 13:22:38 -07001859struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001860{
1861 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1862 struct packet_type *ptype;
Al Viro252e3342006-11-14 20:48:11 -08001863 __be16 type = skb->protocol;
Jesse Grossc8d5bcd2010-10-29 12:14:54 +00001864 int vlan_depth = ETH_HLEN;
Herbert Xua430a432006-07-08 13:34:56 -07001865 int err;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001866
Jesse Grossc8d5bcd2010-10-29 12:14:54 +00001867 while (type == htons(ETH_P_8021Q)) {
1868 struct vlan_hdr *vh;
Jesse Gross7b9c6092010-10-20 13:56:04 +00001869
Jesse Grossc8d5bcd2010-10-29 12:14:54 +00001870 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
Jesse Gross7b9c6092010-10-20 13:56:04 +00001871 return ERR_PTR(-EINVAL);
1872
Jesse Grossc8d5bcd2010-10-29 12:14:54 +00001873 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1874 type = vh->h_vlan_encapsulated_proto;
1875 vlan_depth += VLAN_HLEN;
Jesse Gross7b9c6092010-10-20 13:56:04 +00001876 }
1877
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001878 skb_reset_mac_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001879 skb->mac_len = skb->network_header - skb->mac_header;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001880 __skb_pull(skb, skb->mac_len);
1881
Herbert Xu67fd1a72009-01-19 16:26:44 -08001882 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1883 struct net_device *dev = skb->dev;
1884 struct ethtool_drvinfo info = {};
1885
1886 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1887 dev->ethtool_ops->get_drvinfo(dev, &info);
1888
Joe Perchesb194a362010-10-30 11:08:52 +00001889 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
Herbert Xu67fd1a72009-01-19 16:26:44 -08001890 info.driver, dev ? dev->features : 0L,
1891 skb->sk ? skb->sk->sk_route_caps : 0L,
1892 skb->len, skb->data_len, skb->ip_summed);
1893
Herbert Xua430a432006-07-08 13:34:56 -07001894 if (skb_header_cloned(skb) &&
1895 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1896 return ERR_PTR(err);
1897 }
1898
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001899 rcu_read_lock();
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08001900 list_for_each_entry_rcu(ptype,
1901 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001902 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
Patrick McHardy84fa7932006-08-29 16:44:56 -07001903 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
Herbert Xua430a432006-07-08 13:34:56 -07001904 err = ptype->gso_send_check(skb);
1905 segs = ERR_PTR(err);
1906 if (err || skb_gso_ok(skb, features))
1907 break;
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001908 __skb_push(skb, (skb->data -
1909 skb_network_header(skb)));
Herbert Xua430a432006-07-08 13:34:56 -07001910 }
Herbert Xu576a30e2006-06-27 13:22:38 -07001911 segs = ptype->gso_segment(skb, features);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001912 break;
1913 }
1914 }
1915 rcu_read_unlock();
1916
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001917 __skb_push(skb, skb->data - skb_mac_header(skb));
Herbert Xu576a30e2006-06-27 13:22:38 -07001918
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001919 return segs;
1920}
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001921EXPORT_SYMBOL(skb_gso_segment);
1922
Herbert Xufb286bb2005-11-10 13:01:24 -08001923/* Take action when hardware reception checksum errors are detected. */
1924#ifdef CONFIG_BUG
1925void netdev_rx_csum_fault(struct net_device *dev)
1926{
1927 if (net_ratelimit()) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001928 printk(KERN_ERR "%s: hw csum failure.\n",
Stephen Hemminger246a4212005-12-08 15:21:39 -08001929 dev ? dev->name : "<unknown>");
Herbert Xufb286bb2005-11-10 13:01:24 -08001930 dump_stack();
1931 }
1932}
1933EXPORT_SYMBOL(netdev_rx_csum_fault);
1934#endif
1935
Linus Torvalds1da177e2005-04-16 15:20:36 -07001936/* Actually, we should eliminate this check as soon as we know, that:
1937 * 1. IOMMU is present and allows to map all the memory.
1938 * 2. No high memory really exists on this machine.
1939 */
1940
Eric Dumazet9092c652010-04-02 13:34:49 -07001941static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001942{
Herbert Xu3d3a8532006-06-27 13:33:10 -07001943#ifdef CONFIG_HIGHMEM
Linus Torvalds1da177e2005-04-16 15:20:36 -07001944 int i;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001945 if (!(dev->features & NETIF_F_HIGHDMA)) {
1946 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1947 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1948 return 1;
1949 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001950
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001951 if (PCI_DMA_BUS_IS_PHYS) {
1952 struct device *pdev = dev->dev.parent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001953
Eric Dumazet9092c652010-04-02 13:34:49 -07001954 if (!pdev)
1955 return 0;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001956 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1957 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1958 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1959 return 1;
1960 }
1961 }
Herbert Xu3d3a8532006-06-27 13:33:10 -07001962#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001963 return 0;
1964}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001965
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001966struct dev_gso_cb {
1967 void (*destructor)(struct sk_buff *skb);
1968};
1969
1970#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1971
1972static void dev_gso_skb_destructor(struct sk_buff *skb)
1973{
1974 struct dev_gso_cb *cb;
1975
1976 do {
1977 struct sk_buff *nskb = skb->next;
1978
1979 skb->next = nskb->next;
1980 nskb->next = NULL;
1981 kfree_skb(nskb);
1982 } while (skb->next);
1983
1984 cb = DEV_GSO_CB(skb);
1985 if (cb->destructor)
1986 cb->destructor(skb);
1987}
1988
1989/**
1990 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1991 * @skb: buffer to segment
Jesse Gross91ecb632011-01-09 06:23:33 +00001992 * @features: device features as applicable to this skb
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001993 *
1994 * This function segments the given skb and stores the list of segments
1995 * in skb->next.
1996 */
Jesse Gross91ecb632011-01-09 06:23:33 +00001997static int dev_gso_segment(struct sk_buff *skb, int features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001998{
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001999 struct sk_buff *segs;
2000
Herbert Xu576a30e2006-06-27 13:22:38 -07002001 segs = skb_gso_segment(skb, features);
2002
2003 /* Verifying header integrity only. */
2004 if (!segs)
2005 return 0;
2006
Hirofumi Nakagawa801678c2008-04-29 01:03:09 -07002007 if (IS_ERR(segs))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002008 return PTR_ERR(segs);
2009
2010 skb->next = segs;
2011 DEV_GSO_CB(skb)->destructor = skb->destructor;
2012 skb->destructor = dev_gso_skb_destructor;
2013
2014 return 0;
2015}
2016
Eric Dumazetfc6055a2010-04-16 12:18:22 +00002017/*
2018 * Try to orphan skb early, right before transmission by the device.
Oliver Hartkopp2244d072010-08-17 08:59:14 +00002019 * We cannot orphan skb if tx timestamp is requested or the sk-reference
2020 * is needed on driver level for other reasons, e.g. see net/can/raw.c
Eric Dumazetfc6055a2010-04-16 12:18:22 +00002021 */
2022static inline void skb_orphan_try(struct sk_buff *skb)
2023{
Eric Dumazet87fd3082010-07-13 05:24:20 +00002024 struct sock *sk = skb->sk;
2025
Oliver Hartkopp2244d072010-08-17 08:59:14 +00002026 if (sk && !skb_shinfo(skb)->tx_flags) {
Eric Dumazet87fd3082010-07-13 05:24:20 +00002027 /* skb_tx_hash() wont be able to get sk.
2028 * We copy sk_hash into skb->rxhash
2029 */
2030 if (!skb->rxhash)
2031 skb->rxhash = sk->sk_hash;
Eric Dumazetfc6055a2010-04-16 12:18:22 +00002032 skb_orphan(skb);
Eric Dumazet87fd3082010-07-13 05:24:20 +00002033 }
Eric Dumazetfc6055a2010-04-16 12:18:22 +00002034}
2035
Jesse Gross03634662011-01-09 06:23:35 +00002036static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2037{
2038 return ((features & NETIF_F_GEN_CSUM) ||
2039 ((features & NETIF_F_V4_CSUM) &&
2040 protocol == htons(ETH_P_IP)) ||
2041 ((features & NETIF_F_V6_CSUM) &&
2042 protocol == htons(ETH_P_IPV6)) ||
2043 ((features & NETIF_F_FCOE_CRC) &&
2044 protocol == htons(ETH_P_FCOE)));
2045}
2046
Jesse Grossf01a5232011-01-09 06:23:31 +00002047static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features)
2048{
2049 if (!can_checksum_protocol(protocol, features)) {
2050 features &= ~NETIF_F_ALL_CSUM;
2051 features &= ~NETIF_F_SG;
2052 } else if (illegal_highdma(skb->dev, skb)) {
2053 features &= ~NETIF_F_SG;
2054 }
2055
2056 return features;
2057}
2058
2059int netif_skb_features(struct sk_buff *skb)
Jesse Gross58e998c2010-10-29 12:14:55 +00002060{
2061 __be16 protocol = skb->protocol;
Jesse Grossf01a5232011-01-09 06:23:31 +00002062 int features = skb->dev->features;
Jesse Gross58e998c2010-10-29 12:14:55 +00002063
2064 if (protocol == htons(ETH_P_8021Q)) {
2065 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2066 protocol = veh->h_vlan_encapsulated_proto;
Jesse Grossf01a5232011-01-09 06:23:31 +00002067 } else if (!vlan_tx_tag_present(skb)) {
2068 return harmonize_features(skb, protocol, features);
2069 }
Jesse Gross58e998c2010-10-29 12:14:55 +00002070
Jesse Gross6ee400a2011-01-17 20:46:00 +00002071 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
Jesse Grossf01a5232011-01-09 06:23:31 +00002072
2073 if (protocol != htons(ETH_P_8021Q)) {
2074 return harmonize_features(skb, protocol, features);
2075 } else {
2076 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
Jesse Gross6ee400a2011-01-17 20:46:00 +00002077 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
Jesse Grossf01a5232011-01-09 06:23:31 +00002078 return harmonize_features(skb, protocol, features);
2079 }
Jesse Gross58e998c2010-10-29 12:14:55 +00002080}
Jesse Grossf01a5232011-01-09 06:23:31 +00002081EXPORT_SYMBOL(netif_skb_features);
Jesse Gross58e998c2010-10-29 12:14:55 +00002082
John Fastabend6afff0c2010-06-16 14:18:12 +00002083/*
2084 * Returns true if either:
2085 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2086 * 2. skb is fragmented and the device does not support SG, or if
2087 * at least one of fragments is in highmem and device does not
2088 * support DMA from it.
2089 */
2090static inline int skb_needs_linearize(struct sk_buff *skb,
Jesse Gross02932ce2011-01-09 06:23:34 +00002091 int features)
John Fastabend6afff0c2010-06-16 14:18:12 +00002092{
Jesse Gross02932ce2011-01-09 06:23:34 +00002093 return skb_is_nonlinear(skb) &&
2094 ((skb_has_frag_list(skb) &&
2095 !(features & NETIF_F_FRAGLIST)) ||
Jesse Grosse1e78db2010-10-29 12:14:53 +00002096 (skb_shinfo(skb)->nr_frags &&
Jesse Gross02932ce2011-01-09 06:23:34 +00002097 !(features & NETIF_F_SG)));
John Fastabend6afff0c2010-06-16 14:18:12 +00002098}
2099
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002100int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2101 struct netdev_queue *txq)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002102{
Stephen Hemminger00829822008-11-20 20:14:53 -08002103 const struct net_device_ops *ops = dev->netdev_ops;
Patrick McHardy572a9d72009-11-10 06:14:14 +00002104 int rc = NETDEV_TX_OK;
Stephen Hemminger00829822008-11-20 20:14:53 -08002105
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002106 if (likely(!skb->next)) {
Jesse Grossfc741212011-01-09 06:23:32 +00002107 int features;
2108
Eric Dumazet93f154b2009-05-18 22:19:19 -07002109 /*
2110 * If device doesnt need skb->dst, release it right now while
2111 * its hot in this cpu cache
2112 */
Eric Dumazetadf30902009-06-02 05:19:30 +00002113 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2114 skb_dst_drop(skb);
2115
Eric Dumazet15c2d75f2010-12-07 00:30:37 +00002116 if (!list_empty(&ptype_all))
2117 dev_queue_xmit_nit(skb, dev);
2118
Eric Dumazetfc6055a2010-04-16 12:18:22 +00002119 skb_orphan_try(skb);
David S. Miller9ccb8972010-04-22 01:02:07 -07002120
Jesse Grossfc741212011-01-09 06:23:32 +00002121 features = netif_skb_features(skb);
2122
Jesse Gross7b9c6092010-10-20 13:56:04 +00002123 if (vlan_tx_tag_present(skb) &&
Jesse Grossfc741212011-01-09 06:23:32 +00002124 !(features & NETIF_F_HW_VLAN_TX)) {
Jesse Gross7b9c6092010-10-20 13:56:04 +00002125 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2126 if (unlikely(!skb))
2127 goto out;
2128
2129 skb->vlan_tci = 0;
2130 }
2131
Jesse Grossfc741212011-01-09 06:23:32 +00002132 if (netif_needs_gso(skb, features)) {
Jesse Gross91ecb632011-01-09 06:23:33 +00002133 if (unlikely(dev_gso_segment(skb, features)))
David S. Miller9ccb8972010-04-22 01:02:07 -07002134 goto out_kfree_skb;
2135 if (skb->next)
2136 goto gso;
John Fastabend6afff0c2010-06-16 14:18:12 +00002137 } else {
Jesse Gross02932ce2011-01-09 06:23:34 +00002138 if (skb_needs_linearize(skb, features) &&
John Fastabend6afff0c2010-06-16 14:18:12 +00002139 __skb_linearize(skb))
2140 goto out_kfree_skb;
2141
2142 /* If packet is not checksummed and device does not
2143 * support checksumming for this protocol, complete
2144 * checksumming here.
2145 */
2146 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Michał Mirosław55508d62010-12-14 15:24:08 +00002147 skb_set_transport_header(skb,
2148 skb_checksum_start_offset(skb));
Jesse Gross03634662011-01-09 06:23:35 +00002149 if (!(features & NETIF_F_ALL_CSUM) &&
John Fastabend6afff0c2010-06-16 14:18:12 +00002150 skb_checksum_help(skb))
2151 goto out_kfree_skb;
2152 }
David S. Miller9ccb8972010-04-22 01:02:07 -07002153 }
2154
Patrick Ohlyac45f602009-02-12 05:03:37 +00002155 rc = ops->ndo_start_xmit(skb, dev);
Koki Sanagicf66ba52010-08-23 18:45:02 +09002156 trace_net_dev_xmit(skb, rc);
Patrick McHardyec634fe2009-07-05 19:23:38 -07002157 if (rc == NETDEV_TX_OK)
Eric Dumazet08baf562009-05-25 22:58:01 -07002158 txq_trans_update(txq);
Patrick Ohlyac45f602009-02-12 05:03:37 +00002159 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002160 }
2161
Herbert Xu576a30e2006-06-27 13:22:38 -07002162gso:
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002163 do {
2164 struct sk_buff *nskb = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002165
2166 skb->next = nskb->next;
2167 nskb->next = NULL;
Krishna Kumar068a2de2009-12-09 20:59:58 +00002168
2169 /*
2170 * If device doesnt need nskb->dst, release it right now while
2171 * its hot in this cpu cache
2172 */
2173 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2174 skb_dst_drop(nskb);
2175
Stephen Hemminger00829822008-11-20 20:14:53 -08002176 rc = ops->ndo_start_xmit(nskb, dev);
Koki Sanagicf66ba52010-08-23 18:45:02 +09002177 trace_net_dev_xmit(nskb, rc);
Patrick McHardyec634fe2009-07-05 19:23:38 -07002178 if (unlikely(rc != NETDEV_TX_OK)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002179 if (rc & ~NETDEV_TX_MASK)
2180 goto out_kfree_gso_skb;
Michael Chanf54d9e82006-06-25 23:57:04 -07002181 nskb->next = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002182 skb->next = nskb;
2183 return rc;
2184 }
Eric Dumazet08baf562009-05-25 22:58:01 -07002185 txq_trans_update(txq);
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002186 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
Michael Chanf54d9e82006-06-25 23:57:04 -07002187 return NETDEV_TX_BUSY;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002188 } while (skb->next);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002189
Patrick McHardy572a9d72009-11-10 06:14:14 +00002190out_kfree_gso_skb:
2191 if (likely(skb->next == NULL))
2192 skb->destructor = DEV_GSO_CB(skb)->destructor;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002193out_kfree_skb:
2194 kfree_skb(skb);
Jesse Gross7b9c6092010-10-20 13:56:04 +00002195out:
Patrick McHardy572a9d72009-11-10 06:14:14 +00002196 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002197}
2198
Tom Herbert0a9627f2010-03-16 08:03:29 +00002199static u32 hashrnd __read_mostly;
David S. Millerb6b2fed2008-07-21 09:48:06 -07002200
Vladislav Zolotarova3d22a62010-12-13 06:27:10 +00002201/*
2202 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2203 * to be used as a distribution range.
2204 */
2205u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2206 unsigned int num_tx_queues)
David S. Miller8f0f2222008-07-15 03:47:03 -07002207{
David S. Miller70192982009-01-27 16:34:47 -08002208 u32 hash;
John Fastabend4f57c082011-01-17 08:06:04 +00002209 u16 qoffset = 0;
2210 u16 qcount = num_tx_queues;
David S. Millerb6b2fed2008-07-21 09:48:06 -07002211
David S. Miller513de112009-05-03 14:43:10 -07002212 if (skb_rx_queue_recorded(skb)) {
2213 hash = skb_get_rx_queue(skb);
Vladislav Zolotarova3d22a62010-12-13 06:27:10 +00002214 while (unlikely(hash >= num_tx_queues))
2215 hash -= num_tx_queues;
David S. Miller513de112009-05-03 14:43:10 -07002216 return hash;
2217 }
Eric Dumazetec581f62009-05-01 09:05:06 -07002218
John Fastabend4f57c082011-01-17 08:06:04 +00002219 if (dev->num_tc) {
2220 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2221 qoffset = dev->tc_to_txq[tc].offset;
2222 qcount = dev->tc_to_txq[tc].count;
2223 }
2224
Eric Dumazetec581f62009-05-01 09:05:06 -07002225 if (skb->sk && skb->sk->sk_hash)
David S. Miller70192982009-01-27 16:34:47 -08002226 hash = skb->sk->sk_hash;
Eric Dumazetec581f62009-05-01 09:05:06 -07002227 else
Eric Dumazet87fd3082010-07-13 05:24:20 +00002228 hash = (__force u16) skb->protocol ^ skb->rxhash;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002229 hash = jhash_1word(hash, hashrnd);
David S. Millerd5a9e242009-01-27 16:22:11 -08002230
John Fastabend4f57c082011-01-17 08:06:04 +00002231 return (u16) (((u64) hash * qcount) >> 32) + qoffset;
David S. Miller8f0f2222008-07-15 03:47:03 -07002232}
Vladislav Zolotarova3d22a62010-12-13 06:27:10 +00002233EXPORT_SYMBOL(__skb_tx_hash);
David S. Miller8f0f2222008-07-15 03:47:03 -07002234
Eric Dumazeted046422009-11-13 21:54:04 +00002235static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2236{
2237 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2238 if (net_ratelimit()) {
Eric Dumazet7a161ea2010-04-08 21:26:13 +00002239 pr_warning("%s selects TX queue %d, but "
2240 "real number of TX queues is %d\n",
2241 dev->name, queue_index, dev->real_num_tx_queues);
Eric Dumazeted046422009-11-13 21:54:04 +00002242 }
2243 return 0;
2244 }
2245 return queue_index;
2246}
2247
Tom Herbert1d24eb42010-11-21 13:17:27 +00002248static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2249{
Tom Herbertbf264142010-11-26 08:36:09 +00002250#ifdef CONFIG_XPS
Tom Herbert1d24eb42010-11-21 13:17:27 +00002251 struct xps_dev_maps *dev_maps;
2252 struct xps_map *map;
2253 int queue_index = -1;
2254
2255 rcu_read_lock();
2256 dev_maps = rcu_dereference(dev->xps_maps);
2257 if (dev_maps) {
2258 map = rcu_dereference(
2259 dev_maps->cpu_map[raw_smp_processor_id()]);
2260 if (map) {
2261 if (map->len == 1)
2262 queue_index = map->queues[0];
2263 else {
2264 u32 hash;
2265 if (skb->sk && skb->sk->sk_hash)
2266 hash = skb->sk->sk_hash;
2267 else
2268 hash = (__force u16) skb->protocol ^
2269 skb->rxhash;
2270 hash = jhash_1word(hash, hashrnd);
2271 queue_index = map->queues[
2272 ((u64)hash * map->len) >> 32];
2273 }
2274 if (unlikely(queue_index >= dev->real_num_tx_queues))
2275 queue_index = -1;
2276 }
2277 }
2278 rcu_read_unlock();
2279
2280 return queue_index;
2281#else
2282 return -1;
2283#endif
2284}
2285
David S. Millere8a04642008-07-17 00:34:19 -07002286static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2287 struct sk_buff *skb)
2288{
Tom Herbertb0f77d02010-07-14 20:50:29 -07002289 int queue_index;
Helmut Schaadeabc772010-09-03 02:39:56 +00002290 const struct net_device_ops *ops = dev->netdev_ops;
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002291
Tom Herbert3853b582010-11-21 13:17:29 +00002292 if (dev->real_num_tx_queues == 1)
2293 queue_index = 0;
2294 else if (ops->ndo_select_queue) {
Helmut Schaadeabc772010-09-03 02:39:56 +00002295 queue_index = ops->ndo_select_queue(dev, skb);
2296 queue_index = dev_cap_txqueue(dev, queue_index);
2297 } else {
2298 struct sock *sk = skb->sk;
2299 queue_index = sk_tx_queue_get(sk);
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002300
Tom Herbert3853b582010-11-21 13:17:29 +00002301 if (queue_index < 0 || skb->ooo_okay ||
2302 queue_index >= dev->real_num_tx_queues) {
2303 int old_index = queue_index;
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002304
Tom Herbert1d24eb42010-11-21 13:17:27 +00002305 queue_index = get_xps_queue(dev, skb);
2306 if (queue_index < 0)
2307 queue_index = skb_tx_hash(dev, skb);
Tom Herbert3853b582010-11-21 13:17:29 +00002308
2309 if (queue_index != old_index && sk) {
2310 struct dst_entry *dst =
2311 rcu_dereference_check(sk->sk_dst_cache, 1);
Eric Dumazet8728c542010-04-11 21:18:17 +00002312
2313 if (dst && skb_dst(skb) == dst)
2314 sk_tx_queue_set(sk, queue_index);
2315 }
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002316 }
2317 }
David S. Millereae792b2008-07-15 03:03:33 -07002318
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002319 skb_set_queue_mapping(skb, queue_index);
2320 return netdev_get_tx_queue(dev, queue_index);
David S. Millere8a04642008-07-17 00:34:19 -07002321}
2322
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002323static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2324 struct net_device *dev,
2325 struct netdev_queue *txq)
2326{
2327 spinlock_t *root_lock = qdisc_lock(q);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002328 bool contended;
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002329 int rc;
2330
Eric Dumazeta2da5702011-01-20 03:48:19 +00002331 qdisc_skb_cb(skb)->pkt_len = skb->len;
2332 qdisc_calculate_pkt_len(skb, q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002333 /*
2334 * Heuristic to force contended enqueues to serialize on a
2335 * separate lock before trying to get qdisc main lock.
2336 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2337 * and dequeue packets faster.
2338 */
Eric Dumazeta2da5702011-01-20 03:48:19 +00002339 contended = qdisc_is_running(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002340 if (unlikely(contended))
2341 spin_lock(&q->busylock);
2342
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002343 spin_lock(root_lock);
2344 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2345 kfree_skb(skb);
2346 rc = NET_XMIT_DROP;
2347 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
Eric Dumazetbc135b22010-06-02 03:23:51 -07002348 qdisc_run_begin(q)) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002349 /*
2350 * This is a work-conserving queue; there are no old skbs
2351 * waiting to be sent out; and the qdisc is not running -
2352 * xmit the skb directly.
2353 */
Eric Dumazet7fee2262010-05-11 23:19:48 +00002354 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2355 skb_dst_force(skb);
Eric Dumazetbfe0d022011-01-09 08:30:54 +00002356
Eric Dumazetbfe0d022011-01-09 08:30:54 +00002357 qdisc_bstats_update(q, skb);
2358
Eric Dumazet79640a42010-06-02 05:09:29 -07002359 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2360 if (unlikely(contended)) {
2361 spin_unlock(&q->busylock);
2362 contended = false;
2363 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002364 __qdisc_run(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002365 } else
Eric Dumazetbc135b22010-06-02 03:23:51 -07002366 qdisc_run_end(q);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002367
2368 rc = NET_XMIT_SUCCESS;
2369 } else {
Eric Dumazet7fee2262010-05-11 23:19:48 +00002370 skb_dst_force(skb);
Eric Dumazeta2da5702011-01-20 03:48:19 +00002371 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
Eric Dumazet79640a42010-06-02 05:09:29 -07002372 if (qdisc_run_begin(q)) {
2373 if (unlikely(contended)) {
2374 spin_unlock(&q->busylock);
2375 contended = false;
2376 }
2377 __qdisc_run(q);
2378 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002379 }
2380 spin_unlock(root_lock);
Eric Dumazet79640a42010-06-02 05:09:29 -07002381 if (unlikely(contended))
2382 spin_unlock(&q->busylock);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002383 return rc;
2384}
2385
Eric Dumazet745e20f2010-09-29 13:23:09 -07002386static DEFINE_PER_CPU(int, xmit_recursion);
David S. Miller11a766c2010-10-25 12:51:55 -07002387#define RECURSION_LIMIT 10
Eric Dumazet745e20f2010-09-29 13:23:09 -07002388
Dave Jonesd29f7492008-07-22 14:09:06 -07002389/**
2390 * dev_queue_xmit - transmit a buffer
2391 * @skb: buffer to transmit
2392 *
2393 * Queue a buffer for transmission to a network device. The caller must
2394 * have set the device and priority and built the buffer before calling
2395 * this function. The function can be called from an interrupt.
2396 *
2397 * A negative errno code is returned on a failure. A success does not
2398 * guarantee the frame will be transmitted as it may be dropped due
2399 * to congestion or traffic shaping.
2400 *
2401 * -----------------------------------------------------------------------------------
2402 * I notice this method can also return errors from the queue disciplines,
2403 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2404 * be positive.
2405 *
2406 * Regardless of the return value, the skb is consumed, so it is currently
2407 * difficult to retry a send to this method. (You can bump the ref count
2408 * before sending to hold a reference for retry if you are careful.)
2409 *
2410 * When calling this method, interrupts MUST be enabled. This is because
2411 * the BH enable code must have IRQs enabled so that it will not deadlock.
2412 * --BLG
2413 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002414int dev_queue_xmit(struct sk_buff *skb)
2415{
2416 struct net_device *dev = skb->dev;
David S. Millerdc2b4842008-07-08 17:18:23 -07002417 struct netdev_queue *txq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002418 struct Qdisc *q;
2419 int rc = -ENOMEM;
2420
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002421 /* Disable soft irqs for various locks below. Also
2422 * stops preemption for RCU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002423 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002424 rcu_read_lock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002425
David S. Millereae792b2008-07-15 03:03:33 -07002426 txq = dev_pick_tx(dev, skb);
Paul E. McKenneya898def2010-02-22 17:04:49 -08002427 q = rcu_dereference_bh(txq->qdisc);
David S. Miller37437bb2008-07-16 02:15:04 -07002428
Linus Torvalds1da177e2005-04-16 15:20:36 -07002429#ifdef CONFIG_NET_CLS_ACT
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002430 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002431#endif
Koki Sanagicf66ba52010-08-23 18:45:02 +09002432 trace_net_dev_queue(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002433 if (q->enqueue) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002434 rc = __dev_xmit_skb(skb, q, dev, txq);
David S. Miller37437bb2008-07-16 02:15:04 -07002435 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002436 }
2437
2438 /* The device has no queue. Common case for software devices:
2439 loopback, all the sorts of tunnels...
2440
Herbert Xu932ff272006-06-09 12:20:56 -07002441 Really, it is unlikely that netif_tx_lock protection is necessary
2442 here. (f.e. loopback and IP tunnels are clean ignoring statistics
Linus Torvalds1da177e2005-04-16 15:20:36 -07002443 counters.)
2444 However, it is possible, that they rely on protection
2445 made by us here.
2446
2447 Check this and shot the lock. It is not prone from deadlocks.
2448 Either shot noqueue qdisc, it is even simpler 8)
2449 */
2450 if (dev->flags & IFF_UP) {
2451 int cpu = smp_processor_id(); /* ok because BHs are off */
2452
David S. Millerc773e842008-07-08 23:13:53 -07002453 if (txq->xmit_lock_owner != cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002454
Eric Dumazet745e20f2010-09-29 13:23:09 -07002455 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2456 goto recursion_alert;
2457
David S. Millerc773e842008-07-08 23:13:53 -07002458 HARD_TX_LOCK(dev, txq, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002459
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002460 if (!netif_tx_queue_stopped(txq)) {
Eric Dumazet745e20f2010-09-29 13:23:09 -07002461 __this_cpu_inc(xmit_recursion);
Patrick McHardy572a9d72009-11-10 06:14:14 +00002462 rc = dev_hard_start_xmit(skb, dev, txq);
Eric Dumazet745e20f2010-09-29 13:23:09 -07002463 __this_cpu_dec(xmit_recursion);
Patrick McHardy572a9d72009-11-10 06:14:14 +00002464 if (dev_xmit_complete(rc)) {
David S. Millerc773e842008-07-08 23:13:53 -07002465 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002466 goto out;
2467 }
2468 }
David S. Millerc773e842008-07-08 23:13:53 -07002469 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002470 if (net_ratelimit())
2471 printk(KERN_CRIT "Virtual device %s asks to "
2472 "queue packet!\n", dev->name);
2473 } else {
2474 /* Recursion is detected! It is possible,
Eric Dumazet745e20f2010-09-29 13:23:09 -07002475 * unfortunately
2476 */
2477recursion_alert:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002478 if (net_ratelimit())
2479 printk(KERN_CRIT "Dead loop on virtual device "
2480 "%s, fix it urgently!\n", dev->name);
2481 }
2482 }
2483
2484 rc = -ENETDOWN;
Herbert Xud4828d82006-06-22 02:28:18 -07002485 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002486
Linus Torvalds1da177e2005-04-16 15:20:36 -07002487 kfree_skb(skb);
2488 return rc;
2489out:
Herbert Xud4828d82006-06-22 02:28:18 -07002490 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002491 return rc;
2492}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002493EXPORT_SYMBOL(dev_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002494
2495
2496/*=======================================================================
2497 Receiver routines
2498 =======================================================================*/
2499
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002500int netdev_max_backlog __read_mostly = 1000;
Eric Dumazet3b098e22010-05-15 23:57:10 -07002501int netdev_tstamp_prequeue __read_mostly = 1;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002502int netdev_budget __read_mostly = 300;
2503int weight_p __read_mostly = 64; /* old backlog weight */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002504
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002505/* Called with irq disabled */
2506static inline void ____napi_schedule(struct softnet_data *sd,
2507 struct napi_struct *napi)
2508{
2509 list_add_tail(&napi->poll_list, &sd->poll_list);
2510 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2511}
2512
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002513/*
2514 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2515 * and src/dst port numbers. Returns a non-zero hash number on success
2516 * and 0 on failure.
2517 */
2518__u32 __skb_get_rxhash(struct sk_buff *skb)
2519{
Changli Gao12fcdef2010-08-17 19:04:32 +00002520 int nhoff, hash = 0, poff;
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002521 struct ipv6hdr *ip6;
2522 struct iphdr *ip;
2523 u8 ip_proto;
2524 u32 addr1, addr2, ihl;
2525 union {
2526 u32 v32;
2527 u16 v16[2];
2528 } ports;
2529
2530 nhoff = skb_network_offset(skb);
2531
2532 switch (skb->protocol) {
2533 case __constant_htons(ETH_P_IP):
2534 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2535 goto done;
2536
Changli Gao10034892010-08-21 06:13:28 +00002537 ip = (struct iphdr *) (skb->data + nhoff);
Changli Gaodbe57752010-08-17 19:01:38 +00002538 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2539 ip_proto = 0;
2540 else
2541 ip_proto = ip->protocol;
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002542 addr1 = (__force u32) ip->saddr;
2543 addr2 = (__force u32) ip->daddr;
2544 ihl = ip->ihl;
2545 break;
2546 case __constant_htons(ETH_P_IPV6):
2547 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2548 goto done;
2549
Changli Gao10034892010-08-21 06:13:28 +00002550 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002551 ip_proto = ip6->nexthdr;
2552 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2553 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2554 ihl = (40 >> 2);
2555 break;
2556 default:
2557 goto done;
2558 }
2559
Changli Gao12fcdef2010-08-17 19:04:32 +00002560 ports.v32 = 0;
2561 poff = proto_ports_offset(ip_proto);
2562 if (poff >= 0) {
2563 nhoff += ihl * 4 + poff;
2564 if (pskb_may_pull(skb, nhoff + 4)) {
2565 ports.v32 = * (__force u32 *) (skb->data + nhoff);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002566 if (ports.v16[1] < ports.v16[0])
2567 swap(ports.v16[0], ports.v16[1]);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002568 }
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002569 }
2570
2571 /* get a consistent hash (same value on both flow directions) */
2572 if (addr2 < addr1)
2573 swap(addr1, addr2);
2574
2575 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2576 if (!hash)
2577 hash = 1;
2578
2579done:
2580 return hash;
2581}
2582EXPORT_SYMBOL(__skb_get_rxhash);
2583
Eric Dumazetdf334542010-03-24 19:13:54 +00002584#ifdef CONFIG_RPS
Tom Herbertfec5e652010-04-16 16:01:27 -07002585
2586/* One global table that all flow-based protocols share. */
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002587struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
Tom Herbertfec5e652010-04-16 16:01:27 -07002588EXPORT_SYMBOL(rps_sock_flow_table);
2589
Tom Herbert0a9627f2010-03-16 08:03:29 +00002590/*
2591 * get_rps_cpu is called from netif_receive_skb and returns the target
2592 * CPU from the RPS map of the receiving queue for a given skb.
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002593 * rcu_read_lock must be held on entry.
Tom Herbert0a9627f2010-03-16 08:03:29 +00002594 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002595static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2596 struct rps_dev_flow **rflowp)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002597{
Tom Herbert0a9627f2010-03-16 08:03:29 +00002598 struct netdev_rx_queue *rxqueue;
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002599 struct rps_map *map;
Tom Herbertfec5e652010-04-16 16:01:27 -07002600 struct rps_dev_flow_table *flow_table;
2601 struct rps_sock_flow_table *sock_flow_table;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002602 int cpu = -1;
Tom Herbertfec5e652010-04-16 16:01:27 -07002603 u16 tcpu;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002604
Tom Herbert0a9627f2010-03-16 08:03:29 +00002605 if (skb_rx_queue_recorded(skb)) {
2606 u16 index = skb_get_rx_queue(skb);
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002607 if (unlikely(index >= dev->real_num_rx_queues)) {
2608 WARN_ONCE(dev->real_num_rx_queues > 1,
2609 "%s received packet on queue %u, but number "
2610 "of RX queues is %u\n",
2611 dev->name, index, dev->real_num_rx_queues);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002612 goto done;
2613 }
2614 rxqueue = dev->_rx + index;
2615 } else
2616 rxqueue = dev->_rx;
2617
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002618 map = rcu_dereference(rxqueue->rps_map);
2619 if (map) {
2620 if (map->len == 1) {
Changli Gao6febfca2010-09-03 23:12:37 +00002621 tcpu = map->cpus[0];
2622 if (cpu_online(tcpu))
2623 cpu = tcpu;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002624 goto done;
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002625 }
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002626 } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00002627 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002628 }
2629
Changli Gao2d47b452010-08-17 19:00:56 +00002630 skb_reset_network_header(skb);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002631 if (!skb_get_rxhash(skb))
Tom Herbert0a9627f2010-03-16 08:03:29 +00002632 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002633
Tom Herbertfec5e652010-04-16 16:01:27 -07002634 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2635 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2636 if (flow_table && sock_flow_table) {
2637 u16 next_cpu;
2638 struct rps_dev_flow *rflow;
2639
2640 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2641 tcpu = rflow->cpu;
2642
2643 next_cpu = sock_flow_table->ents[skb->rxhash &
2644 sock_flow_table->mask];
2645
2646 /*
2647 * If the desired CPU (where last recvmsg was done) is
2648 * different from current CPU (one in the rx-queue flow
2649 * table entry), switch if one of the following holds:
2650 * - Current CPU is unset (equal to RPS_NO_CPU).
2651 * - Current CPU is offline.
2652 * - The current CPU's queue tail has advanced beyond the
2653 * last packet that was enqueued using this table entry.
2654 * This guarantees that all previous packets for the flow
2655 * have been dequeued, thus preserving in order delivery.
2656 */
2657 if (unlikely(tcpu != next_cpu) &&
2658 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2659 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2660 rflow->last_qtail)) >= 0)) {
2661 tcpu = rflow->cpu = next_cpu;
2662 if (tcpu != RPS_NO_CPU)
2663 rflow->last_qtail = per_cpu(softnet_data,
2664 tcpu).input_queue_head;
2665 }
2666 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2667 *rflowp = rflow;
2668 cpu = tcpu;
2669 goto done;
2670 }
2671 }
2672
Tom Herbert0a9627f2010-03-16 08:03:29 +00002673 if (map) {
Tom Herbertfec5e652010-04-16 16:01:27 -07002674 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
Tom Herbert0a9627f2010-03-16 08:03:29 +00002675
2676 if (cpu_online(tcpu)) {
2677 cpu = tcpu;
2678 goto done;
2679 }
2680 }
2681
2682done:
Tom Herbert0a9627f2010-03-16 08:03:29 +00002683 return cpu;
2684}
2685
Tom Herbert0a9627f2010-03-16 08:03:29 +00002686/* Called from hardirq (IPI) context */
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002687static void rps_trigger_softirq(void *data)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002688{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002689 struct softnet_data *sd = data;
2690
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002691 ____napi_schedule(sd, &sd->backlog);
Changli Gaodee42872010-05-02 05:42:16 +00002692 sd->received_rps++;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002693}
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002694
Tom Herbertfec5e652010-04-16 16:01:27 -07002695#endif /* CONFIG_RPS */
Tom Herbert0a9627f2010-03-16 08:03:29 +00002696
2697/*
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002698 * Check if this softnet_data structure is another cpu one
2699 * If yes, queue it to our IPI list and return 1
2700 * If no, return 0
2701 */
2702static int rps_ipi_queued(struct softnet_data *sd)
2703{
2704#ifdef CONFIG_RPS
2705 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2706
2707 if (sd != mysd) {
2708 sd->rps_ipi_next = mysd->rps_ipi_list;
2709 mysd->rps_ipi_list = sd;
2710
2711 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2712 return 1;
2713 }
2714#endif /* CONFIG_RPS */
2715 return 0;
2716}
2717
2718/*
Tom Herbert0a9627f2010-03-16 08:03:29 +00002719 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2720 * queue (may be a remote CPU queue).
2721 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002722static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2723 unsigned int *qtail)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002724{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002725 struct softnet_data *sd;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002726 unsigned long flags;
2727
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002728 sd = &per_cpu(softnet_data, cpu);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002729
2730 local_irq_save(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002731
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002732 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07002733 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2734 if (skb_queue_len(&sd->input_pkt_queue)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00002735enqueue:
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002736 __skb_queue_tail(&sd->input_pkt_queue, skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00002737 input_queue_tail_incr_save(sd, qtail);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002738 rps_unlock(sd);
Changli Gao152102c2010-03-30 20:16:22 +00002739 local_irq_restore(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002740 return NET_RX_SUCCESS;
2741 }
2742
Eric Dumazetebda37c22010-05-06 23:51:21 +00002743 /* Schedule NAPI for backlog device
2744 * We can use non atomic operation since we own the queue lock
2745 */
2746 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002747 if (!rps_ipi_queued(sd))
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002748 ____napi_schedule(sd, &sd->backlog);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002749 }
2750 goto enqueue;
2751 }
2752
Changli Gaodee42872010-05-02 05:42:16 +00002753 sd->dropped++;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002754 rps_unlock(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002755
Tom Herbert0a9627f2010-03-16 08:03:29 +00002756 local_irq_restore(flags);
2757
Eric Dumazetcaf586e2010-09-30 21:06:55 +00002758 atomic_long_inc(&skb->dev->rx_dropped);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002759 kfree_skb(skb);
2760 return NET_RX_DROP;
2761}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002762
Linus Torvalds1da177e2005-04-16 15:20:36 -07002763/**
2764 * netif_rx - post buffer to the network code
2765 * @skb: buffer to post
2766 *
2767 * This function receives a packet from a device driver and queues it for
2768 * the upper (protocol) levels to process. It always succeeds. The buffer
2769 * may be dropped during processing for congestion control or by the
2770 * protocol layers.
2771 *
2772 * return values:
2773 * NET_RX_SUCCESS (no congestion)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002774 * NET_RX_DROP (packet was dropped)
2775 *
2776 */
2777
2778int netif_rx(struct sk_buff *skb)
2779{
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002780 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002781
2782 /* if netpoll wants it, pretend we never saw it */
2783 if (netpoll_rx(skb))
2784 return NET_RX_DROP;
2785
Eric Dumazet3b098e22010-05-15 23:57:10 -07002786 if (netdev_tstamp_prequeue)
2787 net_timestamp_check(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002788
Koki Sanagicf66ba52010-08-23 18:45:02 +09002789 trace_netif_rx(skb);
Eric Dumazetdf334542010-03-24 19:13:54 +00002790#ifdef CONFIG_RPS
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002791 {
Tom Herbertfec5e652010-04-16 16:01:27 -07002792 struct rps_dev_flow voidflow, *rflow = &voidflow;
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002793 int cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002794
Changli Gaocece1942010-08-07 20:35:43 -07002795 preempt_disable();
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002796 rcu_read_lock();
Tom Herbertfec5e652010-04-16 16:01:27 -07002797
2798 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002799 if (cpu < 0)
2800 cpu = smp_processor_id();
Tom Herbertfec5e652010-04-16 16:01:27 -07002801
2802 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2803
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002804 rcu_read_unlock();
Changli Gaocece1942010-08-07 20:35:43 -07002805 preempt_enable();
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002806 }
2807#else
Tom Herbertfec5e652010-04-16 16:01:27 -07002808 {
2809 unsigned int qtail;
2810 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2811 put_cpu();
2812 }
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002813#endif
2814 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002815}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002816EXPORT_SYMBOL(netif_rx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002817
2818int netif_rx_ni(struct sk_buff *skb)
2819{
2820 int err;
2821
2822 preempt_disable();
2823 err = netif_rx(skb);
2824 if (local_softirq_pending())
2825 do_softirq();
2826 preempt_enable();
2827
2828 return err;
2829}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002830EXPORT_SYMBOL(netif_rx_ni);
2831
Linus Torvalds1da177e2005-04-16 15:20:36 -07002832static void net_tx_action(struct softirq_action *h)
2833{
2834 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2835
2836 if (sd->completion_queue) {
2837 struct sk_buff *clist;
2838
2839 local_irq_disable();
2840 clist = sd->completion_queue;
2841 sd->completion_queue = NULL;
2842 local_irq_enable();
2843
2844 while (clist) {
2845 struct sk_buff *skb = clist;
2846 clist = clist->next;
2847
Ilpo Järvinen547b7922008-07-25 21:43:18 -07002848 WARN_ON(atomic_read(&skb->users));
Koki Sanagi07dc22e2010-08-23 18:46:12 +09002849 trace_kfree_skb(skb, net_tx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002850 __kfree_skb(skb);
2851 }
2852 }
2853
2854 if (sd->output_queue) {
David S. Miller37437bb2008-07-16 02:15:04 -07002855 struct Qdisc *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002856
2857 local_irq_disable();
2858 head = sd->output_queue;
2859 sd->output_queue = NULL;
Changli Gaoa9cbd582010-04-26 23:06:24 +00002860 sd->output_queue_tailp = &sd->output_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002861 local_irq_enable();
2862
2863 while (head) {
David S. Miller37437bb2008-07-16 02:15:04 -07002864 struct Qdisc *q = head;
2865 spinlock_t *root_lock;
2866
Linus Torvalds1da177e2005-04-16 15:20:36 -07002867 head = head->next_sched;
2868
David S. Miller5fb66222008-08-02 20:02:43 -07002869 root_lock = qdisc_lock(q);
David S. Miller37437bb2008-07-16 02:15:04 -07002870 if (spin_trylock(root_lock)) {
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002871 smp_mb__before_clear_bit();
2872 clear_bit(__QDISC_STATE_SCHED,
2873 &q->state);
David S. Miller37437bb2008-07-16 02:15:04 -07002874 qdisc_run(q);
2875 spin_unlock(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002876 } else {
David S. Miller195648b2008-08-19 04:00:36 -07002877 if (!test_bit(__QDISC_STATE_DEACTIVATED,
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07002878 &q->state)) {
David S. Miller195648b2008-08-19 04:00:36 -07002879 __netif_reschedule(q);
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07002880 } else {
2881 smp_mb__before_clear_bit();
2882 clear_bit(__QDISC_STATE_SCHED,
2883 &q->state);
2884 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002885 }
2886 }
2887 }
2888}
2889
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002890#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2891 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
Michał Mirosławda678292009-06-05 05:35:28 +00002892/* This hook is defined here for ATM LANE */
2893int (*br_fdb_test_addr_hook)(struct net_device *dev,
2894 unsigned char *addr) __read_mostly;
Stephen Hemminger4fb019a2009-09-11 11:50:08 -07002895EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
Michał Mirosławda678292009-06-05 05:35:28 +00002896#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002897
Linus Torvalds1da177e2005-04-16 15:20:36 -07002898#ifdef CONFIG_NET_CLS_ACT
2899/* TODO: Maybe we should just force sch_ingress to be compiled in
2900 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2901 * a compare and 2 stores extra right now if we dont have it on
2902 * but have CONFIG_NET_CLS_ACT
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002903 * NOTE: This doesnt stop any functionality; if you dont have
Linus Torvalds1da177e2005-04-16 15:20:36 -07002904 * the ingress scheduler, you just cant add policies on ingress.
2905 *
2906 */
Eric Dumazet24824a02010-10-02 06:11:55 +00002907static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002908{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002909 struct net_device *dev = skb->dev;
Herbert Xuf697c3e2007-10-14 00:38:47 -07002910 u32 ttl = G_TC_RTTL(skb->tc_verd);
David S. Miller555353c2008-07-08 17:33:13 -07002911 int result = TC_ACT_OK;
2912 struct Qdisc *q;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002913
Stephen Hemmingerde384832010-08-01 00:33:23 -07002914 if (unlikely(MAX_RED_LOOP < ttl++)) {
2915 if (net_ratelimit())
2916 pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2917 skb->skb_iif, dev->ifindex);
Herbert Xuf697c3e2007-10-14 00:38:47 -07002918 return TC_ACT_SHOT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002919 }
2920
Herbert Xuf697c3e2007-10-14 00:38:47 -07002921 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2922 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2923
David S. Miller83874002008-07-17 00:53:03 -07002924 q = rxq->qdisc;
David S. Miller8d50b532008-07-30 02:37:46 -07002925 if (q != &noop_qdisc) {
David S. Miller83874002008-07-17 00:53:03 -07002926 spin_lock(qdisc_lock(q));
David S. Millera9312ae2008-08-17 21:51:03 -07002927 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2928 result = qdisc_enqueue_root(skb, q);
David S. Miller83874002008-07-17 00:53:03 -07002929 spin_unlock(qdisc_lock(q));
2930 }
Herbert Xuf697c3e2007-10-14 00:38:47 -07002931
Linus Torvalds1da177e2005-04-16 15:20:36 -07002932 return result;
2933}
Herbert Xuf697c3e2007-10-14 00:38:47 -07002934
2935static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2936 struct packet_type **pt_prev,
2937 int *ret, struct net_device *orig_dev)
2938{
Eric Dumazet24824a02010-10-02 06:11:55 +00002939 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2940
2941 if (!rxq || rxq->qdisc == &noop_qdisc)
Herbert Xuf697c3e2007-10-14 00:38:47 -07002942 goto out;
2943
2944 if (*pt_prev) {
2945 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2946 *pt_prev = NULL;
Herbert Xuf697c3e2007-10-14 00:38:47 -07002947 }
2948
Eric Dumazet24824a02010-10-02 06:11:55 +00002949 switch (ing_filter(skb, rxq)) {
Herbert Xuf697c3e2007-10-14 00:38:47 -07002950 case TC_ACT_SHOT:
2951 case TC_ACT_STOLEN:
2952 kfree_skb(skb);
2953 return NULL;
2954 }
2955
2956out:
2957 skb->tc_verd = 0;
2958 return skb;
2959}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002960#endif
2961
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002962/**
2963 * netdev_rx_handler_register - register receive handler
2964 * @dev: device to register a handler for
2965 * @rx_handler: receive handler to register
Jiri Pirko93e2c322010-06-10 03:34:59 +00002966 * @rx_handler_data: data pointer that is used by rx handler
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002967 *
2968 * Register a receive hander for a device. This handler will then be
2969 * called from __netif_receive_skb. A negative errno code is returned
2970 * on a failure.
2971 *
2972 * The caller must hold the rtnl_mutex.
2973 */
2974int netdev_rx_handler_register(struct net_device *dev,
Jiri Pirko93e2c322010-06-10 03:34:59 +00002975 rx_handler_func_t *rx_handler,
2976 void *rx_handler_data)
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002977{
2978 ASSERT_RTNL();
2979
2980 if (dev->rx_handler)
2981 return -EBUSY;
2982
Jiri Pirko93e2c322010-06-10 03:34:59 +00002983 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002984 rcu_assign_pointer(dev->rx_handler, rx_handler);
2985
2986 return 0;
2987}
2988EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2989
2990/**
2991 * netdev_rx_handler_unregister - unregister receive handler
2992 * @dev: device to unregister a handler from
2993 *
2994 * Unregister a receive hander from a device.
2995 *
2996 * The caller must hold the rtnl_mutex.
2997 */
2998void netdev_rx_handler_unregister(struct net_device *dev)
2999{
3000
3001 ASSERT_RTNL();
3002 rcu_assign_pointer(dev->rx_handler, NULL);
Jiri Pirko93e2c322010-06-10 03:34:59 +00003003 rcu_assign_pointer(dev->rx_handler_data, NULL);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003004}
3005EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3006
Eric Dumazetacbbc072010-04-11 06:56:11 +00003007static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
3008 struct net_device *master)
3009{
3010 if (skb->pkt_type == PACKET_HOST) {
3011 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
3012
3013 memcpy(dest, master->dev_addr, ETH_ALEN);
3014 }
3015}
3016
3017/* On bonding slaves other than the currently active slave, suppress
3018 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
3019 * ARP on active-backup slaves with arp_validate enabled.
3020 */
3021int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
3022{
3023 struct net_device *dev = skb->dev;
3024
3025 if (master->priv_flags & IFF_MASTER_ARPMON)
3026 dev->last_rx = jiffies;
3027
Jiri Pirkof350a0a82010-06-15 06:50:45 +00003028 if ((master->priv_flags & IFF_MASTER_ALB) &&
3029 (master->priv_flags & IFF_BRIDGE_PORT)) {
Eric Dumazetacbbc072010-04-11 06:56:11 +00003030 /* Do address unmangle. The local destination address
3031 * will be always the one master has. Provides the right
3032 * functionality in a bridge.
3033 */
3034 skb_bond_set_mac_by_master(skb, master);
3035 }
3036
3037 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
3038 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
3039 skb->protocol == __cpu_to_be16(ETH_P_ARP))
3040 return 0;
3041
3042 if (master->priv_flags & IFF_MASTER_ALB) {
3043 if (skb->pkt_type != PACKET_BROADCAST &&
3044 skb->pkt_type != PACKET_MULTICAST)
3045 return 0;
3046 }
3047 if (master->priv_flags & IFF_MASTER_8023AD &&
3048 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
3049 return 0;
3050
3051 return 1;
3052 }
3053 return 0;
3054}
3055EXPORT_SYMBOL(__skb_bond_should_drop);
3056
Eric Dumazet10f744d2010-03-28 23:07:20 -07003057static int __netif_receive_skb(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003058{
3059 struct packet_type *ptype, *pt_prev;
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003060 rx_handler_func_t *rx_handler;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003061 struct net_device *orig_dev;
Eric Dumazet0641e4f2010-03-18 21:16:45 -07003062 struct net_device *master;
Joe Eykholt0d7a3682008-07-02 18:22:01 -07003063 struct net_device *null_or_orig;
John Fastabend2df4a0f2010-05-12 21:31:11 +00003064 struct net_device *orig_or_bond;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003065 int ret = NET_RX_DROP;
Al Viro252e3342006-11-14 20:48:11 -08003066 __be16 type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003067
Eric Dumazet3b098e22010-05-15 23:57:10 -07003068 if (!netdev_tstamp_prequeue)
3069 net_timestamp_check(skb);
Eric Dumazet81bbb3d2009-09-30 16:42:42 -07003070
Koki Sanagicf66ba52010-08-23 18:45:02 +09003071 trace_netif_receive_skb(skb);
Patrick McHardy9b22ea52008-11-04 14:49:57 -08003072
Linus Torvalds1da177e2005-04-16 15:20:36 -07003073 /* if we've gotten here through NAPI, check netpoll */
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003074 if (netpoll_receive_skb(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003075 return NET_RX_DROP;
3076
Eric Dumazet8964be42009-11-20 15:35:04 -08003077 if (!skb->skb_iif)
3078 skb->skb_iif = skb->dev->ifindex;
David S. Miller86e65da2005-08-09 19:36:29 -07003079
John Fastabend597a2642010-06-03 09:30:11 +00003080 /*
3081 * bonding note: skbs received on inactive slaves should only
3082 * be delivered to pkt handlers that are exact matches. Also
3083 * the deliver_no_wcard flag will be set. If packet handlers
3084 * are sensitive to duplicate packets these skbs will need to
Jesse Gross3701e512010-10-20 13:56:06 +00003085 * be dropped at the handler.
John Fastabend597a2642010-06-03 09:30:11 +00003086 */
Joe Eykholt0d7a3682008-07-02 18:22:01 -07003087 null_or_orig = NULL;
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07003088 orig_dev = skb->dev;
Eric Dumazet0641e4f2010-03-18 21:16:45 -07003089 master = ACCESS_ONCE(orig_dev->master);
John Fastabend597a2642010-06-03 09:30:11 +00003090 if (skb->deliver_no_wcard)
3091 null_or_orig = orig_dev;
3092 else if (master) {
3093 if (skb_bond_should_drop(skb, master)) {
3094 skb->deliver_no_wcard = 1;
Joe Eykholt0d7a3682008-07-02 18:22:01 -07003095 null_or_orig = orig_dev; /* deliver only exact match */
John Fastabend597a2642010-06-03 09:30:11 +00003096 } else
Eric Dumazet0641e4f2010-03-18 21:16:45 -07003097 skb->dev = master;
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07003098 }
Jay Vosburgh8f903c72006-02-21 16:36:44 -08003099
Eric Dumazet27f39c73e2010-05-19 22:07:23 +00003100 __this_cpu_inc(softnet_data.processed);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003101 skb_reset_network_header(skb);
Arnaldo Carvalho de Melobadff6d2007-03-13 13:06:52 -03003102 skb_reset_transport_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07003103 skb->mac_len = skb->network_header - skb->mac_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003104
3105 pt_prev = NULL;
3106
3107 rcu_read_lock();
3108
3109#ifdef CONFIG_NET_CLS_ACT
3110 if (skb->tc_verd & TC_NCLS) {
3111 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3112 goto ncls;
3113 }
3114#endif
3115
3116 list_for_each_entry_rcu(ptype, &ptype_all, list) {
Joe Eykholtf9823072008-07-02 18:22:02 -07003117 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3118 ptype->dev == orig_dev) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003119 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003120 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003121 pt_prev = ptype;
3122 }
3123 }
3124
3125#ifdef CONFIG_NET_CLS_ACT
Herbert Xuf697c3e2007-10-14 00:38:47 -07003126 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3127 if (!skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003128 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003129ncls:
3130#endif
3131
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003132 /* Handle special case of bridge or macvlan */
3133 rx_handler = rcu_dereference(skb->dev->rx_handler);
3134 if (rx_handler) {
3135 if (pt_prev) {
3136 ret = deliver_skb(skb, pt_prev, orig_dev);
3137 pt_prev = NULL;
3138 }
3139 skb = rx_handler(skb);
3140 if (!skb)
3141 goto out;
3142 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003143
Jesse Gross3701e512010-10-20 13:56:06 +00003144 if (vlan_tx_tag_present(skb)) {
3145 if (pt_prev) {
3146 ret = deliver_skb(skb, pt_prev, orig_dev);
3147 pt_prev = NULL;
3148 }
3149 if (vlan_hwaccel_do_receive(&skb)) {
3150 ret = __netif_receive_skb(skb);
3151 goto out;
3152 } else if (unlikely(!skb))
3153 goto out;
3154 }
3155
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00003156 /*
3157 * Make sure frames received on VLAN interfaces stacked on
3158 * bonding interfaces still make their way to any base bonding
3159 * device that may have registered for a specific ptype. The
3160 * handler may have to adjust skb->dev and orig_dev.
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00003161 */
John Fastabend2df4a0f2010-05-12 21:31:11 +00003162 orig_or_bond = orig_dev;
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00003163 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
3164 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
John Fastabend2df4a0f2010-05-12 21:31:11 +00003165 orig_or_bond = vlan_dev_real_dev(skb->dev);
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00003166 }
3167
Linus Torvalds1da177e2005-04-16 15:20:36 -07003168 type = skb->protocol;
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08003169 list_for_each_entry_rcu(ptype,
3170 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00003171 if (ptype->type == type && (ptype->dev == null_or_orig ||
Andy Gospodarekca8d9ea2010-01-06 12:56:37 +00003172 ptype->dev == skb->dev || ptype->dev == orig_dev ||
John Fastabend2df4a0f2010-05-12 21:31:11 +00003173 ptype->dev == orig_or_bond)) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003174 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003175 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003176 pt_prev = ptype;
3177 }
3178 }
3179
3180 if (pt_prev) {
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003181 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003182 } else {
Eric Dumazetcaf586e2010-09-30 21:06:55 +00003183 atomic_long_inc(&skb->dev->rx_dropped);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003184 kfree_skb(skb);
3185 /* Jamal, now you will not able to escape explaining
3186 * me how you were going to use this. :-)
3187 */
3188 ret = NET_RX_DROP;
3189 }
3190
3191out:
3192 rcu_read_unlock();
3193 return ret;
3194}
Tom Herbert0a9627f2010-03-16 08:03:29 +00003195
3196/**
3197 * netif_receive_skb - process receive buffer from network
3198 * @skb: buffer to process
3199 *
3200 * netif_receive_skb() is the main receive data processing function.
3201 * It always succeeds. The buffer may be dropped during processing
3202 * for congestion control or by the protocol layers.
3203 *
3204 * This function may only be called from softirq context and interrupts
3205 * should be enabled.
3206 *
3207 * Return values (usually ignored):
3208 * NET_RX_SUCCESS: no congestion
3209 * NET_RX_DROP: packet was dropped
3210 */
3211int netif_receive_skb(struct sk_buff *skb)
3212{
Eric Dumazet3b098e22010-05-15 23:57:10 -07003213 if (netdev_tstamp_prequeue)
3214 net_timestamp_check(skb);
3215
Richard Cochranc1f19b52010-07-17 08:49:36 +00003216 if (skb_defer_rx_timestamp(skb))
3217 return NET_RX_SUCCESS;
3218
Eric Dumazetdf334542010-03-24 19:13:54 +00003219#ifdef CONFIG_RPS
Eric Dumazet3b098e22010-05-15 23:57:10 -07003220 {
3221 struct rps_dev_flow voidflow, *rflow = &voidflow;
3222 int cpu, ret;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003223
Eric Dumazet3b098e22010-05-15 23:57:10 -07003224 rcu_read_lock();
Tom Herbert0a9627f2010-03-16 08:03:29 +00003225
Eric Dumazet3b098e22010-05-15 23:57:10 -07003226 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Tom Herbertfec5e652010-04-16 16:01:27 -07003227
Eric Dumazet3b098e22010-05-15 23:57:10 -07003228 if (cpu >= 0) {
3229 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3230 rcu_read_unlock();
3231 } else {
3232 rcu_read_unlock();
3233 ret = __netif_receive_skb(skb);
3234 }
3235
3236 return ret;
Tom Herbertfec5e652010-04-16 16:01:27 -07003237 }
Tom Herbert1e94d722010-03-18 17:45:44 -07003238#else
3239 return __netif_receive_skb(skb);
3240#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00003241}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003242EXPORT_SYMBOL(netif_receive_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003243
Eric Dumazet88751272010-04-19 05:07:33 +00003244/* Network device is going away, flush any packets still pending
3245 * Called with irqs disabled.
3246 */
Changli Gao152102c2010-03-30 20:16:22 +00003247static void flush_backlog(void *arg)
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003248{
Changli Gao152102c2010-03-30 20:16:22 +00003249 struct net_device *dev = arg;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003250 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003251 struct sk_buff *skb, *tmp;
3252
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003253 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003254 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003255 if (skb->dev == dev) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003256 __skb_unlink(skb, &sd->input_pkt_queue);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003257 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003258 input_queue_head_incr(sd);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003259 }
Changli Gao6e7676c2010-04-27 15:07:33 -07003260 }
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003261 rps_unlock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003262
3263 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3264 if (skb->dev == dev) {
3265 __skb_unlink(skb, &sd->process_queue);
3266 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003267 input_queue_head_incr(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003268 }
3269 }
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003270}
3271
Herbert Xud565b0a2008-12-15 23:38:52 -08003272static int napi_gro_complete(struct sk_buff *skb)
3273{
3274 struct packet_type *ptype;
3275 __be16 type = skb->protocol;
3276 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3277 int err = -ENOENT;
3278
Herbert Xufc59f9a2009-04-14 15:11:06 -07003279 if (NAPI_GRO_CB(skb)->count == 1) {
3280 skb_shinfo(skb)->gso_size = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003281 goto out;
Herbert Xufc59f9a2009-04-14 15:11:06 -07003282 }
Herbert Xud565b0a2008-12-15 23:38:52 -08003283
3284 rcu_read_lock();
3285 list_for_each_entry_rcu(ptype, head, list) {
3286 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3287 continue;
3288
3289 err = ptype->gro_complete(skb);
3290 break;
3291 }
3292 rcu_read_unlock();
3293
3294 if (err) {
3295 WARN_ON(&ptype->list == head);
3296 kfree_skb(skb);
3297 return NET_RX_SUCCESS;
3298 }
3299
3300out:
Herbert Xud565b0a2008-12-15 23:38:52 -08003301 return netif_receive_skb(skb);
3302}
3303
Eric Dumazet86cac582010-08-31 18:25:32 +00003304inline void napi_gro_flush(struct napi_struct *napi)
Herbert Xud565b0a2008-12-15 23:38:52 -08003305{
3306 struct sk_buff *skb, *next;
3307
3308 for (skb = napi->gro_list; skb; skb = next) {
3309 next = skb->next;
3310 skb->next = NULL;
3311 napi_gro_complete(skb);
3312 }
3313
Herbert Xu4ae55442009-02-08 18:00:36 +00003314 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003315 napi->gro_list = NULL;
3316}
Eric Dumazet86cac582010-08-31 18:25:32 +00003317EXPORT_SYMBOL(napi_gro_flush);
Herbert Xud565b0a2008-12-15 23:38:52 -08003318
Ben Hutchings5b252f02009-10-29 07:17:09 +00003319enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xud565b0a2008-12-15 23:38:52 -08003320{
3321 struct sk_buff **pp = NULL;
3322 struct packet_type *ptype;
3323 __be16 type = skb->protocol;
3324 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
Herbert Xu0da2afd52008-12-26 14:57:42 -08003325 int same_flow;
Herbert Xud565b0a2008-12-15 23:38:52 -08003326 int mac_len;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003327 enum gro_result ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003328
Jarek Poplawskice9e76c2010-08-05 01:19:11 +00003329 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
Herbert Xud565b0a2008-12-15 23:38:52 -08003330 goto normal;
3331
David S. Miller21dc3302010-08-23 00:13:46 -07003332 if (skb_is_gso(skb) || skb_has_frag_list(skb))
Herbert Xuf17f5c92009-01-14 14:36:12 -08003333 goto normal;
3334
Herbert Xud565b0a2008-12-15 23:38:52 -08003335 rcu_read_lock();
3336 list_for_each_entry_rcu(ptype, head, list) {
Herbert Xud565b0a2008-12-15 23:38:52 -08003337 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3338 continue;
3339
Herbert Xu86911732009-01-29 14:19:50 +00003340 skb_set_network_header(skb, skb_gro_offset(skb));
Herbert Xud565b0a2008-12-15 23:38:52 -08003341 mac_len = skb->network_header - skb->mac_header;
3342 skb->mac_len = mac_len;
3343 NAPI_GRO_CB(skb)->same_flow = 0;
3344 NAPI_GRO_CB(skb)->flush = 0;
Herbert Xu5d38a072009-01-04 16:13:40 -08003345 NAPI_GRO_CB(skb)->free = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003346
Herbert Xud565b0a2008-12-15 23:38:52 -08003347 pp = ptype->gro_receive(&napi->gro_list, skb);
3348 break;
3349 }
3350 rcu_read_unlock();
3351
3352 if (&ptype->list == head)
3353 goto normal;
3354
Herbert Xu0da2afd52008-12-26 14:57:42 -08003355 same_flow = NAPI_GRO_CB(skb)->same_flow;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003356 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
Herbert Xu0da2afd52008-12-26 14:57:42 -08003357
Herbert Xud565b0a2008-12-15 23:38:52 -08003358 if (pp) {
3359 struct sk_buff *nskb = *pp;
3360
3361 *pp = nskb->next;
3362 nskb->next = NULL;
3363 napi_gro_complete(nskb);
Herbert Xu4ae55442009-02-08 18:00:36 +00003364 napi->gro_count--;
Herbert Xud565b0a2008-12-15 23:38:52 -08003365 }
3366
Herbert Xu0da2afd52008-12-26 14:57:42 -08003367 if (same_flow)
Herbert Xud565b0a2008-12-15 23:38:52 -08003368 goto ok;
3369
Herbert Xu4ae55442009-02-08 18:00:36 +00003370 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
Herbert Xud565b0a2008-12-15 23:38:52 -08003371 goto normal;
Herbert Xud565b0a2008-12-15 23:38:52 -08003372
Herbert Xu4ae55442009-02-08 18:00:36 +00003373 napi->gro_count++;
Herbert Xud565b0a2008-12-15 23:38:52 -08003374 NAPI_GRO_CB(skb)->count = 1;
Herbert Xu86911732009-01-29 14:19:50 +00003375 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003376 skb->next = napi->gro_list;
3377 napi->gro_list = skb;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003378 ret = GRO_HELD;
Herbert Xud565b0a2008-12-15 23:38:52 -08003379
Herbert Xuad0f9902009-02-01 01:24:55 -08003380pull:
Herbert Xucb189782009-05-26 18:50:31 +00003381 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3382 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3383
3384 BUG_ON(skb->end - skb->tail < grow);
3385
3386 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3387
3388 skb->tail += grow;
3389 skb->data_len -= grow;
3390
3391 skb_shinfo(skb)->frags[0].page_offset += grow;
3392 skb_shinfo(skb)->frags[0].size -= grow;
3393
3394 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3395 put_page(skb_shinfo(skb)->frags[0].page);
3396 memmove(skb_shinfo(skb)->frags,
3397 skb_shinfo(skb)->frags + 1,
Jarek Poplawskie5093ae2010-08-11 02:02:10 +00003398 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
Herbert Xucb189782009-05-26 18:50:31 +00003399 }
Herbert Xuad0f9902009-02-01 01:24:55 -08003400 }
3401
Herbert Xud565b0a2008-12-15 23:38:52 -08003402ok:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003403 return ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003404
3405normal:
Herbert Xuad0f9902009-02-01 01:24:55 -08003406 ret = GRO_NORMAL;
3407 goto pull;
Herbert Xu5d38a072009-01-04 16:13:40 -08003408}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003409EXPORT_SYMBOL(dev_gro_receive);
3410
Eric Dumazet40d08022010-08-26 22:03:08 -07003411static inline gro_result_t
Ben Hutchings5b252f02009-10-29 07:17:09 +00003412__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003413{
3414 struct sk_buff *p;
3415
3416 for (p = napi->gro_list; p; p = p->next) {
Eric Dumazet40d08022010-08-26 22:03:08 -07003417 unsigned long diffs;
3418
3419 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
Jesse Gross3701e512010-10-20 13:56:06 +00003420 diffs |= p->vlan_tci ^ skb->vlan_tci;
Eric Dumazet40d08022010-08-26 22:03:08 -07003421 diffs |= compare_ether_header(skb_mac_header(p),
Joe Perchesf64f9e72009-11-29 16:55:45 -08003422 skb_gro_mac_header(skb));
Eric Dumazet40d08022010-08-26 22:03:08 -07003423 NAPI_GRO_CB(p)->same_flow = !diffs;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003424 NAPI_GRO_CB(p)->flush = 0;
3425 }
3426
3427 return dev_gro_receive(napi, skb);
3428}
Herbert Xu5d38a072009-01-04 16:13:40 -08003429
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003430gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
Herbert Xu5d38a072009-01-04 16:13:40 -08003431{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003432 switch (ret) {
3433 case GRO_NORMAL:
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003434 if (netif_receive_skb(skb))
3435 ret = GRO_DROP;
3436 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003437
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003438 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003439 case GRO_MERGED_FREE:
Herbert Xu5d38a072009-01-04 16:13:40 -08003440 kfree_skb(skb);
3441 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003442
3443 case GRO_HELD:
3444 case GRO_MERGED:
3445 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003446 }
3447
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003448 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003449}
3450EXPORT_SYMBOL(napi_skb_finish);
3451
Herbert Xu78a478d2009-05-26 18:50:21 +00003452void skb_gro_reset_offset(struct sk_buff *skb)
3453{
3454 NAPI_GRO_CB(skb)->data_offset = 0;
3455 NAPI_GRO_CB(skb)->frag0 = NULL;
Herbert Xu74895942009-05-26 18:50:27 +00003456 NAPI_GRO_CB(skb)->frag0_len = 0;
Herbert Xu78a478d2009-05-26 18:50:21 +00003457
Herbert Xu78d3fd02009-05-26 18:50:23 +00003458 if (skb->mac_header == skb->tail &&
Herbert Xu74895942009-05-26 18:50:27 +00003459 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
Herbert Xu78a478d2009-05-26 18:50:21 +00003460 NAPI_GRO_CB(skb)->frag0 =
3461 page_address(skb_shinfo(skb)->frags[0].page) +
3462 skb_shinfo(skb)->frags[0].page_offset;
Herbert Xu74895942009-05-26 18:50:27 +00003463 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3464 }
Herbert Xu78a478d2009-05-26 18:50:21 +00003465}
3466EXPORT_SYMBOL(skb_gro_reset_offset);
3467
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003468gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003469{
Herbert Xu86911732009-01-29 14:19:50 +00003470 skb_gro_reset_offset(skb);
3471
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003472 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003473}
3474EXPORT_SYMBOL(napi_gro_receive);
3475
stephen hemmingerd0c2b0d2010-10-19 07:12:10 +00003476static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003477{
Herbert Xu96e93ea2009-01-06 10:49:34 -08003478 __skb_pull(skb, skb_headlen(skb));
3479 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
Jesse Gross3701e512010-10-20 13:56:06 +00003480 skb->vlan_tci = 0;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003481
3482 napi->skb = skb;
3483}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003484
Herbert Xu76620aa2009-04-16 02:02:07 -07003485struct sk_buff *napi_get_frags(struct napi_struct *napi)
Herbert Xu5d38a072009-01-04 16:13:40 -08003486{
Herbert Xu5d38a072009-01-04 16:13:40 -08003487 struct sk_buff *skb = napi->skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003488
3489 if (!skb) {
Eric Dumazet89d71a62009-10-13 05:34:20 +00003490 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3491 if (skb)
3492 napi->skb = skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003493 }
Herbert Xu96e93ea2009-01-06 10:49:34 -08003494 return skb;
3495}
Herbert Xu76620aa2009-04-16 02:02:07 -07003496EXPORT_SYMBOL(napi_get_frags);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003497
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003498gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3499 gro_result_t ret)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003500{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003501 switch (ret) {
3502 case GRO_NORMAL:
Herbert Xu86911732009-01-29 14:19:50 +00003503 case GRO_HELD:
Ajit Khapardee76b69c2010-02-16 20:25:43 +00003504 skb->protocol = eth_type_trans(skb, skb->dev);
Herbert Xu86911732009-01-29 14:19:50 +00003505
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003506 if (ret == GRO_HELD)
3507 skb_gro_pull(skb, -ETH_HLEN);
3508 else if (netif_receive_skb(skb))
3509 ret = GRO_DROP;
Herbert Xu86911732009-01-29 14:19:50 +00003510 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003511
3512 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003513 case GRO_MERGED_FREE:
3514 napi_reuse_skb(napi, skb);
3515 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003516
3517 case GRO_MERGED:
3518 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003519 }
3520
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003521 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003522}
3523EXPORT_SYMBOL(napi_frags_finish);
3524
Herbert Xu76620aa2009-04-16 02:02:07 -07003525struct sk_buff *napi_frags_skb(struct napi_struct *napi)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003526{
Herbert Xu76620aa2009-04-16 02:02:07 -07003527 struct sk_buff *skb = napi->skb;
3528 struct ethhdr *eth;
Herbert Xua5b1cf22009-05-26 18:50:28 +00003529 unsigned int hlen;
3530 unsigned int off;
Herbert Xu76620aa2009-04-16 02:02:07 -07003531
3532 napi->skb = NULL;
3533
3534 skb_reset_mac_header(skb);
3535 skb_gro_reset_offset(skb);
3536
Herbert Xua5b1cf22009-05-26 18:50:28 +00003537 off = skb_gro_offset(skb);
3538 hlen = off + sizeof(*eth);
3539 eth = skb_gro_header_fast(skb, off);
3540 if (skb_gro_header_hard(skb, hlen)) {
3541 eth = skb_gro_header_slow(skb, hlen, off);
3542 if (unlikely(!eth)) {
3543 napi_reuse_skb(napi, skb);
3544 skb = NULL;
3545 goto out;
3546 }
Herbert Xu76620aa2009-04-16 02:02:07 -07003547 }
3548
3549 skb_gro_pull(skb, sizeof(*eth));
3550
3551 /*
3552 * This works because the only protocols we care about don't require
3553 * special handling. We'll fix it up properly at the end.
3554 */
3555 skb->protocol = eth->h_proto;
3556
3557out:
3558 return skb;
3559}
3560EXPORT_SYMBOL(napi_frags_skb);
3561
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003562gro_result_t napi_gro_frags(struct napi_struct *napi)
Herbert Xu76620aa2009-04-16 02:02:07 -07003563{
3564 struct sk_buff *skb = napi_frags_skb(napi);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003565
3566 if (!skb)
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003567 return GRO_DROP;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003568
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003569 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
Herbert Xu5d38a072009-01-04 16:13:40 -08003570}
3571EXPORT_SYMBOL(napi_gro_frags);
3572
Eric Dumazete326bed2010-04-22 00:22:45 -07003573/*
3574 * net_rps_action sends any pending IPI's for rps.
3575 * Note: called with local irq disabled, but exits with local irq enabled.
3576 */
3577static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3578{
3579#ifdef CONFIG_RPS
3580 struct softnet_data *remsd = sd->rps_ipi_list;
3581
3582 if (remsd) {
3583 sd->rps_ipi_list = NULL;
3584
3585 local_irq_enable();
3586
3587 /* Send pending IPI's to kick RPS processing on remote cpus. */
3588 while (remsd) {
3589 struct softnet_data *next = remsd->rps_ipi_next;
3590
3591 if (cpu_online(remsd->cpu))
3592 __smp_call_function_single(remsd->cpu,
3593 &remsd->csd, 0);
3594 remsd = next;
3595 }
3596 } else
3597#endif
3598 local_irq_enable();
3599}
3600
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003601static int process_backlog(struct napi_struct *napi, int quota)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003602{
3603 int work = 0;
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003604 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003605
Eric Dumazete326bed2010-04-22 00:22:45 -07003606#ifdef CONFIG_RPS
3607 /* Check if we have pending ipi, its better to send them now,
3608 * not waiting net_rx_action() end.
3609 */
3610 if (sd->rps_ipi_list) {
3611 local_irq_disable();
3612 net_rps_action_and_irq_enable(sd);
3613 }
3614#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003615 napi->weight = weight_p;
Changli Gao6e7676c2010-04-27 15:07:33 -07003616 local_irq_disable();
3617 while (work < quota) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003618 struct sk_buff *skb;
Changli Gao6e7676c2010-04-27 15:07:33 -07003619 unsigned int qlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003620
Changli Gao6e7676c2010-04-27 15:07:33 -07003621 while ((skb = __skb_dequeue(&sd->process_queue))) {
Eric Dumazete4008272010-04-05 15:42:39 -07003622 local_irq_enable();
Changli Gao6e7676c2010-04-27 15:07:33 -07003623 __netif_receive_skb(skb);
Changli Gao6e7676c2010-04-27 15:07:33 -07003624 local_irq_disable();
Tom Herbert76cc8b12010-05-20 18:37:59 +00003625 input_queue_head_incr(sd);
3626 if (++work >= quota) {
3627 local_irq_enable();
3628 return work;
3629 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003630 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003631
Changli Gao6e7676c2010-04-27 15:07:33 -07003632 rps_lock(sd);
3633 qlen = skb_queue_len(&sd->input_pkt_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003634 if (qlen)
Changli Gao6e7676c2010-04-27 15:07:33 -07003635 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3636 &sd->process_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003637
Changli Gao6e7676c2010-04-27 15:07:33 -07003638 if (qlen < quota - work) {
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003639 /*
3640 * Inline a custom version of __napi_complete().
3641 * only current cpu owns and manipulates this napi,
3642 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3643 * we can use a plain write instead of clear_bit(),
3644 * and we dont need an smp_mb() memory barrier.
3645 */
3646 list_del(&napi->poll_list);
3647 napi->state = 0;
3648
Changli Gao6e7676c2010-04-27 15:07:33 -07003649 quota = work + qlen;
3650 }
3651 rps_unlock(sd);
3652 }
3653 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003654
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003655 return work;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003656}
3657
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003658/**
3659 * __napi_schedule - schedule for receive
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07003660 * @n: entry to schedule
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003661 *
3662 * The entry's receive function will be scheduled to run
3663 */
Harvey Harrisonb5606c22008-02-13 15:03:16 -08003664void __napi_schedule(struct napi_struct *n)
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003665{
3666 unsigned long flags;
3667
3668 local_irq_save(flags);
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003669 ____napi_schedule(&__get_cpu_var(softnet_data), n);
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003670 local_irq_restore(flags);
3671}
3672EXPORT_SYMBOL(__napi_schedule);
3673
Herbert Xud565b0a2008-12-15 23:38:52 -08003674void __napi_complete(struct napi_struct *n)
3675{
3676 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3677 BUG_ON(n->gro_list);
3678
3679 list_del(&n->poll_list);
3680 smp_mb__before_clear_bit();
3681 clear_bit(NAPI_STATE_SCHED, &n->state);
3682}
3683EXPORT_SYMBOL(__napi_complete);
3684
3685void napi_complete(struct napi_struct *n)
3686{
3687 unsigned long flags;
3688
3689 /*
3690 * don't let napi dequeue from the cpu poll list
3691 * just in case its running on a different cpu
3692 */
3693 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3694 return;
3695
3696 napi_gro_flush(n);
3697 local_irq_save(flags);
3698 __napi_complete(n);
3699 local_irq_restore(flags);
3700}
3701EXPORT_SYMBOL(napi_complete);
3702
3703void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3704 int (*poll)(struct napi_struct *, int), int weight)
3705{
3706 INIT_LIST_HEAD(&napi->poll_list);
Herbert Xu4ae55442009-02-08 18:00:36 +00003707 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003708 napi->gro_list = NULL;
Herbert Xu5d38a072009-01-04 16:13:40 -08003709 napi->skb = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08003710 napi->poll = poll;
3711 napi->weight = weight;
3712 list_add(&napi->dev_list, &dev->napi_list);
Herbert Xud565b0a2008-12-15 23:38:52 -08003713 napi->dev = dev;
Herbert Xu5d38a072009-01-04 16:13:40 -08003714#ifdef CONFIG_NETPOLL
Herbert Xud565b0a2008-12-15 23:38:52 -08003715 spin_lock_init(&napi->poll_lock);
3716 napi->poll_owner = -1;
3717#endif
3718 set_bit(NAPI_STATE_SCHED, &napi->state);
3719}
3720EXPORT_SYMBOL(netif_napi_add);
3721
3722void netif_napi_del(struct napi_struct *napi)
3723{
3724 struct sk_buff *skb, *next;
3725
Peter P Waskiewicz Jrd7b06632008-12-26 01:35:35 -08003726 list_del_init(&napi->dev_list);
Herbert Xu76620aa2009-04-16 02:02:07 -07003727 napi_free_frags(napi);
Herbert Xud565b0a2008-12-15 23:38:52 -08003728
3729 for (skb = napi->gro_list; skb; skb = next) {
3730 next = skb->next;
3731 skb->next = NULL;
3732 kfree_skb(skb);
3733 }
3734
3735 napi->gro_list = NULL;
Herbert Xu4ae55442009-02-08 18:00:36 +00003736 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003737}
3738EXPORT_SYMBOL(netif_napi_del);
3739
Linus Torvalds1da177e2005-04-16 15:20:36 -07003740static void net_rx_action(struct softirq_action *h)
3741{
Eric Dumazete326bed2010-04-22 00:22:45 -07003742 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003743 unsigned long time_limit = jiffies + 2;
Stephen Hemminger51b0bde2005-06-23 20:14:40 -07003744 int budget = netdev_budget;
Matt Mackall53fb95d2005-08-11 19:27:43 -07003745 void *have;
3746
Linus Torvalds1da177e2005-04-16 15:20:36 -07003747 local_irq_disable();
3748
Eric Dumazete326bed2010-04-22 00:22:45 -07003749 while (!list_empty(&sd->poll_list)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003750 struct napi_struct *n;
3751 int work, weight;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003752
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003753 /* If softirq window is exhuasted then punt.
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003754 * Allow this to run for 2 jiffies since which will allow
3755 * an average latency of 1.5/HZ.
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003756 */
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003757 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003758 goto softnet_break;
3759
3760 local_irq_enable();
3761
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003762 /* Even though interrupts have been re-enabled, this
3763 * access is safe because interrupts can only add new
3764 * entries to the tail of this list, and only ->poll()
3765 * calls can remove this head entry from the list.
3766 */
Eric Dumazete326bed2010-04-22 00:22:45 -07003767 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003768
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003769 have = netpoll_poll_lock(n);
3770
3771 weight = n->weight;
3772
David S. Miller0a7606c2007-10-29 21:28:47 -07003773 /* This NAPI_STATE_SCHED test is for avoiding a race
3774 * with netpoll's poll_napi(). Only the entity which
3775 * obtains the lock and sees NAPI_STATE_SCHED set will
3776 * actually make the ->poll() call. Therefore we avoid
3777 * accidently calling ->poll() when NAPI is not scheduled.
3778 */
3779 work = 0;
Neil Horman4ea7e382009-05-21 07:36:08 +00003780 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
David S. Miller0a7606c2007-10-29 21:28:47 -07003781 work = n->poll(n, weight);
Neil Horman4ea7e382009-05-21 07:36:08 +00003782 trace_napi_poll(n);
3783 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003784
3785 WARN_ON_ONCE(work > weight);
3786
3787 budget -= work;
3788
3789 local_irq_disable();
3790
3791 /* Drivers must not modify the NAPI state if they
3792 * consume the entire weight. In such cases this code
3793 * still "owns" the NAPI instance and therefore can
3794 * move the instance around on the list at-will.
3795 */
David S. Millerfed17f32008-01-07 21:00:40 -08003796 if (unlikely(work == weight)) {
Herbert Xuff780cd2009-06-26 19:27:04 -07003797 if (unlikely(napi_disable_pending(n))) {
3798 local_irq_enable();
3799 napi_complete(n);
3800 local_irq_disable();
3801 } else
Eric Dumazete326bed2010-04-22 00:22:45 -07003802 list_move_tail(&n->poll_list, &sd->poll_list);
David S. Millerfed17f32008-01-07 21:00:40 -08003803 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003804
3805 netpoll_poll_unlock(have);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003806 }
3807out:
Eric Dumazete326bed2010-04-22 00:22:45 -07003808 net_rps_action_and_irq_enable(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003809
Chris Leechdb217332006-06-17 21:24:58 -07003810#ifdef CONFIG_NET_DMA
3811 /*
3812 * There may not be any more sk_buffs coming right now, so push
3813 * any pending DMA copies to hardware
3814 */
Dan Williams2ba05622009-01-06 11:38:14 -07003815 dma_issue_pending_all();
Chris Leechdb217332006-06-17 21:24:58 -07003816#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003817
Linus Torvalds1da177e2005-04-16 15:20:36 -07003818 return;
3819
3820softnet_break:
Changli Gaodee42872010-05-02 05:42:16 +00003821 sd->time_squeeze++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003822 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3823 goto out;
3824}
3825
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003826static gifconf_func_t *gifconf_list[NPROTO];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003827
3828/**
3829 * register_gifconf - register a SIOCGIF handler
3830 * @family: Address family
3831 * @gifconf: Function handler
3832 *
3833 * Register protocol dependent address dumping routines. The handler
3834 * that is passed must not be freed or reused until it has been replaced
3835 * by another handler.
3836 */
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003837int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003838{
3839 if (family >= NPROTO)
3840 return -EINVAL;
3841 gifconf_list[family] = gifconf;
3842 return 0;
3843}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003844EXPORT_SYMBOL(register_gifconf);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003845
3846
3847/*
3848 * Map an interface index to its name (SIOCGIFNAME)
3849 */
3850
3851/*
3852 * We need this ioctl for efficient implementation of the
3853 * if_indextoname() function required by the IPv6 API. Without
3854 * it, we would have to search all the interfaces to find a
3855 * match. --pb
3856 */
3857
Eric W. Biederman881d9662007-09-17 11:56:21 -07003858static int dev_ifname(struct net *net, struct ifreq __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003859{
3860 struct net_device *dev;
3861 struct ifreq ifr;
3862
3863 /*
3864 * Fetch the caller's info block.
3865 */
3866
3867 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3868 return -EFAULT;
3869
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003870 rcu_read_lock();
3871 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003872 if (!dev) {
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003873 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003874 return -ENODEV;
3875 }
3876
3877 strcpy(ifr.ifr_name, dev->name);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003878 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003879
3880 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3881 return -EFAULT;
3882 return 0;
3883}
3884
3885/*
3886 * Perform a SIOCGIFCONF call. This structure will change
3887 * size eventually, and there is nothing I can do about it.
3888 * Thus we will need a 'compatibility mode'.
3889 */
3890
Eric W. Biederman881d9662007-09-17 11:56:21 -07003891static int dev_ifconf(struct net *net, char __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003892{
3893 struct ifconf ifc;
3894 struct net_device *dev;
3895 char __user *pos;
3896 int len;
3897 int total;
3898 int i;
3899
3900 /*
3901 * Fetch the caller's info block.
3902 */
3903
3904 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3905 return -EFAULT;
3906
3907 pos = ifc.ifc_buf;
3908 len = ifc.ifc_len;
3909
3910 /*
3911 * Loop over the interfaces, and write an info block for each.
3912 */
3913
3914 total = 0;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003915 for_each_netdev(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003916 for (i = 0; i < NPROTO; i++) {
3917 if (gifconf_list[i]) {
3918 int done;
3919 if (!pos)
3920 done = gifconf_list[i](dev, NULL, 0);
3921 else
3922 done = gifconf_list[i](dev, pos + total,
3923 len - total);
3924 if (done < 0)
3925 return -EFAULT;
3926 total += done;
3927 }
3928 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003929 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003930
3931 /*
3932 * All done. Write the updated control block back to the caller.
3933 */
3934 ifc.ifc_len = total;
3935
3936 /*
3937 * Both BSD and Solaris return 0 here, so we do too.
3938 */
3939 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3940}
3941
3942#ifdef CONFIG_PROC_FS
3943/*
3944 * This is invoked by the /proc filesystem handler to display a device
3945 * in detail.
3946 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003947void *dev_seq_start(struct seq_file *seq, loff_t *pos)
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003948 __acquires(RCU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003949{
Denis V. Luneve372c412007-11-19 22:31:54 -08003950 struct net *net = seq_file_net(seq);
Pavel Emelianov7562f872007-05-03 15:13:45 -07003951 loff_t off;
3952 struct net_device *dev;
3953
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003954 rcu_read_lock();
Pavel Emelianov7562f872007-05-03 15:13:45 -07003955 if (!*pos)
3956 return SEQ_START_TOKEN;
3957
3958 off = 1;
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003959 for_each_netdev_rcu(net, dev)
Pavel Emelianov7562f872007-05-03 15:13:45 -07003960 if (off++ == *pos)
3961 return dev;
3962
3963 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003964}
3965
3966void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3967{
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003968 struct net_device *dev = (v == SEQ_START_TOKEN) ?
3969 first_net_device(seq_file_net(seq)) :
3970 next_net_device((struct net_device *)v);
3971
Linus Torvalds1da177e2005-04-16 15:20:36 -07003972 ++*pos;
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003973 return rcu_dereference(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003974}
3975
3976void dev_seq_stop(struct seq_file *seq, void *v)
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003977 __releases(RCU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003978{
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003979 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003980}
3981
3982static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3983{
Eric Dumazet28172732010-07-07 14:58:56 -07003984 struct rtnl_link_stats64 temp;
3985 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003986
Ben Hutchingsbe1f3c22010-06-08 07:19:54 +00003987 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3988 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
Rusty Russell5a1b5892007-04-28 21:04:03 -07003989 dev->name, stats->rx_bytes, stats->rx_packets,
3990 stats->rx_errors,
3991 stats->rx_dropped + stats->rx_missed_errors,
3992 stats->rx_fifo_errors,
3993 stats->rx_length_errors + stats->rx_over_errors +
3994 stats->rx_crc_errors + stats->rx_frame_errors,
3995 stats->rx_compressed, stats->multicast,
3996 stats->tx_bytes, stats->tx_packets,
3997 stats->tx_errors, stats->tx_dropped,
3998 stats->tx_fifo_errors, stats->collisions,
3999 stats->tx_carrier_errors +
4000 stats->tx_aborted_errors +
4001 stats->tx_window_errors +
4002 stats->tx_heartbeat_errors,
4003 stats->tx_compressed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004004}
4005
4006/*
4007 * Called from the PROCfs module. This now uses the new arbitrary sized
4008 * /proc/net interface to create /proc/net/dev
4009 */
4010static int dev_seq_show(struct seq_file *seq, void *v)
4011{
4012 if (v == SEQ_START_TOKEN)
4013 seq_puts(seq, "Inter-| Receive "
4014 " | Transmit\n"
4015 " face |bytes packets errs drop fifo frame "
4016 "compressed multicast|bytes packets errs "
4017 "drop fifo colls carrier compressed\n");
4018 else
4019 dev_seq_printf_stats(seq, v);
4020 return 0;
4021}
4022
Changli Gaodee42872010-05-02 05:42:16 +00004023static struct softnet_data *softnet_get_online(loff_t *pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004024{
Changli Gaodee42872010-05-02 05:42:16 +00004025 struct softnet_data *sd = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004026
Mike Travis0c0b0ac2008-05-02 16:43:08 -07004027 while (*pos < nr_cpu_ids)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09004028 if (cpu_online(*pos)) {
Changli Gaodee42872010-05-02 05:42:16 +00004029 sd = &per_cpu(softnet_data, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004030 break;
4031 } else
4032 ++*pos;
Changli Gaodee42872010-05-02 05:42:16 +00004033 return sd;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004034}
4035
4036static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4037{
4038 return softnet_get_online(pos);
4039}
4040
4041static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4042{
4043 ++*pos;
4044 return softnet_get_online(pos);
4045}
4046
4047static void softnet_seq_stop(struct seq_file *seq, void *v)
4048{
4049}
4050
4051static int softnet_seq_show(struct seq_file *seq, void *v)
4052{
Changli Gaodee42872010-05-02 05:42:16 +00004053 struct softnet_data *sd = v;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004054
Tom Herbert0a9627f2010-03-16 08:03:29 +00004055 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
Changli Gaodee42872010-05-02 05:42:16 +00004056 sd->processed, sd->dropped, sd->time_squeeze, 0,
Stephen Hemmingerc1ebcdb2005-06-23 20:08:59 -07004057 0, 0, 0, 0, /* was fastroute */
Changli Gaodee42872010-05-02 05:42:16 +00004058 sd->cpu_collision, sd->received_rps);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004059 return 0;
4060}
4061
Stephen Hemmingerf6908082007-03-12 14:34:29 -07004062static const struct seq_operations dev_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004063 .start = dev_seq_start,
4064 .next = dev_seq_next,
4065 .stop = dev_seq_stop,
4066 .show = dev_seq_show,
4067};
4068
4069static int dev_seq_open(struct inode *inode, struct file *file)
4070{
Denis V. Luneve372c412007-11-19 22:31:54 -08004071 return seq_open_net(inode, file, &dev_seq_ops,
4072 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07004073}
4074
Arjan van de Ven9a321442007-02-12 00:55:35 -08004075static const struct file_operations dev_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004076 .owner = THIS_MODULE,
4077 .open = dev_seq_open,
4078 .read = seq_read,
4079 .llseek = seq_lseek,
Denis V. Luneve372c412007-11-19 22:31:54 -08004080 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07004081};
4082
Stephen Hemmingerf6908082007-03-12 14:34:29 -07004083static const struct seq_operations softnet_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004084 .start = softnet_seq_start,
4085 .next = softnet_seq_next,
4086 .stop = softnet_seq_stop,
4087 .show = softnet_seq_show,
4088};
4089
4090static int softnet_seq_open(struct inode *inode, struct file *file)
4091{
4092 return seq_open(file, &softnet_seq_ops);
4093}
4094
Arjan van de Ven9a321442007-02-12 00:55:35 -08004095static const struct file_operations softnet_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004096 .owner = THIS_MODULE,
4097 .open = softnet_seq_open,
4098 .read = seq_read,
4099 .llseek = seq_lseek,
4100 .release = seq_release,
4101};
4102
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004103static void *ptype_get_idx(loff_t pos)
4104{
4105 struct packet_type *pt = NULL;
4106 loff_t i = 0;
4107 int t;
4108
4109 list_for_each_entry_rcu(pt, &ptype_all, list) {
4110 if (i == pos)
4111 return pt;
4112 ++i;
4113 }
4114
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08004115 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004116 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4117 if (i == pos)
4118 return pt;
4119 ++i;
4120 }
4121 }
4122 return NULL;
4123}
4124
4125static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
Stephen Hemminger72348a42008-01-21 02:27:29 -08004126 __acquires(RCU)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004127{
4128 rcu_read_lock();
4129 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4130}
4131
4132static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4133{
4134 struct packet_type *pt;
4135 struct list_head *nxt;
4136 int hash;
4137
4138 ++*pos;
4139 if (v == SEQ_START_TOKEN)
4140 return ptype_get_idx(0);
4141
4142 pt = v;
4143 nxt = pt->list.next;
4144 if (pt->type == htons(ETH_P_ALL)) {
4145 if (nxt != &ptype_all)
4146 goto found;
4147 hash = 0;
4148 nxt = ptype_base[0].next;
4149 } else
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08004150 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004151
4152 while (nxt == &ptype_base[hash]) {
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08004153 if (++hash >= PTYPE_HASH_SIZE)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004154 return NULL;
4155 nxt = ptype_base[hash].next;
4156 }
4157found:
4158 return list_entry(nxt, struct packet_type, list);
4159}
4160
4161static void ptype_seq_stop(struct seq_file *seq, void *v)
Stephen Hemminger72348a42008-01-21 02:27:29 -08004162 __releases(RCU)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004163{
4164 rcu_read_unlock();
4165}
4166
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004167static int ptype_seq_show(struct seq_file *seq, void *v)
4168{
4169 struct packet_type *pt = v;
4170
4171 if (v == SEQ_START_TOKEN)
4172 seq_puts(seq, "Type Device Function\n");
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09004173 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004174 if (pt->type == htons(ETH_P_ALL))
4175 seq_puts(seq, "ALL ");
4176 else
4177 seq_printf(seq, "%04x", ntohs(pt->type));
4178
Alexey Dobriyan908cd2d2008-11-16 19:50:35 -08004179 seq_printf(seq, " %-8s %pF\n",
4180 pt->dev ? pt->dev->name : "", pt->func);
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004181 }
4182
4183 return 0;
4184}
4185
4186static const struct seq_operations ptype_seq_ops = {
4187 .start = ptype_seq_start,
4188 .next = ptype_seq_next,
4189 .stop = ptype_seq_stop,
4190 .show = ptype_seq_show,
4191};
4192
4193static int ptype_seq_open(struct inode *inode, struct file *file)
4194{
Pavel Emelyanov2feb27d2008-03-24 14:57:45 -07004195 return seq_open_net(inode, file, &ptype_seq_ops,
4196 sizeof(struct seq_net_private));
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004197}
4198
4199static const struct file_operations ptype_seq_fops = {
4200 .owner = THIS_MODULE,
4201 .open = ptype_seq_open,
4202 .read = seq_read,
4203 .llseek = seq_lseek,
Pavel Emelyanov2feb27d2008-03-24 14:57:45 -07004204 .release = seq_release_net,
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004205};
4206
4207
Pavel Emelyanov46650792007-10-08 20:38:39 -07004208static int __net_init dev_proc_net_init(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004209{
4210 int rc = -ENOMEM;
4211
Eric W. Biederman881d9662007-09-17 11:56:21 -07004212 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004213 goto out;
Eric W. Biederman881d9662007-09-17 11:56:21 -07004214 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004215 goto out_dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07004216 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02004217 goto out_softnet;
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004218
Eric W. Biederman881d9662007-09-17 11:56:21 -07004219 if (wext_proc_init(net))
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02004220 goto out_ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004221 rc = 0;
4222out:
4223 return rc;
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02004224out_ptype:
Eric W. Biederman881d9662007-09-17 11:56:21 -07004225 proc_net_remove(net, "ptype");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004226out_softnet:
Eric W. Biederman881d9662007-09-17 11:56:21 -07004227 proc_net_remove(net, "softnet_stat");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004228out_dev:
Eric W. Biederman881d9662007-09-17 11:56:21 -07004229 proc_net_remove(net, "dev");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004230 goto out;
4231}
Eric W. Biederman881d9662007-09-17 11:56:21 -07004232
Pavel Emelyanov46650792007-10-08 20:38:39 -07004233static void __net_exit dev_proc_net_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07004234{
4235 wext_proc_exit(net);
4236
4237 proc_net_remove(net, "ptype");
4238 proc_net_remove(net, "softnet_stat");
4239 proc_net_remove(net, "dev");
4240}
4241
Denis V. Lunev022cbae2007-11-13 03:23:50 -08004242static struct pernet_operations __net_initdata dev_proc_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07004243 .init = dev_proc_net_init,
4244 .exit = dev_proc_net_exit,
4245};
4246
4247static int __init dev_proc_init(void)
4248{
4249 return register_pernet_subsys(&dev_proc_ops);
4250}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004251#else
4252#define dev_proc_init() 0
4253#endif /* CONFIG_PROC_FS */
4254
4255
4256/**
4257 * netdev_set_master - set up master/slave pair
4258 * @slave: slave device
4259 * @master: new master device
4260 *
4261 * Changes the master device of the slave. Pass %NULL to break the
4262 * bonding. The caller must hold the RTNL semaphore. On a failure
4263 * a negative errno code is returned. On success the reference counts
4264 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4265 * function returns zero.
4266 */
4267int netdev_set_master(struct net_device *slave, struct net_device *master)
4268{
4269 struct net_device *old = slave->master;
4270
4271 ASSERT_RTNL();
4272
4273 if (master) {
4274 if (old)
4275 return -EBUSY;
4276 dev_hold(master);
4277 }
4278
4279 slave->master = master;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09004280
Eric Dumazet283f2fe2010-03-18 13:37:40 +00004281 if (old) {
4282 synchronize_net();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004283 dev_put(old);
Eric Dumazet283f2fe2010-03-18 13:37:40 +00004284 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004285 if (master)
4286 slave->flags |= IFF_SLAVE;
4287 else
4288 slave->flags &= ~IFF_SLAVE;
4289
4290 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4291 return 0;
4292}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004293EXPORT_SYMBOL(netdev_set_master);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004294
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004295static void dev_change_rx_flags(struct net_device *dev, int flags)
4296{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004297 const struct net_device_ops *ops = dev->netdev_ops;
4298
4299 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4300 ops->ndo_change_rx_flags(dev, flags);
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004301}
4302
Wang Chendad9b332008-06-18 01:48:28 -07004303static int __dev_set_promiscuity(struct net_device *dev, int inc)
Patrick McHardy4417da62007-06-27 01:28:10 -07004304{
4305 unsigned short old_flags = dev->flags;
David Howells8192b0c2008-11-14 10:39:10 +11004306 uid_t uid;
4307 gid_t gid;
Patrick McHardy4417da62007-06-27 01:28:10 -07004308
Patrick McHardy24023452007-07-14 18:51:31 -07004309 ASSERT_RTNL();
4310
Wang Chendad9b332008-06-18 01:48:28 -07004311 dev->flags |= IFF_PROMISC;
4312 dev->promiscuity += inc;
4313 if (dev->promiscuity == 0) {
4314 /*
4315 * Avoid overflow.
4316 * If inc causes overflow, untouch promisc and return error.
4317 */
4318 if (inc < 0)
4319 dev->flags &= ~IFF_PROMISC;
4320 else {
4321 dev->promiscuity -= inc;
4322 printk(KERN_WARNING "%s: promiscuity touches roof, "
4323 "set promiscuity failed, promiscuity feature "
4324 "of device might be broken.\n", dev->name);
4325 return -EOVERFLOW;
4326 }
4327 }
Patrick McHardy4417da62007-06-27 01:28:10 -07004328 if (dev->flags != old_flags) {
4329 printk(KERN_INFO "device %s %s promiscuous mode\n",
4330 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4331 "left");
David Howells8192b0c2008-11-14 10:39:10 +11004332 if (audit_enabled) {
4333 current_uid_gid(&uid, &gid);
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004334 audit_log(current->audit_context, GFP_ATOMIC,
4335 AUDIT_ANOM_PROMISCUOUS,
4336 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4337 dev->name, (dev->flags & IFF_PROMISC),
4338 (old_flags & IFF_PROMISC),
4339 audit_get_loginuid(current),
David Howells8192b0c2008-11-14 10:39:10 +11004340 uid, gid,
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004341 audit_get_sessionid(current));
David Howells8192b0c2008-11-14 10:39:10 +11004342 }
Patrick McHardy24023452007-07-14 18:51:31 -07004343
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004344 dev_change_rx_flags(dev, IFF_PROMISC);
Patrick McHardy4417da62007-06-27 01:28:10 -07004345 }
Wang Chendad9b332008-06-18 01:48:28 -07004346 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004347}
4348
Linus Torvalds1da177e2005-04-16 15:20:36 -07004349/**
4350 * dev_set_promiscuity - update promiscuity count on a device
4351 * @dev: device
4352 * @inc: modifier
4353 *
Stephen Hemminger3041a062006-05-26 13:25:24 -07004354 * Add or remove promiscuity from a device. While the count in the device
Linus Torvalds1da177e2005-04-16 15:20:36 -07004355 * remains above zero the interface remains promiscuous. Once it hits zero
4356 * the device reverts back to normal filtering operation. A negative inc
4357 * value is used to drop promiscuity on the device.
Wang Chendad9b332008-06-18 01:48:28 -07004358 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004359 */
Wang Chendad9b332008-06-18 01:48:28 -07004360int dev_set_promiscuity(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004361{
4362 unsigned short old_flags = dev->flags;
Wang Chendad9b332008-06-18 01:48:28 -07004363 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004364
Wang Chendad9b332008-06-18 01:48:28 -07004365 err = __dev_set_promiscuity(dev, inc);
Patrick McHardy4b5a6982008-07-06 15:49:08 -07004366 if (err < 0)
Wang Chendad9b332008-06-18 01:48:28 -07004367 return err;
Patrick McHardy4417da62007-06-27 01:28:10 -07004368 if (dev->flags != old_flags)
4369 dev_set_rx_mode(dev);
Wang Chendad9b332008-06-18 01:48:28 -07004370 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004371}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004372EXPORT_SYMBOL(dev_set_promiscuity);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004373
4374/**
4375 * dev_set_allmulti - update allmulti count on a device
4376 * @dev: device
4377 * @inc: modifier
4378 *
4379 * Add or remove reception of all multicast frames to a device. While the
4380 * count in the device remains above zero the interface remains listening
4381 * to all interfaces. Once it hits zero the device reverts back to normal
4382 * filtering operation. A negative @inc value is used to drop the counter
4383 * when releasing a resource needing all multicasts.
Wang Chendad9b332008-06-18 01:48:28 -07004384 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004385 */
4386
Wang Chendad9b332008-06-18 01:48:28 -07004387int dev_set_allmulti(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004388{
4389 unsigned short old_flags = dev->flags;
4390
Patrick McHardy24023452007-07-14 18:51:31 -07004391 ASSERT_RTNL();
4392
Linus Torvalds1da177e2005-04-16 15:20:36 -07004393 dev->flags |= IFF_ALLMULTI;
Wang Chendad9b332008-06-18 01:48:28 -07004394 dev->allmulti += inc;
4395 if (dev->allmulti == 0) {
4396 /*
4397 * Avoid overflow.
4398 * If inc causes overflow, untouch allmulti and return error.
4399 */
4400 if (inc < 0)
4401 dev->flags &= ~IFF_ALLMULTI;
4402 else {
4403 dev->allmulti -= inc;
4404 printk(KERN_WARNING "%s: allmulti touches roof, "
4405 "set allmulti failed, allmulti feature of "
4406 "device might be broken.\n", dev->name);
4407 return -EOVERFLOW;
4408 }
4409 }
Patrick McHardy24023452007-07-14 18:51:31 -07004410 if (dev->flags ^ old_flags) {
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004411 dev_change_rx_flags(dev, IFF_ALLMULTI);
Patrick McHardy4417da62007-06-27 01:28:10 -07004412 dev_set_rx_mode(dev);
Patrick McHardy24023452007-07-14 18:51:31 -07004413 }
Wang Chendad9b332008-06-18 01:48:28 -07004414 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004415}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004416EXPORT_SYMBOL(dev_set_allmulti);
Patrick McHardy4417da62007-06-27 01:28:10 -07004417
4418/*
4419 * Upload unicast and multicast address lists to device and
4420 * configure RX filtering. When the device doesn't support unicast
Joe Perches53ccaae2007-12-20 14:02:06 -08004421 * filtering it is put in promiscuous mode while unicast addresses
Patrick McHardy4417da62007-06-27 01:28:10 -07004422 * are present.
4423 */
4424void __dev_set_rx_mode(struct net_device *dev)
4425{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004426 const struct net_device_ops *ops = dev->netdev_ops;
4427
Patrick McHardy4417da62007-06-27 01:28:10 -07004428 /* dev_open will call this function so the list will stay sane. */
4429 if (!(dev->flags&IFF_UP))
4430 return;
4431
4432 if (!netif_device_present(dev))
YOSHIFUJI Hideaki40b77c92007-07-19 10:43:23 +09004433 return;
Patrick McHardy4417da62007-06-27 01:28:10 -07004434
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004435 if (ops->ndo_set_rx_mode)
4436 ops->ndo_set_rx_mode(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004437 else {
4438 /* Unicast addresses changes may only happen under the rtnl,
4439 * therefore calling __dev_set_promiscuity here is safe.
4440 */
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004441 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004442 __dev_set_promiscuity(dev, 1);
4443 dev->uc_promisc = 1;
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004444 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004445 __dev_set_promiscuity(dev, -1);
4446 dev->uc_promisc = 0;
4447 }
4448
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004449 if (ops->ndo_set_multicast_list)
4450 ops->ndo_set_multicast_list(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004451 }
4452}
4453
4454void dev_set_rx_mode(struct net_device *dev)
4455{
David S. Millerb9e40852008-07-15 00:15:08 -07004456 netif_addr_lock_bh(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004457 __dev_set_rx_mode(dev);
David S. Millerb9e40852008-07-15 00:15:08 -07004458 netif_addr_unlock_bh(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004459}
4460
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004461/**
4462 * dev_get_flags - get flags reported to userspace
4463 * @dev: device
4464 *
4465 * Get the combination of flag bits exported through APIs to userspace.
4466 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004467unsigned dev_get_flags(const struct net_device *dev)
4468{
4469 unsigned flags;
4470
4471 flags = (dev->flags & ~(IFF_PROMISC |
4472 IFF_ALLMULTI |
Stefan Rompfb00055a2006-03-20 17:09:11 -08004473 IFF_RUNNING |
4474 IFF_LOWER_UP |
4475 IFF_DORMANT)) |
Linus Torvalds1da177e2005-04-16 15:20:36 -07004476 (dev->gflags & (IFF_PROMISC |
4477 IFF_ALLMULTI));
4478
Stefan Rompfb00055a2006-03-20 17:09:11 -08004479 if (netif_running(dev)) {
4480 if (netif_oper_up(dev))
4481 flags |= IFF_RUNNING;
4482 if (netif_carrier_ok(dev))
4483 flags |= IFF_LOWER_UP;
4484 if (netif_dormant(dev))
4485 flags |= IFF_DORMANT;
4486 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004487
4488 return flags;
4489}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004490EXPORT_SYMBOL(dev_get_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004491
Patrick McHardybd380812010-02-26 06:34:53 +00004492int __dev_change_flags(struct net_device *dev, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004493{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004494 int old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00004495 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004496
Patrick McHardy24023452007-07-14 18:51:31 -07004497 ASSERT_RTNL();
4498
Linus Torvalds1da177e2005-04-16 15:20:36 -07004499 /*
4500 * Set the flags on our device.
4501 */
4502
4503 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4504 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4505 IFF_AUTOMEDIA)) |
4506 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4507 IFF_ALLMULTI));
4508
4509 /*
4510 * Load in the correct multicast list now the flags have changed.
4511 */
4512
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004513 if ((old_flags ^ flags) & IFF_MULTICAST)
4514 dev_change_rx_flags(dev, IFF_MULTICAST);
Patrick McHardy24023452007-07-14 18:51:31 -07004515
Patrick McHardy4417da62007-06-27 01:28:10 -07004516 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004517
4518 /*
4519 * Have we downed the interface. We handle IFF_UP ourselves
4520 * according to user attempts to set it, rather than blindly
4521 * setting it.
4522 */
4523
4524 ret = 0;
4525 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
Patrick McHardybd380812010-02-26 06:34:53 +00004526 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004527
4528 if (!ret)
Patrick McHardy4417da62007-06-27 01:28:10 -07004529 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004530 }
4531
Linus Torvalds1da177e2005-04-16 15:20:36 -07004532 if ((flags ^ dev->gflags) & IFF_PROMISC) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004533 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4534
Linus Torvalds1da177e2005-04-16 15:20:36 -07004535 dev->gflags ^= IFF_PROMISC;
4536 dev_set_promiscuity(dev, inc);
4537 }
4538
4539 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4540 is important. Some (broken) drivers set IFF_PROMISC, when
4541 IFF_ALLMULTI is requested not asking us and not reporting.
4542 */
4543 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004544 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4545
Linus Torvalds1da177e2005-04-16 15:20:36 -07004546 dev->gflags ^= IFF_ALLMULTI;
4547 dev_set_allmulti(dev, inc);
4548 }
4549
Patrick McHardybd380812010-02-26 06:34:53 +00004550 return ret;
4551}
4552
4553void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4554{
4555 unsigned int changes = dev->flags ^ old_flags;
4556
4557 if (changes & IFF_UP) {
4558 if (dev->flags & IFF_UP)
4559 call_netdevice_notifiers(NETDEV_UP, dev);
4560 else
4561 call_netdevice_notifiers(NETDEV_DOWN, dev);
4562 }
4563
4564 if (dev->flags & IFF_UP &&
4565 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4566 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4567}
4568
4569/**
4570 * dev_change_flags - change device settings
4571 * @dev: device
4572 * @flags: device state flags
4573 *
4574 * Change settings on device based state flags. The flags are
4575 * in the userspace exported format.
4576 */
4577int dev_change_flags(struct net_device *dev, unsigned flags)
4578{
4579 int ret, changes;
4580 int old_flags = dev->flags;
4581
4582 ret = __dev_change_flags(dev, flags);
4583 if (ret < 0)
4584 return ret;
4585
4586 changes = old_flags ^ dev->flags;
Thomas Graf7c355f52007-06-05 16:03:03 -07004587 if (changes)
4588 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004589
Patrick McHardybd380812010-02-26 06:34:53 +00004590 __dev_notify_flags(dev, old_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004591 return ret;
4592}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004593EXPORT_SYMBOL(dev_change_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004594
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004595/**
4596 * dev_set_mtu - Change maximum transfer unit
4597 * @dev: device
4598 * @new_mtu: new transfer unit
4599 *
4600 * Change the maximum transfer size of the network device.
4601 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004602int dev_set_mtu(struct net_device *dev, int new_mtu)
4603{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004604 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004605 int err;
4606
4607 if (new_mtu == dev->mtu)
4608 return 0;
4609
4610 /* MTU must be positive. */
4611 if (new_mtu < 0)
4612 return -EINVAL;
4613
4614 if (!netif_device_present(dev))
4615 return -ENODEV;
4616
4617 err = 0;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004618 if (ops->ndo_change_mtu)
4619 err = ops->ndo_change_mtu(dev, new_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004620 else
4621 dev->mtu = new_mtu;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004622
Linus Torvalds1da177e2005-04-16 15:20:36 -07004623 if (!err && dev->flags & IFF_UP)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07004624 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004625 return err;
4626}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004627EXPORT_SYMBOL(dev_set_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004628
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004629/**
Vlad Dogarucbda10f2011-01-13 23:38:30 +00004630 * dev_set_group - Change group this device belongs to
4631 * @dev: device
4632 * @new_group: group this device should belong to
4633 */
4634void dev_set_group(struct net_device *dev, int new_group)
4635{
4636 dev->group = new_group;
4637}
4638EXPORT_SYMBOL(dev_set_group);
4639
4640/**
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004641 * dev_set_mac_address - Change Media Access Control Address
4642 * @dev: device
4643 * @sa: new address
4644 *
4645 * Change the hardware (MAC) address of the device
4646 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004647int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4648{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004649 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004650 int err;
4651
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004652 if (!ops->ndo_set_mac_address)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004653 return -EOPNOTSUPP;
4654 if (sa->sa_family != dev->type)
4655 return -EINVAL;
4656 if (!netif_device_present(dev))
4657 return -ENODEV;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004658 err = ops->ndo_set_mac_address(dev, sa);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004659 if (!err)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07004660 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004661 return err;
4662}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004663EXPORT_SYMBOL(dev_set_mac_address);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004664
4665/*
Eric Dumazet3710bec2009-11-01 19:42:09 +00004666 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07004667 */
Jeff Garzik14e3e072007-10-08 00:06:32 -07004668static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004669{
4670 int err;
Eric Dumazet3710bec2009-11-01 19:42:09 +00004671 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004672
4673 if (!dev)
4674 return -ENODEV;
4675
4676 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004677 case SIOCGIFFLAGS: /* Get interface flags */
4678 ifr->ifr_flags = (short) dev_get_flags(dev);
4679 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004680
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004681 case SIOCGIFMETRIC: /* Get the metric on the interface
4682 (currently unused) */
4683 ifr->ifr_metric = 0;
4684 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004685
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004686 case SIOCGIFMTU: /* Get the MTU of a device */
4687 ifr->ifr_mtu = dev->mtu;
4688 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004689
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004690 case SIOCGIFHWADDR:
4691 if (!dev->addr_len)
4692 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4693 else
4694 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4695 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4696 ifr->ifr_hwaddr.sa_family = dev->type;
4697 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004698
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004699 case SIOCGIFSLAVE:
4700 err = -EINVAL;
4701 break;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004702
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004703 case SIOCGIFMAP:
4704 ifr->ifr_map.mem_start = dev->mem_start;
4705 ifr->ifr_map.mem_end = dev->mem_end;
4706 ifr->ifr_map.base_addr = dev->base_addr;
4707 ifr->ifr_map.irq = dev->irq;
4708 ifr->ifr_map.dma = dev->dma;
4709 ifr->ifr_map.port = dev->if_port;
4710 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004711
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004712 case SIOCGIFINDEX:
4713 ifr->ifr_ifindex = dev->ifindex;
4714 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004715
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004716 case SIOCGIFTXQLEN:
4717 ifr->ifr_qlen = dev->tx_queue_len;
4718 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004719
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004720 default:
4721 /* dev_ioctl() should ensure this case
4722 * is never reached
4723 */
4724 WARN_ON(1);
4725 err = -EINVAL;
4726 break;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004727
4728 }
4729 return err;
4730}
4731
4732/*
4733 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4734 */
4735static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4736{
4737 int err;
4738 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
Jarek Poplawski5f2f6da2008-12-22 19:35:28 -08004739 const struct net_device_ops *ops;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004740
4741 if (!dev)
4742 return -ENODEV;
4743
Jarek Poplawski5f2f6da2008-12-22 19:35:28 -08004744 ops = dev->netdev_ops;
4745
Jeff Garzik14e3e072007-10-08 00:06:32 -07004746 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004747 case SIOCSIFFLAGS: /* Set interface flags */
4748 return dev_change_flags(dev, ifr->ifr_flags);
Jeff Garzik14e3e072007-10-08 00:06:32 -07004749
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004750 case SIOCSIFMETRIC: /* Set the metric on the interface
4751 (currently unused) */
4752 return -EOPNOTSUPP;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004753
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004754 case SIOCSIFMTU: /* Set the MTU of a device */
4755 return dev_set_mtu(dev, ifr->ifr_mtu);
Jeff Garzik14e3e072007-10-08 00:06:32 -07004756
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004757 case SIOCSIFHWADDR:
4758 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004759
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004760 case SIOCSIFHWBROADCAST:
4761 if (ifr->ifr_hwaddr.sa_family != dev->type)
4762 return -EINVAL;
4763 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4764 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4765 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4766 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004767
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004768 case SIOCSIFMAP:
4769 if (ops->ndo_set_config) {
4770 if (!netif_device_present(dev))
4771 return -ENODEV;
4772 return ops->ndo_set_config(dev, &ifr->ifr_map);
4773 }
4774 return -EOPNOTSUPP;
4775
4776 case SIOCADDMULTI:
4777 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4778 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4779 return -EINVAL;
4780 if (!netif_device_present(dev))
4781 return -ENODEV;
Jiri Pirko22bedad32010-04-01 21:22:57 +00004782 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004783
4784 case SIOCDELMULTI:
4785 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4786 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4787 return -EINVAL;
4788 if (!netif_device_present(dev))
4789 return -ENODEV;
Jiri Pirko22bedad32010-04-01 21:22:57 +00004790 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004791
4792 case SIOCSIFTXQLEN:
4793 if (ifr->ifr_qlen < 0)
4794 return -EINVAL;
4795 dev->tx_queue_len = ifr->ifr_qlen;
4796 return 0;
4797
4798 case SIOCSIFNAME:
4799 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4800 return dev_change_name(dev, ifr->ifr_newname);
4801
4802 /*
4803 * Unknown or private ioctl
4804 */
4805 default:
4806 if ((cmd >= SIOCDEVPRIVATE &&
4807 cmd <= SIOCDEVPRIVATE + 15) ||
4808 cmd == SIOCBONDENSLAVE ||
4809 cmd == SIOCBONDRELEASE ||
4810 cmd == SIOCBONDSETHWADDR ||
4811 cmd == SIOCBONDSLAVEINFOQUERY ||
4812 cmd == SIOCBONDINFOQUERY ||
4813 cmd == SIOCBONDCHANGEACTIVE ||
4814 cmd == SIOCGMIIPHY ||
4815 cmd == SIOCGMIIREG ||
4816 cmd == SIOCSMIIREG ||
4817 cmd == SIOCBRADDIF ||
4818 cmd == SIOCBRDELIF ||
4819 cmd == SIOCSHWTSTAMP ||
4820 cmd == SIOCWANDEV) {
4821 err = -EOPNOTSUPP;
4822 if (ops->ndo_do_ioctl) {
4823 if (netif_device_present(dev))
4824 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4825 else
4826 err = -ENODEV;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004827 }
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004828 } else
4829 err = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004830
4831 }
4832 return err;
4833}
4834
4835/*
4836 * This function handles all "interface"-type I/O control requests. The actual
4837 * 'doing' part of this is dev_ifsioc above.
4838 */
4839
4840/**
4841 * dev_ioctl - network device ioctl
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004842 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07004843 * @cmd: command to issue
4844 * @arg: pointer to a struct ifreq in user space
4845 *
4846 * Issue ioctl functions to devices. This is normally called by the
4847 * user space syscall interfaces but can sometimes be useful for
4848 * other purposes. The return value is the return from the syscall if
4849 * positive or a negative errno code on error.
4850 */
4851
Eric W. Biederman881d9662007-09-17 11:56:21 -07004852int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004853{
4854 struct ifreq ifr;
4855 int ret;
4856 char *colon;
4857
4858 /* One special case: SIOCGIFCONF takes ifconf argument
4859 and requires shared lock, because it sleeps writing
4860 to user space.
4861 */
4862
4863 if (cmd == SIOCGIFCONF) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08004864 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -07004865 ret = dev_ifconf(net, (char __user *) arg);
Stephen Hemminger6756ae42006-03-20 22:23:58 -08004866 rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004867 return ret;
4868 }
4869 if (cmd == SIOCGIFNAME)
Eric W. Biederman881d9662007-09-17 11:56:21 -07004870 return dev_ifname(net, (struct ifreq __user *)arg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004871
4872 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4873 return -EFAULT;
4874
4875 ifr.ifr_name[IFNAMSIZ-1] = 0;
4876
4877 colon = strchr(ifr.ifr_name, ':');
4878 if (colon)
4879 *colon = 0;
4880
4881 /*
4882 * See which interface the caller is talking about.
4883 */
4884
4885 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004886 /*
4887 * These ioctl calls:
4888 * - can be done by all.
4889 * - atomic and do not require locking.
4890 * - return a value
4891 */
4892 case SIOCGIFFLAGS:
4893 case SIOCGIFMETRIC:
4894 case SIOCGIFMTU:
4895 case SIOCGIFHWADDR:
4896 case SIOCGIFSLAVE:
4897 case SIOCGIFMAP:
4898 case SIOCGIFINDEX:
4899 case SIOCGIFTXQLEN:
4900 dev_load(net, ifr.ifr_name);
Eric Dumazet3710bec2009-11-01 19:42:09 +00004901 rcu_read_lock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004902 ret = dev_ifsioc_locked(net, &ifr, cmd);
Eric Dumazet3710bec2009-11-01 19:42:09 +00004903 rcu_read_unlock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004904 if (!ret) {
4905 if (colon)
4906 *colon = ':';
4907 if (copy_to_user(arg, &ifr,
4908 sizeof(struct ifreq)))
4909 ret = -EFAULT;
4910 }
4911 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004912
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004913 case SIOCETHTOOL:
4914 dev_load(net, ifr.ifr_name);
4915 rtnl_lock();
4916 ret = dev_ethtool(net, &ifr);
4917 rtnl_unlock();
4918 if (!ret) {
4919 if (colon)
4920 *colon = ':';
4921 if (copy_to_user(arg, &ifr,
4922 sizeof(struct ifreq)))
4923 ret = -EFAULT;
4924 }
4925 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004926
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004927 /*
4928 * These ioctl calls:
4929 * - require superuser power.
4930 * - require strict serialization.
4931 * - return a value
4932 */
4933 case SIOCGMIIPHY:
4934 case SIOCGMIIREG:
4935 case SIOCSIFNAME:
4936 if (!capable(CAP_NET_ADMIN))
4937 return -EPERM;
4938 dev_load(net, ifr.ifr_name);
4939 rtnl_lock();
4940 ret = dev_ifsioc(net, &ifr, cmd);
4941 rtnl_unlock();
4942 if (!ret) {
4943 if (colon)
4944 *colon = ':';
4945 if (copy_to_user(arg, &ifr,
4946 sizeof(struct ifreq)))
4947 ret = -EFAULT;
4948 }
4949 return ret;
4950
4951 /*
4952 * These ioctl calls:
4953 * - require superuser power.
4954 * - require strict serialization.
4955 * - do not return a value
4956 */
4957 case SIOCSIFFLAGS:
4958 case SIOCSIFMETRIC:
4959 case SIOCSIFMTU:
4960 case SIOCSIFMAP:
4961 case SIOCSIFHWADDR:
4962 case SIOCSIFSLAVE:
4963 case SIOCADDMULTI:
4964 case SIOCDELMULTI:
4965 case SIOCSIFHWBROADCAST:
4966 case SIOCSIFTXQLEN:
4967 case SIOCSMIIREG:
4968 case SIOCBONDENSLAVE:
4969 case SIOCBONDRELEASE:
4970 case SIOCBONDSETHWADDR:
4971 case SIOCBONDCHANGEACTIVE:
4972 case SIOCBRADDIF:
4973 case SIOCBRDELIF:
4974 case SIOCSHWTSTAMP:
4975 if (!capable(CAP_NET_ADMIN))
4976 return -EPERM;
4977 /* fall through */
4978 case SIOCBONDSLAVEINFOQUERY:
4979 case SIOCBONDINFOQUERY:
4980 dev_load(net, ifr.ifr_name);
4981 rtnl_lock();
4982 ret = dev_ifsioc(net, &ifr, cmd);
4983 rtnl_unlock();
4984 return ret;
4985
4986 case SIOCGIFMEM:
4987 /* Get the per device memory space. We can add this but
4988 * currently do not support it */
4989 case SIOCSIFMEM:
4990 /* Set the per device memory buffer space.
4991 * Not applicable in our case */
4992 case SIOCSIFLINK:
4993 return -EINVAL;
4994
4995 /*
4996 * Unknown or private ioctl.
4997 */
4998 default:
4999 if (cmd == SIOCWANDEV ||
5000 (cmd >= SIOCDEVPRIVATE &&
5001 cmd <= SIOCDEVPRIVATE + 15)) {
Eric W. Biederman881d9662007-09-17 11:56:21 -07005002 dev_load(net, ifr.ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005003 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -07005004 ret = dev_ifsioc(net, &ifr, cmd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005005 rtnl_unlock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005006 if (!ret && copy_to_user(arg, &ifr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07005007 sizeof(struct ifreq)))
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005008 ret = -EFAULT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005009 return ret;
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005010 }
5011 /* Take care of Wireless Extensions */
5012 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5013 return wext_handle_ioctl(net, &ifr, cmd, arg);
5014 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005015 }
5016}
5017
5018
5019/**
5020 * dev_new_index - allocate an ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07005021 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07005022 *
5023 * Returns a suitable unique value for a new device interface
5024 * number. The caller must hold the rtnl semaphore or the
5025 * dev_base_lock to be sure it remains unique.
5026 */
Eric W. Biederman881d9662007-09-17 11:56:21 -07005027static int dev_new_index(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005028{
5029 static int ifindex;
5030 for (;;) {
5031 if (++ifindex <= 0)
5032 ifindex = 1;
Eric W. Biederman881d9662007-09-17 11:56:21 -07005033 if (!__dev_get_by_index(net, ifindex))
Linus Torvalds1da177e2005-04-16 15:20:36 -07005034 return ifindex;
5035 }
5036}
5037
Linus Torvalds1da177e2005-04-16 15:20:36 -07005038/* Delayed registration/unregisteration */
Denis Cheng3b5b34f2007-12-07 00:49:17 -08005039static LIST_HEAD(net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005040
Stephen Hemminger6f05f622007-03-08 20:46:03 -08005041static void net_set_todo(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005042{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005043 list_add_tail(&dev->todo_list, &net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005044}
5045
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005046static void rollback_registered_many(struct list_head *head)
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005047{
Krishna Kumare93737b2009-12-08 22:26:02 +00005048 struct net_device *dev, *tmp;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005049
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005050 BUG_ON(dev_boot_phase);
5051 ASSERT_RTNL();
5052
Krishna Kumare93737b2009-12-08 22:26:02 +00005053 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005054 /* Some devices call without registering
Krishna Kumare93737b2009-12-08 22:26:02 +00005055 * for initialization unwind. Remove those
5056 * devices and proceed with the remaining.
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005057 */
5058 if (dev->reg_state == NETREG_UNINITIALIZED) {
5059 pr_debug("unregister_netdevice: device %s/%p never "
5060 "was registered\n", dev->name, dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005061
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005062 WARN_ON(1);
Krishna Kumare93737b2009-12-08 22:26:02 +00005063 list_del(&dev->unreg_list);
5064 continue;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005065 }
5066
5067 BUG_ON(dev->reg_state != NETREG_REGISTERED);
Octavian Purdila44345722010-12-13 12:44:07 +00005068 }
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005069
Octavian Purdila44345722010-12-13 12:44:07 +00005070 /* If device is running, close it first. */
5071 dev_close_many(head);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005072
Octavian Purdila44345722010-12-13 12:44:07 +00005073 list_for_each_entry(dev, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005074 /* And unlink it from device chain. */
5075 unlist_netdevice(dev);
5076
5077 dev->reg_state = NETREG_UNREGISTERING;
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005078 }
5079
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005080 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005081
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005082 list_for_each_entry(dev, head, unreg_list) {
5083 /* Shutdown queueing discipline. */
5084 dev_shutdown(dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005085
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005086
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005087 /* Notify protocols, that we are about to destroy
5088 this device. They should clean all the things.
5089 */
5090 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5091
Patrick McHardya2835762010-02-26 06:34:51 +00005092 if (!dev->rtnl_link_ops ||
5093 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5094 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5095
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005096 /*
5097 * Flush the unicast and multicast chains
5098 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005099 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00005100 dev_mc_flush(dev);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005101
5102 if (dev->netdev_ops->ndo_uninit)
5103 dev->netdev_ops->ndo_uninit(dev);
5104
5105 /* Notifier chain MUST detach us from master device. */
5106 WARN_ON(dev->master);
5107
5108 /* Remove entries from kobject tree */
5109 netdev_unregister_kobject(dev);
5110 }
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005111
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005112 /* Process any work delayed until the end of the batch */
stephen hemmingere5e26d72010-02-24 14:01:38 +00005113 dev = list_first_entry(head, struct net_device, unreg_list);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005114 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5115
Eric Dumazetef885af2010-09-13 12:24:54 +00005116 rcu_barrier();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005117
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005118 list_for_each_entry(dev, head, unreg_list)
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005119 dev_put(dev);
5120}
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005121
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005122static void rollback_registered(struct net_device *dev)
5123{
5124 LIST_HEAD(single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005125
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005126 list_add(&dev->unreg_list, &single);
5127 rollback_registered_many(&single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005128}
5129
Herbert Xub63365a2008-10-23 01:11:29 -07005130unsigned long netdev_fix_features(unsigned long features, const char *name)
5131{
5132 /* Fix illegal SG+CSUM combinations. */
5133 if ((features & NETIF_F_SG) &&
5134 !(features & NETIF_F_ALL_CSUM)) {
5135 if (name)
5136 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
5137 "checksum feature.\n", name);
5138 features &= ~NETIF_F_SG;
5139 }
5140
5141 /* TSO requires that SG is present as well. */
5142 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5143 if (name)
5144 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
5145 "SG feature.\n", name);
5146 features &= ~NETIF_F_TSO;
5147 }
5148
5149 if (features & NETIF_F_UFO) {
Michał Mirosław79032642010-11-30 06:38:00 +00005150 /* maybe split UFO into V4 and V6? */
5151 if (!((features & NETIF_F_GEN_CSUM) ||
5152 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5153 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
Herbert Xub63365a2008-10-23 01:11:29 -07005154 if (name)
5155 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
Michał Mirosław79032642010-11-30 06:38:00 +00005156 "since no checksum offload features.\n",
Herbert Xub63365a2008-10-23 01:11:29 -07005157 name);
5158 features &= ~NETIF_F_UFO;
5159 }
5160
5161 if (!(features & NETIF_F_SG)) {
5162 if (name)
5163 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5164 "since no NETIF_F_SG feature.\n", name);
5165 features &= ~NETIF_F_UFO;
5166 }
5167 }
5168
5169 return features;
5170}
5171EXPORT_SYMBOL(netdev_fix_features);
5172
Linus Torvalds1da177e2005-04-16 15:20:36 -07005173/**
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08005174 * netif_stacked_transfer_operstate - transfer operstate
5175 * @rootdev: the root or lower level device to transfer state from
5176 * @dev: the device to transfer operstate to
5177 *
5178 * Transfer operational state from root to device. This is normally
5179 * called when a stacking relationship exists between the root
5180 * device and the device(a leaf device).
5181 */
5182void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5183 struct net_device *dev)
5184{
5185 if (rootdev->operstate == IF_OPER_DORMANT)
5186 netif_dormant_on(dev);
5187 else
5188 netif_dormant_off(dev);
5189
5190 if (netif_carrier_ok(rootdev)) {
5191 if (!netif_carrier_ok(dev))
5192 netif_carrier_on(dev);
5193 } else {
5194 if (netif_carrier_ok(dev))
5195 netif_carrier_off(dev);
5196 }
5197}
5198EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5199
Tom Herbertbf264142010-11-26 08:36:09 +00005200#ifdef CONFIG_RPS
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005201static int netif_alloc_rx_queues(struct net_device *dev)
5202{
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005203 unsigned int i, count = dev->num_rx_queues;
Tom Herbertbd25fa72010-10-18 18:00:16 +00005204 struct netdev_rx_queue *rx;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005205
Tom Herbertbd25fa72010-10-18 18:00:16 +00005206 BUG_ON(count < 1);
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005207
Tom Herbertbd25fa72010-10-18 18:00:16 +00005208 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5209 if (!rx) {
5210 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5211 return -ENOMEM;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005212 }
Tom Herbertbd25fa72010-10-18 18:00:16 +00005213 dev->_rx = rx;
5214
Tom Herbertbd25fa72010-10-18 18:00:16 +00005215 for (i = 0; i < count; i++)
Tom Herbertfe822242010-11-09 10:47:38 +00005216 rx[i].dev = dev;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005217 return 0;
5218}
Tom Herbertbf264142010-11-26 08:36:09 +00005219#endif
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005220
Changli Gaoaa942102010-12-04 02:31:41 +00005221static void netdev_init_one_queue(struct net_device *dev,
5222 struct netdev_queue *queue, void *_unused)
5223{
5224 /* Initialize queue lock */
5225 spin_lock_init(&queue->_xmit_lock);
5226 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5227 queue->xmit_lock_owner = -1;
Changli Gaob236da62010-12-14 03:09:15 +00005228 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
Changli Gaoaa942102010-12-04 02:31:41 +00005229 queue->dev = dev;
5230}
5231
Tom Herberte6484932010-10-18 18:04:39 +00005232static int netif_alloc_netdev_queues(struct net_device *dev)
5233{
5234 unsigned int count = dev->num_tx_queues;
5235 struct netdev_queue *tx;
5236
5237 BUG_ON(count < 1);
5238
5239 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5240 if (!tx) {
5241 pr_err("netdev: Unable to allocate %u tx queues.\n",
5242 count);
5243 return -ENOMEM;
5244 }
5245 dev->_tx = tx;
Tom Herbert1d24eb42010-11-21 13:17:27 +00005246
Tom Herberte6484932010-10-18 18:04:39 +00005247 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5248 spin_lock_init(&dev->tx_global_lock);
Changli Gaoaa942102010-12-04 02:31:41 +00005249
5250 return 0;
Tom Herberte6484932010-10-18 18:04:39 +00005251}
5252
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08005253/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005254 * register_netdevice - register a network device
5255 * @dev: device to register
5256 *
5257 * Take a completed network device structure and add it to the kernel
5258 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5259 * chain. 0 is returned on success. A negative errno code is returned
5260 * on a failure to set up the device, or if the name is a duplicate.
5261 *
5262 * Callers must hold the rtnl semaphore. You may want
5263 * register_netdev() instead of this.
5264 *
5265 * BUGS:
5266 * The locking appears insufficient to guarantee two parallel registers
5267 * will not get the same name.
5268 */
5269
5270int register_netdevice(struct net_device *dev)
5271{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005272 int ret;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005273 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005274
5275 BUG_ON(dev_boot_phase);
5276 ASSERT_RTNL();
5277
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005278 might_sleep();
5279
Linus Torvalds1da177e2005-04-16 15:20:36 -07005280 /* When net_device's are persistent, this will be fatal. */
5281 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005282 BUG_ON(!net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005283
David S. Millerf1f28aa2008-07-15 00:08:33 -07005284 spin_lock_init(&dev->addr_list_lock);
David S. Millercf508b12008-07-22 14:16:42 -07005285 netdev_set_addr_lockdep_class(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005286
Linus Torvalds1da177e2005-04-16 15:20:36 -07005287 dev->iflink = -1;
5288
5289 /* Init, if this function is available */
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005290 if (dev->netdev_ops->ndo_init) {
5291 ret = dev->netdev_ops->ndo_init(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005292 if (ret) {
5293 if (ret > 0)
5294 ret = -EIO;
Adrian Bunk90833aa2006-11-13 16:02:22 -08005295 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005296 }
5297 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005298
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00005299 ret = dev_get_valid_name(dev, dev->name, 0);
Octavian Purdilad9031022009-11-18 02:36:59 +00005300 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005301 goto err_uninit;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005302
Eric W. Biederman881d9662007-09-17 11:56:21 -07005303 dev->ifindex = dev_new_index(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005304 if (dev->iflink == -1)
5305 dev->iflink = dev->ifindex;
5306
Stephen Hemmingerd212f872007-06-27 00:47:37 -07005307 /* Fix illegal checksum combinations */
5308 if ((dev->features & NETIF_F_HW_CSUM) &&
5309 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5310 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5311 dev->name);
5312 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5313 }
5314
5315 if ((dev->features & NETIF_F_NO_CSUM) &&
5316 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5317 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5318 dev->name);
5319 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5320 }
5321
Herbert Xub63365a2008-10-23 01:11:29 -07005322 dev->features = netdev_fix_features(dev->features, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005323
Lennert Buytenheke5a4a722008-08-03 01:23:10 -07005324 /* Enable software GSO if SG is supported. */
5325 if (dev->features & NETIF_F_SG)
5326 dev->features |= NETIF_F_GSO;
5327
Eric Dumazetc5256c52010-09-23 00:46:11 +00005328 /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5329 * vlan_dev_init() will do the dev->features check, so these features
5330 * are enabled only if supported by underlying device.
Brandon Philips16c3ea72010-09-15 09:24:24 +00005331 */
Eric Dumazetc5256c52010-09-23 00:46:11 +00005332 dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
Brandon Philips16c3ea72010-09-15 09:24:24 +00005333
Johannes Berg7ffbe3f2009-10-02 05:15:27 +00005334 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5335 ret = notifier_to_errno(ret);
5336 if (ret)
5337 goto err_uninit;
5338
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005339 ret = netdev_register_kobject(dev);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005340 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005341 goto err_uninit;
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005342 dev->reg_state = NETREG_REGISTERED;
5343
Linus Torvalds1da177e2005-04-16 15:20:36 -07005344 /*
5345 * Default initial state at registry is that the
5346 * device is present.
5347 */
5348
5349 set_bit(__LINK_STATE_PRESENT, &dev->state);
5350
Linus Torvalds1da177e2005-04-16 15:20:36 -07005351 dev_init_scheduler(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005352 dev_hold(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005353 list_netdevice(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005354
5355 /* Notify protocols, that a new device appeared. */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005356 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07005357 ret = notifier_to_errno(ret);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005358 if (ret) {
5359 rollback_registered(dev);
5360 dev->reg_state = NETREG_UNREGISTERED;
5361 }
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005362 /*
5363 * Prevent userspace races by waiting until the network
5364 * device is fully setup before sending notifications.
5365 */
Patrick McHardya2835762010-02-26 06:34:51 +00005366 if (!dev->rtnl_link_ops ||
5367 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5368 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005369
5370out:
5371 return ret;
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005372
5373err_uninit:
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005374 if (dev->netdev_ops->ndo_uninit)
5375 dev->netdev_ops->ndo_uninit(dev);
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005376 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005377}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005378EXPORT_SYMBOL(register_netdevice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005379
5380/**
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005381 * init_dummy_netdev - init a dummy network device for NAPI
5382 * @dev: device to init
5383 *
5384 * This takes a network device structure and initialize the minimum
5385 * amount of fields so it can be used to schedule NAPI polls without
5386 * registering a full blown interface. This is to be used by drivers
5387 * that need to tie several hardware interfaces to a single NAPI
5388 * poll scheduler due to HW limitations.
5389 */
5390int init_dummy_netdev(struct net_device *dev)
5391{
5392 /* Clear everything. Note we don't initialize spinlocks
5393 * are they aren't supposed to be taken by any of the
5394 * NAPI code and this dummy netdev is supposed to be
5395 * only ever used for NAPI polls
5396 */
5397 memset(dev, 0, sizeof(struct net_device));
5398
5399 /* make sure we BUG if trying to hit standard
5400 * register/unregister code path
5401 */
5402 dev->reg_state = NETREG_DUMMY;
5403
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005404 /* NAPI wants this */
5405 INIT_LIST_HEAD(&dev->napi_list);
5406
5407 /* a dummy interface is started by default */
5408 set_bit(__LINK_STATE_PRESENT, &dev->state);
5409 set_bit(__LINK_STATE_START, &dev->state);
5410
Eric Dumazet29b44332010-10-11 10:22:12 +00005411 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5412 * because users of this 'device' dont need to change
5413 * its refcount.
5414 */
5415
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005416 return 0;
5417}
5418EXPORT_SYMBOL_GPL(init_dummy_netdev);
5419
5420
5421/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005422 * register_netdev - register a network device
5423 * @dev: device to register
5424 *
5425 * Take a completed network device structure and add it to the kernel
5426 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5427 * chain. 0 is returned on success. A negative errno code is returned
5428 * on a failure to set up the device, or if the name is a duplicate.
5429 *
Borislav Petkov38b4da32007-04-20 22:14:10 -07005430 * This is a wrapper around register_netdevice that takes the rtnl semaphore
Linus Torvalds1da177e2005-04-16 15:20:36 -07005431 * and expands the device name if you passed a format string to
5432 * alloc_netdev.
5433 */
5434int register_netdev(struct net_device *dev)
5435{
5436 int err;
5437
5438 rtnl_lock();
5439
5440 /*
5441 * If the name is a format string the caller wants us to do a
5442 * name allocation.
5443 */
5444 if (strchr(dev->name, '%')) {
5445 err = dev_alloc_name(dev, dev->name);
5446 if (err < 0)
5447 goto out;
5448 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005449
Linus Torvalds1da177e2005-04-16 15:20:36 -07005450 err = register_netdevice(dev);
5451out:
5452 rtnl_unlock();
5453 return err;
5454}
5455EXPORT_SYMBOL(register_netdev);
5456
Eric Dumazet29b44332010-10-11 10:22:12 +00005457int netdev_refcnt_read(const struct net_device *dev)
5458{
5459 int i, refcnt = 0;
5460
5461 for_each_possible_cpu(i)
5462 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5463 return refcnt;
5464}
5465EXPORT_SYMBOL(netdev_refcnt_read);
5466
Linus Torvalds1da177e2005-04-16 15:20:36 -07005467/*
5468 * netdev_wait_allrefs - wait until all references are gone.
5469 *
5470 * This is called when unregistering network devices.
5471 *
5472 * Any protocol or device that holds a reference should register
5473 * for netdevice notification, and cleanup and put back the
5474 * reference if they receive an UNREGISTER event.
5475 * We can get stuck here if buggy protocols don't correctly
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005476 * call dev_put.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005477 */
5478static void netdev_wait_allrefs(struct net_device *dev)
5479{
5480 unsigned long rebroadcast_time, warning_time;
Eric Dumazet29b44332010-10-11 10:22:12 +00005481 int refcnt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005482
Eric Dumazete014deb2009-11-17 05:59:21 +00005483 linkwatch_forget_dev(dev);
5484
Linus Torvalds1da177e2005-04-16 15:20:36 -07005485 rebroadcast_time = warning_time = jiffies;
Eric Dumazet29b44332010-10-11 10:22:12 +00005486 refcnt = netdev_refcnt_read(dev);
5487
5488 while (refcnt != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005489 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005490 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005491
5492 /* Rebroadcast unregister notification */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005493 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005494 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
Octavian Purdila395264d2009-11-16 13:49:35 +00005495 * should have already handle it the first time */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005496
5497 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5498 &dev->state)) {
5499 /* We must not have linkwatch events
5500 * pending on unregister. If this
5501 * happens, we simply run the queue
5502 * unscheduled, resulting in a noop
5503 * for this device.
5504 */
5505 linkwatch_run_queue();
5506 }
5507
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005508 __rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005509
5510 rebroadcast_time = jiffies;
5511 }
5512
5513 msleep(250);
5514
Eric Dumazet29b44332010-10-11 10:22:12 +00005515 refcnt = netdev_refcnt_read(dev);
5516
Linus Torvalds1da177e2005-04-16 15:20:36 -07005517 if (time_after(jiffies, warning_time + 10 * HZ)) {
5518 printk(KERN_EMERG "unregister_netdevice: "
5519 "waiting for %s to become free. Usage "
5520 "count = %d\n",
Eric Dumazet29b44332010-10-11 10:22:12 +00005521 dev->name, refcnt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005522 warning_time = jiffies;
5523 }
5524 }
5525}
5526
5527/* The sequence is:
5528 *
5529 * rtnl_lock();
5530 * ...
5531 * register_netdevice(x1);
5532 * register_netdevice(x2);
5533 * ...
5534 * unregister_netdevice(y1);
5535 * unregister_netdevice(y2);
5536 * ...
5537 * rtnl_unlock();
5538 * free_netdev(y1);
5539 * free_netdev(y2);
5540 *
Herbert Xu58ec3b42008-10-07 15:50:03 -07005541 * We are invoked by rtnl_unlock().
Linus Torvalds1da177e2005-04-16 15:20:36 -07005542 * This allows us to deal with problems:
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005543 * 1) We can delete sysfs objects which invoke hotplug
Linus Torvalds1da177e2005-04-16 15:20:36 -07005544 * without deadlocking with linkwatch via keventd.
5545 * 2) Since we run with the RTNL semaphore not held, we can sleep
5546 * safely in order to wait for the netdev refcnt to drop to zero.
Herbert Xu58ec3b42008-10-07 15:50:03 -07005547 *
5548 * We must not return until all unregister events added during
5549 * the interval the lock was held have been completed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005550 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005551void netdev_run_todo(void)
5552{
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005553 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005554
Linus Torvalds1da177e2005-04-16 15:20:36 -07005555 /* Snapshot list, allow later requests */
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005556 list_replace_init(&net_todo_list, &list);
Herbert Xu58ec3b42008-10-07 15:50:03 -07005557
5558 __rtnl_unlock();
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005559
Linus Torvalds1da177e2005-04-16 15:20:36 -07005560 while (!list_empty(&list)) {
5561 struct net_device *dev
stephen hemmingere5e26d72010-02-24 14:01:38 +00005562 = list_first_entry(&list, struct net_device, todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005563 list_del(&dev->todo_list);
5564
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005565 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005566 printk(KERN_ERR "network todo '%s' but state %d\n",
5567 dev->name, dev->reg_state);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005568 dump_stack();
5569 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005570 }
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005571
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005572 dev->reg_state = NETREG_UNREGISTERED;
5573
Changli Gao152102c2010-03-30 20:16:22 +00005574 on_each_cpu(flush_backlog, dev, 1);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07005575
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005576 netdev_wait_allrefs(dev);
5577
5578 /* paranoia */
Eric Dumazet29b44332010-10-11 10:22:12 +00005579 BUG_ON(netdev_refcnt_read(dev));
Eric Dumazet95ae6b22010-09-15 04:04:31 +00005580 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
Eric Dumazet198caec2010-10-24 21:32:05 +00005581 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
Ilpo Järvinen547b7922008-07-25 21:43:18 -07005582 WARN_ON(dev->dn_ptr);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005583
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005584 if (dev->destructor)
5585 dev->destructor(dev);
Stephen Hemminger9093bbb2007-05-19 15:39:25 -07005586
5587 /* Free network device */
5588 kobject_put(&dev->dev.kobj);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005589 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005590}
5591
Ben Hutchings3cfde792010-07-09 09:11:52 +00005592/* Convert net_device_stats to rtnl_link_stats64. They have the same
5593 * fields in the same order, with only the type differing.
5594 */
5595static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5596 const struct net_device_stats *netdev_stats)
5597{
5598#if BITS_PER_LONG == 64
5599 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5600 memcpy(stats64, netdev_stats, sizeof(*stats64));
5601#else
5602 size_t i, n = sizeof(*stats64) / sizeof(u64);
5603 const unsigned long *src = (const unsigned long *)netdev_stats;
5604 u64 *dst = (u64 *)stats64;
5605
5606 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5607 sizeof(*stats64) / sizeof(u64));
5608 for (i = 0; i < n; i++)
5609 dst[i] = src[i];
5610#endif
5611}
5612
Eric Dumazetd83345a2009-11-16 03:36:51 +00005613/**
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005614 * dev_get_stats - get network device statistics
5615 * @dev: device to get statistics from
Eric Dumazet28172732010-07-07 14:58:56 -07005616 * @storage: place to store stats
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005617 *
Ben Hutchingsd7753512010-07-09 09:12:41 +00005618 * Get network statistics from device. Return @storage.
5619 * The device driver may provide its own method by setting
5620 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5621 * otherwise the internal statistics structure is used.
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005622 */
Ben Hutchingsd7753512010-07-09 09:12:41 +00005623struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5624 struct rtnl_link_stats64 *storage)
Eric Dumazet7004bf22009-05-18 00:34:33 +00005625{
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005626 const struct net_device_ops *ops = dev->netdev_ops;
5627
Eric Dumazet28172732010-07-07 14:58:56 -07005628 if (ops->ndo_get_stats64) {
5629 memset(storage, 0, sizeof(*storage));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005630 ops->ndo_get_stats64(dev, storage);
5631 } else if (ops->ndo_get_stats) {
Ben Hutchings3cfde792010-07-09 09:11:52 +00005632 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005633 } else {
5634 netdev_stats_to_stats64(storage, &dev->stats);
Eric Dumazet28172732010-07-07 14:58:56 -07005635 }
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005636 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
Eric Dumazet28172732010-07-07 14:58:56 -07005637 return storage;
Rusty Russellc45d2862007-03-28 14:29:08 -07005638}
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005639EXPORT_SYMBOL(dev_get_stats);
Rusty Russellc45d2862007-03-28 14:29:08 -07005640
Eric Dumazet24824a02010-10-02 06:11:55 +00005641struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
David S. Millerdc2b4842008-07-08 17:18:23 -07005642{
Eric Dumazet24824a02010-10-02 06:11:55 +00005643 struct netdev_queue *queue = dev_ingress_queue(dev);
David S. Millerdc2b4842008-07-08 17:18:23 -07005644
Eric Dumazet24824a02010-10-02 06:11:55 +00005645#ifdef CONFIG_NET_CLS_ACT
5646 if (queue)
5647 return queue;
5648 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5649 if (!queue)
5650 return NULL;
5651 netdev_init_one_queue(dev, queue, NULL);
Eric Dumazet24824a02010-10-02 06:11:55 +00005652 queue->qdisc = &noop_qdisc;
5653 queue->qdisc_sleeping = &noop_qdisc;
5654 rcu_assign_pointer(dev->ingress_queue, queue);
5655#endif
5656 return queue;
David S. Millerbb949fb2008-07-08 16:55:56 -07005657}
5658
Linus Torvalds1da177e2005-04-16 15:20:36 -07005659/**
Tom Herbert36909ea2011-01-09 19:36:31 +00005660 * alloc_netdev_mqs - allocate network device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005661 * @sizeof_priv: size of private data to allocate space for
5662 * @name: device name format string
5663 * @setup: callback to initialize device
Tom Herbert36909ea2011-01-09 19:36:31 +00005664 * @txqs: the number of TX subqueues to allocate
5665 * @rxqs: the number of RX subqueues to allocate
Linus Torvalds1da177e2005-04-16 15:20:36 -07005666 *
5667 * Allocates a struct net_device with private data area for driver use
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005668 * and performs basic initialization. Also allocates subquue structs
Tom Herbert36909ea2011-01-09 19:36:31 +00005669 * for each queue on the device.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005670 */
Tom Herbert36909ea2011-01-09 19:36:31 +00005671struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5672 void (*setup)(struct net_device *),
5673 unsigned int txqs, unsigned int rxqs)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005674{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005675 struct net_device *dev;
Stephen Hemminger79439862008-07-21 13:28:44 -07005676 size_t alloc_size;
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005677 struct net_device *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005678
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07005679 BUG_ON(strlen(name) >= sizeof(dev->name));
5680
Tom Herbert36909ea2011-01-09 19:36:31 +00005681 if (txqs < 1) {
Tom Herbert55513fb2010-10-18 17:55:58 +00005682 pr_err("alloc_netdev: Unable to allocate device "
5683 "with zero queues.\n");
5684 return NULL;
5685 }
5686
Tom Herbert36909ea2011-01-09 19:36:31 +00005687#ifdef CONFIG_RPS
5688 if (rxqs < 1) {
5689 pr_err("alloc_netdev: Unable to allocate device "
5690 "with zero RX queues.\n");
5691 return NULL;
5692 }
5693#endif
5694
David S. Millerfd2ea0a2008-07-17 01:56:23 -07005695 alloc_size = sizeof(struct net_device);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005696 if (sizeof_priv) {
5697 /* ensure 32-byte alignment of private area */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005698 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005699 alloc_size += sizeof_priv;
5700 }
5701 /* ensure 32-byte alignment of whole construct */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005702 alloc_size += NETDEV_ALIGN - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005703
Paolo 'Blaisorblade' Giarrusso31380de2006-04-06 22:38:28 -07005704 p = kzalloc(alloc_size, GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005705 if (!p) {
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07005706 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005707 return NULL;
5708 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005709
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005710 dev = PTR_ALIGN(p, NETDEV_ALIGN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005711 dev->padded = (char *)dev - (char *)p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005712
Eric Dumazet29b44332010-10-11 10:22:12 +00005713 dev->pcpu_refcnt = alloc_percpu(int);
5714 if (!dev->pcpu_refcnt)
Tom Herberte6484932010-10-18 18:04:39 +00005715 goto free_p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005716
Linus Torvalds1da177e2005-04-16 15:20:36 -07005717 if (dev_addr_init(dev))
Eric Dumazet29b44332010-10-11 10:22:12 +00005718 goto free_pcpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005719
Jiri Pirko22bedad32010-04-01 21:22:57 +00005720 dev_mc_init(dev);
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005721 dev_uc_init(dev);
Jiri Pirkoccffad252009-05-22 23:22:17 +00005722
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005723 dev_net_set(dev, &init_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005724
Tom Herbert36909ea2011-01-09 19:36:31 +00005725 dev->num_tx_queues = txqs;
5726 dev->real_num_tx_queues = txqs;
Tom Herberted9af2e2010-11-09 10:47:30 +00005727 if (netif_alloc_netdev_queues(dev))
5728 goto free_pcpu;
David S. Millere8a04642008-07-17 00:34:19 -07005729
Eric Dumazetdf334542010-03-24 19:13:54 +00005730#ifdef CONFIG_RPS
Tom Herbert36909ea2011-01-09 19:36:31 +00005731 dev->num_rx_queues = rxqs;
5732 dev->real_num_rx_queues = rxqs;
Tom Herbertfe822242010-11-09 10:47:38 +00005733 if (netif_alloc_rx_queues(dev))
5734 goto free_pcpu;
Eric Dumazetdf334542010-03-24 19:13:54 +00005735#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00005736
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07005737 dev->gso_max_size = GSO_MAX_SIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005738
Peter P Waskiewicz Jr15682bc2010-02-10 20:03:05 -08005739 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5740 dev->ethtool_ntuple_list.count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08005741 INIT_LIST_HEAD(&dev->napi_list);
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005742 INIT_LIST_HEAD(&dev->unreg_list);
Eric Dumazete014deb2009-11-17 05:59:21 +00005743 INIT_LIST_HEAD(&dev->link_watch_list);
Eric Dumazet93f154b2009-05-18 22:19:19 -07005744 dev->priv_flags = IFF_XMIT_DST_RELEASE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005745 setup(dev);
5746 strcpy(dev->name, name);
Vlad Dogarucbda10f2011-01-13 23:38:30 +00005747 dev->group = INIT_NETDEV_GROUP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005748 return dev;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005749
Eric Dumazet29b44332010-10-11 10:22:12 +00005750free_pcpu:
5751 free_percpu(dev->pcpu_refcnt);
Tom Herberted9af2e2010-11-09 10:47:30 +00005752 kfree(dev->_tx);
Tom Herbertfe822242010-11-09 10:47:38 +00005753#ifdef CONFIG_RPS
5754 kfree(dev->_rx);
5755#endif
5756
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005757free_p:
5758 kfree(p);
5759 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005760}
Tom Herbert36909ea2011-01-09 19:36:31 +00005761EXPORT_SYMBOL(alloc_netdev_mqs);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005762
5763/**
5764 * free_netdev - free network device
5765 * @dev: device
5766 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005767 * This function does the last stage of destroying an allocated device
5768 * interface. The reference to the device object is released.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005769 * If this is the last reference then it will be freed.
5770 */
5771void free_netdev(struct net_device *dev)
5772{
Herbert Xud565b0a2008-12-15 23:38:52 -08005773 struct napi_struct *p, *n;
5774
Denis V. Lunevf3005d72008-04-16 02:02:18 -07005775 release_net(dev_net(dev));
5776
David S. Millere8a04642008-07-17 00:34:19 -07005777 kfree(dev->_tx);
Tom Herbertfe822242010-11-09 10:47:38 +00005778#ifdef CONFIG_RPS
5779 kfree(dev->_rx);
5780#endif
David S. Millere8a04642008-07-17 00:34:19 -07005781
Eric Dumazet24824a02010-10-02 06:11:55 +00005782 kfree(rcu_dereference_raw(dev->ingress_queue));
5783
Jiri Pirkof001fde2009-05-05 02:48:28 +00005784 /* Flush device addresses */
5785 dev_addr_flush(dev);
5786
Peter P Waskiewicz Jr15682bc2010-02-10 20:03:05 -08005787 /* Clear ethtool n-tuple list */
5788 ethtool_ntuple_flush(dev);
5789
Herbert Xud565b0a2008-12-15 23:38:52 -08005790 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5791 netif_napi_del(p);
5792
Eric Dumazet29b44332010-10-11 10:22:12 +00005793 free_percpu(dev->pcpu_refcnt);
5794 dev->pcpu_refcnt = NULL;
5795
Stephen Hemminger3041a062006-05-26 13:25:24 -07005796 /* Compatibility with error handling in drivers */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005797 if (dev->reg_state == NETREG_UNINITIALIZED) {
5798 kfree((char *)dev - dev->padded);
5799 return;
5800 }
5801
5802 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5803 dev->reg_state = NETREG_RELEASED;
5804
Greg Kroah-Hartman43cb76d2002-04-09 12:14:34 -07005805 /* will free via device release */
5806 put_device(&dev->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005807}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005808EXPORT_SYMBOL(free_netdev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005809
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005810/**
5811 * synchronize_net - Synchronize with packet receive processing
5812 *
5813 * Wait for packets currently being received to be done.
5814 * Does not block later packets from starting.
5815 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005816void synchronize_net(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005817{
5818 might_sleep();
Paul E. McKenneyfbd568a3e2005-05-01 08:59:04 -07005819 synchronize_rcu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005820}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005821EXPORT_SYMBOL(synchronize_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005822
5823/**
Eric Dumazet44a08732009-10-27 07:03:04 +00005824 * unregister_netdevice_queue - remove device from the kernel
Linus Torvalds1da177e2005-04-16 15:20:36 -07005825 * @dev: device
Eric Dumazet44a08732009-10-27 07:03:04 +00005826 * @head: list
Jaswinder Singh Rajput6ebfbc02009-11-22 20:43:13 -08005827 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07005828 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005829 * from the kernel tables.
Eric Dumazet44a08732009-10-27 07:03:04 +00005830 * If head not NULL, device is queued to be unregistered later.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005831 *
5832 * Callers must hold the rtnl semaphore. You may want
5833 * unregister_netdev() instead of this.
5834 */
5835
Eric Dumazet44a08732009-10-27 07:03:04 +00005836void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005837{
Herbert Xua6620712007-12-12 19:21:56 -08005838 ASSERT_RTNL();
5839
Eric Dumazet44a08732009-10-27 07:03:04 +00005840 if (head) {
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005841 list_move_tail(&dev->unreg_list, head);
Eric Dumazet44a08732009-10-27 07:03:04 +00005842 } else {
5843 rollback_registered(dev);
5844 /* Finish processing unregister after unlock */
5845 net_set_todo(dev);
5846 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005847}
Eric Dumazet44a08732009-10-27 07:03:04 +00005848EXPORT_SYMBOL(unregister_netdevice_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005849
5850/**
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005851 * unregister_netdevice_many - unregister many devices
5852 * @head: list of devices
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005853 */
5854void unregister_netdevice_many(struct list_head *head)
5855{
5856 struct net_device *dev;
5857
5858 if (!list_empty(head)) {
5859 rollback_registered_many(head);
5860 list_for_each_entry(dev, head, unreg_list)
5861 net_set_todo(dev);
5862 }
5863}
Eric Dumazet63c80992009-10-27 07:06:49 +00005864EXPORT_SYMBOL(unregister_netdevice_many);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005865
5866/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005867 * unregister_netdev - remove device from the kernel
5868 * @dev: device
5869 *
5870 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005871 * from the kernel tables.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005872 *
5873 * This is just a wrapper for unregister_netdevice that takes
5874 * the rtnl semaphore. In general you want to use this and not
5875 * unregister_netdevice.
5876 */
5877void unregister_netdev(struct net_device *dev)
5878{
5879 rtnl_lock();
5880 unregister_netdevice(dev);
5881 rtnl_unlock();
5882}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005883EXPORT_SYMBOL(unregister_netdev);
5884
Eric W. Biedermance286d32007-09-12 13:53:49 +02005885/**
5886 * dev_change_net_namespace - move device to different nethost namespace
5887 * @dev: device
5888 * @net: network namespace
5889 * @pat: If not NULL name pattern to try if the current device name
5890 * is already taken in the destination network namespace.
5891 *
5892 * This function shuts down a device interface and moves it
5893 * to a new network namespace. On success 0 is returned, on
5894 * a failure a netagive errno code is returned.
5895 *
5896 * Callers must hold the rtnl semaphore.
5897 */
5898
5899int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5900{
Eric W. Biedermance286d32007-09-12 13:53:49 +02005901 int err;
5902
5903 ASSERT_RTNL();
5904
5905 /* Don't allow namespace local devices to be moved. */
5906 err = -EINVAL;
5907 if (dev->features & NETIF_F_NETNS_LOCAL)
5908 goto out;
5909
5910 /* Ensure the device has been registrered */
5911 err = -EINVAL;
5912 if (dev->reg_state != NETREG_REGISTERED)
5913 goto out;
5914
5915 /* Get out if there is nothing todo */
5916 err = 0;
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09005917 if (net_eq(dev_net(dev), net))
Eric W. Biedermance286d32007-09-12 13:53:49 +02005918 goto out;
5919
5920 /* Pick the destination device name, and ensure
5921 * we can use it in the destination network namespace.
5922 */
5923 err = -EEXIST;
Octavian Purdilad9031022009-11-18 02:36:59 +00005924 if (__dev_get_by_name(net, dev->name)) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02005925 /* We get here if we can't use the current device name */
5926 if (!pat)
5927 goto out;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00005928 if (dev_get_valid_name(dev, pat, 1))
Eric W. Biedermance286d32007-09-12 13:53:49 +02005929 goto out;
5930 }
5931
5932 /*
5933 * And now a mini version of register_netdevice unregister_netdevice.
5934 */
5935
5936 /* If device is running close it first. */
Pavel Emelyanov9b772652007-10-10 02:49:09 -07005937 dev_close(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005938
5939 /* And unlink it from device chain */
5940 err = -ENODEV;
5941 unlist_netdevice(dev);
5942
5943 synchronize_net();
5944
5945 /* Shutdown queueing discipline. */
5946 dev_shutdown(dev);
5947
5948 /* Notify protocols, that we are about to destroy
5949 this device. They should clean all the things.
David Lamparter3b27e102010-09-17 03:22:19 +00005950
5951 Note that dev->reg_state stays at NETREG_REGISTERED.
5952 This is wanted because this way 8021q and macvlan know
5953 the device is just moving and can keep their slaves up.
Eric W. Biedermance286d32007-09-12 13:53:49 +02005954 */
5955 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005956 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005957
5958 /*
5959 * Flush the unicast and multicast chains
5960 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005961 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00005962 dev_mc_flush(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005963
5964 /* Actually switch the network namespace */
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005965 dev_net_set(dev, net);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005966
Eric W. Biedermance286d32007-09-12 13:53:49 +02005967 /* If there is an ifindex conflict assign a new one */
5968 if (__dev_get_by_index(net, dev->ifindex)) {
5969 int iflink = (dev->iflink == dev->ifindex);
5970 dev->ifindex = dev_new_index(net);
5971 if (iflink)
5972 dev->iflink = dev->ifindex;
5973 }
5974
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005975 /* Fixup kobjects */
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07005976 err = device_rename(&dev->dev, dev->name);
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005977 WARN_ON(err);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005978
5979 /* Add the device back in the hashes */
5980 list_netdevice(dev);
5981
5982 /* Notify protocols, that a new device appeared. */
5983 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5984
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005985 /*
5986 * Prevent userspace races by waiting until the network
5987 * device is fully setup before sending notifications.
5988 */
5989 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5990
Eric W. Biedermance286d32007-09-12 13:53:49 +02005991 synchronize_net();
5992 err = 0;
5993out:
5994 return err;
5995}
Johannes Berg463d0182009-07-14 00:33:35 +02005996EXPORT_SYMBOL_GPL(dev_change_net_namespace);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005997
Linus Torvalds1da177e2005-04-16 15:20:36 -07005998static int dev_cpu_callback(struct notifier_block *nfb,
5999 unsigned long action,
6000 void *ocpu)
6001{
6002 struct sk_buff **list_skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006003 struct sk_buff *skb;
6004 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6005 struct softnet_data *sd, *oldsd;
6006
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006007 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006008 return NOTIFY_OK;
6009
6010 local_irq_disable();
6011 cpu = smp_processor_id();
6012 sd = &per_cpu(softnet_data, cpu);
6013 oldsd = &per_cpu(softnet_data, oldcpu);
6014
6015 /* Find end of our completion_queue. */
6016 list_skb = &sd->completion_queue;
6017 while (*list_skb)
6018 list_skb = &(*list_skb)->next;
6019 /* Append completion queue from offline CPU. */
6020 *list_skb = oldsd->completion_queue;
6021 oldsd->completion_queue = NULL;
6022
Linus Torvalds1da177e2005-04-16 15:20:36 -07006023 /* Append output queue from offline CPU. */
Changli Gaoa9cbd582010-04-26 23:06:24 +00006024 if (oldsd->output_queue) {
6025 *sd->output_queue_tailp = oldsd->output_queue;
6026 sd->output_queue_tailp = oldsd->output_queue_tailp;
6027 oldsd->output_queue = NULL;
6028 oldsd->output_queue_tailp = &oldsd->output_queue;
6029 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006030
6031 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6032 local_irq_enable();
6033
6034 /* Process offline CPU's input_pkt_queue */
Tom Herbert76cc8b12010-05-20 18:37:59 +00006035 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6036 netif_rx(skb);
6037 input_queue_head_incr(oldsd);
6038 }
Tom Herbertfec5e652010-04-16 16:01:27 -07006039 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006040 netif_rx(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00006041 input_queue_head_incr(oldsd);
Tom Herbertfec5e652010-04-16 16:01:27 -07006042 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006043
6044 return NOTIFY_OK;
6045}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006046
6047
Herbert Xu7f353bf2007-08-10 15:47:58 -07006048/**
Herbert Xub63365a2008-10-23 01:11:29 -07006049 * netdev_increment_features - increment feature set by one
6050 * @all: current feature set
6051 * @one: new feature set
6052 * @mask: mask feature set
Herbert Xu7f353bf2007-08-10 15:47:58 -07006053 *
6054 * Computes a new feature set after adding a device with feature set
Herbert Xub63365a2008-10-23 01:11:29 -07006055 * @one to the master device with current feature set @all. Will not
6056 * enable anything that is off in @mask. Returns the new feature set.
Herbert Xu7f353bf2007-08-10 15:47:58 -07006057 */
Herbert Xub63365a2008-10-23 01:11:29 -07006058unsigned long netdev_increment_features(unsigned long all, unsigned long one,
6059 unsigned long mask)
Herbert Xu7f353bf2007-08-10 15:47:58 -07006060{
Herbert Xub63365a2008-10-23 01:11:29 -07006061 /* If device needs checksumming, downgrade to it. */
Eric Dumazetd1b19df2009-09-03 01:29:39 -07006062 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
Herbert Xub63365a2008-10-23 01:11:29 -07006063 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6064 else if (mask & NETIF_F_ALL_CSUM) {
6065 /* If one device supports v4/v6 checksumming, set for all. */
6066 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6067 !(all & NETIF_F_GEN_CSUM)) {
6068 all &= ~NETIF_F_ALL_CSUM;
6069 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6070 }
Herbert Xu7f353bf2007-08-10 15:47:58 -07006071
Herbert Xub63365a2008-10-23 01:11:29 -07006072 /* If one device supports hw checksumming, set for all. */
6073 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6074 all &= ~NETIF_F_ALL_CSUM;
6075 all |= NETIF_F_HW_CSUM;
6076 }
6077 }
Herbert Xu7f353bf2007-08-10 15:47:58 -07006078
Herbert Xub63365a2008-10-23 01:11:29 -07006079 one |= NETIF_F_ALL_CSUM;
Herbert Xu7f353bf2007-08-10 15:47:58 -07006080
Herbert Xub63365a2008-10-23 01:11:29 -07006081 one |= all & NETIF_F_ONE_FOR_ALL;
Sridhar Samudralad9f59502009-10-07 12:24:25 +00006082 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
Herbert Xub63365a2008-10-23 01:11:29 -07006083 all |= one & mask & NETIF_F_ONE_FOR_ALL;
Herbert Xu7f353bf2007-08-10 15:47:58 -07006084
6085 return all;
6086}
Herbert Xub63365a2008-10-23 01:11:29 -07006087EXPORT_SYMBOL(netdev_increment_features);
Herbert Xu7f353bf2007-08-10 15:47:58 -07006088
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006089static struct hlist_head *netdev_create_hash(void)
6090{
6091 int i;
6092 struct hlist_head *hash;
6093
6094 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6095 if (hash != NULL)
6096 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6097 INIT_HLIST_HEAD(&hash[i]);
6098
6099 return hash;
6100}
6101
Eric W. Biederman881d9662007-09-17 11:56:21 -07006102/* Initialize per network namespace state */
Pavel Emelyanov46650792007-10-08 20:38:39 -07006103static int __net_init netdev_init(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07006104{
Eric W. Biederman881d9662007-09-17 11:56:21 -07006105 INIT_LIST_HEAD(&net->dev_base_head);
Eric W. Biederman881d9662007-09-17 11:56:21 -07006106
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006107 net->dev_name_head = netdev_create_hash();
6108 if (net->dev_name_head == NULL)
6109 goto err_name;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006110
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006111 net->dev_index_head = netdev_create_hash();
6112 if (net->dev_index_head == NULL)
6113 goto err_idx;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006114
6115 return 0;
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006116
6117err_idx:
6118 kfree(net->dev_name_head);
6119err_name:
6120 return -ENOMEM;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006121}
6122
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006123/**
6124 * netdev_drivername - network driver for the device
6125 * @dev: network device
6126 * @buffer: buffer for resulting name
6127 * @len: size of buffer
6128 *
6129 * Determine network driver for device.
6130 */
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07006131char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
Arjan van de Ven6579e572008-07-21 13:31:48 -07006132{
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07006133 const struct device_driver *driver;
6134 const struct device *parent;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006135
6136 if (len <= 0 || !buffer)
6137 return buffer;
6138 buffer[0] = 0;
6139
6140 parent = dev->dev.parent;
6141
6142 if (!parent)
6143 return buffer;
6144
6145 driver = parent->driver;
6146 if (driver && driver->name)
6147 strlcpy(buffer, driver->name, len);
6148 return buffer;
6149}
6150
Joe Perches256df2f2010-06-27 01:02:35 +00006151static int __netdev_printk(const char *level, const struct net_device *dev,
6152 struct va_format *vaf)
6153{
6154 int r;
6155
6156 if (dev && dev->dev.parent)
6157 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6158 netdev_name(dev), vaf);
6159 else if (dev)
6160 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6161 else
6162 r = printk("%s(NULL net_device): %pV", level, vaf);
6163
6164 return r;
6165}
6166
6167int netdev_printk(const char *level, const struct net_device *dev,
6168 const char *format, ...)
6169{
6170 struct va_format vaf;
6171 va_list args;
6172 int r;
6173
6174 va_start(args, format);
6175
6176 vaf.fmt = format;
6177 vaf.va = &args;
6178
6179 r = __netdev_printk(level, dev, &vaf);
6180 va_end(args);
6181
6182 return r;
6183}
6184EXPORT_SYMBOL(netdev_printk);
6185
6186#define define_netdev_printk_level(func, level) \
6187int func(const struct net_device *dev, const char *fmt, ...) \
6188{ \
6189 int r; \
6190 struct va_format vaf; \
6191 va_list args; \
6192 \
6193 va_start(args, fmt); \
6194 \
6195 vaf.fmt = fmt; \
6196 vaf.va = &args; \
6197 \
6198 r = __netdev_printk(level, dev, &vaf); \
6199 va_end(args); \
6200 \
6201 return r; \
6202} \
6203EXPORT_SYMBOL(func);
6204
6205define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6206define_netdev_printk_level(netdev_alert, KERN_ALERT);
6207define_netdev_printk_level(netdev_crit, KERN_CRIT);
6208define_netdev_printk_level(netdev_err, KERN_ERR);
6209define_netdev_printk_level(netdev_warn, KERN_WARNING);
6210define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6211define_netdev_printk_level(netdev_info, KERN_INFO);
6212
Pavel Emelyanov46650792007-10-08 20:38:39 -07006213static void __net_exit netdev_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07006214{
6215 kfree(net->dev_name_head);
6216 kfree(net->dev_index_head);
6217}
6218
Denis V. Lunev022cbae2007-11-13 03:23:50 -08006219static struct pernet_operations __net_initdata netdev_net_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07006220 .init = netdev_init,
6221 .exit = netdev_exit,
6222};
6223
Pavel Emelyanov46650792007-10-08 20:38:39 -07006224static void __net_exit default_device_exit(struct net *net)
Eric W. Biedermance286d32007-09-12 13:53:49 +02006225{
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006226 struct net_device *dev, *aux;
Eric W. Biedermance286d32007-09-12 13:53:49 +02006227 /*
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006228 * Push all migratable network devices back to the
Eric W. Biedermance286d32007-09-12 13:53:49 +02006229 * initial network namespace
6230 */
6231 rtnl_lock();
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006232 for_each_netdev_safe(net, dev, aux) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006233 int err;
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006234 char fb_name[IFNAMSIZ];
Eric W. Biedermance286d32007-09-12 13:53:49 +02006235
6236 /* Ignore unmoveable devices (i.e. loopback) */
6237 if (dev->features & NETIF_F_NETNS_LOCAL)
6238 continue;
6239
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006240 /* Leave virtual devices for the generic cleanup */
6241 if (dev->rtnl_link_ops)
6242 continue;
Eric W. Biedermand0c082c2008-11-05 15:59:38 -08006243
Eric W. Biedermance286d32007-09-12 13:53:49 +02006244 /* Push remaing network devices to init_net */
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006245 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6246 err = dev_change_net_namespace(dev, &init_net, fb_name);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006247 if (err) {
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006248 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
Eric W. Biedermance286d32007-09-12 13:53:49 +02006249 __func__, dev->name, err);
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006250 BUG();
Eric W. Biedermance286d32007-09-12 13:53:49 +02006251 }
6252 }
6253 rtnl_unlock();
6254}
6255
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006256static void __net_exit default_device_exit_batch(struct list_head *net_list)
6257{
6258 /* At exit all network devices most be removed from a network
Uwe Kleine-Königb5950762010-11-01 15:38:34 -04006259 * namespace. Do this in the reverse order of registration.
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006260 * Do this across as many network namespaces as possible to
6261 * improve batching efficiency.
6262 */
6263 struct net_device *dev;
6264 struct net *net;
6265 LIST_HEAD(dev_kill_list);
6266
6267 rtnl_lock();
6268 list_for_each_entry(net, net_list, exit_list) {
6269 for_each_netdev_reverse(net, dev) {
6270 if (dev->rtnl_link_ops)
6271 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6272 else
6273 unregister_netdevice_queue(dev, &dev_kill_list);
6274 }
6275 }
6276 unregister_netdevice_many(&dev_kill_list);
6277 rtnl_unlock();
6278}
6279
Denis V. Lunev022cbae2007-11-13 03:23:50 -08006280static struct pernet_operations __net_initdata default_device_ops = {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006281 .exit = default_device_exit,
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006282 .exit_batch = default_device_exit_batch,
Eric W. Biedermance286d32007-09-12 13:53:49 +02006283};
6284
Linus Torvalds1da177e2005-04-16 15:20:36 -07006285/*
6286 * Initialize the DEV module. At boot time this walks the device list and
6287 * unhooks any devices that fail to initialise (normally hardware not
6288 * present) and leaves us with a valid list of present and active devices.
6289 *
6290 */
6291
6292/*
6293 * This is called single threaded during boot, so no need
6294 * to take the rtnl semaphore.
6295 */
6296static int __init net_dev_init(void)
6297{
6298 int i, rc = -ENOMEM;
6299
6300 BUG_ON(!dev_boot_phase);
6301
Linus Torvalds1da177e2005-04-16 15:20:36 -07006302 if (dev_proc_init())
6303 goto out;
6304
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006305 if (netdev_kobject_init())
Linus Torvalds1da177e2005-04-16 15:20:36 -07006306 goto out;
6307
6308 INIT_LIST_HEAD(&ptype_all);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08006309 for (i = 0; i < PTYPE_HASH_SIZE; i++)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006310 INIT_LIST_HEAD(&ptype_base[i]);
6311
Eric W. Biederman881d9662007-09-17 11:56:21 -07006312 if (register_pernet_subsys(&netdev_net_ops))
6313 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006314
6315 /*
6316 * Initialise the packet receive queues.
6317 */
6318
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07006319 for_each_possible_cpu(i) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006320 struct softnet_data *sd = &per_cpu(softnet_data, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006321
Changli Gaodee42872010-05-02 05:42:16 +00006322 memset(sd, 0, sizeof(*sd));
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006323 skb_queue_head_init(&sd->input_pkt_queue);
Changli Gao6e7676c2010-04-27 15:07:33 -07006324 skb_queue_head_init(&sd->process_queue);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006325 sd->completion_queue = NULL;
6326 INIT_LIST_HEAD(&sd->poll_list);
Changli Gaoa9cbd582010-04-26 23:06:24 +00006327 sd->output_queue = NULL;
6328 sd->output_queue_tailp = &sd->output_queue;
Eric Dumazetdf334542010-03-24 19:13:54 +00006329#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006330 sd->csd.func = rps_trigger_softirq;
6331 sd->csd.info = sd;
6332 sd->csd.flags = 0;
6333 sd->cpu = i;
Tom Herbert1e94d722010-03-18 17:45:44 -07006334#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00006335
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006336 sd->backlog.poll = process_backlog;
6337 sd->backlog.weight = weight_p;
6338 sd->backlog.gro_list = NULL;
6339 sd->backlog.gro_count = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006340 }
6341
Linus Torvalds1da177e2005-04-16 15:20:36 -07006342 dev_boot_phase = 0;
6343
Eric W. Biederman505d4f72008-11-07 22:54:20 -08006344 /* The loopback device is special if any other network devices
6345 * is present in a network namespace the loopback device must
6346 * be present. Since we now dynamically allocate and free the
6347 * loopback device ensure this invariant is maintained by
6348 * keeping the loopback device as the first device on the
6349 * list of network devices. Ensuring the loopback devices
6350 * is the first device that appears and the last network device
6351 * that disappears.
6352 */
6353 if (register_pernet_device(&loopback_net_ops))
6354 goto out;
6355
6356 if (register_pernet_device(&default_device_ops))
6357 goto out;
6358
Carlos R. Mafra962cf362008-05-15 11:15:37 -03006359 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6360 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006361
6362 hotcpu_notifier(dev_cpu_callback, 0);
6363 dst_init();
6364 dev_mcast_init();
6365 rc = 0;
6366out:
6367 return rc;
6368}
6369
6370subsys_initcall(net_dev_init);
6371
Krishna Kumare88721f2009-02-18 17:55:02 -08006372static int __init initialize_hashrnd(void)
6373{
Tom Herbert0a9627f2010-03-16 08:03:29 +00006374 get_random_bytes(&hashrnd, sizeof(hashrnd));
Krishna Kumare88721f2009-02-18 17:55:02 -08006375 return 0;
6376}
6377
6378late_initcall_sync(initialize_hashrnd);
6379