blob: 2b85d4ae981f4985256b6db8af1bc95d9b2b0212 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <asm/system.h>
77#include <linux/bitops.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080078#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070079#include <linux/cpu.h>
80#include <linux/types.h>
81#include <linux/kernel.h>
stephen hemminger08e98972009-11-10 07:20:34 +000082#include <linux/hash.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090083#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/sched.h>
Arjan van de Ven4a3e2f72006-03-20 22:33:17 -080085#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070086#include <linux/string.h>
87#include <linux/mm.h>
88#include <linux/socket.h>
89#include <linux/sockios.h>
90#include <linux/errno.h>
91#include <linux/interrupt.h>
92#include <linux/if_ether.h>
93#include <linux/netdevice.h>
94#include <linux/etherdevice.h>
Ben Hutchings0187bdf2008-06-19 16:15:47 -070095#include <linux/ethtool.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070096#include <linux/notifier.h>
97#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020098#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070099#include <net/sock.h>
100#include <linux/rtnetlink.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/stat.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700104#include <net/dst.h>
105#include <net/pkt_sched.h>
106#include <net/checksum.h>
Arnd Bergmann44540962009-11-26 06:07:08 +0000107#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#include <linux/highmem.h>
109#include <linux/init.h>
110#include <linux/kmod.h>
111#include <linux/module.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112#include <linux/netpoll.h>
113#include <linux/rcupdate.h>
114#include <linux/delay.h>
Johannes Berg295f4a12007-04-26 20:43:56 -0700115#include <net/wext.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116#include <net/iw_handler.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117#include <asm/current.h>
Steve Grubb5bdb9882005-12-03 08:39:35 -0500118#include <linux/audit.h>
Chris Leechdb217332006-06-17 21:24:58 -0700119#include <linux/dmaengine.h>
Herbert Xuf6a78bf2006-06-22 02:57:17 -0700120#include <linux/err.h>
David S. Millerc7fa9d12006-08-15 16:34:13 -0700121#include <linux/ctype.h>
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700122#include <linux/if_arp.h>
Ben Hutchings6de329e2008-06-16 17:02:28 -0700123#include <linux/if_vlan.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700124#include <linux/ip.h>
Alexander Duyckad55dca2008-09-20 22:05:50 -0700125#include <net/ip.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700126#include <linux/ipv6.h>
127#include <linux/in.h>
David S. Millerb6b2fed2008-07-21 09:48:06 -0700128#include <linux/jhash.h>
129#include <linux/random.h>
David S. Miller9cbc1cb2009-06-15 03:02:23 -0700130#include <trace/events/napi.h>
Koki Sanagicf66ba52010-08-23 18:45:02 +0900131#include <trace/events/net.h>
Koki Sanagi07dc22e2010-08-23 18:46:12 +0900132#include <trace/events/skb.h>
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +0000133#include <linux/pci.h>
Stephen Rothwellcaeda9b2010-09-16 21:39:16 -0700134#include <linux/inetdevice.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135
Pavel Emelyanov342709e2007-10-23 21:14:45 -0700136#include "net-sysfs.h"
137
Herbert Xud565b0a2008-12-15 23:38:52 -0800138/* Instead of increasing this, you should create a hash table. */
139#define MAX_GRO_SKBS 8
140
Herbert Xu5d38a072009-01-04 16:13:40 -0800141/* This should be increased if a protocol with a bigger head is added. */
142#define GRO_MAX_HEAD (MAX_HEADER + 128)
143
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144/*
145 * The list of packet types we will receive (as opposed to discard)
146 * and the routines to invoke.
147 *
148 * Why 16. Because with 16 the only overlap we get on a hash of the
149 * low nibble of the protocol value is RARP/SNAP/X.25.
150 *
151 * NOTE: That is no longer true with the addition of VLAN tags. Not
152 * sure which should go first, but I bet it won't make much
153 * difference if we are running VLANs. The good news is that
154 * this protocol won't be in the list unless compiled in, so
Stephen Hemminger3041a062006-05-26 13:25:24 -0700155 * the average user (w/out VLANs) will not be adversely affected.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156 * --BLG
157 *
158 * 0800 IP
159 * 8100 802.1Q VLAN
160 * 0001 802.3
161 * 0002 AX.25
162 * 0004 802.2
163 * 8035 RARP
164 * 0005 SNAP
165 * 0805 X.25
166 * 0806 ARP
167 * 8137 IPX
168 * 0009 Localtalk
169 * 86DD IPv6
170 */
171
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +0800172#define PTYPE_HASH_SIZE (16)
173#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
174
Linus Torvalds1da177e2005-04-16 15:20:36 -0700175static DEFINE_SPINLOCK(ptype_lock);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +0800176static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -0700177static struct list_head ptype_all __read_mostly; /* Taps */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178
Linus Torvalds1da177e2005-04-16 15:20:36 -0700179/*
Pavel Emelianov7562f872007-05-03 15:13:45 -0700180 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181 * semaphore.
182 *
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800183 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184 *
185 * Writers must hold the rtnl semaphore while they loop through the
Pavel Emelianov7562f872007-05-03 15:13:45 -0700186 * dev_base_head list, and hold dev_base_lock for writing when they do the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700187 * actual updates. This allows pure readers to access the list even
188 * while a writer is preparing to update it.
189 *
190 * To put it another way, dev_base_lock is held for writing only to
191 * protect against pure readers; the rtnl semaphore provides the
192 * protection against other writers.
193 *
194 * See, for example usages, register_netdevice() and
195 * unregister_netdevice(), which must be called with the rtnl
196 * semaphore held.
197 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198DEFINE_RWLOCK(dev_base_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199EXPORT_SYMBOL(dev_base_lock);
200
Eric W. Biederman881d9662007-09-17 11:56:21 -0700201static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202{
203 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
stephen hemminger08e98972009-11-10 07:20:34 +0000204 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700205}
206
Eric W. Biederman881d9662007-09-17 11:56:21 -0700207static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700208{
Eric Dumazet7c28bd02009-10-24 06:13:17 -0700209 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700210}
211
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000212static inline void rps_lock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000213{
214#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000215 spin_lock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000216#endif
217}
218
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000219static inline void rps_unlock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000220{
221#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000222 spin_unlock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000223#endif
224}
225
Eric W. Biedermance286d32007-09-12 13:53:49 +0200226/* Device list insertion */
227static int list_netdevice(struct net_device *dev)
228{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900229 struct net *net = dev_net(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200230
231 ASSERT_RTNL();
232
233 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800234 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
Eric Dumazet72c95282009-10-30 07:11:27 +0000235 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000236 hlist_add_head_rcu(&dev->index_hlist,
237 dev_index_hash(net, dev->ifindex));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200238 write_unlock_bh(&dev_base_lock);
239 return 0;
240}
241
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000242/* Device list removal
243 * caller must respect a RCU grace period before freeing/reusing dev
244 */
Eric W. Biedermance286d32007-09-12 13:53:49 +0200245static void unlist_netdevice(struct net_device *dev)
246{
247 ASSERT_RTNL();
248
249 /* Unlink dev from the device chain */
250 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800251 list_del_rcu(&dev->dev_list);
Eric Dumazet72c95282009-10-30 07:11:27 +0000252 hlist_del_rcu(&dev->name_hlist);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000253 hlist_del_rcu(&dev->index_hlist);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200254 write_unlock_bh(&dev_base_lock);
255}
256
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257/*
258 * Our notifier list
259 */
260
Alan Sternf07d5b92006-05-09 15:23:03 -0700261static RAW_NOTIFIER_HEAD(netdev_chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700262
263/*
264 * Device drivers call our routines to queue packets here. We empty the
265 * queue in the local softnet handler.
266 */
Stephen Hemmingerbea33482007-10-03 16:41:36 -0700267
Eric Dumazet9958da02010-04-17 04:17:02 +0000268DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700269EXPORT_PER_CPU_SYMBOL(softnet_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700270
David S. Millercf508b12008-07-22 14:16:42 -0700271#ifdef CONFIG_LOCKDEP
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700272/*
David S. Millerc773e842008-07-08 23:13:53 -0700273 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700274 * according to dev->type
275 */
276static const unsigned short netdev_lock_type[] =
277 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
278 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
279 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
280 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
281 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
282 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
283 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
284 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
285 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
286 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
287 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
288 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
289 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
Rémi Denis-Courmont2d91d782008-12-17 15:47:29 -0800290 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
Dmitry Eremin-Solenikov929122cd2009-08-14 20:00:20 +0400291 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000292 ARPHRD_VOID, ARPHRD_NONE};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700293
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700294static const char *const netdev_lock_name[] =
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700295 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
Rémi Denis-Courmont2d91d782008-12-17 15:47:29 -0800308 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
Dmitry Eremin-Solenikov929122cd2009-08-14 20:00:20 +0400309 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000310 "_xmit_VOID", "_xmit_NONE"};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700311
312static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
David S. Millercf508b12008-07-22 14:16:42 -0700313static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700314
315static inline unsigned short netdev_lock_pos(unsigned short dev_type)
316{
317 int i;
318
319 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
320 if (netdev_lock_type[i] == dev_type)
321 return i;
322 /* the last key is used by default */
323 return ARRAY_SIZE(netdev_lock_type) - 1;
324}
325
David S. Millercf508b12008-07-22 14:16:42 -0700326static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
327 unsigned short dev_type)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700328{
329 int i;
330
331 i = netdev_lock_pos(dev_type);
332 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
333 netdev_lock_name[i]);
334}
David S. Millercf508b12008-07-22 14:16:42 -0700335
336static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
337{
338 int i;
339
340 i = netdev_lock_pos(dev->type);
341 lockdep_set_class_and_name(&dev->addr_list_lock,
342 &netdev_addr_lock_key[i],
343 netdev_lock_name[i]);
344}
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700345#else
David S. Millercf508b12008-07-22 14:16:42 -0700346static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
347 unsigned short dev_type)
348{
349}
350static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700351{
352}
353#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700354
355/*******************************************************************************
356
357 Protocol management and registration routines
358
359*******************************************************************************/
360
361/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700362 * Add a protocol ID to the list. Now that the input handler is
363 * smarter we can dispense with all the messy stuff that used to be
364 * here.
365 *
366 * BEWARE!!! Protocol handlers, mangling input packets,
367 * MUST BE last in hash buckets and checking protocol handlers
368 * MUST start from promiscuous ptype_all chain in net_bh.
369 * It is true now, do not change it.
370 * Explanation follows: if protocol handler, mangling packet, will
371 * be the first on list, it is not able to sense, that packet
372 * is cloned and should be copied-on-write, so that it will
373 * change it and subsequent readers will get broken packet.
374 * --ANK (980803)
375 */
376
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000377static inline struct list_head *ptype_head(const struct packet_type *pt)
378{
379 if (pt->type == htons(ETH_P_ALL))
380 return &ptype_all;
381 else
382 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383}
384
Linus Torvalds1da177e2005-04-16 15:20:36 -0700385/**
386 * dev_add_pack - add packet handler
387 * @pt: packet type declaration
388 *
389 * Add a protocol handler to the networking stack. The passed &packet_type
390 * is linked into kernel lists and may not be freed until it has been
391 * removed from the kernel lists.
392 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900393 * This call does not sleep therefore it can not
Linus Torvalds1da177e2005-04-16 15:20:36 -0700394 * guarantee all CPU's that are in middle of receiving packets
395 * will see the new packet type (until the next received packet).
396 */
397
398void dev_add_pack(struct packet_type *pt)
399{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000400 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000402 spin_lock(&ptype_lock);
403 list_add_rcu(&pt->list, head);
404 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700405}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700406EXPORT_SYMBOL(dev_add_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700407
Linus Torvalds1da177e2005-04-16 15:20:36 -0700408/**
409 * __dev_remove_pack - remove packet handler
410 * @pt: packet type declaration
411 *
412 * Remove a protocol handler that was previously added to the kernel
413 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
414 * from the kernel lists and can be freed or reused once this function
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900415 * returns.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416 *
417 * The packet type might still be in use by receivers
418 * and must not be freed until after all the CPU's have gone
419 * through a quiescent state.
420 */
421void __dev_remove_pack(struct packet_type *pt)
422{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000423 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700424 struct packet_type *pt1;
425
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000426 spin_lock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700427
428 list_for_each_entry(pt1, head, list) {
429 if (pt == pt1) {
430 list_del_rcu(&pt->list);
431 goto out;
432 }
433 }
434
435 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
436out:
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000437 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700438}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700439EXPORT_SYMBOL(__dev_remove_pack);
440
Linus Torvalds1da177e2005-04-16 15:20:36 -0700441/**
442 * dev_remove_pack - remove packet handler
443 * @pt: packet type declaration
444 *
445 * Remove a protocol handler that was previously added to the kernel
446 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
447 * from the kernel lists and can be freed or reused once this function
448 * returns.
449 *
450 * This call sleeps to guarantee that no CPU is looking at the packet
451 * type after return.
452 */
453void dev_remove_pack(struct packet_type *pt)
454{
455 __dev_remove_pack(pt);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900456
Linus Torvalds1da177e2005-04-16 15:20:36 -0700457 synchronize_net();
458}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700459EXPORT_SYMBOL(dev_remove_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460
461/******************************************************************************
462
463 Device Boot-time Settings Routines
464
465*******************************************************************************/
466
467/* Boot time configuration table */
468static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
469
470/**
471 * netdev_boot_setup_add - add new setup entry
472 * @name: name of the device
473 * @map: configured settings for the device
474 *
475 * Adds new setup entry to the dev_boot_setup list. The function
476 * returns 0 on error and 1 on success. This is a generic routine to
477 * all netdevices.
478 */
479static int netdev_boot_setup_add(char *name, struct ifmap *map)
480{
481 struct netdev_boot_setup *s;
482 int i;
483
484 s = dev_boot_setup;
485 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
486 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
487 memset(s[i].name, 0, sizeof(s[i].name));
Wang Chen93b3cff2008-07-01 19:57:19 -0700488 strlcpy(s[i].name, name, IFNAMSIZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700489 memcpy(&s[i].map, map, sizeof(s[i].map));
490 break;
491 }
492 }
493
494 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
495}
496
497/**
498 * netdev_boot_setup_check - check boot time settings
499 * @dev: the netdevice
500 *
501 * Check boot time settings for the device.
502 * The found settings are set for the device to be used
503 * later in the device probing.
504 * Returns 0 if no settings found, 1 if they are.
505 */
506int netdev_boot_setup_check(struct net_device *dev)
507{
508 struct netdev_boot_setup *s = dev_boot_setup;
509 int i;
510
511 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
512 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
Wang Chen93b3cff2008-07-01 19:57:19 -0700513 !strcmp(dev->name, s[i].name)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700514 dev->irq = s[i].map.irq;
515 dev->base_addr = s[i].map.base_addr;
516 dev->mem_start = s[i].map.mem_start;
517 dev->mem_end = s[i].map.mem_end;
518 return 1;
519 }
520 }
521 return 0;
522}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700523EXPORT_SYMBOL(netdev_boot_setup_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700524
525
526/**
527 * netdev_boot_base - get address from boot time settings
528 * @prefix: prefix for network device
529 * @unit: id for network device
530 *
531 * Check boot time settings for the base address of device.
532 * The found settings are set for the device to be used
533 * later in the device probing.
534 * Returns 0 if no settings found.
535 */
536unsigned long netdev_boot_base(const char *prefix, int unit)
537{
538 const struct netdev_boot_setup *s = dev_boot_setup;
539 char name[IFNAMSIZ];
540 int i;
541
542 sprintf(name, "%s%d", prefix, unit);
543
544 /*
545 * If device already registered then return base of 1
546 * to indicate not to probe for this interface
547 */
Eric W. Biederman881d9662007-09-17 11:56:21 -0700548 if (__dev_get_by_name(&init_net, name))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549 return 1;
550
551 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
552 if (!strcmp(name, s[i].name))
553 return s[i].map.base_addr;
554 return 0;
555}
556
557/*
558 * Saves at boot time configured settings for any netdevice.
559 */
560int __init netdev_boot_setup(char *str)
561{
562 int ints[5];
563 struct ifmap map;
564
565 str = get_options(str, ARRAY_SIZE(ints), ints);
566 if (!str || !*str)
567 return 0;
568
569 /* Save settings */
570 memset(&map, 0, sizeof(map));
571 if (ints[0] > 0)
572 map.irq = ints[1];
573 if (ints[0] > 1)
574 map.base_addr = ints[2];
575 if (ints[0] > 2)
576 map.mem_start = ints[3];
577 if (ints[0] > 3)
578 map.mem_end = ints[4];
579
580 /* Add new entry to the list */
581 return netdev_boot_setup_add(str, &map);
582}
583
584__setup("netdev=", netdev_boot_setup);
585
586/*******************************************************************************
587
588 Device Interface Subroutines
589
590*******************************************************************************/
591
592/**
593 * __dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700594 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700595 * @name: name to find
596 *
597 * Find an interface by name. Must be called under RTNL semaphore
598 * or @dev_base_lock. If the name is found a pointer to the device
599 * is returned. If the name is not found then %NULL is returned. The
600 * reference counters are not incremented so the caller must be
601 * careful with locks.
602 */
603
Eric W. Biederman881d9662007-09-17 11:56:21 -0700604struct net_device *__dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700605{
606 struct hlist_node *p;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700607 struct net_device *dev;
608 struct hlist_head *head = dev_name_hash(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700609
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700610 hlist_for_each_entry(dev, p, head, name_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611 if (!strncmp(dev->name, name, IFNAMSIZ))
612 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700613
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614 return NULL;
615}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700616EXPORT_SYMBOL(__dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700617
618/**
Eric Dumazet72c95282009-10-30 07:11:27 +0000619 * dev_get_by_name_rcu - find a device by its name
620 * @net: the applicable net namespace
621 * @name: name to find
622 *
623 * Find an interface by name.
624 * If the name is found a pointer to the device is returned.
625 * If the name is not found then %NULL is returned.
626 * The reference counters are not incremented so the caller must be
627 * careful with locks. The caller must hold RCU lock.
628 */
629
630struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
631{
632 struct hlist_node *p;
633 struct net_device *dev;
634 struct hlist_head *head = dev_name_hash(net, name);
635
636 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
637 if (!strncmp(dev->name, name, IFNAMSIZ))
638 return dev;
639
640 return NULL;
641}
642EXPORT_SYMBOL(dev_get_by_name_rcu);
643
644/**
Linus Torvalds1da177e2005-04-16 15:20:36 -0700645 * dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700646 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647 * @name: name to find
648 *
649 * Find an interface by name. This can be called from any
650 * context and does its own locking. The returned handle has
651 * the usage count incremented and the caller must use dev_put() to
652 * release it when it is no longer needed. %NULL is returned if no
653 * matching device is found.
654 */
655
Eric W. Biederman881d9662007-09-17 11:56:21 -0700656struct net_device *dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700657{
658 struct net_device *dev;
659
Eric Dumazet72c95282009-10-30 07:11:27 +0000660 rcu_read_lock();
661 dev = dev_get_by_name_rcu(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662 if (dev)
663 dev_hold(dev);
Eric Dumazet72c95282009-10-30 07:11:27 +0000664 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665 return dev;
666}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700667EXPORT_SYMBOL(dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700668
669/**
670 * __dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700671 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700672 * @ifindex: index of device
673 *
674 * Search for an interface by index. Returns %NULL if the device
675 * is not found or a pointer to the device. The device has not
676 * had its reference counter increased so the caller must be careful
677 * about locking. The caller must hold either the RTNL semaphore
678 * or @dev_base_lock.
679 */
680
Eric W. Biederman881d9662007-09-17 11:56:21 -0700681struct net_device *__dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700682{
683 struct hlist_node *p;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700684 struct net_device *dev;
685 struct hlist_head *head = dev_index_hash(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700686
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700687 hlist_for_each_entry(dev, p, head, index_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700688 if (dev->ifindex == ifindex)
689 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700690
Linus Torvalds1da177e2005-04-16 15:20:36 -0700691 return NULL;
692}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700693EXPORT_SYMBOL(__dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000695/**
696 * dev_get_by_index_rcu - find a device by its ifindex
697 * @net: the applicable net namespace
698 * @ifindex: index of device
699 *
700 * Search for an interface by index. Returns %NULL if the device
701 * is not found or a pointer to the device. The device has not
702 * had its reference counter increased so the caller must be careful
703 * about locking. The caller must hold RCU lock.
704 */
705
706struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
707{
708 struct hlist_node *p;
709 struct net_device *dev;
710 struct hlist_head *head = dev_index_hash(net, ifindex);
711
712 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
713 if (dev->ifindex == ifindex)
714 return dev;
715
716 return NULL;
717}
718EXPORT_SYMBOL(dev_get_by_index_rcu);
719
Linus Torvalds1da177e2005-04-16 15:20:36 -0700720
721/**
722 * dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700723 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724 * @ifindex: index of device
725 *
726 * Search for an interface by index. Returns NULL if the device
727 * is not found or a pointer to the device. The device returned has
728 * had a reference added and the pointer is safe until the user calls
729 * dev_put to indicate they have finished with it.
730 */
731
Eric W. Biederman881d9662007-09-17 11:56:21 -0700732struct net_device *dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700733{
734 struct net_device *dev;
735
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000736 rcu_read_lock();
737 dev = dev_get_by_index_rcu(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700738 if (dev)
739 dev_hold(dev);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000740 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700741 return dev;
742}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700743EXPORT_SYMBOL(dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700744
745/**
Eric Dumazet941666c2010-12-05 01:23:53 +0000746 * dev_getbyhwaddr_rcu - find a device by its hardware address
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700747 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700748 * @type: media type of device
749 * @ha: hardware address
750 *
751 * Search for an interface by MAC address. Returns NULL if the device
Eric Dumazet941666c2010-12-05 01:23:53 +0000752 * is not found or a pointer to the device. The caller must hold RCU
753 * The returned device has not had its ref count increased
Linus Torvalds1da177e2005-04-16 15:20:36 -0700754 * and the caller must therefore be careful about locking
755 *
Linus Torvalds1da177e2005-04-16 15:20:36 -0700756 */
757
Eric Dumazet941666c2010-12-05 01:23:53 +0000758struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
759 const char *ha)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700760{
761 struct net_device *dev;
762
Eric Dumazet941666c2010-12-05 01:23:53 +0000763 for_each_netdev_rcu(net, dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700764 if (dev->type == type &&
765 !memcmp(dev->dev_addr, ha, dev->addr_len))
Pavel Emelianov7562f872007-05-03 15:13:45 -0700766 return dev;
767
768 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700769}
Eric Dumazet941666c2010-12-05 01:23:53 +0000770EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
Jochen Friedrichcf309e32005-09-22 04:44:55 -0300771
Eric W. Biederman881d9662007-09-17 11:56:21 -0700772struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700773{
774 struct net_device *dev;
775
776 ASSERT_RTNL();
Eric W. Biederman881d9662007-09-17 11:56:21 -0700777 for_each_netdev(net, dev)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700778 if (dev->type == type)
Pavel Emelianov7562f872007-05-03 15:13:45 -0700779 return dev;
780
781 return NULL;
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700782}
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700783EXPORT_SYMBOL(__dev_getfirstbyhwtype);
784
Eric W. Biederman881d9662007-09-17 11:56:21 -0700785struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700786{
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000787 struct net_device *dev, *ret = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700788
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000789 rcu_read_lock();
790 for_each_netdev_rcu(net, dev)
791 if (dev->type == type) {
792 dev_hold(dev);
793 ret = dev;
794 break;
795 }
796 rcu_read_unlock();
797 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700798}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799EXPORT_SYMBOL(dev_getfirstbyhwtype);
800
801/**
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000802 * dev_get_by_flags_rcu - find any device with given flags
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700803 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700804 * @if_flags: IFF_* values
805 * @mask: bitmask of bits in if_flags to check
806 *
807 * Search for any interface with the given flags. Returns NULL if a device
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000808 * is not found or a pointer to the device. Must be called inside
809 * rcu_read_lock(), and result refcount is unchanged.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700810 */
811
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000812struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700813 unsigned short mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700814{
Pavel Emelianov7562f872007-05-03 15:13:45 -0700815 struct net_device *dev, *ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700816
Pavel Emelianov7562f872007-05-03 15:13:45 -0700817 ret = NULL;
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800818 for_each_netdev_rcu(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700819 if (((dev->flags ^ if_flags) & mask) == 0) {
Pavel Emelianov7562f872007-05-03 15:13:45 -0700820 ret = dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700821 break;
822 }
823 }
Pavel Emelianov7562f872007-05-03 15:13:45 -0700824 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700825}
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000826EXPORT_SYMBOL(dev_get_by_flags_rcu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700827
828/**
829 * dev_valid_name - check if name is okay for network device
830 * @name: name string
831 *
832 * Network device names need to be valid file names to
David S. Millerc7fa9d12006-08-15 16:34:13 -0700833 * to allow sysfs to work. We also disallow any kind of
834 * whitespace.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700835 */
Mitch Williamsc2373ee2005-11-09 10:34:45 -0800836int dev_valid_name(const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700837{
David S. Millerc7fa9d12006-08-15 16:34:13 -0700838 if (*name == '\0')
839 return 0;
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -0700840 if (strlen(name) >= IFNAMSIZ)
841 return 0;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700842 if (!strcmp(name, ".") || !strcmp(name, ".."))
843 return 0;
844
845 while (*name) {
846 if (*name == '/' || isspace(*name))
847 return 0;
848 name++;
849 }
850 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700851}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700852EXPORT_SYMBOL(dev_valid_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700853
854/**
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200855 * __dev_alloc_name - allocate a name for a device
856 * @net: network namespace to allocate the device name in
Linus Torvalds1da177e2005-04-16 15:20:36 -0700857 * @name: name format string
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200858 * @buf: scratch buffer and result name string
Linus Torvalds1da177e2005-04-16 15:20:36 -0700859 *
860 * Passed a format string - eg "lt%d" it will try and find a suitable
Stephen Hemminger3041a062006-05-26 13:25:24 -0700861 * id. It scans list of devices to build up a free map, then chooses
862 * the first empty slot. The caller must hold the dev_base or rtnl lock
863 * while allocating the name and adding the device in order to avoid
864 * duplicates.
865 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
866 * Returns the number of the unit assigned or a negative errno code.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700867 */
868
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200869static int __dev_alloc_name(struct net *net, const char *name, char *buf)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700870{
871 int i = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700872 const char *p;
873 const int max_netdevices = 8*PAGE_SIZE;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700874 unsigned long *inuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700875 struct net_device *d;
876
877 p = strnchr(name, IFNAMSIZ-1, '%');
878 if (p) {
879 /*
880 * Verify the string as this thing may have come from
881 * the user. There must be either one "%d" and no other "%"
882 * characters.
883 */
884 if (p[1] != 'd' || strchr(p + 2, '%'))
885 return -EINVAL;
886
887 /* Use one page as a bit array of possible slots */
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700888 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700889 if (!inuse)
890 return -ENOMEM;
891
Eric W. Biederman881d9662007-09-17 11:56:21 -0700892 for_each_netdev(net, d) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700893 if (!sscanf(d->name, name, &i))
894 continue;
895 if (i < 0 || i >= max_netdevices)
896 continue;
897
898 /* avoid cases where sscanf is not exact inverse of printf */
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200899 snprintf(buf, IFNAMSIZ, name, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700900 if (!strncmp(buf, d->name, IFNAMSIZ))
901 set_bit(i, inuse);
902 }
903
904 i = find_first_zero_bit(inuse, max_netdevices);
905 free_page((unsigned long) inuse);
906 }
907
Octavian Purdilad9031022009-11-18 02:36:59 +0000908 if (buf != name)
909 snprintf(buf, IFNAMSIZ, name, i);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200910 if (!__dev_get_by_name(net, buf))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700911 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700912
913 /* It is possible to run out of possible slots
914 * when the name is long and there isn't enough space left
915 * for the digits, or if all bits are used.
916 */
917 return -ENFILE;
918}
919
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200920/**
921 * dev_alloc_name - allocate a name for a device
922 * @dev: device
923 * @name: name format string
924 *
925 * Passed a format string - eg "lt%d" it will try and find a suitable
926 * id. It scans list of devices to build up a free map, then chooses
927 * the first empty slot. The caller must hold the dev_base or rtnl lock
928 * while allocating the name and adding the device in order to avoid
929 * duplicates.
930 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
931 * Returns the number of the unit assigned or a negative errno code.
932 */
933
934int dev_alloc_name(struct net_device *dev, const char *name)
935{
936 char buf[IFNAMSIZ];
937 struct net *net;
938 int ret;
939
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900940 BUG_ON(!dev_net(dev));
941 net = dev_net(dev);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200942 ret = __dev_alloc_name(net, name, buf);
943 if (ret >= 0)
944 strlcpy(dev->name, buf, IFNAMSIZ);
945 return ret;
946}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700947EXPORT_SYMBOL(dev_alloc_name);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200948
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000949static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
Octavian Purdilad9031022009-11-18 02:36:59 +0000950{
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000951 struct net *net;
952
953 BUG_ON(!dev_net(dev));
954 net = dev_net(dev);
955
Octavian Purdilad9031022009-11-18 02:36:59 +0000956 if (!dev_valid_name(name))
957 return -EINVAL;
958
959 if (fmt && strchr(name, '%'))
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000960 return dev_alloc_name(dev, name);
Octavian Purdilad9031022009-11-18 02:36:59 +0000961 else if (__dev_get_by_name(net, name))
962 return -EEXIST;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000963 else if (dev->name != name)
964 strlcpy(dev->name, name, IFNAMSIZ);
Octavian Purdilad9031022009-11-18 02:36:59 +0000965
966 return 0;
967}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968
969/**
970 * dev_change_name - change name of a device
971 * @dev: device
972 * @newname: name (or format string) must be at least IFNAMSIZ
973 *
974 * Change name of a device, can pass format strings "eth%d".
975 * for wildcarding.
976 */
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -0700977int dev_change_name(struct net_device *dev, const char *newname)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700978{
Herbert Xufcc5a032007-07-30 17:03:38 -0700979 char oldname[IFNAMSIZ];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980 int err = 0;
Herbert Xufcc5a032007-07-30 17:03:38 -0700981 int ret;
Eric W. Biederman881d9662007-09-17 11:56:21 -0700982 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700983
984 ASSERT_RTNL();
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900985 BUG_ON(!dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700986
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900987 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700988 if (dev->flags & IFF_UP)
989 return -EBUSY;
990
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -0700991 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
992 return 0;
993
Herbert Xufcc5a032007-07-30 17:03:38 -0700994 memcpy(oldname, dev->name, IFNAMSIZ);
995
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000996 err = dev_get_valid_name(dev, newname, 1);
Octavian Purdilad9031022009-11-18 02:36:59 +0000997 if (err < 0)
998 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700999
Herbert Xufcc5a032007-07-30 17:03:38 -07001000rollback:
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001001 ret = device_rename(&dev->dev, dev->name);
1002 if (ret) {
1003 memcpy(dev->name, oldname, IFNAMSIZ);
1004 return ret;
Stephen Hemmingerdcc99772008-05-14 22:33:38 -07001005 }
Herbert Xu7f988ea2007-07-30 16:35:46 -07001006
1007 write_lock_bh(&dev_base_lock);
Eric W. Biederman92749822007-04-03 00:07:30 -06001008 hlist_del(&dev->name_hlist);
Eric Dumazet72c95282009-10-30 07:11:27 +00001009 write_unlock_bh(&dev_base_lock);
1010
1011 synchronize_rcu();
1012
1013 write_lock_bh(&dev_base_lock);
1014 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Herbert Xu7f988ea2007-07-30 16:35:46 -07001015 write_unlock_bh(&dev_base_lock);
1016
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001017 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001018 ret = notifier_to_errno(ret);
1019
1020 if (ret) {
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001021 /* err >= 0 after dev_alloc_name() or stores the first errno */
1022 if (err >= 0) {
Herbert Xufcc5a032007-07-30 17:03:38 -07001023 err = ret;
1024 memcpy(dev->name, oldname, IFNAMSIZ);
1025 goto rollback;
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001026 } else {
1027 printk(KERN_ERR
1028 "%s: name change rollback failed: %d.\n",
1029 dev->name, ret);
Herbert Xufcc5a032007-07-30 17:03:38 -07001030 }
1031 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001032
1033 return err;
1034}
1035
1036/**
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001037 * dev_set_alias - change ifalias of a device
1038 * @dev: device
1039 * @alias: name up to IFALIASZ
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07001040 * @len: limit of bytes to copy from info
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001041 *
1042 * Set ifalias for a device,
1043 */
1044int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1045{
1046 ASSERT_RTNL();
1047
1048 if (len >= IFALIASZ)
1049 return -EINVAL;
1050
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001051 if (!len) {
1052 if (dev->ifalias) {
1053 kfree(dev->ifalias);
1054 dev->ifalias = NULL;
1055 }
1056 return 0;
1057 }
1058
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001059 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001060 if (!dev->ifalias)
1061 return -ENOMEM;
1062
1063 strlcpy(dev->ifalias, alias, len+1);
1064 return len;
1065}
1066
1067
1068/**
Stephen Hemminger3041a062006-05-26 13:25:24 -07001069 * netdev_features_change - device changes features
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001070 * @dev: device to cause notification
1071 *
1072 * Called to indicate a device has changed features.
1073 */
1074void netdev_features_change(struct net_device *dev)
1075{
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001076 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001077}
1078EXPORT_SYMBOL(netdev_features_change);
1079
1080/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001081 * netdev_state_change - device changes state
1082 * @dev: device to cause notification
1083 *
1084 * Called to indicate a device has changed state. This function calls
1085 * the notifier chains for netdev_chain and sends a NEWLINK message
1086 * to the routing socket.
1087 */
1088void netdev_state_change(struct net_device *dev)
1089{
1090 if (dev->flags & IFF_UP) {
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001091 call_netdevice_notifiers(NETDEV_CHANGE, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001092 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1093 }
1094}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001095EXPORT_SYMBOL(netdev_state_change);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001096
Jiri Pirko3ca5b402010-03-10 10:29:35 +00001097int netdev_bonding_change(struct net_device *dev, unsigned long event)
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001098{
Jiri Pirko3ca5b402010-03-10 10:29:35 +00001099 return call_netdevice_notifiers(event, dev);
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001100}
1101EXPORT_SYMBOL(netdev_bonding_change);
1102
Linus Torvalds1da177e2005-04-16 15:20:36 -07001103/**
1104 * dev_load - load a network module
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001105 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07001106 * @name: name of interface
1107 *
1108 * If a network interface is not present and the process has suitable
1109 * privileges this function loads the module. If module loading is not
1110 * available in this kernel then it becomes a nop.
1111 */
1112
Eric W. Biederman881d9662007-09-17 11:56:21 -07001113void dev_load(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001114{
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001115 struct net_device *dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001116
Eric Dumazet72c95282009-10-30 07:11:27 +00001117 rcu_read_lock();
1118 dev = dev_get_by_name_rcu(net, name);
1119 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001120
Eric Parisa8f80e82009-08-13 09:44:51 -04001121 if (!dev && capable(CAP_NET_ADMIN))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001122 request_module("%s", name);
1123}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001124EXPORT_SYMBOL(dev_load);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001125
Patrick McHardybd380812010-02-26 06:34:53 +00001126static int __dev_open(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001127{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001128 const struct net_device_ops *ops = dev->netdev_ops;
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001129 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001131 ASSERT_RTNL();
1132
Linus Torvalds1da177e2005-04-16 15:20:36 -07001133 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001134 * Is it even present?
1135 */
1136 if (!netif_device_present(dev))
1137 return -ENODEV;
1138
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001139 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1140 ret = notifier_to_errno(ret);
1141 if (ret)
1142 return ret;
1143
Linus Torvalds1da177e2005-04-16 15:20:36 -07001144 /*
1145 * Call device private open method
1146 */
1147 set_bit(__LINK_STATE_START, &dev->state);
Jeff Garzikbada3392007-10-23 20:19:37 -07001148
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001149 if (ops->ndo_validate_addr)
1150 ret = ops->ndo_validate_addr(dev);
Jeff Garzikbada3392007-10-23 20:19:37 -07001151
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001152 if (!ret && ops->ndo_open)
1153 ret = ops->ndo_open(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001154
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001155 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001156 * If it went open OK then:
1157 */
1158
Jeff Garzikbada3392007-10-23 20:19:37 -07001159 if (ret)
1160 clear_bit(__LINK_STATE_START, &dev->state);
1161 else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001162 /*
1163 * Set the flags.
1164 */
1165 dev->flags |= IFF_UP;
1166
1167 /*
Dan Williams649274d2009-01-11 00:20:39 -08001168 * Enable NET_DMA
1169 */
David S. Millerb4bd07c2009-02-06 22:06:43 -08001170 net_dmaengine_get();
Dan Williams649274d2009-01-11 00:20:39 -08001171
1172 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001173 * Initialize multicasting status
1174 */
Patrick McHardy4417da62007-06-27 01:28:10 -07001175 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001176
1177 /*
1178 * Wakeup transmit queue engine
1179 */
1180 dev_activate(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001181 }
Jeff Garzikbada3392007-10-23 20:19:37 -07001182
Linus Torvalds1da177e2005-04-16 15:20:36 -07001183 return ret;
1184}
Patrick McHardybd380812010-02-26 06:34:53 +00001185
1186/**
1187 * dev_open - prepare an interface for use.
1188 * @dev: device to open
1189 *
1190 * Takes a device from down to up state. The device's private open
1191 * function is invoked and then the multicast lists are loaded. Finally
1192 * the device is moved into the up state and a %NETDEV_UP message is
1193 * sent to the netdev notifier chain.
1194 *
1195 * Calling this function on an active interface is a nop. On a failure
1196 * a negative errno code is returned.
1197 */
1198int dev_open(struct net_device *dev)
1199{
1200 int ret;
1201
1202 /*
1203 * Is it already up?
1204 */
1205 if (dev->flags & IFF_UP)
1206 return 0;
1207
1208 /*
1209 * Open device
1210 */
1211 ret = __dev_open(dev);
1212 if (ret < 0)
1213 return ret;
1214
1215 /*
1216 * ... and announce new interface.
1217 */
1218 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1219 call_netdevice_notifiers(NETDEV_UP, dev);
1220
1221 return ret;
1222}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001223EXPORT_SYMBOL(dev_open);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001224
Octavian Purdila44345722010-12-13 12:44:07 +00001225static int __dev_close_many(struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001226{
Octavian Purdila44345722010-12-13 12:44:07 +00001227 struct net_device *dev;
Patrick McHardybd380812010-02-26 06:34:53 +00001228
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001229 ASSERT_RTNL();
David S. Miller9d5010d2007-09-12 14:33:25 +02001230 might_sleep();
1231
Octavian Purdila44345722010-12-13 12:44:07 +00001232 list_for_each_entry(dev, head, unreg_list) {
1233 /*
1234 * Tell people we are going down, so that they can
1235 * prepare to death, when device is still operating.
1236 */
1237 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001238
Octavian Purdila44345722010-12-13 12:44:07 +00001239 clear_bit(__LINK_STATE_START, &dev->state);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001240
Octavian Purdila44345722010-12-13 12:44:07 +00001241 /* Synchronize to scheduled poll. We cannot touch poll list, it
1242 * can be even on different cpu. So just clear netif_running().
1243 *
1244 * dev->stop() will invoke napi_disable() on all of it's
1245 * napi_struct instances on this device.
1246 */
1247 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1248 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001249
Octavian Purdila44345722010-12-13 12:44:07 +00001250 dev_deactivate_many(head);
1251
1252 list_for_each_entry(dev, head, unreg_list) {
1253 const struct net_device_ops *ops = dev->netdev_ops;
1254
1255 /*
1256 * Call the device specific close. This cannot fail.
1257 * Only if device is UP
1258 *
1259 * We allow it to be called even after a DETACH hot-plug
1260 * event.
1261 */
1262 if (ops->ndo_stop)
1263 ops->ndo_stop(dev);
1264
1265 /*
1266 * Device is now down.
1267 */
1268
1269 dev->flags &= ~IFF_UP;
1270
1271 /*
1272 * Shutdown NET_DMA
1273 */
1274 net_dmaengine_put();
1275 }
1276
1277 return 0;
1278}
1279
1280static int __dev_close(struct net_device *dev)
1281{
1282 LIST_HEAD(single);
1283
1284 list_add(&dev->unreg_list, &single);
1285 return __dev_close_many(&single);
1286}
1287
1288int dev_close_many(struct list_head *head)
1289{
1290 struct net_device *dev, *tmp;
1291 LIST_HEAD(tmp_list);
1292
1293 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1294 if (!(dev->flags & IFF_UP))
1295 list_move(&dev->unreg_list, &tmp_list);
1296
1297 __dev_close_many(head);
Matti Linnanvuorid8b2a4d2008-02-12 23:10:11 -08001298
Linus Torvalds1da177e2005-04-16 15:20:36 -07001299 /*
Octavian Purdila44345722010-12-13 12:44:07 +00001300 * Tell people we are down
Linus Torvalds1da177e2005-04-16 15:20:36 -07001301 */
Octavian Purdila44345722010-12-13 12:44:07 +00001302 list_for_each_entry(dev, head, unreg_list) {
1303 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1304 call_netdevice_notifiers(NETDEV_DOWN, dev);
1305 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001306
Octavian Purdila44345722010-12-13 12:44:07 +00001307 /* rollback_registered_many needs the complete original list */
1308 list_splice(&tmp_list, head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001309 return 0;
1310}
Patrick McHardybd380812010-02-26 06:34:53 +00001311
1312/**
1313 * dev_close - shutdown an interface.
1314 * @dev: device to shutdown
1315 *
1316 * This function moves an active device into down state. A
1317 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1318 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1319 * chain.
1320 */
1321int dev_close(struct net_device *dev)
1322{
Octavian Purdila44345722010-12-13 12:44:07 +00001323 LIST_HEAD(single);
Patrick McHardybd380812010-02-26 06:34:53 +00001324
Octavian Purdila44345722010-12-13 12:44:07 +00001325 list_add(&dev->unreg_list, &single);
1326 dev_close_many(&single);
Patrick McHardybd380812010-02-26 06:34:53 +00001327
1328 return 0;
1329}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001330EXPORT_SYMBOL(dev_close);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001331
1332
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001333/**
1334 * dev_disable_lro - disable Large Receive Offload on a device
1335 * @dev: device
1336 *
1337 * Disable Large Receive Offload (LRO) on a net device. Must be
1338 * called under RTNL. This is needed if received packets may be
1339 * forwarded to another interface.
1340 */
1341void dev_disable_lro(struct net_device *dev)
1342{
1343 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1344 dev->ethtool_ops->set_flags) {
1345 u32 flags = dev->ethtool_ops->get_flags(dev);
1346 if (flags & ETH_FLAG_LRO) {
1347 flags &= ~ETH_FLAG_LRO;
1348 dev->ethtool_ops->set_flags(dev, flags);
1349 }
1350 }
1351 WARN_ON(dev->features & NETIF_F_LRO);
1352}
1353EXPORT_SYMBOL(dev_disable_lro);
1354
1355
Eric W. Biederman881d9662007-09-17 11:56:21 -07001356static int dev_boot_phase = 1;
1357
Linus Torvalds1da177e2005-04-16 15:20:36 -07001358/*
1359 * Device change register/unregister. These are not inline or static
1360 * as we export them to the world.
1361 */
1362
1363/**
1364 * register_netdevice_notifier - register a network notifier block
1365 * @nb: notifier
1366 *
1367 * Register a notifier to be called when network device events occur.
1368 * The notifier passed is linked into the kernel structures and must
1369 * not be reused until it has been unregistered. A negative errno code
1370 * is returned on a failure.
1371 *
1372 * When registered all registration and up events are replayed
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001373 * to the new notifier to allow device to have a race free
Linus Torvalds1da177e2005-04-16 15:20:36 -07001374 * view of the network device list.
1375 */
1376
1377int register_netdevice_notifier(struct notifier_block *nb)
1378{
1379 struct net_device *dev;
Herbert Xufcc5a032007-07-30 17:03:38 -07001380 struct net_device *last;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001381 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001382 int err;
1383
1384 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001385 err = raw_notifier_chain_register(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001386 if (err)
1387 goto unlock;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001388 if (dev_boot_phase)
1389 goto unlock;
1390 for_each_net(net) {
1391 for_each_netdev(net, dev) {
1392 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1393 err = notifier_to_errno(err);
1394 if (err)
1395 goto rollback;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001396
Eric W. Biederman881d9662007-09-17 11:56:21 -07001397 if (!(dev->flags & IFF_UP))
1398 continue;
Herbert Xufcc5a032007-07-30 17:03:38 -07001399
Eric W. Biederman881d9662007-09-17 11:56:21 -07001400 nb->notifier_call(nb, NETDEV_UP, dev);
1401 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001402 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001403
1404unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001405 rtnl_unlock();
1406 return err;
Herbert Xufcc5a032007-07-30 17:03:38 -07001407
1408rollback:
1409 last = dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001410 for_each_net(net) {
1411 for_each_netdev(net, dev) {
1412 if (dev == last)
1413 break;
Herbert Xufcc5a032007-07-30 17:03:38 -07001414
Eric W. Biederman881d9662007-09-17 11:56:21 -07001415 if (dev->flags & IFF_UP) {
1416 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1417 nb->notifier_call(nb, NETDEV_DOWN, dev);
1418 }
1419 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00001420 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001421 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001422 }
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001423
1424 raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001425 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001426}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001427EXPORT_SYMBOL(register_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001428
1429/**
1430 * unregister_netdevice_notifier - unregister a network notifier block
1431 * @nb: notifier
1432 *
1433 * Unregister a notifier previously registered by
1434 * register_netdevice_notifier(). The notifier is unlinked into the
1435 * kernel structures and may then be reused. A negative errno code
1436 * is returned on a failure.
1437 */
1438
1439int unregister_netdevice_notifier(struct notifier_block *nb)
1440{
Herbert Xu9f514952006-03-25 01:24:25 -08001441 int err;
1442
1443 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001444 err = raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xu9f514952006-03-25 01:24:25 -08001445 rtnl_unlock();
1446 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001447}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001448EXPORT_SYMBOL(unregister_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001449
1450/**
1451 * call_netdevice_notifiers - call all network notifier blocks
1452 * @val: value passed unmodified to notifier function
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001453 * @dev: net_device pointer passed unmodified to notifier function
Linus Torvalds1da177e2005-04-16 15:20:36 -07001454 *
1455 * Call all network notifier blocks. Parameters and return value
Alan Sternf07d5b92006-05-09 15:23:03 -07001456 * are as for raw_notifier_call_chain().
Linus Torvalds1da177e2005-04-16 15:20:36 -07001457 */
1458
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001459int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001460{
Jiri Pirkoab930472010-04-20 01:45:37 -07001461 ASSERT_RTNL();
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001462 return raw_notifier_call_chain(&netdev_chain, val, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001463}
1464
1465/* When > 0 there are consumers of rx skb time stamps */
1466static atomic_t netstamp_needed = ATOMIC_INIT(0);
1467
1468void net_enable_timestamp(void)
1469{
1470 atomic_inc(&netstamp_needed);
1471}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001472EXPORT_SYMBOL(net_enable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001473
1474void net_disable_timestamp(void)
1475{
1476 atomic_dec(&netstamp_needed);
1477}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001478EXPORT_SYMBOL(net_disable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001479
Eric Dumazet3b098e22010-05-15 23:57:10 -07001480static inline void net_timestamp_set(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001481{
1482 if (atomic_read(&netstamp_needed))
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001483 __net_timestamp(skb);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001484 else
1485 skb->tstamp.tv64 = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001486}
1487
Eric Dumazet3b098e22010-05-15 23:57:10 -07001488static inline void net_timestamp_check(struct sk_buff *skb)
1489{
1490 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1491 __net_timestamp(skb);
1492}
1493
Arnd Bergmann44540962009-11-26 06:07:08 +00001494/**
1495 * dev_forward_skb - loopback an skb to another netif
1496 *
1497 * @dev: destination network device
1498 * @skb: buffer to forward
1499 *
1500 * return values:
1501 * NET_RX_SUCCESS (no congestion)
Eric Dumazet6ec82562010-05-06 00:53:53 -07001502 * NET_RX_DROP (packet was dropped, but freed)
Arnd Bergmann44540962009-11-26 06:07:08 +00001503 *
1504 * dev_forward_skb can be used for injecting an skb from the
1505 * start_xmit function of one device into the receive queue
1506 * of another device.
1507 *
1508 * The receiving device may be in another namespace, so
1509 * we have to clear all information in the skb that could
1510 * impact namespace isolation.
1511 */
1512int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1513{
1514 skb_orphan(skb);
Ben Greearc736eef2010-07-22 09:54:47 +00001515 nf_reset(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001516
Eric Dumazetcaf586e2010-09-30 21:06:55 +00001517 if (unlikely(!(dev->flags & IFF_UP) ||
David S. Miller2198a102010-10-21 08:43:05 -07001518 (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
Eric Dumazetcaf586e2010-09-30 21:06:55 +00001519 atomic_long_inc(&dev->rx_dropped);
Eric Dumazet6ec82562010-05-06 00:53:53 -07001520 kfree_skb(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001521 return NET_RX_DROP;
Eric Dumazet6ec82562010-05-06 00:53:53 -07001522 }
Arnd Bergmann8a83a002010-01-30 12:23:03 +00001523 skb_set_dev(skb, dev);
Arnd Bergmann44540962009-11-26 06:07:08 +00001524 skb->tstamp.tv64 = 0;
1525 skb->pkt_type = PACKET_HOST;
1526 skb->protocol = eth_type_trans(skb, dev);
Arnd Bergmann44540962009-11-26 06:07:08 +00001527 return netif_rx(skb);
1528}
1529EXPORT_SYMBOL_GPL(dev_forward_skb);
1530
Changli Gao71d9dec2010-12-15 19:57:25 +00001531static inline int deliver_skb(struct sk_buff *skb,
1532 struct packet_type *pt_prev,
1533 struct net_device *orig_dev)
1534{
1535 atomic_inc(&skb->users);
1536 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1537}
1538
Linus Torvalds1da177e2005-04-16 15:20:36 -07001539/*
1540 * Support routine. Sends outgoing frames to any network
1541 * taps currently in use.
1542 */
1543
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001544static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001545{
1546 struct packet_type *ptype;
Changli Gao71d9dec2010-12-15 19:57:25 +00001547 struct sk_buff *skb2 = NULL;
1548 struct packet_type *pt_prev = NULL;
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001549
Linus Torvalds1da177e2005-04-16 15:20:36 -07001550 rcu_read_lock();
1551 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1552 /* Never send packets back to the socket
1553 * they originated from - MvS (miquels@drinkel.ow.org)
1554 */
1555 if ((ptype->dev == dev || !ptype->dev) &&
1556 (ptype->af_packet_priv == NULL ||
1557 (struct sock *)ptype->af_packet_priv != skb->sk)) {
Changli Gao71d9dec2010-12-15 19:57:25 +00001558 if (pt_prev) {
1559 deliver_skb(skb2, pt_prev, skb->dev);
1560 pt_prev = ptype;
1561 continue;
1562 }
1563
1564 skb2 = skb_clone(skb, GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001565 if (!skb2)
1566 break;
1567
Eric Dumazet70978182010-12-20 21:22:51 +00001568 net_timestamp_set(skb2);
1569
Linus Torvalds1da177e2005-04-16 15:20:36 -07001570 /* skb->nh should be correctly
1571 set by sender, so that the second statement is
1572 just protection against buggy protocols.
1573 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001574 skb_reset_mac_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001575
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001576 if (skb_network_header(skb2) < skb2->data ||
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001577 skb2->network_header > skb2->tail) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001578 if (net_ratelimit())
1579 printk(KERN_CRIT "protocol %04x is "
1580 "buggy, dev %s\n",
Sebastian Andrzej Siewior70777d02010-06-30 10:39:19 -07001581 ntohs(skb2->protocol),
1582 dev->name);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07001583 skb_reset_network_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001584 }
1585
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001586 skb2->transport_header = skb2->network_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001587 skb2->pkt_type = PACKET_OUTGOING;
Changli Gao71d9dec2010-12-15 19:57:25 +00001588 pt_prev = ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589 }
1590 }
Changli Gao71d9dec2010-12-15 19:57:25 +00001591 if (pt_prev)
1592 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001593 rcu_read_unlock();
1594}
1595
John Fastabendf0796d52010-07-01 13:21:57 +00001596/*
1597 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1598 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1599 */
Tom Herberte6484932010-10-18 18:04:39 +00001600int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
John Fastabendf0796d52010-07-01 13:21:57 +00001601{
Tom Herbert1d24eb42010-11-21 13:17:27 +00001602 int rc;
1603
Tom Herberte6484932010-10-18 18:04:39 +00001604 if (txq < 1 || txq > dev->num_tx_queues)
1605 return -EINVAL;
John Fastabendf0796d52010-07-01 13:21:57 +00001606
Tom Herberte6484932010-10-18 18:04:39 +00001607 if (dev->reg_state == NETREG_REGISTERED) {
1608 ASSERT_RTNL();
1609
Tom Herbert1d24eb42010-11-21 13:17:27 +00001610 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1611 txq);
Tom Herbertbf264142010-11-26 08:36:09 +00001612 if (rc)
1613 return rc;
1614
Tom Herberte6484932010-10-18 18:04:39 +00001615 if (txq < dev->real_num_tx_queues)
1616 qdisc_reset_all_tx_gt(dev, txq);
John Fastabendf0796d52010-07-01 13:21:57 +00001617 }
Tom Herberte6484932010-10-18 18:04:39 +00001618
1619 dev->real_num_tx_queues = txq;
1620 return 0;
John Fastabendf0796d52010-07-01 13:21:57 +00001621}
1622EXPORT_SYMBOL(netif_set_real_num_tx_queues);
Denis Vlasenko56079432006-03-29 15:57:29 -08001623
Ben Hutchings62fe0b42010-09-27 08:24:33 +00001624#ifdef CONFIG_RPS
1625/**
1626 * netif_set_real_num_rx_queues - set actual number of RX queues used
1627 * @dev: Network device
1628 * @rxq: Actual number of RX queues
1629 *
1630 * This must be called either with the rtnl_lock held or before
1631 * registration of the net device. Returns 0 on success, or a
Ben Hutchings4e7f7952010-10-08 10:33:39 -07001632 * negative error code. If called before registration, it always
1633 * succeeds.
Ben Hutchings62fe0b42010-09-27 08:24:33 +00001634 */
1635int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1636{
1637 int rc;
1638
Tom Herbertbd25fa72010-10-18 18:00:16 +00001639 if (rxq < 1 || rxq > dev->num_rx_queues)
1640 return -EINVAL;
1641
Ben Hutchings62fe0b42010-09-27 08:24:33 +00001642 if (dev->reg_state == NETREG_REGISTERED) {
1643 ASSERT_RTNL();
1644
Ben Hutchings62fe0b42010-09-27 08:24:33 +00001645 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1646 rxq);
1647 if (rc)
1648 return rc;
Ben Hutchings62fe0b42010-09-27 08:24:33 +00001649 }
1650
1651 dev->real_num_rx_queues = rxq;
1652 return 0;
1653}
1654EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1655#endif
1656
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001657static inline void __netif_reschedule(struct Qdisc *q)
1658{
1659 struct softnet_data *sd;
1660 unsigned long flags;
1661
1662 local_irq_save(flags);
1663 sd = &__get_cpu_var(softnet_data);
Changli Gaoa9cbd582010-04-26 23:06:24 +00001664 q->next_sched = NULL;
1665 *sd->output_queue_tailp = q;
1666 sd->output_queue_tailp = &q->next_sched;
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001667 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1668 local_irq_restore(flags);
1669}
1670
David S. Miller37437bb2008-07-16 02:15:04 -07001671void __netif_schedule(struct Qdisc *q)
Denis Vlasenko56079432006-03-29 15:57:29 -08001672{
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001673 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1674 __netif_reschedule(q);
Denis Vlasenko56079432006-03-29 15:57:29 -08001675}
1676EXPORT_SYMBOL(__netif_schedule);
1677
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001678void dev_kfree_skb_irq(struct sk_buff *skb)
Denis Vlasenko56079432006-03-29 15:57:29 -08001679{
David S. Miller3578b0c2010-08-03 00:24:04 -07001680 if (atomic_dec_and_test(&skb->users)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001681 struct softnet_data *sd;
1682 unsigned long flags;
Denis Vlasenko56079432006-03-29 15:57:29 -08001683
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001684 local_irq_save(flags);
1685 sd = &__get_cpu_var(softnet_data);
1686 skb->next = sd->completion_queue;
1687 sd->completion_queue = skb;
1688 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1689 local_irq_restore(flags);
1690 }
Denis Vlasenko56079432006-03-29 15:57:29 -08001691}
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001692EXPORT_SYMBOL(dev_kfree_skb_irq);
Denis Vlasenko56079432006-03-29 15:57:29 -08001693
1694void dev_kfree_skb_any(struct sk_buff *skb)
1695{
1696 if (in_irq() || irqs_disabled())
1697 dev_kfree_skb_irq(skb);
1698 else
1699 dev_kfree_skb(skb);
1700}
1701EXPORT_SYMBOL(dev_kfree_skb_any);
1702
1703
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001704/**
1705 * netif_device_detach - mark device as removed
1706 * @dev: network device
1707 *
1708 * Mark device as removed from system and therefore no longer available.
1709 */
Denis Vlasenko56079432006-03-29 15:57:29 -08001710void netif_device_detach(struct net_device *dev)
1711{
1712 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1713 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00001714 netif_tx_stop_all_queues(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08001715 }
1716}
1717EXPORT_SYMBOL(netif_device_detach);
1718
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001719/**
1720 * netif_device_attach - mark device as attached
1721 * @dev: network device
1722 *
1723 * Mark device as attached from system and restart if needed.
1724 */
Denis Vlasenko56079432006-03-29 15:57:29 -08001725void netif_device_attach(struct net_device *dev)
1726{
1727 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1728 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00001729 netif_tx_wake_all_queues(dev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001730 __netdev_watchdog_up(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08001731 }
1732}
1733EXPORT_SYMBOL(netif_device_attach);
1734
Arnd Bergmann8a83a002010-01-30 12:23:03 +00001735/**
1736 * skb_dev_set -- assign a new device to a buffer
1737 * @skb: buffer for the new device
1738 * @dev: network device
1739 *
1740 * If an skb is owned by a device already, we have to reset
1741 * all data private to the namespace a device belongs to
1742 * before assigning it a new device.
1743 */
1744#ifdef CONFIG_NET_NS
1745void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1746{
1747 skb_dst_drop(skb);
1748 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1749 secpath_reset(skb);
1750 nf_reset(skb);
1751 skb_init_secmark(skb);
1752 skb->mark = 0;
1753 skb->priority = 0;
1754 skb->nf_trace = 0;
1755 skb->ipvs_property = 0;
1756#ifdef CONFIG_NET_SCHED
1757 skb->tc_index = 0;
1758#endif
1759 }
1760 skb->dev = dev;
1761}
1762EXPORT_SYMBOL(skb_set_dev);
1763#endif /* CONFIG_NET_NS */
1764
Linus Torvalds1da177e2005-04-16 15:20:36 -07001765/*
1766 * Invalidate hardware checksum when packet is to be mangled, and
1767 * complete checksum manually on outgoing path.
1768 */
Patrick McHardy84fa7932006-08-29 16:44:56 -07001769int skb_checksum_help(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001770{
Al Virod3bc23e2006-11-14 21:24:49 -08001771 __wsum csum;
Herbert Xu663ead32007-04-09 11:59:07 -07001772 int ret = 0, offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001773
Patrick McHardy84fa7932006-08-29 16:44:56 -07001774 if (skb->ip_summed == CHECKSUM_COMPLETE)
Herbert Xua430a432006-07-08 13:34:56 -07001775 goto out_set_summed;
1776
1777 if (unlikely(skb_shinfo(skb)->gso_size)) {
Herbert Xua430a432006-07-08 13:34:56 -07001778 /* Let GSO fix up the checksum. */
1779 goto out_set_summed;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001780 }
1781
Michał Mirosław55508d62010-12-14 15:24:08 +00001782 offset = skb_checksum_start_offset(skb);
Herbert Xua0308472007-10-15 01:47:15 -07001783 BUG_ON(offset >= skb_headlen(skb));
1784 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1785
1786 offset += skb->csum_offset;
1787 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1788
1789 if (skb_cloned(skb) &&
1790 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001791 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1792 if (ret)
1793 goto out;
1794 }
1795
Herbert Xua0308472007-10-15 01:47:15 -07001796 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
Herbert Xua430a432006-07-08 13:34:56 -07001797out_set_summed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001798 skb->ip_summed = CHECKSUM_NONE;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001799out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001800 return ret;
1801}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001802EXPORT_SYMBOL(skb_checksum_help);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001803
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001804/**
1805 * skb_gso_segment - Perform segmentation on skb.
1806 * @skb: buffer to segment
Herbert Xu576a30e2006-06-27 13:22:38 -07001807 * @features: features for the output path (see dev->features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001808 *
1809 * This function segments the given skb and returns a list of segments.
Herbert Xu576a30e2006-06-27 13:22:38 -07001810 *
1811 * It may return NULL if the skb requires no segmentation. This is
1812 * only possible when GSO is used for verifying header integrity.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001813 */
Herbert Xu576a30e2006-06-27 13:22:38 -07001814struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001815{
1816 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1817 struct packet_type *ptype;
Al Viro252e3342006-11-14 20:48:11 -08001818 __be16 type = skb->protocol;
Jesse Grossc8d5bcd2010-10-29 12:14:54 +00001819 int vlan_depth = ETH_HLEN;
Herbert Xua430a432006-07-08 13:34:56 -07001820 int err;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001821
Jesse Grossc8d5bcd2010-10-29 12:14:54 +00001822 while (type == htons(ETH_P_8021Q)) {
1823 struct vlan_hdr *vh;
Jesse Gross7b9c6092010-10-20 13:56:04 +00001824
Jesse Grossc8d5bcd2010-10-29 12:14:54 +00001825 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
Jesse Gross7b9c6092010-10-20 13:56:04 +00001826 return ERR_PTR(-EINVAL);
1827
Jesse Grossc8d5bcd2010-10-29 12:14:54 +00001828 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1829 type = vh->h_vlan_encapsulated_proto;
1830 vlan_depth += VLAN_HLEN;
Jesse Gross7b9c6092010-10-20 13:56:04 +00001831 }
1832
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001833 skb_reset_mac_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001834 skb->mac_len = skb->network_header - skb->mac_header;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001835 __skb_pull(skb, skb->mac_len);
1836
Herbert Xu67fd1a72009-01-19 16:26:44 -08001837 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1838 struct net_device *dev = skb->dev;
1839 struct ethtool_drvinfo info = {};
1840
1841 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1842 dev->ethtool_ops->get_drvinfo(dev, &info);
1843
Joe Perchesb194a362010-10-30 11:08:52 +00001844 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
Herbert Xu67fd1a72009-01-19 16:26:44 -08001845 info.driver, dev ? dev->features : 0L,
1846 skb->sk ? skb->sk->sk_route_caps : 0L,
1847 skb->len, skb->data_len, skb->ip_summed);
1848
Herbert Xua430a432006-07-08 13:34:56 -07001849 if (skb_header_cloned(skb) &&
1850 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1851 return ERR_PTR(err);
1852 }
1853
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001854 rcu_read_lock();
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08001855 list_for_each_entry_rcu(ptype,
1856 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001857 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
Patrick McHardy84fa7932006-08-29 16:44:56 -07001858 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
Herbert Xua430a432006-07-08 13:34:56 -07001859 err = ptype->gso_send_check(skb);
1860 segs = ERR_PTR(err);
1861 if (err || skb_gso_ok(skb, features))
1862 break;
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001863 __skb_push(skb, (skb->data -
1864 skb_network_header(skb)));
Herbert Xua430a432006-07-08 13:34:56 -07001865 }
Herbert Xu576a30e2006-06-27 13:22:38 -07001866 segs = ptype->gso_segment(skb, features);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001867 break;
1868 }
1869 }
1870 rcu_read_unlock();
1871
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001872 __skb_push(skb, skb->data - skb_mac_header(skb));
Herbert Xu576a30e2006-06-27 13:22:38 -07001873
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001874 return segs;
1875}
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001876EXPORT_SYMBOL(skb_gso_segment);
1877
Herbert Xufb286bb2005-11-10 13:01:24 -08001878/* Take action when hardware reception checksum errors are detected. */
1879#ifdef CONFIG_BUG
1880void netdev_rx_csum_fault(struct net_device *dev)
1881{
1882 if (net_ratelimit()) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001883 printk(KERN_ERR "%s: hw csum failure.\n",
Stephen Hemminger246a4212005-12-08 15:21:39 -08001884 dev ? dev->name : "<unknown>");
Herbert Xufb286bb2005-11-10 13:01:24 -08001885 dump_stack();
1886 }
1887}
1888EXPORT_SYMBOL(netdev_rx_csum_fault);
1889#endif
1890
Linus Torvalds1da177e2005-04-16 15:20:36 -07001891/* Actually, we should eliminate this check as soon as we know, that:
1892 * 1. IOMMU is present and allows to map all the memory.
1893 * 2. No high memory really exists on this machine.
1894 */
1895
Eric Dumazet9092c652010-04-02 13:34:49 -07001896static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001897{
Herbert Xu3d3a8532006-06-27 13:33:10 -07001898#ifdef CONFIG_HIGHMEM
Linus Torvalds1da177e2005-04-16 15:20:36 -07001899 int i;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001900 if (!(dev->features & NETIF_F_HIGHDMA)) {
1901 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1902 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1903 return 1;
1904 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001905
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001906 if (PCI_DMA_BUS_IS_PHYS) {
1907 struct device *pdev = dev->dev.parent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001908
Eric Dumazet9092c652010-04-02 13:34:49 -07001909 if (!pdev)
1910 return 0;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001911 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1912 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1913 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1914 return 1;
1915 }
1916 }
Herbert Xu3d3a8532006-06-27 13:33:10 -07001917#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001918 return 0;
1919}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001920
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001921struct dev_gso_cb {
1922 void (*destructor)(struct sk_buff *skb);
1923};
1924
1925#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1926
1927static void dev_gso_skb_destructor(struct sk_buff *skb)
1928{
1929 struct dev_gso_cb *cb;
1930
1931 do {
1932 struct sk_buff *nskb = skb->next;
1933
1934 skb->next = nskb->next;
1935 nskb->next = NULL;
1936 kfree_skb(nskb);
1937 } while (skb->next);
1938
1939 cb = DEV_GSO_CB(skb);
1940 if (cb->destructor)
1941 cb->destructor(skb);
1942}
1943
1944/**
1945 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1946 * @skb: buffer to segment
Jesse Gross91ecb632011-01-09 06:23:33 +00001947 * @features: device features as applicable to this skb
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001948 *
1949 * This function segments the given skb and stores the list of segments
1950 * in skb->next.
1951 */
Jesse Gross91ecb632011-01-09 06:23:33 +00001952static int dev_gso_segment(struct sk_buff *skb, int features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001953{
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001954 struct sk_buff *segs;
1955
Herbert Xu576a30e2006-06-27 13:22:38 -07001956 segs = skb_gso_segment(skb, features);
1957
1958 /* Verifying header integrity only. */
1959 if (!segs)
1960 return 0;
1961
Hirofumi Nakagawa801678c2008-04-29 01:03:09 -07001962 if (IS_ERR(segs))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001963 return PTR_ERR(segs);
1964
1965 skb->next = segs;
1966 DEV_GSO_CB(skb)->destructor = skb->destructor;
1967 skb->destructor = dev_gso_skb_destructor;
1968
1969 return 0;
1970}
1971
Eric Dumazetfc6055a2010-04-16 12:18:22 +00001972/*
1973 * Try to orphan skb early, right before transmission by the device.
Oliver Hartkopp2244d072010-08-17 08:59:14 +00001974 * We cannot orphan skb if tx timestamp is requested or the sk-reference
1975 * is needed on driver level for other reasons, e.g. see net/can/raw.c
Eric Dumazetfc6055a2010-04-16 12:18:22 +00001976 */
1977static inline void skb_orphan_try(struct sk_buff *skb)
1978{
Eric Dumazet87fd3082010-07-13 05:24:20 +00001979 struct sock *sk = skb->sk;
1980
Oliver Hartkopp2244d072010-08-17 08:59:14 +00001981 if (sk && !skb_shinfo(skb)->tx_flags) {
Eric Dumazet87fd3082010-07-13 05:24:20 +00001982 /* skb_tx_hash() wont be able to get sk.
1983 * We copy sk_hash into skb->rxhash
1984 */
1985 if (!skb->rxhash)
1986 skb->rxhash = sk->sk_hash;
Eric Dumazetfc6055a2010-04-16 12:18:22 +00001987 skb_orphan(skb);
Eric Dumazet87fd3082010-07-13 05:24:20 +00001988 }
Eric Dumazetfc6055a2010-04-16 12:18:22 +00001989}
1990
Jesse Gross03634662011-01-09 06:23:35 +00001991static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1992{
1993 return ((features & NETIF_F_GEN_CSUM) ||
1994 ((features & NETIF_F_V4_CSUM) &&
1995 protocol == htons(ETH_P_IP)) ||
1996 ((features & NETIF_F_V6_CSUM) &&
1997 protocol == htons(ETH_P_IPV6)) ||
1998 ((features & NETIF_F_FCOE_CRC) &&
1999 protocol == htons(ETH_P_FCOE)));
2000}
2001
Jesse Grossf01a5232011-01-09 06:23:31 +00002002static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features)
2003{
2004 if (!can_checksum_protocol(protocol, features)) {
2005 features &= ~NETIF_F_ALL_CSUM;
2006 features &= ~NETIF_F_SG;
2007 } else if (illegal_highdma(skb->dev, skb)) {
2008 features &= ~NETIF_F_SG;
2009 }
2010
2011 return features;
2012}
2013
2014int netif_skb_features(struct sk_buff *skb)
Jesse Gross58e998c2010-10-29 12:14:55 +00002015{
2016 __be16 protocol = skb->protocol;
Jesse Grossf01a5232011-01-09 06:23:31 +00002017 int features = skb->dev->features;
Jesse Gross58e998c2010-10-29 12:14:55 +00002018
2019 if (protocol == htons(ETH_P_8021Q)) {
2020 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2021 protocol = veh->h_vlan_encapsulated_proto;
Jesse Grossf01a5232011-01-09 06:23:31 +00002022 } else if (!vlan_tx_tag_present(skb)) {
2023 return harmonize_features(skb, protocol, features);
2024 }
Jesse Gross58e998c2010-10-29 12:14:55 +00002025
Jesse Gross6ee400a2011-01-17 20:46:00 +00002026 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
Jesse Grossf01a5232011-01-09 06:23:31 +00002027
2028 if (protocol != htons(ETH_P_8021Q)) {
2029 return harmonize_features(skb, protocol, features);
2030 } else {
2031 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
Jesse Gross6ee400a2011-01-17 20:46:00 +00002032 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
Jesse Grossf01a5232011-01-09 06:23:31 +00002033 return harmonize_features(skb, protocol, features);
2034 }
Jesse Gross58e998c2010-10-29 12:14:55 +00002035}
Jesse Grossf01a5232011-01-09 06:23:31 +00002036EXPORT_SYMBOL(netif_skb_features);
Jesse Gross58e998c2010-10-29 12:14:55 +00002037
John Fastabend6afff0c2010-06-16 14:18:12 +00002038/*
2039 * Returns true if either:
2040 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2041 * 2. skb is fragmented and the device does not support SG, or if
2042 * at least one of fragments is in highmem and device does not
2043 * support DMA from it.
2044 */
2045static inline int skb_needs_linearize(struct sk_buff *skb,
Jesse Gross02932ce2011-01-09 06:23:34 +00002046 int features)
John Fastabend6afff0c2010-06-16 14:18:12 +00002047{
Jesse Gross02932ce2011-01-09 06:23:34 +00002048 return skb_is_nonlinear(skb) &&
2049 ((skb_has_frag_list(skb) &&
2050 !(features & NETIF_F_FRAGLIST)) ||
Jesse Grosse1e78db2010-10-29 12:14:53 +00002051 (skb_shinfo(skb)->nr_frags &&
Jesse Gross02932ce2011-01-09 06:23:34 +00002052 !(features & NETIF_F_SG)));
John Fastabend6afff0c2010-06-16 14:18:12 +00002053}
2054
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002055int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2056 struct netdev_queue *txq)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002057{
Stephen Hemminger00829822008-11-20 20:14:53 -08002058 const struct net_device_ops *ops = dev->netdev_ops;
Patrick McHardy572a9d72009-11-10 06:14:14 +00002059 int rc = NETDEV_TX_OK;
Stephen Hemminger00829822008-11-20 20:14:53 -08002060
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002061 if (likely(!skb->next)) {
Jesse Grossfc741212011-01-09 06:23:32 +00002062 int features;
2063
Eric Dumazet93f154b2009-05-18 22:19:19 -07002064 /*
2065 * If device doesnt need skb->dst, release it right now while
2066 * its hot in this cpu cache
2067 */
Eric Dumazetadf30902009-06-02 05:19:30 +00002068 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2069 skb_dst_drop(skb);
2070
Eric Dumazet15c2d75f2010-12-07 00:30:37 +00002071 if (!list_empty(&ptype_all))
2072 dev_queue_xmit_nit(skb, dev);
2073
Eric Dumazetfc6055a2010-04-16 12:18:22 +00002074 skb_orphan_try(skb);
David S. Miller9ccb8972010-04-22 01:02:07 -07002075
Jesse Grossfc741212011-01-09 06:23:32 +00002076 features = netif_skb_features(skb);
2077
Jesse Gross7b9c6092010-10-20 13:56:04 +00002078 if (vlan_tx_tag_present(skb) &&
Jesse Grossfc741212011-01-09 06:23:32 +00002079 !(features & NETIF_F_HW_VLAN_TX)) {
Jesse Gross7b9c6092010-10-20 13:56:04 +00002080 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2081 if (unlikely(!skb))
2082 goto out;
2083
2084 skb->vlan_tci = 0;
2085 }
2086
Jesse Grossfc741212011-01-09 06:23:32 +00002087 if (netif_needs_gso(skb, features)) {
Jesse Gross91ecb632011-01-09 06:23:33 +00002088 if (unlikely(dev_gso_segment(skb, features)))
David S. Miller9ccb8972010-04-22 01:02:07 -07002089 goto out_kfree_skb;
2090 if (skb->next)
2091 goto gso;
John Fastabend6afff0c2010-06-16 14:18:12 +00002092 } else {
Jesse Gross02932ce2011-01-09 06:23:34 +00002093 if (skb_needs_linearize(skb, features) &&
John Fastabend6afff0c2010-06-16 14:18:12 +00002094 __skb_linearize(skb))
2095 goto out_kfree_skb;
2096
2097 /* If packet is not checksummed and device does not
2098 * support checksumming for this protocol, complete
2099 * checksumming here.
2100 */
2101 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Michał Mirosław55508d62010-12-14 15:24:08 +00002102 skb_set_transport_header(skb,
2103 skb_checksum_start_offset(skb));
Jesse Gross03634662011-01-09 06:23:35 +00002104 if (!(features & NETIF_F_ALL_CSUM) &&
John Fastabend6afff0c2010-06-16 14:18:12 +00002105 skb_checksum_help(skb))
2106 goto out_kfree_skb;
2107 }
David S. Miller9ccb8972010-04-22 01:02:07 -07002108 }
2109
Patrick Ohlyac45f602009-02-12 05:03:37 +00002110 rc = ops->ndo_start_xmit(skb, dev);
Koki Sanagicf66ba52010-08-23 18:45:02 +09002111 trace_net_dev_xmit(skb, rc);
Patrick McHardyec634fe2009-07-05 19:23:38 -07002112 if (rc == NETDEV_TX_OK)
Eric Dumazet08baf562009-05-25 22:58:01 -07002113 txq_trans_update(txq);
Patrick Ohlyac45f602009-02-12 05:03:37 +00002114 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002115 }
2116
Herbert Xu576a30e2006-06-27 13:22:38 -07002117gso:
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002118 do {
2119 struct sk_buff *nskb = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002120
2121 skb->next = nskb->next;
2122 nskb->next = NULL;
Krishna Kumar068a2de2009-12-09 20:59:58 +00002123
2124 /*
2125 * If device doesnt need nskb->dst, release it right now while
2126 * its hot in this cpu cache
2127 */
2128 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2129 skb_dst_drop(nskb);
2130
Stephen Hemminger00829822008-11-20 20:14:53 -08002131 rc = ops->ndo_start_xmit(nskb, dev);
Koki Sanagicf66ba52010-08-23 18:45:02 +09002132 trace_net_dev_xmit(nskb, rc);
Patrick McHardyec634fe2009-07-05 19:23:38 -07002133 if (unlikely(rc != NETDEV_TX_OK)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002134 if (rc & ~NETDEV_TX_MASK)
2135 goto out_kfree_gso_skb;
Michael Chanf54d9e82006-06-25 23:57:04 -07002136 nskb->next = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002137 skb->next = nskb;
2138 return rc;
2139 }
Eric Dumazet08baf562009-05-25 22:58:01 -07002140 txq_trans_update(txq);
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002141 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
Michael Chanf54d9e82006-06-25 23:57:04 -07002142 return NETDEV_TX_BUSY;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002143 } while (skb->next);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002144
Patrick McHardy572a9d72009-11-10 06:14:14 +00002145out_kfree_gso_skb:
2146 if (likely(skb->next == NULL))
2147 skb->destructor = DEV_GSO_CB(skb)->destructor;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002148out_kfree_skb:
2149 kfree_skb(skb);
Jesse Gross7b9c6092010-10-20 13:56:04 +00002150out:
Patrick McHardy572a9d72009-11-10 06:14:14 +00002151 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002152}
2153
Tom Herbert0a9627f2010-03-16 08:03:29 +00002154static u32 hashrnd __read_mostly;
David S. Millerb6b2fed2008-07-21 09:48:06 -07002155
Vladislav Zolotarova3d22a62010-12-13 06:27:10 +00002156/*
2157 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2158 * to be used as a distribution range.
2159 */
2160u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2161 unsigned int num_tx_queues)
David S. Miller8f0f2222008-07-15 03:47:03 -07002162{
David S. Miller70192982009-01-27 16:34:47 -08002163 u32 hash;
David S. Millerb6b2fed2008-07-21 09:48:06 -07002164
David S. Miller513de112009-05-03 14:43:10 -07002165 if (skb_rx_queue_recorded(skb)) {
2166 hash = skb_get_rx_queue(skb);
Vladislav Zolotarova3d22a62010-12-13 06:27:10 +00002167 while (unlikely(hash >= num_tx_queues))
2168 hash -= num_tx_queues;
David S. Miller513de112009-05-03 14:43:10 -07002169 return hash;
2170 }
Eric Dumazetec581f62009-05-01 09:05:06 -07002171
2172 if (skb->sk && skb->sk->sk_hash)
David S. Miller70192982009-01-27 16:34:47 -08002173 hash = skb->sk->sk_hash;
Eric Dumazetec581f62009-05-01 09:05:06 -07002174 else
Eric Dumazet87fd3082010-07-13 05:24:20 +00002175 hash = (__force u16) skb->protocol ^ skb->rxhash;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002176 hash = jhash_1word(hash, hashrnd);
David S. Millerd5a9e242009-01-27 16:22:11 -08002177
Vladislav Zolotarova3d22a62010-12-13 06:27:10 +00002178 return (u16) (((u64) hash * num_tx_queues) >> 32);
David S. Miller8f0f2222008-07-15 03:47:03 -07002179}
Vladislav Zolotarova3d22a62010-12-13 06:27:10 +00002180EXPORT_SYMBOL(__skb_tx_hash);
David S. Miller8f0f2222008-07-15 03:47:03 -07002181
Eric Dumazeted046422009-11-13 21:54:04 +00002182static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2183{
2184 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2185 if (net_ratelimit()) {
Eric Dumazet7a161ea2010-04-08 21:26:13 +00002186 pr_warning("%s selects TX queue %d, but "
2187 "real number of TX queues is %d\n",
2188 dev->name, queue_index, dev->real_num_tx_queues);
Eric Dumazeted046422009-11-13 21:54:04 +00002189 }
2190 return 0;
2191 }
2192 return queue_index;
2193}
2194
Tom Herbert1d24eb42010-11-21 13:17:27 +00002195static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2196{
Tom Herbertbf264142010-11-26 08:36:09 +00002197#ifdef CONFIG_XPS
Tom Herbert1d24eb42010-11-21 13:17:27 +00002198 struct xps_dev_maps *dev_maps;
2199 struct xps_map *map;
2200 int queue_index = -1;
2201
2202 rcu_read_lock();
2203 dev_maps = rcu_dereference(dev->xps_maps);
2204 if (dev_maps) {
2205 map = rcu_dereference(
2206 dev_maps->cpu_map[raw_smp_processor_id()]);
2207 if (map) {
2208 if (map->len == 1)
2209 queue_index = map->queues[0];
2210 else {
2211 u32 hash;
2212 if (skb->sk && skb->sk->sk_hash)
2213 hash = skb->sk->sk_hash;
2214 else
2215 hash = (__force u16) skb->protocol ^
2216 skb->rxhash;
2217 hash = jhash_1word(hash, hashrnd);
2218 queue_index = map->queues[
2219 ((u64)hash * map->len) >> 32];
2220 }
2221 if (unlikely(queue_index >= dev->real_num_tx_queues))
2222 queue_index = -1;
2223 }
2224 }
2225 rcu_read_unlock();
2226
2227 return queue_index;
2228#else
2229 return -1;
2230#endif
2231}
2232
David S. Millere8a04642008-07-17 00:34:19 -07002233static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2234 struct sk_buff *skb)
2235{
Tom Herbertb0f77d02010-07-14 20:50:29 -07002236 int queue_index;
Helmut Schaadeabc772010-09-03 02:39:56 +00002237 const struct net_device_ops *ops = dev->netdev_ops;
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002238
Tom Herbert3853b582010-11-21 13:17:29 +00002239 if (dev->real_num_tx_queues == 1)
2240 queue_index = 0;
2241 else if (ops->ndo_select_queue) {
Helmut Schaadeabc772010-09-03 02:39:56 +00002242 queue_index = ops->ndo_select_queue(dev, skb);
2243 queue_index = dev_cap_txqueue(dev, queue_index);
2244 } else {
2245 struct sock *sk = skb->sk;
2246 queue_index = sk_tx_queue_get(sk);
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002247
Tom Herbert3853b582010-11-21 13:17:29 +00002248 if (queue_index < 0 || skb->ooo_okay ||
2249 queue_index >= dev->real_num_tx_queues) {
2250 int old_index = queue_index;
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002251
Tom Herbert1d24eb42010-11-21 13:17:27 +00002252 queue_index = get_xps_queue(dev, skb);
2253 if (queue_index < 0)
2254 queue_index = skb_tx_hash(dev, skb);
Tom Herbert3853b582010-11-21 13:17:29 +00002255
2256 if (queue_index != old_index && sk) {
2257 struct dst_entry *dst =
2258 rcu_dereference_check(sk->sk_dst_cache, 1);
Eric Dumazet8728c542010-04-11 21:18:17 +00002259
2260 if (dst && skb_dst(skb) == dst)
2261 sk_tx_queue_set(sk, queue_index);
2262 }
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002263 }
2264 }
David S. Millereae792b2008-07-15 03:03:33 -07002265
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002266 skb_set_queue_mapping(skb, queue_index);
2267 return netdev_get_tx_queue(dev, queue_index);
David S. Millere8a04642008-07-17 00:34:19 -07002268}
2269
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002270static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2271 struct net_device *dev,
2272 struct netdev_queue *txq)
2273{
2274 spinlock_t *root_lock = qdisc_lock(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002275 bool contended = qdisc_is_running(q);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002276 int rc;
2277
Eric Dumazet79640a42010-06-02 05:09:29 -07002278 /*
2279 * Heuristic to force contended enqueues to serialize on a
2280 * separate lock before trying to get qdisc main lock.
2281 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2282 * and dequeue packets faster.
2283 */
2284 if (unlikely(contended))
2285 spin_lock(&q->busylock);
2286
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002287 spin_lock(root_lock);
2288 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2289 kfree_skb(skb);
2290 rc = NET_XMIT_DROP;
2291 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
Eric Dumazetbc135b22010-06-02 03:23:51 -07002292 qdisc_run_begin(q)) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002293 /*
2294 * This is a work-conserving queue; there are no old skbs
2295 * waiting to be sent out; and the qdisc is not running -
2296 * xmit the skb directly.
2297 */
Eric Dumazet7fee2262010-05-11 23:19:48 +00002298 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2299 skb_dst_force(skb);
Eric Dumazetbfe0d022011-01-09 08:30:54 +00002300
2301 qdisc_skb_cb(skb)->pkt_len = skb->len;
2302 qdisc_bstats_update(q, skb);
2303
Eric Dumazet79640a42010-06-02 05:09:29 -07002304 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2305 if (unlikely(contended)) {
2306 spin_unlock(&q->busylock);
2307 contended = false;
2308 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002309 __qdisc_run(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002310 } else
Eric Dumazetbc135b22010-06-02 03:23:51 -07002311 qdisc_run_end(q);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002312
2313 rc = NET_XMIT_SUCCESS;
2314 } else {
Eric Dumazet7fee2262010-05-11 23:19:48 +00002315 skb_dst_force(skb);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002316 rc = qdisc_enqueue_root(skb, q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002317 if (qdisc_run_begin(q)) {
2318 if (unlikely(contended)) {
2319 spin_unlock(&q->busylock);
2320 contended = false;
2321 }
2322 __qdisc_run(q);
2323 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002324 }
2325 spin_unlock(root_lock);
Eric Dumazet79640a42010-06-02 05:09:29 -07002326 if (unlikely(contended))
2327 spin_unlock(&q->busylock);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002328 return rc;
2329}
2330
Eric Dumazet745e20f2010-09-29 13:23:09 -07002331static DEFINE_PER_CPU(int, xmit_recursion);
David S. Miller11a766c2010-10-25 12:51:55 -07002332#define RECURSION_LIMIT 10
Eric Dumazet745e20f2010-09-29 13:23:09 -07002333
Dave Jonesd29f7492008-07-22 14:09:06 -07002334/**
2335 * dev_queue_xmit - transmit a buffer
2336 * @skb: buffer to transmit
2337 *
2338 * Queue a buffer for transmission to a network device. The caller must
2339 * have set the device and priority and built the buffer before calling
2340 * this function. The function can be called from an interrupt.
2341 *
2342 * A negative errno code is returned on a failure. A success does not
2343 * guarantee the frame will be transmitted as it may be dropped due
2344 * to congestion or traffic shaping.
2345 *
2346 * -----------------------------------------------------------------------------------
2347 * I notice this method can also return errors from the queue disciplines,
2348 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2349 * be positive.
2350 *
2351 * Regardless of the return value, the skb is consumed, so it is currently
2352 * difficult to retry a send to this method. (You can bump the ref count
2353 * before sending to hold a reference for retry if you are careful.)
2354 *
2355 * When calling this method, interrupts MUST be enabled. This is because
2356 * the BH enable code must have IRQs enabled so that it will not deadlock.
2357 * --BLG
2358 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002359int dev_queue_xmit(struct sk_buff *skb)
2360{
2361 struct net_device *dev = skb->dev;
David S. Millerdc2b4842008-07-08 17:18:23 -07002362 struct netdev_queue *txq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002363 struct Qdisc *q;
2364 int rc = -ENOMEM;
2365
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002366 /* Disable soft irqs for various locks below. Also
2367 * stops preemption for RCU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002368 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002369 rcu_read_lock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002370
David S. Millereae792b2008-07-15 03:03:33 -07002371 txq = dev_pick_tx(dev, skb);
Paul E. McKenneya898def2010-02-22 17:04:49 -08002372 q = rcu_dereference_bh(txq->qdisc);
David S. Miller37437bb2008-07-16 02:15:04 -07002373
Linus Torvalds1da177e2005-04-16 15:20:36 -07002374#ifdef CONFIG_NET_CLS_ACT
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002375 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002376#endif
Koki Sanagicf66ba52010-08-23 18:45:02 +09002377 trace_net_dev_queue(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002378 if (q->enqueue) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002379 rc = __dev_xmit_skb(skb, q, dev, txq);
David S. Miller37437bb2008-07-16 02:15:04 -07002380 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002381 }
2382
2383 /* The device has no queue. Common case for software devices:
2384 loopback, all the sorts of tunnels...
2385
Herbert Xu932ff272006-06-09 12:20:56 -07002386 Really, it is unlikely that netif_tx_lock protection is necessary
2387 here. (f.e. loopback and IP tunnels are clean ignoring statistics
Linus Torvalds1da177e2005-04-16 15:20:36 -07002388 counters.)
2389 However, it is possible, that they rely on protection
2390 made by us here.
2391
2392 Check this and shot the lock. It is not prone from deadlocks.
2393 Either shot noqueue qdisc, it is even simpler 8)
2394 */
2395 if (dev->flags & IFF_UP) {
2396 int cpu = smp_processor_id(); /* ok because BHs are off */
2397
David S. Millerc773e842008-07-08 23:13:53 -07002398 if (txq->xmit_lock_owner != cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002399
Eric Dumazet745e20f2010-09-29 13:23:09 -07002400 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2401 goto recursion_alert;
2402
David S. Millerc773e842008-07-08 23:13:53 -07002403 HARD_TX_LOCK(dev, txq, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002404
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002405 if (!netif_tx_queue_stopped(txq)) {
Eric Dumazet745e20f2010-09-29 13:23:09 -07002406 __this_cpu_inc(xmit_recursion);
Patrick McHardy572a9d72009-11-10 06:14:14 +00002407 rc = dev_hard_start_xmit(skb, dev, txq);
Eric Dumazet745e20f2010-09-29 13:23:09 -07002408 __this_cpu_dec(xmit_recursion);
Patrick McHardy572a9d72009-11-10 06:14:14 +00002409 if (dev_xmit_complete(rc)) {
David S. Millerc773e842008-07-08 23:13:53 -07002410 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002411 goto out;
2412 }
2413 }
David S. Millerc773e842008-07-08 23:13:53 -07002414 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002415 if (net_ratelimit())
2416 printk(KERN_CRIT "Virtual device %s asks to "
2417 "queue packet!\n", dev->name);
2418 } else {
2419 /* Recursion is detected! It is possible,
Eric Dumazet745e20f2010-09-29 13:23:09 -07002420 * unfortunately
2421 */
2422recursion_alert:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002423 if (net_ratelimit())
2424 printk(KERN_CRIT "Dead loop on virtual device "
2425 "%s, fix it urgently!\n", dev->name);
2426 }
2427 }
2428
2429 rc = -ENETDOWN;
Herbert Xud4828d82006-06-22 02:28:18 -07002430 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002431
Linus Torvalds1da177e2005-04-16 15:20:36 -07002432 kfree_skb(skb);
2433 return rc;
2434out:
Herbert Xud4828d82006-06-22 02:28:18 -07002435 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002436 return rc;
2437}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002438EXPORT_SYMBOL(dev_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002439
2440
2441/*=======================================================================
2442 Receiver routines
2443 =======================================================================*/
2444
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002445int netdev_max_backlog __read_mostly = 1000;
Eric Dumazet3b098e22010-05-15 23:57:10 -07002446int netdev_tstamp_prequeue __read_mostly = 1;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002447int netdev_budget __read_mostly = 300;
2448int weight_p __read_mostly = 64; /* old backlog weight */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002449
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002450/* Called with irq disabled */
2451static inline void ____napi_schedule(struct softnet_data *sd,
2452 struct napi_struct *napi)
2453{
2454 list_add_tail(&napi->poll_list, &sd->poll_list);
2455 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2456}
2457
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002458/*
2459 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2460 * and src/dst port numbers. Returns a non-zero hash number on success
2461 * and 0 on failure.
2462 */
2463__u32 __skb_get_rxhash(struct sk_buff *skb)
2464{
Changli Gao12fcdef2010-08-17 19:04:32 +00002465 int nhoff, hash = 0, poff;
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002466 struct ipv6hdr *ip6;
2467 struct iphdr *ip;
2468 u8 ip_proto;
2469 u32 addr1, addr2, ihl;
2470 union {
2471 u32 v32;
2472 u16 v16[2];
2473 } ports;
2474
2475 nhoff = skb_network_offset(skb);
2476
2477 switch (skb->protocol) {
2478 case __constant_htons(ETH_P_IP):
2479 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2480 goto done;
2481
Changli Gao10034892010-08-21 06:13:28 +00002482 ip = (struct iphdr *) (skb->data + nhoff);
Changli Gaodbe57752010-08-17 19:01:38 +00002483 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2484 ip_proto = 0;
2485 else
2486 ip_proto = ip->protocol;
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002487 addr1 = (__force u32) ip->saddr;
2488 addr2 = (__force u32) ip->daddr;
2489 ihl = ip->ihl;
2490 break;
2491 case __constant_htons(ETH_P_IPV6):
2492 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2493 goto done;
2494
Changli Gao10034892010-08-21 06:13:28 +00002495 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002496 ip_proto = ip6->nexthdr;
2497 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2498 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2499 ihl = (40 >> 2);
2500 break;
2501 default:
2502 goto done;
2503 }
2504
Changli Gao12fcdef2010-08-17 19:04:32 +00002505 ports.v32 = 0;
2506 poff = proto_ports_offset(ip_proto);
2507 if (poff >= 0) {
2508 nhoff += ihl * 4 + poff;
2509 if (pskb_may_pull(skb, nhoff + 4)) {
2510 ports.v32 = * (__force u32 *) (skb->data + nhoff);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002511 if (ports.v16[1] < ports.v16[0])
2512 swap(ports.v16[0], ports.v16[1]);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002513 }
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002514 }
2515
2516 /* get a consistent hash (same value on both flow directions) */
2517 if (addr2 < addr1)
2518 swap(addr1, addr2);
2519
2520 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2521 if (!hash)
2522 hash = 1;
2523
2524done:
2525 return hash;
2526}
2527EXPORT_SYMBOL(__skb_get_rxhash);
2528
Eric Dumazetdf334542010-03-24 19:13:54 +00002529#ifdef CONFIG_RPS
Tom Herbertfec5e652010-04-16 16:01:27 -07002530
2531/* One global table that all flow-based protocols share. */
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002532struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
Tom Herbertfec5e652010-04-16 16:01:27 -07002533EXPORT_SYMBOL(rps_sock_flow_table);
2534
Tom Herbert0a9627f2010-03-16 08:03:29 +00002535/*
2536 * get_rps_cpu is called from netif_receive_skb and returns the target
2537 * CPU from the RPS map of the receiving queue for a given skb.
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002538 * rcu_read_lock must be held on entry.
Tom Herbert0a9627f2010-03-16 08:03:29 +00002539 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002540static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2541 struct rps_dev_flow **rflowp)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002542{
Tom Herbert0a9627f2010-03-16 08:03:29 +00002543 struct netdev_rx_queue *rxqueue;
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002544 struct rps_map *map;
Tom Herbertfec5e652010-04-16 16:01:27 -07002545 struct rps_dev_flow_table *flow_table;
2546 struct rps_sock_flow_table *sock_flow_table;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002547 int cpu = -1;
Tom Herbertfec5e652010-04-16 16:01:27 -07002548 u16 tcpu;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002549
Tom Herbert0a9627f2010-03-16 08:03:29 +00002550 if (skb_rx_queue_recorded(skb)) {
2551 u16 index = skb_get_rx_queue(skb);
Ben Hutchings62fe0b42010-09-27 08:24:33 +00002552 if (unlikely(index >= dev->real_num_rx_queues)) {
2553 WARN_ONCE(dev->real_num_rx_queues > 1,
2554 "%s received packet on queue %u, but number "
2555 "of RX queues is %u\n",
2556 dev->name, index, dev->real_num_rx_queues);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002557 goto done;
2558 }
2559 rxqueue = dev->_rx + index;
2560 } else
2561 rxqueue = dev->_rx;
2562
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002563 map = rcu_dereference(rxqueue->rps_map);
2564 if (map) {
2565 if (map->len == 1) {
Changli Gao6febfca2010-09-03 23:12:37 +00002566 tcpu = map->cpus[0];
2567 if (cpu_online(tcpu))
2568 cpu = tcpu;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002569 goto done;
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002570 }
Eric Dumazet6e3f7fa2010-10-25 03:02:02 +00002571 } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00002572 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002573 }
2574
Changli Gao2d47b452010-08-17 19:00:56 +00002575 skb_reset_network_header(skb);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002576 if (!skb_get_rxhash(skb))
Tom Herbert0a9627f2010-03-16 08:03:29 +00002577 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002578
Tom Herbertfec5e652010-04-16 16:01:27 -07002579 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2580 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2581 if (flow_table && sock_flow_table) {
2582 u16 next_cpu;
2583 struct rps_dev_flow *rflow;
2584
2585 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2586 tcpu = rflow->cpu;
2587
2588 next_cpu = sock_flow_table->ents[skb->rxhash &
2589 sock_flow_table->mask];
2590
2591 /*
2592 * If the desired CPU (where last recvmsg was done) is
2593 * different from current CPU (one in the rx-queue flow
2594 * table entry), switch if one of the following holds:
2595 * - Current CPU is unset (equal to RPS_NO_CPU).
2596 * - Current CPU is offline.
2597 * - The current CPU's queue tail has advanced beyond the
2598 * last packet that was enqueued using this table entry.
2599 * This guarantees that all previous packets for the flow
2600 * have been dequeued, thus preserving in order delivery.
2601 */
2602 if (unlikely(tcpu != next_cpu) &&
2603 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2604 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2605 rflow->last_qtail)) >= 0)) {
2606 tcpu = rflow->cpu = next_cpu;
2607 if (tcpu != RPS_NO_CPU)
2608 rflow->last_qtail = per_cpu(softnet_data,
2609 tcpu).input_queue_head;
2610 }
2611 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2612 *rflowp = rflow;
2613 cpu = tcpu;
2614 goto done;
2615 }
2616 }
2617
Tom Herbert0a9627f2010-03-16 08:03:29 +00002618 if (map) {
Tom Herbertfec5e652010-04-16 16:01:27 -07002619 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
Tom Herbert0a9627f2010-03-16 08:03:29 +00002620
2621 if (cpu_online(tcpu)) {
2622 cpu = tcpu;
2623 goto done;
2624 }
2625 }
2626
2627done:
Tom Herbert0a9627f2010-03-16 08:03:29 +00002628 return cpu;
2629}
2630
Tom Herbert0a9627f2010-03-16 08:03:29 +00002631/* Called from hardirq (IPI) context */
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002632static void rps_trigger_softirq(void *data)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002633{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002634 struct softnet_data *sd = data;
2635
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002636 ____napi_schedule(sd, &sd->backlog);
Changli Gaodee42872010-05-02 05:42:16 +00002637 sd->received_rps++;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002638}
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002639
Tom Herbertfec5e652010-04-16 16:01:27 -07002640#endif /* CONFIG_RPS */
Tom Herbert0a9627f2010-03-16 08:03:29 +00002641
2642/*
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002643 * Check if this softnet_data structure is another cpu one
2644 * If yes, queue it to our IPI list and return 1
2645 * If no, return 0
2646 */
2647static int rps_ipi_queued(struct softnet_data *sd)
2648{
2649#ifdef CONFIG_RPS
2650 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2651
2652 if (sd != mysd) {
2653 sd->rps_ipi_next = mysd->rps_ipi_list;
2654 mysd->rps_ipi_list = sd;
2655
2656 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2657 return 1;
2658 }
2659#endif /* CONFIG_RPS */
2660 return 0;
2661}
2662
2663/*
Tom Herbert0a9627f2010-03-16 08:03:29 +00002664 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2665 * queue (may be a remote CPU queue).
2666 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002667static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2668 unsigned int *qtail)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002669{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002670 struct softnet_data *sd;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002671 unsigned long flags;
2672
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002673 sd = &per_cpu(softnet_data, cpu);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002674
2675 local_irq_save(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002676
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002677 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07002678 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2679 if (skb_queue_len(&sd->input_pkt_queue)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00002680enqueue:
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002681 __skb_queue_tail(&sd->input_pkt_queue, skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00002682 input_queue_tail_incr_save(sd, qtail);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002683 rps_unlock(sd);
Changli Gao152102c2010-03-30 20:16:22 +00002684 local_irq_restore(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002685 return NET_RX_SUCCESS;
2686 }
2687
Eric Dumazetebda37c22010-05-06 23:51:21 +00002688 /* Schedule NAPI for backlog device
2689 * We can use non atomic operation since we own the queue lock
2690 */
2691 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002692 if (!rps_ipi_queued(sd))
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002693 ____napi_schedule(sd, &sd->backlog);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002694 }
2695 goto enqueue;
2696 }
2697
Changli Gaodee42872010-05-02 05:42:16 +00002698 sd->dropped++;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002699 rps_unlock(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002700
Tom Herbert0a9627f2010-03-16 08:03:29 +00002701 local_irq_restore(flags);
2702
Eric Dumazetcaf586e2010-09-30 21:06:55 +00002703 atomic_long_inc(&skb->dev->rx_dropped);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002704 kfree_skb(skb);
2705 return NET_RX_DROP;
2706}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002707
Linus Torvalds1da177e2005-04-16 15:20:36 -07002708/**
2709 * netif_rx - post buffer to the network code
2710 * @skb: buffer to post
2711 *
2712 * This function receives a packet from a device driver and queues it for
2713 * the upper (protocol) levels to process. It always succeeds. The buffer
2714 * may be dropped during processing for congestion control or by the
2715 * protocol layers.
2716 *
2717 * return values:
2718 * NET_RX_SUCCESS (no congestion)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002719 * NET_RX_DROP (packet was dropped)
2720 *
2721 */
2722
2723int netif_rx(struct sk_buff *skb)
2724{
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002725 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002726
2727 /* if netpoll wants it, pretend we never saw it */
2728 if (netpoll_rx(skb))
2729 return NET_RX_DROP;
2730
Eric Dumazet3b098e22010-05-15 23:57:10 -07002731 if (netdev_tstamp_prequeue)
2732 net_timestamp_check(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002733
Koki Sanagicf66ba52010-08-23 18:45:02 +09002734 trace_netif_rx(skb);
Eric Dumazetdf334542010-03-24 19:13:54 +00002735#ifdef CONFIG_RPS
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002736 {
Tom Herbertfec5e652010-04-16 16:01:27 -07002737 struct rps_dev_flow voidflow, *rflow = &voidflow;
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002738 int cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002739
Changli Gaocece1942010-08-07 20:35:43 -07002740 preempt_disable();
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002741 rcu_read_lock();
Tom Herbertfec5e652010-04-16 16:01:27 -07002742
2743 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002744 if (cpu < 0)
2745 cpu = smp_processor_id();
Tom Herbertfec5e652010-04-16 16:01:27 -07002746
2747 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2748
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002749 rcu_read_unlock();
Changli Gaocece1942010-08-07 20:35:43 -07002750 preempt_enable();
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002751 }
2752#else
Tom Herbertfec5e652010-04-16 16:01:27 -07002753 {
2754 unsigned int qtail;
2755 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2756 put_cpu();
2757 }
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002758#endif
2759 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002760}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002761EXPORT_SYMBOL(netif_rx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002762
2763int netif_rx_ni(struct sk_buff *skb)
2764{
2765 int err;
2766
2767 preempt_disable();
2768 err = netif_rx(skb);
2769 if (local_softirq_pending())
2770 do_softirq();
2771 preempt_enable();
2772
2773 return err;
2774}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002775EXPORT_SYMBOL(netif_rx_ni);
2776
Linus Torvalds1da177e2005-04-16 15:20:36 -07002777static void net_tx_action(struct softirq_action *h)
2778{
2779 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2780
2781 if (sd->completion_queue) {
2782 struct sk_buff *clist;
2783
2784 local_irq_disable();
2785 clist = sd->completion_queue;
2786 sd->completion_queue = NULL;
2787 local_irq_enable();
2788
2789 while (clist) {
2790 struct sk_buff *skb = clist;
2791 clist = clist->next;
2792
Ilpo Järvinen547b7922008-07-25 21:43:18 -07002793 WARN_ON(atomic_read(&skb->users));
Koki Sanagi07dc22e2010-08-23 18:46:12 +09002794 trace_kfree_skb(skb, net_tx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002795 __kfree_skb(skb);
2796 }
2797 }
2798
2799 if (sd->output_queue) {
David S. Miller37437bb2008-07-16 02:15:04 -07002800 struct Qdisc *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002801
2802 local_irq_disable();
2803 head = sd->output_queue;
2804 sd->output_queue = NULL;
Changli Gaoa9cbd582010-04-26 23:06:24 +00002805 sd->output_queue_tailp = &sd->output_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002806 local_irq_enable();
2807
2808 while (head) {
David S. Miller37437bb2008-07-16 02:15:04 -07002809 struct Qdisc *q = head;
2810 spinlock_t *root_lock;
2811
Linus Torvalds1da177e2005-04-16 15:20:36 -07002812 head = head->next_sched;
2813
David S. Miller5fb66222008-08-02 20:02:43 -07002814 root_lock = qdisc_lock(q);
David S. Miller37437bb2008-07-16 02:15:04 -07002815 if (spin_trylock(root_lock)) {
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002816 smp_mb__before_clear_bit();
2817 clear_bit(__QDISC_STATE_SCHED,
2818 &q->state);
David S. Miller37437bb2008-07-16 02:15:04 -07002819 qdisc_run(q);
2820 spin_unlock(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002821 } else {
David S. Miller195648b2008-08-19 04:00:36 -07002822 if (!test_bit(__QDISC_STATE_DEACTIVATED,
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07002823 &q->state)) {
David S. Miller195648b2008-08-19 04:00:36 -07002824 __netif_reschedule(q);
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07002825 } else {
2826 smp_mb__before_clear_bit();
2827 clear_bit(__QDISC_STATE_SCHED,
2828 &q->state);
2829 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002830 }
2831 }
2832 }
2833}
2834
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002835#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2836 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
Michał Mirosławda678292009-06-05 05:35:28 +00002837/* This hook is defined here for ATM LANE */
2838int (*br_fdb_test_addr_hook)(struct net_device *dev,
2839 unsigned char *addr) __read_mostly;
Stephen Hemminger4fb019a2009-09-11 11:50:08 -07002840EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
Michał Mirosławda678292009-06-05 05:35:28 +00002841#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002842
Linus Torvalds1da177e2005-04-16 15:20:36 -07002843#ifdef CONFIG_NET_CLS_ACT
2844/* TODO: Maybe we should just force sch_ingress to be compiled in
2845 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2846 * a compare and 2 stores extra right now if we dont have it on
2847 * but have CONFIG_NET_CLS_ACT
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002848 * NOTE: This doesnt stop any functionality; if you dont have
Linus Torvalds1da177e2005-04-16 15:20:36 -07002849 * the ingress scheduler, you just cant add policies on ingress.
2850 *
2851 */
Eric Dumazet24824a02010-10-02 06:11:55 +00002852static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002853{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002854 struct net_device *dev = skb->dev;
Herbert Xuf697c3e2007-10-14 00:38:47 -07002855 u32 ttl = G_TC_RTTL(skb->tc_verd);
David S. Miller555353c2008-07-08 17:33:13 -07002856 int result = TC_ACT_OK;
2857 struct Qdisc *q;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002858
Stephen Hemmingerde384832010-08-01 00:33:23 -07002859 if (unlikely(MAX_RED_LOOP < ttl++)) {
2860 if (net_ratelimit())
2861 pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2862 skb->skb_iif, dev->ifindex);
Herbert Xuf697c3e2007-10-14 00:38:47 -07002863 return TC_ACT_SHOT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002864 }
2865
Herbert Xuf697c3e2007-10-14 00:38:47 -07002866 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2867 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2868
David S. Miller83874002008-07-17 00:53:03 -07002869 q = rxq->qdisc;
David S. Miller8d50b532008-07-30 02:37:46 -07002870 if (q != &noop_qdisc) {
David S. Miller83874002008-07-17 00:53:03 -07002871 spin_lock(qdisc_lock(q));
David S. Millera9312ae2008-08-17 21:51:03 -07002872 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2873 result = qdisc_enqueue_root(skb, q);
David S. Miller83874002008-07-17 00:53:03 -07002874 spin_unlock(qdisc_lock(q));
2875 }
Herbert Xuf697c3e2007-10-14 00:38:47 -07002876
Linus Torvalds1da177e2005-04-16 15:20:36 -07002877 return result;
2878}
Herbert Xuf697c3e2007-10-14 00:38:47 -07002879
2880static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2881 struct packet_type **pt_prev,
2882 int *ret, struct net_device *orig_dev)
2883{
Eric Dumazet24824a02010-10-02 06:11:55 +00002884 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2885
2886 if (!rxq || rxq->qdisc == &noop_qdisc)
Herbert Xuf697c3e2007-10-14 00:38:47 -07002887 goto out;
2888
2889 if (*pt_prev) {
2890 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2891 *pt_prev = NULL;
Herbert Xuf697c3e2007-10-14 00:38:47 -07002892 }
2893
Eric Dumazet24824a02010-10-02 06:11:55 +00002894 switch (ing_filter(skb, rxq)) {
Herbert Xuf697c3e2007-10-14 00:38:47 -07002895 case TC_ACT_SHOT:
2896 case TC_ACT_STOLEN:
2897 kfree_skb(skb);
2898 return NULL;
2899 }
2900
2901out:
2902 skb->tc_verd = 0;
2903 return skb;
2904}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002905#endif
2906
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002907/**
2908 * netdev_rx_handler_register - register receive handler
2909 * @dev: device to register a handler for
2910 * @rx_handler: receive handler to register
Jiri Pirko93e2c322010-06-10 03:34:59 +00002911 * @rx_handler_data: data pointer that is used by rx handler
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002912 *
2913 * Register a receive hander for a device. This handler will then be
2914 * called from __netif_receive_skb. A negative errno code is returned
2915 * on a failure.
2916 *
2917 * The caller must hold the rtnl_mutex.
2918 */
2919int netdev_rx_handler_register(struct net_device *dev,
Jiri Pirko93e2c322010-06-10 03:34:59 +00002920 rx_handler_func_t *rx_handler,
2921 void *rx_handler_data)
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002922{
2923 ASSERT_RTNL();
2924
2925 if (dev->rx_handler)
2926 return -EBUSY;
2927
Jiri Pirko93e2c322010-06-10 03:34:59 +00002928 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002929 rcu_assign_pointer(dev->rx_handler, rx_handler);
2930
2931 return 0;
2932}
2933EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2934
2935/**
2936 * netdev_rx_handler_unregister - unregister receive handler
2937 * @dev: device to unregister a handler from
2938 *
2939 * Unregister a receive hander from a device.
2940 *
2941 * The caller must hold the rtnl_mutex.
2942 */
2943void netdev_rx_handler_unregister(struct net_device *dev)
2944{
2945
2946 ASSERT_RTNL();
2947 rcu_assign_pointer(dev->rx_handler, NULL);
Jiri Pirko93e2c322010-06-10 03:34:59 +00002948 rcu_assign_pointer(dev->rx_handler_data, NULL);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002949}
2950EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2951
Eric Dumazetacbbc072010-04-11 06:56:11 +00002952static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2953 struct net_device *master)
2954{
2955 if (skb->pkt_type == PACKET_HOST) {
2956 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2957
2958 memcpy(dest, master->dev_addr, ETH_ALEN);
2959 }
2960}
2961
2962/* On bonding slaves other than the currently active slave, suppress
2963 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2964 * ARP on active-backup slaves with arp_validate enabled.
2965 */
2966int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2967{
2968 struct net_device *dev = skb->dev;
2969
2970 if (master->priv_flags & IFF_MASTER_ARPMON)
2971 dev->last_rx = jiffies;
2972
Jiri Pirkof350a0a82010-06-15 06:50:45 +00002973 if ((master->priv_flags & IFF_MASTER_ALB) &&
2974 (master->priv_flags & IFF_BRIDGE_PORT)) {
Eric Dumazetacbbc072010-04-11 06:56:11 +00002975 /* Do address unmangle. The local destination address
2976 * will be always the one master has. Provides the right
2977 * functionality in a bridge.
2978 */
2979 skb_bond_set_mac_by_master(skb, master);
2980 }
2981
2982 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2983 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2984 skb->protocol == __cpu_to_be16(ETH_P_ARP))
2985 return 0;
2986
2987 if (master->priv_flags & IFF_MASTER_ALB) {
2988 if (skb->pkt_type != PACKET_BROADCAST &&
2989 skb->pkt_type != PACKET_MULTICAST)
2990 return 0;
2991 }
2992 if (master->priv_flags & IFF_MASTER_8023AD &&
2993 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2994 return 0;
2995
2996 return 1;
2997 }
2998 return 0;
2999}
3000EXPORT_SYMBOL(__skb_bond_should_drop);
3001
Eric Dumazet10f744d2010-03-28 23:07:20 -07003002static int __netif_receive_skb(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003003{
3004 struct packet_type *ptype, *pt_prev;
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003005 rx_handler_func_t *rx_handler;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003006 struct net_device *orig_dev;
Eric Dumazet0641e4f2010-03-18 21:16:45 -07003007 struct net_device *master;
Joe Eykholt0d7a3682008-07-02 18:22:01 -07003008 struct net_device *null_or_orig;
John Fastabend2df4a0f2010-05-12 21:31:11 +00003009 struct net_device *orig_or_bond;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003010 int ret = NET_RX_DROP;
Al Viro252e3342006-11-14 20:48:11 -08003011 __be16 type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003012
Eric Dumazet3b098e22010-05-15 23:57:10 -07003013 if (!netdev_tstamp_prequeue)
3014 net_timestamp_check(skb);
Eric Dumazet81bbb3d2009-09-30 16:42:42 -07003015
Koki Sanagicf66ba52010-08-23 18:45:02 +09003016 trace_netif_receive_skb(skb);
Patrick McHardy9b22ea52008-11-04 14:49:57 -08003017
Linus Torvalds1da177e2005-04-16 15:20:36 -07003018 /* if we've gotten here through NAPI, check netpoll */
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003019 if (netpoll_receive_skb(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003020 return NET_RX_DROP;
3021
Eric Dumazet8964be42009-11-20 15:35:04 -08003022 if (!skb->skb_iif)
3023 skb->skb_iif = skb->dev->ifindex;
David S. Miller86e65da2005-08-09 19:36:29 -07003024
John Fastabend597a2642010-06-03 09:30:11 +00003025 /*
3026 * bonding note: skbs received on inactive slaves should only
3027 * be delivered to pkt handlers that are exact matches. Also
3028 * the deliver_no_wcard flag will be set. If packet handlers
3029 * are sensitive to duplicate packets these skbs will need to
Jesse Gross3701e512010-10-20 13:56:06 +00003030 * be dropped at the handler.
John Fastabend597a2642010-06-03 09:30:11 +00003031 */
Joe Eykholt0d7a3682008-07-02 18:22:01 -07003032 null_or_orig = NULL;
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07003033 orig_dev = skb->dev;
Eric Dumazet0641e4f2010-03-18 21:16:45 -07003034 master = ACCESS_ONCE(orig_dev->master);
John Fastabend597a2642010-06-03 09:30:11 +00003035 if (skb->deliver_no_wcard)
3036 null_or_orig = orig_dev;
3037 else if (master) {
3038 if (skb_bond_should_drop(skb, master)) {
3039 skb->deliver_no_wcard = 1;
Joe Eykholt0d7a3682008-07-02 18:22:01 -07003040 null_or_orig = orig_dev; /* deliver only exact match */
John Fastabend597a2642010-06-03 09:30:11 +00003041 } else
Eric Dumazet0641e4f2010-03-18 21:16:45 -07003042 skb->dev = master;
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07003043 }
Jay Vosburgh8f903c72006-02-21 16:36:44 -08003044
Eric Dumazet27f39c73e2010-05-19 22:07:23 +00003045 __this_cpu_inc(softnet_data.processed);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003046 skb_reset_network_header(skb);
Arnaldo Carvalho de Melobadff6d2007-03-13 13:06:52 -03003047 skb_reset_transport_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07003048 skb->mac_len = skb->network_header - skb->mac_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003049
3050 pt_prev = NULL;
3051
3052 rcu_read_lock();
3053
3054#ifdef CONFIG_NET_CLS_ACT
3055 if (skb->tc_verd & TC_NCLS) {
3056 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3057 goto ncls;
3058 }
3059#endif
3060
3061 list_for_each_entry_rcu(ptype, &ptype_all, list) {
Joe Eykholtf9823072008-07-02 18:22:02 -07003062 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3063 ptype->dev == orig_dev) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003064 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003065 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003066 pt_prev = ptype;
3067 }
3068 }
3069
3070#ifdef CONFIG_NET_CLS_ACT
Herbert Xuf697c3e2007-10-14 00:38:47 -07003071 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3072 if (!skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003073 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003074ncls:
3075#endif
3076
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00003077 /* Handle special case of bridge or macvlan */
3078 rx_handler = rcu_dereference(skb->dev->rx_handler);
3079 if (rx_handler) {
3080 if (pt_prev) {
3081 ret = deliver_skb(skb, pt_prev, orig_dev);
3082 pt_prev = NULL;
3083 }
3084 skb = rx_handler(skb);
3085 if (!skb)
3086 goto out;
3087 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003088
Jesse Gross3701e512010-10-20 13:56:06 +00003089 if (vlan_tx_tag_present(skb)) {
3090 if (pt_prev) {
3091 ret = deliver_skb(skb, pt_prev, orig_dev);
3092 pt_prev = NULL;
3093 }
3094 if (vlan_hwaccel_do_receive(&skb)) {
3095 ret = __netif_receive_skb(skb);
3096 goto out;
3097 } else if (unlikely(!skb))
3098 goto out;
3099 }
3100
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00003101 /*
3102 * Make sure frames received on VLAN interfaces stacked on
3103 * bonding interfaces still make their way to any base bonding
3104 * device that may have registered for a specific ptype. The
3105 * handler may have to adjust skb->dev and orig_dev.
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00003106 */
John Fastabend2df4a0f2010-05-12 21:31:11 +00003107 orig_or_bond = orig_dev;
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00003108 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
3109 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
John Fastabend2df4a0f2010-05-12 21:31:11 +00003110 orig_or_bond = vlan_dev_real_dev(skb->dev);
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00003111 }
3112
Linus Torvalds1da177e2005-04-16 15:20:36 -07003113 type = skb->protocol;
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08003114 list_for_each_entry_rcu(ptype,
3115 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00003116 if (ptype->type == type && (ptype->dev == null_or_orig ||
Andy Gospodarekca8d9ea2010-01-06 12:56:37 +00003117 ptype->dev == skb->dev || ptype->dev == orig_dev ||
John Fastabend2df4a0f2010-05-12 21:31:11 +00003118 ptype->dev == orig_or_bond)) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003119 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003120 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003121 pt_prev = ptype;
3122 }
3123 }
3124
3125 if (pt_prev) {
David S. Millerf2ccd8f2005-08-09 19:34:12 -07003126 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003127 } else {
Eric Dumazetcaf586e2010-09-30 21:06:55 +00003128 atomic_long_inc(&skb->dev->rx_dropped);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003129 kfree_skb(skb);
3130 /* Jamal, now you will not able to escape explaining
3131 * me how you were going to use this. :-)
3132 */
3133 ret = NET_RX_DROP;
3134 }
3135
3136out:
3137 rcu_read_unlock();
3138 return ret;
3139}
Tom Herbert0a9627f2010-03-16 08:03:29 +00003140
3141/**
3142 * netif_receive_skb - process receive buffer from network
3143 * @skb: buffer to process
3144 *
3145 * netif_receive_skb() is the main receive data processing function.
3146 * It always succeeds. The buffer may be dropped during processing
3147 * for congestion control or by the protocol layers.
3148 *
3149 * This function may only be called from softirq context and interrupts
3150 * should be enabled.
3151 *
3152 * Return values (usually ignored):
3153 * NET_RX_SUCCESS: no congestion
3154 * NET_RX_DROP: packet was dropped
3155 */
3156int netif_receive_skb(struct sk_buff *skb)
3157{
Eric Dumazet3b098e22010-05-15 23:57:10 -07003158 if (netdev_tstamp_prequeue)
3159 net_timestamp_check(skb);
3160
Richard Cochranc1f19b52010-07-17 08:49:36 +00003161 if (skb_defer_rx_timestamp(skb))
3162 return NET_RX_SUCCESS;
3163
Eric Dumazetdf334542010-03-24 19:13:54 +00003164#ifdef CONFIG_RPS
Eric Dumazet3b098e22010-05-15 23:57:10 -07003165 {
3166 struct rps_dev_flow voidflow, *rflow = &voidflow;
3167 int cpu, ret;
Tom Herbert0a9627f2010-03-16 08:03:29 +00003168
Eric Dumazet3b098e22010-05-15 23:57:10 -07003169 rcu_read_lock();
Tom Herbert0a9627f2010-03-16 08:03:29 +00003170
Eric Dumazet3b098e22010-05-15 23:57:10 -07003171 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Tom Herbertfec5e652010-04-16 16:01:27 -07003172
Eric Dumazet3b098e22010-05-15 23:57:10 -07003173 if (cpu >= 0) {
3174 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3175 rcu_read_unlock();
3176 } else {
3177 rcu_read_unlock();
3178 ret = __netif_receive_skb(skb);
3179 }
3180
3181 return ret;
Tom Herbertfec5e652010-04-16 16:01:27 -07003182 }
Tom Herbert1e94d722010-03-18 17:45:44 -07003183#else
3184 return __netif_receive_skb(skb);
3185#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00003186}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003187EXPORT_SYMBOL(netif_receive_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003188
Eric Dumazet88751272010-04-19 05:07:33 +00003189/* Network device is going away, flush any packets still pending
3190 * Called with irqs disabled.
3191 */
Changli Gao152102c2010-03-30 20:16:22 +00003192static void flush_backlog(void *arg)
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003193{
Changli Gao152102c2010-03-30 20:16:22 +00003194 struct net_device *dev = arg;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003195 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003196 struct sk_buff *skb, *tmp;
3197
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003198 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003199 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003200 if (skb->dev == dev) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003201 __skb_unlink(skb, &sd->input_pkt_queue);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003202 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003203 input_queue_head_incr(sd);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003204 }
Changli Gao6e7676c2010-04-27 15:07:33 -07003205 }
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003206 rps_unlock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003207
3208 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3209 if (skb->dev == dev) {
3210 __skb_unlink(skb, &sd->process_queue);
3211 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003212 input_queue_head_incr(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003213 }
3214 }
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003215}
3216
Herbert Xud565b0a2008-12-15 23:38:52 -08003217static int napi_gro_complete(struct sk_buff *skb)
3218{
3219 struct packet_type *ptype;
3220 __be16 type = skb->protocol;
3221 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3222 int err = -ENOENT;
3223
Herbert Xufc59f9a2009-04-14 15:11:06 -07003224 if (NAPI_GRO_CB(skb)->count == 1) {
3225 skb_shinfo(skb)->gso_size = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003226 goto out;
Herbert Xufc59f9a2009-04-14 15:11:06 -07003227 }
Herbert Xud565b0a2008-12-15 23:38:52 -08003228
3229 rcu_read_lock();
3230 list_for_each_entry_rcu(ptype, head, list) {
3231 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3232 continue;
3233
3234 err = ptype->gro_complete(skb);
3235 break;
3236 }
3237 rcu_read_unlock();
3238
3239 if (err) {
3240 WARN_ON(&ptype->list == head);
3241 kfree_skb(skb);
3242 return NET_RX_SUCCESS;
3243 }
3244
3245out:
Herbert Xud565b0a2008-12-15 23:38:52 -08003246 return netif_receive_skb(skb);
3247}
3248
Eric Dumazet86cac582010-08-31 18:25:32 +00003249inline void napi_gro_flush(struct napi_struct *napi)
Herbert Xud565b0a2008-12-15 23:38:52 -08003250{
3251 struct sk_buff *skb, *next;
3252
3253 for (skb = napi->gro_list; skb; skb = next) {
3254 next = skb->next;
3255 skb->next = NULL;
3256 napi_gro_complete(skb);
3257 }
3258
Herbert Xu4ae55442009-02-08 18:00:36 +00003259 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003260 napi->gro_list = NULL;
3261}
Eric Dumazet86cac582010-08-31 18:25:32 +00003262EXPORT_SYMBOL(napi_gro_flush);
Herbert Xud565b0a2008-12-15 23:38:52 -08003263
Ben Hutchings5b252f02009-10-29 07:17:09 +00003264enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xud565b0a2008-12-15 23:38:52 -08003265{
3266 struct sk_buff **pp = NULL;
3267 struct packet_type *ptype;
3268 __be16 type = skb->protocol;
3269 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
Herbert Xu0da2afd52008-12-26 14:57:42 -08003270 int same_flow;
Herbert Xud565b0a2008-12-15 23:38:52 -08003271 int mac_len;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003272 enum gro_result ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003273
Jarek Poplawskice9e76c2010-08-05 01:19:11 +00003274 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
Herbert Xud565b0a2008-12-15 23:38:52 -08003275 goto normal;
3276
David S. Miller21dc3302010-08-23 00:13:46 -07003277 if (skb_is_gso(skb) || skb_has_frag_list(skb))
Herbert Xuf17f5c92009-01-14 14:36:12 -08003278 goto normal;
3279
Herbert Xud565b0a2008-12-15 23:38:52 -08003280 rcu_read_lock();
3281 list_for_each_entry_rcu(ptype, head, list) {
Herbert Xud565b0a2008-12-15 23:38:52 -08003282 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3283 continue;
3284
Herbert Xu86911732009-01-29 14:19:50 +00003285 skb_set_network_header(skb, skb_gro_offset(skb));
Herbert Xud565b0a2008-12-15 23:38:52 -08003286 mac_len = skb->network_header - skb->mac_header;
3287 skb->mac_len = mac_len;
3288 NAPI_GRO_CB(skb)->same_flow = 0;
3289 NAPI_GRO_CB(skb)->flush = 0;
Herbert Xu5d38a072009-01-04 16:13:40 -08003290 NAPI_GRO_CB(skb)->free = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003291
Herbert Xud565b0a2008-12-15 23:38:52 -08003292 pp = ptype->gro_receive(&napi->gro_list, skb);
3293 break;
3294 }
3295 rcu_read_unlock();
3296
3297 if (&ptype->list == head)
3298 goto normal;
3299
Herbert Xu0da2afd52008-12-26 14:57:42 -08003300 same_flow = NAPI_GRO_CB(skb)->same_flow;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003301 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
Herbert Xu0da2afd52008-12-26 14:57:42 -08003302
Herbert Xud565b0a2008-12-15 23:38:52 -08003303 if (pp) {
3304 struct sk_buff *nskb = *pp;
3305
3306 *pp = nskb->next;
3307 nskb->next = NULL;
3308 napi_gro_complete(nskb);
Herbert Xu4ae55442009-02-08 18:00:36 +00003309 napi->gro_count--;
Herbert Xud565b0a2008-12-15 23:38:52 -08003310 }
3311
Herbert Xu0da2afd52008-12-26 14:57:42 -08003312 if (same_flow)
Herbert Xud565b0a2008-12-15 23:38:52 -08003313 goto ok;
3314
Herbert Xu4ae55442009-02-08 18:00:36 +00003315 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
Herbert Xud565b0a2008-12-15 23:38:52 -08003316 goto normal;
Herbert Xud565b0a2008-12-15 23:38:52 -08003317
Herbert Xu4ae55442009-02-08 18:00:36 +00003318 napi->gro_count++;
Herbert Xud565b0a2008-12-15 23:38:52 -08003319 NAPI_GRO_CB(skb)->count = 1;
Herbert Xu86911732009-01-29 14:19:50 +00003320 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003321 skb->next = napi->gro_list;
3322 napi->gro_list = skb;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003323 ret = GRO_HELD;
Herbert Xud565b0a2008-12-15 23:38:52 -08003324
Herbert Xuad0f9902009-02-01 01:24:55 -08003325pull:
Herbert Xucb189782009-05-26 18:50:31 +00003326 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3327 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3328
3329 BUG_ON(skb->end - skb->tail < grow);
3330
3331 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3332
3333 skb->tail += grow;
3334 skb->data_len -= grow;
3335
3336 skb_shinfo(skb)->frags[0].page_offset += grow;
3337 skb_shinfo(skb)->frags[0].size -= grow;
3338
3339 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3340 put_page(skb_shinfo(skb)->frags[0].page);
3341 memmove(skb_shinfo(skb)->frags,
3342 skb_shinfo(skb)->frags + 1,
Jarek Poplawskie5093ae2010-08-11 02:02:10 +00003343 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
Herbert Xucb189782009-05-26 18:50:31 +00003344 }
Herbert Xuad0f9902009-02-01 01:24:55 -08003345 }
3346
Herbert Xud565b0a2008-12-15 23:38:52 -08003347ok:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003348 return ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003349
3350normal:
Herbert Xuad0f9902009-02-01 01:24:55 -08003351 ret = GRO_NORMAL;
3352 goto pull;
Herbert Xu5d38a072009-01-04 16:13:40 -08003353}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003354EXPORT_SYMBOL(dev_gro_receive);
3355
Eric Dumazet40d08022010-08-26 22:03:08 -07003356static inline gro_result_t
Ben Hutchings5b252f02009-10-29 07:17:09 +00003357__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003358{
3359 struct sk_buff *p;
3360
3361 for (p = napi->gro_list; p; p = p->next) {
Eric Dumazet40d08022010-08-26 22:03:08 -07003362 unsigned long diffs;
3363
3364 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
Jesse Gross3701e512010-10-20 13:56:06 +00003365 diffs |= p->vlan_tci ^ skb->vlan_tci;
Eric Dumazet40d08022010-08-26 22:03:08 -07003366 diffs |= compare_ether_header(skb_mac_header(p),
Joe Perchesf64f9e72009-11-29 16:55:45 -08003367 skb_gro_mac_header(skb));
Eric Dumazet40d08022010-08-26 22:03:08 -07003368 NAPI_GRO_CB(p)->same_flow = !diffs;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003369 NAPI_GRO_CB(p)->flush = 0;
3370 }
3371
3372 return dev_gro_receive(napi, skb);
3373}
Herbert Xu5d38a072009-01-04 16:13:40 -08003374
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003375gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
Herbert Xu5d38a072009-01-04 16:13:40 -08003376{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003377 switch (ret) {
3378 case GRO_NORMAL:
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003379 if (netif_receive_skb(skb))
3380 ret = GRO_DROP;
3381 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003382
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003383 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003384 case GRO_MERGED_FREE:
Herbert Xu5d38a072009-01-04 16:13:40 -08003385 kfree_skb(skb);
3386 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003387
3388 case GRO_HELD:
3389 case GRO_MERGED:
3390 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003391 }
3392
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003393 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003394}
3395EXPORT_SYMBOL(napi_skb_finish);
3396
Herbert Xu78a478d2009-05-26 18:50:21 +00003397void skb_gro_reset_offset(struct sk_buff *skb)
3398{
3399 NAPI_GRO_CB(skb)->data_offset = 0;
3400 NAPI_GRO_CB(skb)->frag0 = NULL;
Herbert Xu74895942009-05-26 18:50:27 +00003401 NAPI_GRO_CB(skb)->frag0_len = 0;
Herbert Xu78a478d2009-05-26 18:50:21 +00003402
Herbert Xu78d3fd02009-05-26 18:50:23 +00003403 if (skb->mac_header == skb->tail &&
Herbert Xu74895942009-05-26 18:50:27 +00003404 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
Herbert Xu78a478d2009-05-26 18:50:21 +00003405 NAPI_GRO_CB(skb)->frag0 =
3406 page_address(skb_shinfo(skb)->frags[0].page) +
3407 skb_shinfo(skb)->frags[0].page_offset;
Herbert Xu74895942009-05-26 18:50:27 +00003408 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3409 }
Herbert Xu78a478d2009-05-26 18:50:21 +00003410}
3411EXPORT_SYMBOL(skb_gro_reset_offset);
3412
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003413gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003414{
Herbert Xu86911732009-01-29 14:19:50 +00003415 skb_gro_reset_offset(skb);
3416
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003417 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003418}
3419EXPORT_SYMBOL(napi_gro_receive);
3420
stephen hemmingerd0c2b0d2010-10-19 07:12:10 +00003421static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003422{
Herbert Xu96e93ea2009-01-06 10:49:34 -08003423 __skb_pull(skb, skb_headlen(skb));
3424 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
Jesse Gross3701e512010-10-20 13:56:06 +00003425 skb->vlan_tci = 0;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003426
3427 napi->skb = skb;
3428}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003429
Herbert Xu76620aa2009-04-16 02:02:07 -07003430struct sk_buff *napi_get_frags(struct napi_struct *napi)
Herbert Xu5d38a072009-01-04 16:13:40 -08003431{
Herbert Xu5d38a072009-01-04 16:13:40 -08003432 struct sk_buff *skb = napi->skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003433
3434 if (!skb) {
Eric Dumazet89d71a62009-10-13 05:34:20 +00003435 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3436 if (skb)
3437 napi->skb = skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003438 }
Herbert Xu96e93ea2009-01-06 10:49:34 -08003439 return skb;
3440}
Herbert Xu76620aa2009-04-16 02:02:07 -07003441EXPORT_SYMBOL(napi_get_frags);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003442
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003443gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3444 gro_result_t ret)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003445{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003446 switch (ret) {
3447 case GRO_NORMAL:
Herbert Xu86911732009-01-29 14:19:50 +00003448 case GRO_HELD:
Ajit Khapardee76b69c2010-02-16 20:25:43 +00003449 skb->protocol = eth_type_trans(skb, skb->dev);
Herbert Xu86911732009-01-29 14:19:50 +00003450
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003451 if (ret == GRO_HELD)
3452 skb_gro_pull(skb, -ETH_HLEN);
3453 else if (netif_receive_skb(skb))
3454 ret = GRO_DROP;
Herbert Xu86911732009-01-29 14:19:50 +00003455 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003456
3457 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003458 case GRO_MERGED_FREE:
3459 napi_reuse_skb(napi, skb);
3460 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003461
3462 case GRO_MERGED:
3463 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003464 }
3465
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003466 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003467}
3468EXPORT_SYMBOL(napi_frags_finish);
3469
Herbert Xu76620aa2009-04-16 02:02:07 -07003470struct sk_buff *napi_frags_skb(struct napi_struct *napi)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003471{
Herbert Xu76620aa2009-04-16 02:02:07 -07003472 struct sk_buff *skb = napi->skb;
3473 struct ethhdr *eth;
Herbert Xua5b1cf22009-05-26 18:50:28 +00003474 unsigned int hlen;
3475 unsigned int off;
Herbert Xu76620aa2009-04-16 02:02:07 -07003476
3477 napi->skb = NULL;
3478
3479 skb_reset_mac_header(skb);
3480 skb_gro_reset_offset(skb);
3481
Herbert Xua5b1cf22009-05-26 18:50:28 +00003482 off = skb_gro_offset(skb);
3483 hlen = off + sizeof(*eth);
3484 eth = skb_gro_header_fast(skb, off);
3485 if (skb_gro_header_hard(skb, hlen)) {
3486 eth = skb_gro_header_slow(skb, hlen, off);
3487 if (unlikely(!eth)) {
3488 napi_reuse_skb(napi, skb);
3489 skb = NULL;
3490 goto out;
3491 }
Herbert Xu76620aa2009-04-16 02:02:07 -07003492 }
3493
3494 skb_gro_pull(skb, sizeof(*eth));
3495
3496 /*
3497 * This works because the only protocols we care about don't require
3498 * special handling. We'll fix it up properly at the end.
3499 */
3500 skb->protocol = eth->h_proto;
3501
3502out:
3503 return skb;
3504}
3505EXPORT_SYMBOL(napi_frags_skb);
3506
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003507gro_result_t napi_gro_frags(struct napi_struct *napi)
Herbert Xu76620aa2009-04-16 02:02:07 -07003508{
3509 struct sk_buff *skb = napi_frags_skb(napi);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003510
3511 if (!skb)
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003512 return GRO_DROP;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003513
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003514 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
Herbert Xu5d38a072009-01-04 16:13:40 -08003515}
3516EXPORT_SYMBOL(napi_gro_frags);
3517
Eric Dumazete326bed2010-04-22 00:22:45 -07003518/*
3519 * net_rps_action sends any pending IPI's for rps.
3520 * Note: called with local irq disabled, but exits with local irq enabled.
3521 */
3522static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3523{
3524#ifdef CONFIG_RPS
3525 struct softnet_data *remsd = sd->rps_ipi_list;
3526
3527 if (remsd) {
3528 sd->rps_ipi_list = NULL;
3529
3530 local_irq_enable();
3531
3532 /* Send pending IPI's to kick RPS processing on remote cpus. */
3533 while (remsd) {
3534 struct softnet_data *next = remsd->rps_ipi_next;
3535
3536 if (cpu_online(remsd->cpu))
3537 __smp_call_function_single(remsd->cpu,
3538 &remsd->csd, 0);
3539 remsd = next;
3540 }
3541 } else
3542#endif
3543 local_irq_enable();
3544}
3545
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003546static int process_backlog(struct napi_struct *napi, int quota)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003547{
3548 int work = 0;
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003549 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003550
Eric Dumazete326bed2010-04-22 00:22:45 -07003551#ifdef CONFIG_RPS
3552 /* Check if we have pending ipi, its better to send them now,
3553 * not waiting net_rx_action() end.
3554 */
3555 if (sd->rps_ipi_list) {
3556 local_irq_disable();
3557 net_rps_action_and_irq_enable(sd);
3558 }
3559#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003560 napi->weight = weight_p;
Changli Gao6e7676c2010-04-27 15:07:33 -07003561 local_irq_disable();
3562 while (work < quota) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003563 struct sk_buff *skb;
Changli Gao6e7676c2010-04-27 15:07:33 -07003564 unsigned int qlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003565
Changli Gao6e7676c2010-04-27 15:07:33 -07003566 while ((skb = __skb_dequeue(&sd->process_queue))) {
Eric Dumazete4008272010-04-05 15:42:39 -07003567 local_irq_enable();
Changli Gao6e7676c2010-04-27 15:07:33 -07003568 __netif_receive_skb(skb);
Changli Gao6e7676c2010-04-27 15:07:33 -07003569 local_irq_disable();
Tom Herbert76cc8b12010-05-20 18:37:59 +00003570 input_queue_head_incr(sd);
3571 if (++work >= quota) {
3572 local_irq_enable();
3573 return work;
3574 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003575 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003576
Changli Gao6e7676c2010-04-27 15:07:33 -07003577 rps_lock(sd);
3578 qlen = skb_queue_len(&sd->input_pkt_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003579 if (qlen)
Changli Gao6e7676c2010-04-27 15:07:33 -07003580 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3581 &sd->process_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003582
Changli Gao6e7676c2010-04-27 15:07:33 -07003583 if (qlen < quota - work) {
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003584 /*
3585 * Inline a custom version of __napi_complete().
3586 * only current cpu owns and manipulates this napi,
3587 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3588 * we can use a plain write instead of clear_bit(),
3589 * and we dont need an smp_mb() memory barrier.
3590 */
3591 list_del(&napi->poll_list);
3592 napi->state = 0;
3593
Changli Gao6e7676c2010-04-27 15:07:33 -07003594 quota = work + qlen;
3595 }
3596 rps_unlock(sd);
3597 }
3598 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003599
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003600 return work;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003601}
3602
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003603/**
3604 * __napi_schedule - schedule for receive
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07003605 * @n: entry to schedule
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003606 *
3607 * The entry's receive function will be scheduled to run
3608 */
Harvey Harrisonb5606c22008-02-13 15:03:16 -08003609void __napi_schedule(struct napi_struct *n)
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003610{
3611 unsigned long flags;
3612
3613 local_irq_save(flags);
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003614 ____napi_schedule(&__get_cpu_var(softnet_data), n);
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003615 local_irq_restore(flags);
3616}
3617EXPORT_SYMBOL(__napi_schedule);
3618
Herbert Xud565b0a2008-12-15 23:38:52 -08003619void __napi_complete(struct napi_struct *n)
3620{
3621 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3622 BUG_ON(n->gro_list);
3623
3624 list_del(&n->poll_list);
3625 smp_mb__before_clear_bit();
3626 clear_bit(NAPI_STATE_SCHED, &n->state);
3627}
3628EXPORT_SYMBOL(__napi_complete);
3629
3630void napi_complete(struct napi_struct *n)
3631{
3632 unsigned long flags;
3633
3634 /*
3635 * don't let napi dequeue from the cpu poll list
3636 * just in case its running on a different cpu
3637 */
3638 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3639 return;
3640
3641 napi_gro_flush(n);
3642 local_irq_save(flags);
3643 __napi_complete(n);
3644 local_irq_restore(flags);
3645}
3646EXPORT_SYMBOL(napi_complete);
3647
3648void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3649 int (*poll)(struct napi_struct *, int), int weight)
3650{
3651 INIT_LIST_HEAD(&napi->poll_list);
Herbert Xu4ae55442009-02-08 18:00:36 +00003652 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003653 napi->gro_list = NULL;
Herbert Xu5d38a072009-01-04 16:13:40 -08003654 napi->skb = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08003655 napi->poll = poll;
3656 napi->weight = weight;
3657 list_add(&napi->dev_list, &dev->napi_list);
Herbert Xud565b0a2008-12-15 23:38:52 -08003658 napi->dev = dev;
Herbert Xu5d38a072009-01-04 16:13:40 -08003659#ifdef CONFIG_NETPOLL
Herbert Xud565b0a2008-12-15 23:38:52 -08003660 spin_lock_init(&napi->poll_lock);
3661 napi->poll_owner = -1;
3662#endif
3663 set_bit(NAPI_STATE_SCHED, &napi->state);
3664}
3665EXPORT_SYMBOL(netif_napi_add);
3666
3667void netif_napi_del(struct napi_struct *napi)
3668{
3669 struct sk_buff *skb, *next;
3670
Peter P Waskiewicz Jrd7b06632008-12-26 01:35:35 -08003671 list_del_init(&napi->dev_list);
Herbert Xu76620aa2009-04-16 02:02:07 -07003672 napi_free_frags(napi);
Herbert Xud565b0a2008-12-15 23:38:52 -08003673
3674 for (skb = napi->gro_list; skb; skb = next) {
3675 next = skb->next;
3676 skb->next = NULL;
3677 kfree_skb(skb);
3678 }
3679
3680 napi->gro_list = NULL;
Herbert Xu4ae55442009-02-08 18:00:36 +00003681 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003682}
3683EXPORT_SYMBOL(netif_napi_del);
3684
Linus Torvalds1da177e2005-04-16 15:20:36 -07003685static void net_rx_action(struct softirq_action *h)
3686{
Eric Dumazete326bed2010-04-22 00:22:45 -07003687 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003688 unsigned long time_limit = jiffies + 2;
Stephen Hemminger51b0bde2005-06-23 20:14:40 -07003689 int budget = netdev_budget;
Matt Mackall53fb95d2005-08-11 19:27:43 -07003690 void *have;
3691
Linus Torvalds1da177e2005-04-16 15:20:36 -07003692 local_irq_disable();
3693
Eric Dumazete326bed2010-04-22 00:22:45 -07003694 while (!list_empty(&sd->poll_list)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003695 struct napi_struct *n;
3696 int work, weight;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003697
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003698 /* If softirq window is exhuasted then punt.
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003699 * Allow this to run for 2 jiffies since which will allow
3700 * an average latency of 1.5/HZ.
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003701 */
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003702 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003703 goto softnet_break;
3704
3705 local_irq_enable();
3706
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003707 /* Even though interrupts have been re-enabled, this
3708 * access is safe because interrupts can only add new
3709 * entries to the tail of this list, and only ->poll()
3710 * calls can remove this head entry from the list.
3711 */
Eric Dumazete326bed2010-04-22 00:22:45 -07003712 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003713
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003714 have = netpoll_poll_lock(n);
3715
3716 weight = n->weight;
3717
David S. Miller0a7606c2007-10-29 21:28:47 -07003718 /* This NAPI_STATE_SCHED test is for avoiding a race
3719 * with netpoll's poll_napi(). Only the entity which
3720 * obtains the lock and sees NAPI_STATE_SCHED set will
3721 * actually make the ->poll() call. Therefore we avoid
3722 * accidently calling ->poll() when NAPI is not scheduled.
3723 */
3724 work = 0;
Neil Horman4ea7e382009-05-21 07:36:08 +00003725 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
David S. Miller0a7606c2007-10-29 21:28:47 -07003726 work = n->poll(n, weight);
Neil Horman4ea7e382009-05-21 07:36:08 +00003727 trace_napi_poll(n);
3728 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003729
3730 WARN_ON_ONCE(work > weight);
3731
3732 budget -= work;
3733
3734 local_irq_disable();
3735
3736 /* Drivers must not modify the NAPI state if they
3737 * consume the entire weight. In such cases this code
3738 * still "owns" the NAPI instance and therefore can
3739 * move the instance around on the list at-will.
3740 */
David S. Millerfed17f32008-01-07 21:00:40 -08003741 if (unlikely(work == weight)) {
Herbert Xuff780cd2009-06-26 19:27:04 -07003742 if (unlikely(napi_disable_pending(n))) {
3743 local_irq_enable();
3744 napi_complete(n);
3745 local_irq_disable();
3746 } else
Eric Dumazete326bed2010-04-22 00:22:45 -07003747 list_move_tail(&n->poll_list, &sd->poll_list);
David S. Millerfed17f32008-01-07 21:00:40 -08003748 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003749
3750 netpoll_poll_unlock(have);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003751 }
3752out:
Eric Dumazete326bed2010-04-22 00:22:45 -07003753 net_rps_action_and_irq_enable(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003754
Chris Leechdb217332006-06-17 21:24:58 -07003755#ifdef CONFIG_NET_DMA
3756 /*
3757 * There may not be any more sk_buffs coming right now, so push
3758 * any pending DMA copies to hardware
3759 */
Dan Williams2ba05622009-01-06 11:38:14 -07003760 dma_issue_pending_all();
Chris Leechdb217332006-06-17 21:24:58 -07003761#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003762
Linus Torvalds1da177e2005-04-16 15:20:36 -07003763 return;
3764
3765softnet_break:
Changli Gaodee42872010-05-02 05:42:16 +00003766 sd->time_squeeze++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003767 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3768 goto out;
3769}
3770
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003771static gifconf_func_t *gifconf_list[NPROTO];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003772
3773/**
3774 * register_gifconf - register a SIOCGIF handler
3775 * @family: Address family
3776 * @gifconf: Function handler
3777 *
3778 * Register protocol dependent address dumping routines. The handler
3779 * that is passed must not be freed or reused until it has been replaced
3780 * by another handler.
3781 */
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003782int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003783{
3784 if (family >= NPROTO)
3785 return -EINVAL;
3786 gifconf_list[family] = gifconf;
3787 return 0;
3788}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003789EXPORT_SYMBOL(register_gifconf);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003790
3791
3792/*
3793 * Map an interface index to its name (SIOCGIFNAME)
3794 */
3795
3796/*
3797 * We need this ioctl for efficient implementation of the
3798 * if_indextoname() function required by the IPv6 API. Without
3799 * it, we would have to search all the interfaces to find a
3800 * match. --pb
3801 */
3802
Eric W. Biederman881d9662007-09-17 11:56:21 -07003803static int dev_ifname(struct net *net, struct ifreq __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003804{
3805 struct net_device *dev;
3806 struct ifreq ifr;
3807
3808 /*
3809 * Fetch the caller's info block.
3810 */
3811
3812 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3813 return -EFAULT;
3814
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003815 rcu_read_lock();
3816 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003817 if (!dev) {
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003818 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003819 return -ENODEV;
3820 }
3821
3822 strcpy(ifr.ifr_name, dev->name);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003823 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003824
3825 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3826 return -EFAULT;
3827 return 0;
3828}
3829
3830/*
3831 * Perform a SIOCGIFCONF call. This structure will change
3832 * size eventually, and there is nothing I can do about it.
3833 * Thus we will need a 'compatibility mode'.
3834 */
3835
Eric W. Biederman881d9662007-09-17 11:56:21 -07003836static int dev_ifconf(struct net *net, char __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003837{
3838 struct ifconf ifc;
3839 struct net_device *dev;
3840 char __user *pos;
3841 int len;
3842 int total;
3843 int i;
3844
3845 /*
3846 * Fetch the caller's info block.
3847 */
3848
3849 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3850 return -EFAULT;
3851
3852 pos = ifc.ifc_buf;
3853 len = ifc.ifc_len;
3854
3855 /*
3856 * Loop over the interfaces, and write an info block for each.
3857 */
3858
3859 total = 0;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003860 for_each_netdev(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003861 for (i = 0; i < NPROTO; i++) {
3862 if (gifconf_list[i]) {
3863 int done;
3864 if (!pos)
3865 done = gifconf_list[i](dev, NULL, 0);
3866 else
3867 done = gifconf_list[i](dev, pos + total,
3868 len - total);
3869 if (done < 0)
3870 return -EFAULT;
3871 total += done;
3872 }
3873 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003874 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003875
3876 /*
3877 * All done. Write the updated control block back to the caller.
3878 */
3879 ifc.ifc_len = total;
3880
3881 /*
3882 * Both BSD and Solaris return 0 here, so we do too.
3883 */
3884 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3885}
3886
3887#ifdef CONFIG_PROC_FS
3888/*
3889 * This is invoked by the /proc filesystem handler to display a device
3890 * in detail.
3891 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003892void *dev_seq_start(struct seq_file *seq, loff_t *pos)
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003893 __acquires(RCU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003894{
Denis V. Luneve372c412007-11-19 22:31:54 -08003895 struct net *net = seq_file_net(seq);
Pavel Emelianov7562f872007-05-03 15:13:45 -07003896 loff_t off;
3897 struct net_device *dev;
3898
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003899 rcu_read_lock();
Pavel Emelianov7562f872007-05-03 15:13:45 -07003900 if (!*pos)
3901 return SEQ_START_TOKEN;
3902
3903 off = 1;
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003904 for_each_netdev_rcu(net, dev)
Pavel Emelianov7562f872007-05-03 15:13:45 -07003905 if (off++ == *pos)
3906 return dev;
3907
3908 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003909}
3910
3911void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3912{
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003913 struct net_device *dev = (v == SEQ_START_TOKEN) ?
3914 first_net_device(seq_file_net(seq)) :
3915 next_net_device((struct net_device *)v);
3916
Linus Torvalds1da177e2005-04-16 15:20:36 -07003917 ++*pos;
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003918 return rcu_dereference(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003919}
3920
3921void dev_seq_stop(struct seq_file *seq, void *v)
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003922 __releases(RCU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003923{
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003924 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003925}
3926
3927static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3928{
Eric Dumazet28172732010-07-07 14:58:56 -07003929 struct rtnl_link_stats64 temp;
3930 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003931
Ben Hutchingsbe1f3c22010-06-08 07:19:54 +00003932 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3933 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
Rusty Russell5a1b5892007-04-28 21:04:03 -07003934 dev->name, stats->rx_bytes, stats->rx_packets,
3935 stats->rx_errors,
3936 stats->rx_dropped + stats->rx_missed_errors,
3937 stats->rx_fifo_errors,
3938 stats->rx_length_errors + stats->rx_over_errors +
3939 stats->rx_crc_errors + stats->rx_frame_errors,
3940 stats->rx_compressed, stats->multicast,
3941 stats->tx_bytes, stats->tx_packets,
3942 stats->tx_errors, stats->tx_dropped,
3943 stats->tx_fifo_errors, stats->collisions,
3944 stats->tx_carrier_errors +
3945 stats->tx_aborted_errors +
3946 stats->tx_window_errors +
3947 stats->tx_heartbeat_errors,
3948 stats->tx_compressed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003949}
3950
3951/*
3952 * Called from the PROCfs module. This now uses the new arbitrary sized
3953 * /proc/net interface to create /proc/net/dev
3954 */
3955static int dev_seq_show(struct seq_file *seq, void *v)
3956{
3957 if (v == SEQ_START_TOKEN)
3958 seq_puts(seq, "Inter-| Receive "
3959 " | Transmit\n"
3960 " face |bytes packets errs drop fifo frame "
3961 "compressed multicast|bytes packets errs "
3962 "drop fifo colls carrier compressed\n");
3963 else
3964 dev_seq_printf_stats(seq, v);
3965 return 0;
3966}
3967
Changli Gaodee42872010-05-02 05:42:16 +00003968static struct softnet_data *softnet_get_online(loff_t *pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003969{
Changli Gaodee42872010-05-02 05:42:16 +00003970 struct softnet_data *sd = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003971
Mike Travis0c0b0ac2008-05-02 16:43:08 -07003972 while (*pos < nr_cpu_ids)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003973 if (cpu_online(*pos)) {
Changli Gaodee42872010-05-02 05:42:16 +00003974 sd = &per_cpu(softnet_data, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003975 break;
3976 } else
3977 ++*pos;
Changli Gaodee42872010-05-02 05:42:16 +00003978 return sd;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003979}
3980
3981static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3982{
3983 return softnet_get_online(pos);
3984}
3985
3986static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3987{
3988 ++*pos;
3989 return softnet_get_online(pos);
3990}
3991
3992static void softnet_seq_stop(struct seq_file *seq, void *v)
3993{
3994}
3995
3996static int softnet_seq_show(struct seq_file *seq, void *v)
3997{
Changli Gaodee42872010-05-02 05:42:16 +00003998 struct softnet_data *sd = v;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003999
Tom Herbert0a9627f2010-03-16 08:03:29 +00004000 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
Changli Gaodee42872010-05-02 05:42:16 +00004001 sd->processed, sd->dropped, sd->time_squeeze, 0,
Stephen Hemmingerc1ebcdb2005-06-23 20:08:59 -07004002 0, 0, 0, 0, /* was fastroute */
Changli Gaodee42872010-05-02 05:42:16 +00004003 sd->cpu_collision, sd->received_rps);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004004 return 0;
4005}
4006
Stephen Hemmingerf6908082007-03-12 14:34:29 -07004007static const struct seq_operations dev_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004008 .start = dev_seq_start,
4009 .next = dev_seq_next,
4010 .stop = dev_seq_stop,
4011 .show = dev_seq_show,
4012};
4013
4014static int dev_seq_open(struct inode *inode, struct file *file)
4015{
Denis V. Luneve372c412007-11-19 22:31:54 -08004016 return seq_open_net(inode, file, &dev_seq_ops,
4017 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07004018}
4019
Arjan van de Ven9a321442007-02-12 00:55:35 -08004020static const struct file_operations dev_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004021 .owner = THIS_MODULE,
4022 .open = dev_seq_open,
4023 .read = seq_read,
4024 .llseek = seq_lseek,
Denis V. Luneve372c412007-11-19 22:31:54 -08004025 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07004026};
4027
Stephen Hemmingerf6908082007-03-12 14:34:29 -07004028static const struct seq_operations softnet_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004029 .start = softnet_seq_start,
4030 .next = softnet_seq_next,
4031 .stop = softnet_seq_stop,
4032 .show = softnet_seq_show,
4033};
4034
4035static int softnet_seq_open(struct inode *inode, struct file *file)
4036{
4037 return seq_open(file, &softnet_seq_ops);
4038}
4039
Arjan van de Ven9a321442007-02-12 00:55:35 -08004040static const struct file_operations softnet_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004041 .owner = THIS_MODULE,
4042 .open = softnet_seq_open,
4043 .read = seq_read,
4044 .llseek = seq_lseek,
4045 .release = seq_release,
4046};
4047
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004048static void *ptype_get_idx(loff_t pos)
4049{
4050 struct packet_type *pt = NULL;
4051 loff_t i = 0;
4052 int t;
4053
4054 list_for_each_entry_rcu(pt, &ptype_all, list) {
4055 if (i == pos)
4056 return pt;
4057 ++i;
4058 }
4059
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08004060 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004061 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4062 if (i == pos)
4063 return pt;
4064 ++i;
4065 }
4066 }
4067 return NULL;
4068}
4069
4070static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
Stephen Hemminger72348a42008-01-21 02:27:29 -08004071 __acquires(RCU)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004072{
4073 rcu_read_lock();
4074 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4075}
4076
4077static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4078{
4079 struct packet_type *pt;
4080 struct list_head *nxt;
4081 int hash;
4082
4083 ++*pos;
4084 if (v == SEQ_START_TOKEN)
4085 return ptype_get_idx(0);
4086
4087 pt = v;
4088 nxt = pt->list.next;
4089 if (pt->type == htons(ETH_P_ALL)) {
4090 if (nxt != &ptype_all)
4091 goto found;
4092 hash = 0;
4093 nxt = ptype_base[0].next;
4094 } else
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08004095 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004096
4097 while (nxt == &ptype_base[hash]) {
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08004098 if (++hash >= PTYPE_HASH_SIZE)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004099 return NULL;
4100 nxt = ptype_base[hash].next;
4101 }
4102found:
4103 return list_entry(nxt, struct packet_type, list);
4104}
4105
4106static void ptype_seq_stop(struct seq_file *seq, void *v)
Stephen Hemminger72348a42008-01-21 02:27:29 -08004107 __releases(RCU)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004108{
4109 rcu_read_unlock();
4110}
4111
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004112static int ptype_seq_show(struct seq_file *seq, void *v)
4113{
4114 struct packet_type *pt = v;
4115
4116 if (v == SEQ_START_TOKEN)
4117 seq_puts(seq, "Type Device Function\n");
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09004118 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004119 if (pt->type == htons(ETH_P_ALL))
4120 seq_puts(seq, "ALL ");
4121 else
4122 seq_printf(seq, "%04x", ntohs(pt->type));
4123
Alexey Dobriyan908cd2d2008-11-16 19:50:35 -08004124 seq_printf(seq, " %-8s %pF\n",
4125 pt->dev ? pt->dev->name : "", pt->func);
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004126 }
4127
4128 return 0;
4129}
4130
4131static const struct seq_operations ptype_seq_ops = {
4132 .start = ptype_seq_start,
4133 .next = ptype_seq_next,
4134 .stop = ptype_seq_stop,
4135 .show = ptype_seq_show,
4136};
4137
4138static int ptype_seq_open(struct inode *inode, struct file *file)
4139{
Pavel Emelyanov2feb27d2008-03-24 14:57:45 -07004140 return seq_open_net(inode, file, &ptype_seq_ops,
4141 sizeof(struct seq_net_private));
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004142}
4143
4144static const struct file_operations ptype_seq_fops = {
4145 .owner = THIS_MODULE,
4146 .open = ptype_seq_open,
4147 .read = seq_read,
4148 .llseek = seq_lseek,
Pavel Emelyanov2feb27d2008-03-24 14:57:45 -07004149 .release = seq_release_net,
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004150};
4151
4152
Pavel Emelyanov46650792007-10-08 20:38:39 -07004153static int __net_init dev_proc_net_init(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004154{
4155 int rc = -ENOMEM;
4156
Eric W. Biederman881d9662007-09-17 11:56:21 -07004157 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004158 goto out;
Eric W. Biederman881d9662007-09-17 11:56:21 -07004159 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004160 goto out_dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07004161 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02004162 goto out_softnet;
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07004163
Eric W. Biederman881d9662007-09-17 11:56:21 -07004164 if (wext_proc_init(net))
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02004165 goto out_ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004166 rc = 0;
4167out:
4168 return rc;
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02004169out_ptype:
Eric W. Biederman881d9662007-09-17 11:56:21 -07004170 proc_net_remove(net, "ptype");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004171out_softnet:
Eric W. Biederman881d9662007-09-17 11:56:21 -07004172 proc_net_remove(net, "softnet_stat");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004173out_dev:
Eric W. Biederman881d9662007-09-17 11:56:21 -07004174 proc_net_remove(net, "dev");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004175 goto out;
4176}
Eric W. Biederman881d9662007-09-17 11:56:21 -07004177
Pavel Emelyanov46650792007-10-08 20:38:39 -07004178static void __net_exit dev_proc_net_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07004179{
4180 wext_proc_exit(net);
4181
4182 proc_net_remove(net, "ptype");
4183 proc_net_remove(net, "softnet_stat");
4184 proc_net_remove(net, "dev");
4185}
4186
Denis V. Lunev022cbae2007-11-13 03:23:50 -08004187static struct pernet_operations __net_initdata dev_proc_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07004188 .init = dev_proc_net_init,
4189 .exit = dev_proc_net_exit,
4190};
4191
4192static int __init dev_proc_init(void)
4193{
4194 return register_pernet_subsys(&dev_proc_ops);
4195}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004196#else
4197#define dev_proc_init() 0
4198#endif /* CONFIG_PROC_FS */
4199
4200
4201/**
4202 * netdev_set_master - set up master/slave pair
4203 * @slave: slave device
4204 * @master: new master device
4205 *
4206 * Changes the master device of the slave. Pass %NULL to break the
4207 * bonding. The caller must hold the RTNL semaphore. On a failure
4208 * a negative errno code is returned. On success the reference counts
4209 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4210 * function returns zero.
4211 */
4212int netdev_set_master(struct net_device *slave, struct net_device *master)
4213{
4214 struct net_device *old = slave->master;
4215
4216 ASSERT_RTNL();
4217
4218 if (master) {
4219 if (old)
4220 return -EBUSY;
4221 dev_hold(master);
4222 }
4223
4224 slave->master = master;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09004225
Eric Dumazet283f2fe2010-03-18 13:37:40 +00004226 if (old) {
4227 synchronize_net();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004228 dev_put(old);
Eric Dumazet283f2fe2010-03-18 13:37:40 +00004229 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004230 if (master)
4231 slave->flags |= IFF_SLAVE;
4232 else
4233 slave->flags &= ~IFF_SLAVE;
4234
4235 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4236 return 0;
4237}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004238EXPORT_SYMBOL(netdev_set_master);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004239
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004240static void dev_change_rx_flags(struct net_device *dev, int flags)
4241{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004242 const struct net_device_ops *ops = dev->netdev_ops;
4243
4244 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4245 ops->ndo_change_rx_flags(dev, flags);
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004246}
4247
Wang Chendad9b332008-06-18 01:48:28 -07004248static int __dev_set_promiscuity(struct net_device *dev, int inc)
Patrick McHardy4417da62007-06-27 01:28:10 -07004249{
4250 unsigned short old_flags = dev->flags;
David Howells8192b0c2008-11-14 10:39:10 +11004251 uid_t uid;
4252 gid_t gid;
Patrick McHardy4417da62007-06-27 01:28:10 -07004253
Patrick McHardy24023452007-07-14 18:51:31 -07004254 ASSERT_RTNL();
4255
Wang Chendad9b332008-06-18 01:48:28 -07004256 dev->flags |= IFF_PROMISC;
4257 dev->promiscuity += inc;
4258 if (dev->promiscuity == 0) {
4259 /*
4260 * Avoid overflow.
4261 * If inc causes overflow, untouch promisc and return error.
4262 */
4263 if (inc < 0)
4264 dev->flags &= ~IFF_PROMISC;
4265 else {
4266 dev->promiscuity -= inc;
4267 printk(KERN_WARNING "%s: promiscuity touches roof, "
4268 "set promiscuity failed, promiscuity feature "
4269 "of device might be broken.\n", dev->name);
4270 return -EOVERFLOW;
4271 }
4272 }
Patrick McHardy4417da62007-06-27 01:28:10 -07004273 if (dev->flags != old_flags) {
4274 printk(KERN_INFO "device %s %s promiscuous mode\n",
4275 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4276 "left");
David Howells8192b0c2008-11-14 10:39:10 +11004277 if (audit_enabled) {
4278 current_uid_gid(&uid, &gid);
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004279 audit_log(current->audit_context, GFP_ATOMIC,
4280 AUDIT_ANOM_PROMISCUOUS,
4281 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4282 dev->name, (dev->flags & IFF_PROMISC),
4283 (old_flags & IFF_PROMISC),
4284 audit_get_loginuid(current),
David Howells8192b0c2008-11-14 10:39:10 +11004285 uid, gid,
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004286 audit_get_sessionid(current));
David Howells8192b0c2008-11-14 10:39:10 +11004287 }
Patrick McHardy24023452007-07-14 18:51:31 -07004288
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004289 dev_change_rx_flags(dev, IFF_PROMISC);
Patrick McHardy4417da62007-06-27 01:28:10 -07004290 }
Wang Chendad9b332008-06-18 01:48:28 -07004291 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004292}
4293
Linus Torvalds1da177e2005-04-16 15:20:36 -07004294/**
4295 * dev_set_promiscuity - update promiscuity count on a device
4296 * @dev: device
4297 * @inc: modifier
4298 *
Stephen Hemminger3041a062006-05-26 13:25:24 -07004299 * Add or remove promiscuity from a device. While the count in the device
Linus Torvalds1da177e2005-04-16 15:20:36 -07004300 * remains above zero the interface remains promiscuous. Once it hits zero
4301 * the device reverts back to normal filtering operation. A negative inc
4302 * value is used to drop promiscuity on the device.
Wang Chendad9b332008-06-18 01:48:28 -07004303 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004304 */
Wang Chendad9b332008-06-18 01:48:28 -07004305int dev_set_promiscuity(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004306{
4307 unsigned short old_flags = dev->flags;
Wang Chendad9b332008-06-18 01:48:28 -07004308 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004309
Wang Chendad9b332008-06-18 01:48:28 -07004310 err = __dev_set_promiscuity(dev, inc);
Patrick McHardy4b5a6982008-07-06 15:49:08 -07004311 if (err < 0)
Wang Chendad9b332008-06-18 01:48:28 -07004312 return err;
Patrick McHardy4417da62007-06-27 01:28:10 -07004313 if (dev->flags != old_flags)
4314 dev_set_rx_mode(dev);
Wang Chendad9b332008-06-18 01:48:28 -07004315 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004316}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004317EXPORT_SYMBOL(dev_set_promiscuity);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004318
4319/**
4320 * dev_set_allmulti - update allmulti count on a device
4321 * @dev: device
4322 * @inc: modifier
4323 *
4324 * Add or remove reception of all multicast frames to a device. While the
4325 * count in the device remains above zero the interface remains listening
4326 * to all interfaces. Once it hits zero the device reverts back to normal
4327 * filtering operation. A negative @inc value is used to drop the counter
4328 * when releasing a resource needing all multicasts.
Wang Chendad9b332008-06-18 01:48:28 -07004329 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004330 */
4331
Wang Chendad9b332008-06-18 01:48:28 -07004332int dev_set_allmulti(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004333{
4334 unsigned short old_flags = dev->flags;
4335
Patrick McHardy24023452007-07-14 18:51:31 -07004336 ASSERT_RTNL();
4337
Linus Torvalds1da177e2005-04-16 15:20:36 -07004338 dev->flags |= IFF_ALLMULTI;
Wang Chendad9b332008-06-18 01:48:28 -07004339 dev->allmulti += inc;
4340 if (dev->allmulti == 0) {
4341 /*
4342 * Avoid overflow.
4343 * If inc causes overflow, untouch allmulti and return error.
4344 */
4345 if (inc < 0)
4346 dev->flags &= ~IFF_ALLMULTI;
4347 else {
4348 dev->allmulti -= inc;
4349 printk(KERN_WARNING "%s: allmulti touches roof, "
4350 "set allmulti failed, allmulti feature of "
4351 "device might be broken.\n", dev->name);
4352 return -EOVERFLOW;
4353 }
4354 }
Patrick McHardy24023452007-07-14 18:51:31 -07004355 if (dev->flags ^ old_flags) {
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004356 dev_change_rx_flags(dev, IFF_ALLMULTI);
Patrick McHardy4417da62007-06-27 01:28:10 -07004357 dev_set_rx_mode(dev);
Patrick McHardy24023452007-07-14 18:51:31 -07004358 }
Wang Chendad9b332008-06-18 01:48:28 -07004359 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004360}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004361EXPORT_SYMBOL(dev_set_allmulti);
Patrick McHardy4417da62007-06-27 01:28:10 -07004362
4363/*
4364 * Upload unicast and multicast address lists to device and
4365 * configure RX filtering. When the device doesn't support unicast
Joe Perches53ccaae2007-12-20 14:02:06 -08004366 * filtering it is put in promiscuous mode while unicast addresses
Patrick McHardy4417da62007-06-27 01:28:10 -07004367 * are present.
4368 */
4369void __dev_set_rx_mode(struct net_device *dev)
4370{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004371 const struct net_device_ops *ops = dev->netdev_ops;
4372
Patrick McHardy4417da62007-06-27 01:28:10 -07004373 /* dev_open will call this function so the list will stay sane. */
4374 if (!(dev->flags&IFF_UP))
4375 return;
4376
4377 if (!netif_device_present(dev))
YOSHIFUJI Hideaki40b77c92007-07-19 10:43:23 +09004378 return;
Patrick McHardy4417da62007-06-27 01:28:10 -07004379
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004380 if (ops->ndo_set_rx_mode)
4381 ops->ndo_set_rx_mode(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004382 else {
4383 /* Unicast addresses changes may only happen under the rtnl,
4384 * therefore calling __dev_set_promiscuity here is safe.
4385 */
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004386 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004387 __dev_set_promiscuity(dev, 1);
4388 dev->uc_promisc = 1;
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004389 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004390 __dev_set_promiscuity(dev, -1);
4391 dev->uc_promisc = 0;
4392 }
4393
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004394 if (ops->ndo_set_multicast_list)
4395 ops->ndo_set_multicast_list(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004396 }
4397}
4398
4399void dev_set_rx_mode(struct net_device *dev)
4400{
David S. Millerb9e40852008-07-15 00:15:08 -07004401 netif_addr_lock_bh(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004402 __dev_set_rx_mode(dev);
David S. Millerb9e40852008-07-15 00:15:08 -07004403 netif_addr_unlock_bh(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004404}
4405
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004406/**
4407 * dev_get_flags - get flags reported to userspace
4408 * @dev: device
4409 *
4410 * Get the combination of flag bits exported through APIs to userspace.
4411 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004412unsigned dev_get_flags(const struct net_device *dev)
4413{
4414 unsigned flags;
4415
4416 flags = (dev->flags & ~(IFF_PROMISC |
4417 IFF_ALLMULTI |
Stefan Rompfb00055a2006-03-20 17:09:11 -08004418 IFF_RUNNING |
4419 IFF_LOWER_UP |
4420 IFF_DORMANT)) |
Linus Torvalds1da177e2005-04-16 15:20:36 -07004421 (dev->gflags & (IFF_PROMISC |
4422 IFF_ALLMULTI));
4423
Stefan Rompfb00055a2006-03-20 17:09:11 -08004424 if (netif_running(dev)) {
4425 if (netif_oper_up(dev))
4426 flags |= IFF_RUNNING;
4427 if (netif_carrier_ok(dev))
4428 flags |= IFF_LOWER_UP;
4429 if (netif_dormant(dev))
4430 flags |= IFF_DORMANT;
4431 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004432
4433 return flags;
4434}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004435EXPORT_SYMBOL(dev_get_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004436
Patrick McHardybd380812010-02-26 06:34:53 +00004437int __dev_change_flags(struct net_device *dev, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004438{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004439 int old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00004440 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004441
Patrick McHardy24023452007-07-14 18:51:31 -07004442 ASSERT_RTNL();
4443
Linus Torvalds1da177e2005-04-16 15:20:36 -07004444 /*
4445 * Set the flags on our device.
4446 */
4447
4448 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4449 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4450 IFF_AUTOMEDIA)) |
4451 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4452 IFF_ALLMULTI));
4453
4454 /*
4455 * Load in the correct multicast list now the flags have changed.
4456 */
4457
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004458 if ((old_flags ^ flags) & IFF_MULTICAST)
4459 dev_change_rx_flags(dev, IFF_MULTICAST);
Patrick McHardy24023452007-07-14 18:51:31 -07004460
Patrick McHardy4417da62007-06-27 01:28:10 -07004461 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004462
4463 /*
4464 * Have we downed the interface. We handle IFF_UP ourselves
4465 * according to user attempts to set it, rather than blindly
4466 * setting it.
4467 */
4468
4469 ret = 0;
4470 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
Patrick McHardybd380812010-02-26 06:34:53 +00004471 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004472
4473 if (!ret)
Patrick McHardy4417da62007-06-27 01:28:10 -07004474 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004475 }
4476
Linus Torvalds1da177e2005-04-16 15:20:36 -07004477 if ((flags ^ dev->gflags) & IFF_PROMISC) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004478 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4479
Linus Torvalds1da177e2005-04-16 15:20:36 -07004480 dev->gflags ^= IFF_PROMISC;
4481 dev_set_promiscuity(dev, inc);
4482 }
4483
4484 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4485 is important. Some (broken) drivers set IFF_PROMISC, when
4486 IFF_ALLMULTI is requested not asking us and not reporting.
4487 */
4488 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004489 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4490
Linus Torvalds1da177e2005-04-16 15:20:36 -07004491 dev->gflags ^= IFF_ALLMULTI;
4492 dev_set_allmulti(dev, inc);
4493 }
4494
Patrick McHardybd380812010-02-26 06:34:53 +00004495 return ret;
4496}
4497
4498void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4499{
4500 unsigned int changes = dev->flags ^ old_flags;
4501
4502 if (changes & IFF_UP) {
4503 if (dev->flags & IFF_UP)
4504 call_netdevice_notifiers(NETDEV_UP, dev);
4505 else
4506 call_netdevice_notifiers(NETDEV_DOWN, dev);
4507 }
4508
4509 if (dev->flags & IFF_UP &&
4510 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4511 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4512}
4513
4514/**
4515 * dev_change_flags - change device settings
4516 * @dev: device
4517 * @flags: device state flags
4518 *
4519 * Change settings on device based state flags. The flags are
4520 * in the userspace exported format.
4521 */
4522int dev_change_flags(struct net_device *dev, unsigned flags)
4523{
4524 int ret, changes;
4525 int old_flags = dev->flags;
4526
4527 ret = __dev_change_flags(dev, flags);
4528 if (ret < 0)
4529 return ret;
4530
4531 changes = old_flags ^ dev->flags;
Thomas Graf7c355f52007-06-05 16:03:03 -07004532 if (changes)
4533 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004534
Patrick McHardybd380812010-02-26 06:34:53 +00004535 __dev_notify_flags(dev, old_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004536 return ret;
4537}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004538EXPORT_SYMBOL(dev_change_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004539
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004540/**
4541 * dev_set_mtu - Change maximum transfer unit
4542 * @dev: device
4543 * @new_mtu: new transfer unit
4544 *
4545 * Change the maximum transfer size of the network device.
4546 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004547int dev_set_mtu(struct net_device *dev, int new_mtu)
4548{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004549 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004550 int err;
4551
4552 if (new_mtu == dev->mtu)
4553 return 0;
4554
4555 /* MTU must be positive. */
4556 if (new_mtu < 0)
4557 return -EINVAL;
4558
4559 if (!netif_device_present(dev))
4560 return -ENODEV;
4561
4562 err = 0;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004563 if (ops->ndo_change_mtu)
4564 err = ops->ndo_change_mtu(dev, new_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004565 else
4566 dev->mtu = new_mtu;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004567
Linus Torvalds1da177e2005-04-16 15:20:36 -07004568 if (!err && dev->flags & IFF_UP)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07004569 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004570 return err;
4571}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004572EXPORT_SYMBOL(dev_set_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004573
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004574/**
Vlad Dogarucbda10f2011-01-13 23:38:30 +00004575 * dev_set_group - Change group this device belongs to
4576 * @dev: device
4577 * @new_group: group this device should belong to
4578 */
4579void dev_set_group(struct net_device *dev, int new_group)
4580{
4581 dev->group = new_group;
4582}
4583EXPORT_SYMBOL(dev_set_group);
4584
4585/**
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004586 * dev_set_mac_address - Change Media Access Control Address
4587 * @dev: device
4588 * @sa: new address
4589 *
4590 * Change the hardware (MAC) address of the device
4591 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004592int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4593{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004594 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004595 int err;
4596
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004597 if (!ops->ndo_set_mac_address)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004598 return -EOPNOTSUPP;
4599 if (sa->sa_family != dev->type)
4600 return -EINVAL;
4601 if (!netif_device_present(dev))
4602 return -ENODEV;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004603 err = ops->ndo_set_mac_address(dev, sa);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004604 if (!err)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07004605 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004606 return err;
4607}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004608EXPORT_SYMBOL(dev_set_mac_address);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004609
4610/*
Eric Dumazet3710bec2009-11-01 19:42:09 +00004611 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07004612 */
Jeff Garzik14e3e072007-10-08 00:06:32 -07004613static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004614{
4615 int err;
Eric Dumazet3710bec2009-11-01 19:42:09 +00004616 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004617
4618 if (!dev)
4619 return -ENODEV;
4620
4621 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004622 case SIOCGIFFLAGS: /* Get interface flags */
4623 ifr->ifr_flags = (short) dev_get_flags(dev);
4624 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004625
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004626 case SIOCGIFMETRIC: /* Get the metric on the interface
4627 (currently unused) */
4628 ifr->ifr_metric = 0;
4629 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004630
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004631 case SIOCGIFMTU: /* Get the MTU of a device */
4632 ifr->ifr_mtu = dev->mtu;
4633 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004634
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004635 case SIOCGIFHWADDR:
4636 if (!dev->addr_len)
4637 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4638 else
4639 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4640 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4641 ifr->ifr_hwaddr.sa_family = dev->type;
4642 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004643
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004644 case SIOCGIFSLAVE:
4645 err = -EINVAL;
4646 break;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004647
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004648 case SIOCGIFMAP:
4649 ifr->ifr_map.mem_start = dev->mem_start;
4650 ifr->ifr_map.mem_end = dev->mem_end;
4651 ifr->ifr_map.base_addr = dev->base_addr;
4652 ifr->ifr_map.irq = dev->irq;
4653 ifr->ifr_map.dma = dev->dma;
4654 ifr->ifr_map.port = dev->if_port;
4655 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004656
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004657 case SIOCGIFINDEX:
4658 ifr->ifr_ifindex = dev->ifindex;
4659 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004660
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004661 case SIOCGIFTXQLEN:
4662 ifr->ifr_qlen = dev->tx_queue_len;
4663 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004664
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004665 default:
4666 /* dev_ioctl() should ensure this case
4667 * is never reached
4668 */
4669 WARN_ON(1);
4670 err = -EINVAL;
4671 break;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004672
4673 }
4674 return err;
4675}
4676
4677/*
4678 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4679 */
4680static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4681{
4682 int err;
4683 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
Jarek Poplawski5f2f6da2008-12-22 19:35:28 -08004684 const struct net_device_ops *ops;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004685
4686 if (!dev)
4687 return -ENODEV;
4688
Jarek Poplawski5f2f6da2008-12-22 19:35:28 -08004689 ops = dev->netdev_ops;
4690
Jeff Garzik14e3e072007-10-08 00:06:32 -07004691 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004692 case SIOCSIFFLAGS: /* Set interface flags */
4693 return dev_change_flags(dev, ifr->ifr_flags);
Jeff Garzik14e3e072007-10-08 00:06:32 -07004694
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004695 case SIOCSIFMETRIC: /* Set the metric on the interface
4696 (currently unused) */
4697 return -EOPNOTSUPP;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004698
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004699 case SIOCSIFMTU: /* Set the MTU of a device */
4700 return dev_set_mtu(dev, ifr->ifr_mtu);
Jeff Garzik14e3e072007-10-08 00:06:32 -07004701
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004702 case SIOCSIFHWADDR:
4703 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004704
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004705 case SIOCSIFHWBROADCAST:
4706 if (ifr->ifr_hwaddr.sa_family != dev->type)
4707 return -EINVAL;
4708 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4709 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4710 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4711 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004712
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004713 case SIOCSIFMAP:
4714 if (ops->ndo_set_config) {
4715 if (!netif_device_present(dev))
4716 return -ENODEV;
4717 return ops->ndo_set_config(dev, &ifr->ifr_map);
4718 }
4719 return -EOPNOTSUPP;
4720
4721 case SIOCADDMULTI:
4722 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4723 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4724 return -EINVAL;
4725 if (!netif_device_present(dev))
4726 return -ENODEV;
Jiri Pirko22bedad32010-04-01 21:22:57 +00004727 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004728
4729 case SIOCDELMULTI:
4730 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4731 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4732 return -EINVAL;
4733 if (!netif_device_present(dev))
4734 return -ENODEV;
Jiri Pirko22bedad32010-04-01 21:22:57 +00004735 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004736
4737 case SIOCSIFTXQLEN:
4738 if (ifr->ifr_qlen < 0)
4739 return -EINVAL;
4740 dev->tx_queue_len = ifr->ifr_qlen;
4741 return 0;
4742
4743 case SIOCSIFNAME:
4744 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4745 return dev_change_name(dev, ifr->ifr_newname);
4746
4747 /*
4748 * Unknown or private ioctl
4749 */
4750 default:
4751 if ((cmd >= SIOCDEVPRIVATE &&
4752 cmd <= SIOCDEVPRIVATE + 15) ||
4753 cmd == SIOCBONDENSLAVE ||
4754 cmd == SIOCBONDRELEASE ||
4755 cmd == SIOCBONDSETHWADDR ||
4756 cmd == SIOCBONDSLAVEINFOQUERY ||
4757 cmd == SIOCBONDINFOQUERY ||
4758 cmd == SIOCBONDCHANGEACTIVE ||
4759 cmd == SIOCGMIIPHY ||
4760 cmd == SIOCGMIIREG ||
4761 cmd == SIOCSMIIREG ||
4762 cmd == SIOCBRADDIF ||
4763 cmd == SIOCBRDELIF ||
4764 cmd == SIOCSHWTSTAMP ||
4765 cmd == SIOCWANDEV) {
4766 err = -EOPNOTSUPP;
4767 if (ops->ndo_do_ioctl) {
4768 if (netif_device_present(dev))
4769 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4770 else
4771 err = -ENODEV;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004772 }
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004773 } else
4774 err = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004775
4776 }
4777 return err;
4778}
4779
4780/*
4781 * This function handles all "interface"-type I/O control requests. The actual
4782 * 'doing' part of this is dev_ifsioc above.
4783 */
4784
4785/**
4786 * dev_ioctl - network device ioctl
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004787 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07004788 * @cmd: command to issue
4789 * @arg: pointer to a struct ifreq in user space
4790 *
4791 * Issue ioctl functions to devices. This is normally called by the
4792 * user space syscall interfaces but can sometimes be useful for
4793 * other purposes. The return value is the return from the syscall if
4794 * positive or a negative errno code on error.
4795 */
4796
Eric W. Biederman881d9662007-09-17 11:56:21 -07004797int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004798{
4799 struct ifreq ifr;
4800 int ret;
4801 char *colon;
4802
4803 /* One special case: SIOCGIFCONF takes ifconf argument
4804 and requires shared lock, because it sleeps writing
4805 to user space.
4806 */
4807
4808 if (cmd == SIOCGIFCONF) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08004809 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -07004810 ret = dev_ifconf(net, (char __user *) arg);
Stephen Hemminger6756ae42006-03-20 22:23:58 -08004811 rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004812 return ret;
4813 }
4814 if (cmd == SIOCGIFNAME)
Eric W. Biederman881d9662007-09-17 11:56:21 -07004815 return dev_ifname(net, (struct ifreq __user *)arg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004816
4817 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4818 return -EFAULT;
4819
4820 ifr.ifr_name[IFNAMSIZ-1] = 0;
4821
4822 colon = strchr(ifr.ifr_name, ':');
4823 if (colon)
4824 *colon = 0;
4825
4826 /*
4827 * See which interface the caller is talking about.
4828 */
4829
4830 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004831 /*
4832 * These ioctl calls:
4833 * - can be done by all.
4834 * - atomic and do not require locking.
4835 * - return a value
4836 */
4837 case SIOCGIFFLAGS:
4838 case SIOCGIFMETRIC:
4839 case SIOCGIFMTU:
4840 case SIOCGIFHWADDR:
4841 case SIOCGIFSLAVE:
4842 case SIOCGIFMAP:
4843 case SIOCGIFINDEX:
4844 case SIOCGIFTXQLEN:
4845 dev_load(net, ifr.ifr_name);
Eric Dumazet3710bec2009-11-01 19:42:09 +00004846 rcu_read_lock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004847 ret = dev_ifsioc_locked(net, &ifr, cmd);
Eric Dumazet3710bec2009-11-01 19:42:09 +00004848 rcu_read_unlock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004849 if (!ret) {
4850 if (colon)
4851 *colon = ':';
4852 if (copy_to_user(arg, &ifr,
4853 sizeof(struct ifreq)))
4854 ret = -EFAULT;
4855 }
4856 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004857
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004858 case SIOCETHTOOL:
4859 dev_load(net, ifr.ifr_name);
4860 rtnl_lock();
4861 ret = dev_ethtool(net, &ifr);
4862 rtnl_unlock();
4863 if (!ret) {
4864 if (colon)
4865 *colon = ':';
4866 if (copy_to_user(arg, &ifr,
4867 sizeof(struct ifreq)))
4868 ret = -EFAULT;
4869 }
4870 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004871
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004872 /*
4873 * These ioctl calls:
4874 * - require superuser power.
4875 * - require strict serialization.
4876 * - return a value
4877 */
4878 case SIOCGMIIPHY:
4879 case SIOCGMIIREG:
4880 case SIOCSIFNAME:
4881 if (!capable(CAP_NET_ADMIN))
4882 return -EPERM;
4883 dev_load(net, ifr.ifr_name);
4884 rtnl_lock();
4885 ret = dev_ifsioc(net, &ifr, cmd);
4886 rtnl_unlock();
4887 if (!ret) {
4888 if (colon)
4889 *colon = ':';
4890 if (copy_to_user(arg, &ifr,
4891 sizeof(struct ifreq)))
4892 ret = -EFAULT;
4893 }
4894 return ret;
4895
4896 /*
4897 * These ioctl calls:
4898 * - require superuser power.
4899 * - require strict serialization.
4900 * - do not return a value
4901 */
4902 case SIOCSIFFLAGS:
4903 case SIOCSIFMETRIC:
4904 case SIOCSIFMTU:
4905 case SIOCSIFMAP:
4906 case SIOCSIFHWADDR:
4907 case SIOCSIFSLAVE:
4908 case SIOCADDMULTI:
4909 case SIOCDELMULTI:
4910 case SIOCSIFHWBROADCAST:
4911 case SIOCSIFTXQLEN:
4912 case SIOCSMIIREG:
4913 case SIOCBONDENSLAVE:
4914 case SIOCBONDRELEASE:
4915 case SIOCBONDSETHWADDR:
4916 case SIOCBONDCHANGEACTIVE:
4917 case SIOCBRADDIF:
4918 case SIOCBRDELIF:
4919 case SIOCSHWTSTAMP:
4920 if (!capable(CAP_NET_ADMIN))
4921 return -EPERM;
4922 /* fall through */
4923 case SIOCBONDSLAVEINFOQUERY:
4924 case SIOCBONDINFOQUERY:
4925 dev_load(net, ifr.ifr_name);
4926 rtnl_lock();
4927 ret = dev_ifsioc(net, &ifr, cmd);
4928 rtnl_unlock();
4929 return ret;
4930
4931 case SIOCGIFMEM:
4932 /* Get the per device memory space. We can add this but
4933 * currently do not support it */
4934 case SIOCSIFMEM:
4935 /* Set the per device memory buffer space.
4936 * Not applicable in our case */
4937 case SIOCSIFLINK:
4938 return -EINVAL;
4939
4940 /*
4941 * Unknown or private ioctl.
4942 */
4943 default:
4944 if (cmd == SIOCWANDEV ||
4945 (cmd >= SIOCDEVPRIVATE &&
4946 cmd <= SIOCDEVPRIVATE + 15)) {
Eric W. Biederman881d9662007-09-17 11:56:21 -07004947 dev_load(net, ifr.ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004948 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -07004949 ret = dev_ifsioc(net, &ifr, cmd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004950 rtnl_unlock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004951 if (!ret && copy_to_user(arg, &ifr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07004952 sizeof(struct ifreq)))
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004953 ret = -EFAULT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004954 return ret;
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004955 }
4956 /* Take care of Wireless Extensions */
4957 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4958 return wext_handle_ioctl(net, &ifr, cmd, arg);
4959 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004960 }
4961}
4962
4963
4964/**
4965 * dev_new_index - allocate an ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004966 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07004967 *
4968 * Returns a suitable unique value for a new device interface
4969 * number. The caller must hold the rtnl semaphore or the
4970 * dev_base_lock to be sure it remains unique.
4971 */
Eric W. Biederman881d9662007-09-17 11:56:21 -07004972static int dev_new_index(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004973{
4974 static int ifindex;
4975 for (;;) {
4976 if (++ifindex <= 0)
4977 ifindex = 1;
Eric W. Biederman881d9662007-09-17 11:56:21 -07004978 if (!__dev_get_by_index(net, ifindex))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004979 return ifindex;
4980 }
4981}
4982
Linus Torvalds1da177e2005-04-16 15:20:36 -07004983/* Delayed registration/unregisteration */
Denis Cheng3b5b34f2007-12-07 00:49:17 -08004984static LIST_HEAD(net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004985
Stephen Hemminger6f05f622007-03-08 20:46:03 -08004986static void net_set_todo(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004987{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004988 list_add_tail(&dev->todo_list, &net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004989}
4990
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004991static void rollback_registered_many(struct list_head *head)
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004992{
Krishna Kumare93737b2009-12-08 22:26:02 +00004993 struct net_device *dev, *tmp;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004994
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004995 BUG_ON(dev_boot_phase);
4996 ASSERT_RTNL();
4997
Krishna Kumare93737b2009-12-08 22:26:02 +00004998 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004999 /* Some devices call without registering
Krishna Kumare93737b2009-12-08 22:26:02 +00005000 * for initialization unwind. Remove those
5001 * devices and proceed with the remaining.
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005002 */
5003 if (dev->reg_state == NETREG_UNINITIALIZED) {
5004 pr_debug("unregister_netdevice: device %s/%p never "
5005 "was registered\n", dev->name, dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005006
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005007 WARN_ON(1);
Krishna Kumare93737b2009-12-08 22:26:02 +00005008 list_del(&dev->unreg_list);
5009 continue;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005010 }
5011
5012 BUG_ON(dev->reg_state != NETREG_REGISTERED);
Octavian Purdila44345722010-12-13 12:44:07 +00005013 }
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005014
Octavian Purdila44345722010-12-13 12:44:07 +00005015 /* If device is running, close it first. */
5016 dev_close_many(head);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005017
Octavian Purdila44345722010-12-13 12:44:07 +00005018 list_for_each_entry(dev, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005019 /* And unlink it from device chain. */
5020 unlist_netdevice(dev);
5021
5022 dev->reg_state = NETREG_UNREGISTERING;
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005023 }
5024
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005025 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005026
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005027 list_for_each_entry(dev, head, unreg_list) {
5028 /* Shutdown queueing discipline. */
5029 dev_shutdown(dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005030
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005031
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005032 /* Notify protocols, that we are about to destroy
5033 this device. They should clean all the things.
5034 */
5035 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5036
Patrick McHardya2835762010-02-26 06:34:51 +00005037 if (!dev->rtnl_link_ops ||
5038 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5039 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5040
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005041 /*
5042 * Flush the unicast and multicast chains
5043 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005044 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00005045 dev_mc_flush(dev);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005046
5047 if (dev->netdev_ops->ndo_uninit)
5048 dev->netdev_ops->ndo_uninit(dev);
5049
5050 /* Notifier chain MUST detach us from master device. */
5051 WARN_ON(dev->master);
5052
5053 /* Remove entries from kobject tree */
5054 netdev_unregister_kobject(dev);
5055 }
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005056
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005057 /* Process any work delayed until the end of the batch */
stephen hemmingere5e26d72010-02-24 14:01:38 +00005058 dev = list_first_entry(head, struct net_device, unreg_list);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005059 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5060
Eric Dumazetef885af2010-09-13 12:24:54 +00005061 rcu_barrier();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005062
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005063 list_for_each_entry(dev, head, unreg_list)
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005064 dev_put(dev);
5065}
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005066
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005067static void rollback_registered(struct net_device *dev)
5068{
5069 LIST_HEAD(single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005070
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005071 list_add(&dev->unreg_list, &single);
5072 rollback_registered_many(&single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005073}
5074
Herbert Xub63365a2008-10-23 01:11:29 -07005075unsigned long netdev_fix_features(unsigned long features, const char *name)
5076{
5077 /* Fix illegal SG+CSUM combinations. */
5078 if ((features & NETIF_F_SG) &&
5079 !(features & NETIF_F_ALL_CSUM)) {
5080 if (name)
5081 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
5082 "checksum feature.\n", name);
5083 features &= ~NETIF_F_SG;
5084 }
5085
5086 /* TSO requires that SG is present as well. */
5087 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5088 if (name)
5089 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
5090 "SG feature.\n", name);
5091 features &= ~NETIF_F_TSO;
5092 }
5093
5094 if (features & NETIF_F_UFO) {
Michał Mirosław79032642010-11-30 06:38:00 +00005095 /* maybe split UFO into V4 and V6? */
5096 if (!((features & NETIF_F_GEN_CSUM) ||
5097 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5098 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
Herbert Xub63365a2008-10-23 01:11:29 -07005099 if (name)
5100 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
Michał Mirosław79032642010-11-30 06:38:00 +00005101 "since no checksum offload features.\n",
Herbert Xub63365a2008-10-23 01:11:29 -07005102 name);
5103 features &= ~NETIF_F_UFO;
5104 }
5105
5106 if (!(features & NETIF_F_SG)) {
5107 if (name)
5108 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5109 "since no NETIF_F_SG feature.\n", name);
5110 features &= ~NETIF_F_UFO;
5111 }
5112 }
5113
5114 return features;
5115}
5116EXPORT_SYMBOL(netdev_fix_features);
5117
Linus Torvalds1da177e2005-04-16 15:20:36 -07005118/**
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08005119 * netif_stacked_transfer_operstate - transfer operstate
5120 * @rootdev: the root or lower level device to transfer state from
5121 * @dev: the device to transfer operstate to
5122 *
5123 * Transfer operational state from root to device. This is normally
5124 * called when a stacking relationship exists between the root
5125 * device and the device(a leaf device).
5126 */
5127void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5128 struct net_device *dev)
5129{
5130 if (rootdev->operstate == IF_OPER_DORMANT)
5131 netif_dormant_on(dev);
5132 else
5133 netif_dormant_off(dev);
5134
5135 if (netif_carrier_ok(rootdev)) {
5136 if (!netif_carrier_ok(dev))
5137 netif_carrier_on(dev);
5138 } else {
5139 if (netif_carrier_ok(dev))
5140 netif_carrier_off(dev);
5141 }
5142}
5143EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5144
Tom Herbertbf264142010-11-26 08:36:09 +00005145#ifdef CONFIG_RPS
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005146static int netif_alloc_rx_queues(struct net_device *dev)
5147{
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005148 unsigned int i, count = dev->num_rx_queues;
Tom Herbertbd25fa72010-10-18 18:00:16 +00005149 struct netdev_rx_queue *rx;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005150
Tom Herbertbd25fa72010-10-18 18:00:16 +00005151 BUG_ON(count < 1);
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005152
Tom Herbertbd25fa72010-10-18 18:00:16 +00005153 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5154 if (!rx) {
5155 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5156 return -ENOMEM;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005157 }
Tom Herbertbd25fa72010-10-18 18:00:16 +00005158 dev->_rx = rx;
5159
Tom Herbertbd25fa72010-10-18 18:00:16 +00005160 for (i = 0; i < count; i++)
Tom Herbertfe822242010-11-09 10:47:38 +00005161 rx[i].dev = dev;
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005162 return 0;
5163}
Tom Herbertbf264142010-11-26 08:36:09 +00005164#endif
Eric Dumazet1b4bf462010-09-23 17:26:35 +00005165
Changli Gaoaa942102010-12-04 02:31:41 +00005166static void netdev_init_one_queue(struct net_device *dev,
5167 struct netdev_queue *queue, void *_unused)
5168{
5169 /* Initialize queue lock */
5170 spin_lock_init(&queue->_xmit_lock);
5171 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5172 queue->xmit_lock_owner = -1;
Changli Gaob236da62010-12-14 03:09:15 +00005173 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
Changli Gaoaa942102010-12-04 02:31:41 +00005174 queue->dev = dev;
5175}
5176
Tom Herberte6484932010-10-18 18:04:39 +00005177static int netif_alloc_netdev_queues(struct net_device *dev)
5178{
5179 unsigned int count = dev->num_tx_queues;
5180 struct netdev_queue *tx;
5181
5182 BUG_ON(count < 1);
5183
5184 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5185 if (!tx) {
5186 pr_err("netdev: Unable to allocate %u tx queues.\n",
5187 count);
5188 return -ENOMEM;
5189 }
5190 dev->_tx = tx;
Tom Herbert1d24eb42010-11-21 13:17:27 +00005191
Tom Herberte6484932010-10-18 18:04:39 +00005192 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5193 spin_lock_init(&dev->tx_global_lock);
Changli Gaoaa942102010-12-04 02:31:41 +00005194
5195 return 0;
Tom Herberte6484932010-10-18 18:04:39 +00005196}
5197
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08005198/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005199 * register_netdevice - register a network device
5200 * @dev: device to register
5201 *
5202 * Take a completed network device structure and add it to the kernel
5203 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5204 * chain. 0 is returned on success. A negative errno code is returned
5205 * on a failure to set up the device, or if the name is a duplicate.
5206 *
5207 * Callers must hold the rtnl semaphore. You may want
5208 * register_netdev() instead of this.
5209 *
5210 * BUGS:
5211 * The locking appears insufficient to guarantee two parallel registers
5212 * will not get the same name.
5213 */
5214
5215int register_netdevice(struct net_device *dev)
5216{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005217 int ret;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005218 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005219
5220 BUG_ON(dev_boot_phase);
5221 ASSERT_RTNL();
5222
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005223 might_sleep();
5224
Linus Torvalds1da177e2005-04-16 15:20:36 -07005225 /* When net_device's are persistent, this will be fatal. */
5226 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005227 BUG_ON(!net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005228
David S. Millerf1f28aa2008-07-15 00:08:33 -07005229 spin_lock_init(&dev->addr_list_lock);
David S. Millercf508b12008-07-22 14:16:42 -07005230 netdev_set_addr_lockdep_class(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005231
Linus Torvalds1da177e2005-04-16 15:20:36 -07005232 dev->iflink = -1;
5233
5234 /* Init, if this function is available */
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005235 if (dev->netdev_ops->ndo_init) {
5236 ret = dev->netdev_ops->ndo_init(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005237 if (ret) {
5238 if (ret > 0)
5239 ret = -EIO;
Adrian Bunk90833aa2006-11-13 16:02:22 -08005240 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005241 }
5242 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005243
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00005244 ret = dev_get_valid_name(dev, dev->name, 0);
Octavian Purdilad9031022009-11-18 02:36:59 +00005245 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005246 goto err_uninit;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005247
Eric W. Biederman881d9662007-09-17 11:56:21 -07005248 dev->ifindex = dev_new_index(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005249 if (dev->iflink == -1)
5250 dev->iflink = dev->ifindex;
5251
Stephen Hemmingerd212f872007-06-27 00:47:37 -07005252 /* Fix illegal checksum combinations */
5253 if ((dev->features & NETIF_F_HW_CSUM) &&
5254 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5255 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5256 dev->name);
5257 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5258 }
5259
5260 if ((dev->features & NETIF_F_NO_CSUM) &&
5261 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5262 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5263 dev->name);
5264 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5265 }
5266
Herbert Xub63365a2008-10-23 01:11:29 -07005267 dev->features = netdev_fix_features(dev->features, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005268
Lennert Buytenheke5a4a722008-08-03 01:23:10 -07005269 /* Enable software GSO if SG is supported. */
5270 if (dev->features & NETIF_F_SG)
5271 dev->features |= NETIF_F_GSO;
5272
Eric Dumazetc5256c52010-09-23 00:46:11 +00005273 /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5274 * vlan_dev_init() will do the dev->features check, so these features
5275 * are enabled only if supported by underlying device.
Brandon Philips16c3ea72010-09-15 09:24:24 +00005276 */
Eric Dumazetc5256c52010-09-23 00:46:11 +00005277 dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
Brandon Philips16c3ea72010-09-15 09:24:24 +00005278
Johannes Berg7ffbe3f2009-10-02 05:15:27 +00005279 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5280 ret = notifier_to_errno(ret);
5281 if (ret)
5282 goto err_uninit;
5283
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005284 ret = netdev_register_kobject(dev);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005285 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005286 goto err_uninit;
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005287 dev->reg_state = NETREG_REGISTERED;
5288
Linus Torvalds1da177e2005-04-16 15:20:36 -07005289 /*
5290 * Default initial state at registry is that the
5291 * device is present.
5292 */
5293
5294 set_bit(__LINK_STATE_PRESENT, &dev->state);
5295
Linus Torvalds1da177e2005-04-16 15:20:36 -07005296 dev_init_scheduler(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005297 dev_hold(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005298 list_netdevice(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005299
5300 /* Notify protocols, that a new device appeared. */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005301 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07005302 ret = notifier_to_errno(ret);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005303 if (ret) {
5304 rollback_registered(dev);
5305 dev->reg_state = NETREG_UNREGISTERED;
5306 }
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005307 /*
5308 * Prevent userspace races by waiting until the network
5309 * device is fully setup before sending notifications.
5310 */
Patrick McHardya2835762010-02-26 06:34:51 +00005311 if (!dev->rtnl_link_ops ||
5312 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5313 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005314
5315out:
5316 return ret;
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005317
5318err_uninit:
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005319 if (dev->netdev_ops->ndo_uninit)
5320 dev->netdev_ops->ndo_uninit(dev);
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005321 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005322}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005323EXPORT_SYMBOL(register_netdevice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005324
5325/**
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005326 * init_dummy_netdev - init a dummy network device for NAPI
5327 * @dev: device to init
5328 *
5329 * This takes a network device structure and initialize the minimum
5330 * amount of fields so it can be used to schedule NAPI polls without
5331 * registering a full blown interface. This is to be used by drivers
5332 * that need to tie several hardware interfaces to a single NAPI
5333 * poll scheduler due to HW limitations.
5334 */
5335int init_dummy_netdev(struct net_device *dev)
5336{
5337 /* Clear everything. Note we don't initialize spinlocks
5338 * are they aren't supposed to be taken by any of the
5339 * NAPI code and this dummy netdev is supposed to be
5340 * only ever used for NAPI polls
5341 */
5342 memset(dev, 0, sizeof(struct net_device));
5343
5344 /* make sure we BUG if trying to hit standard
5345 * register/unregister code path
5346 */
5347 dev->reg_state = NETREG_DUMMY;
5348
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005349 /* NAPI wants this */
5350 INIT_LIST_HEAD(&dev->napi_list);
5351
5352 /* a dummy interface is started by default */
5353 set_bit(__LINK_STATE_PRESENT, &dev->state);
5354 set_bit(__LINK_STATE_START, &dev->state);
5355
Eric Dumazet29b44332010-10-11 10:22:12 +00005356 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5357 * because users of this 'device' dont need to change
5358 * its refcount.
5359 */
5360
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005361 return 0;
5362}
5363EXPORT_SYMBOL_GPL(init_dummy_netdev);
5364
5365
5366/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005367 * register_netdev - register a network device
5368 * @dev: device to register
5369 *
5370 * Take a completed network device structure and add it to the kernel
5371 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5372 * chain. 0 is returned on success. A negative errno code is returned
5373 * on a failure to set up the device, or if the name is a duplicate.
5374 *
Borislav Petkov38b4da32007-04-20 22:14:10 -07005375 * This is a wrapper around register_netdevice that takes the rtnl semaphore
Linus Torvalds1da177e2005-04-16 15:20:36 -07005376 * and expands the device name if you passed a format string to
5377 * alloc_netdev.
5378 */
5379int register_netdev(struct net_device *dev)
5380{
5381 int err;
5382
5383 rtnl_lock();
5384
5385 /*
5386 * If the name is a format string the caller wants us to do a
5387 * name allocation.
5388 */
5389 if (strchr(dev->name, '%')) {
5390 err = dev_alloc_name(dev, dev->name);
5391 if (err < 0)
5392 goto out;
5393 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005394
Linus Torvalds1da177e2005-04-16 15:20:36 -07005395 err = register_netdevice(dev);
5396out:
5397 rtnl_unlock();
5398 return err;
5399}
5400EXPORT_SYMBOL(register_netdev);
5401
Eric Dumazet29b44332010-10-11 10:22:12 +00005402int netdev_refcnt_read(const struct net_device *dev)
5403{
5404 int i, refcnt = 0;
5405
5406 for_each_possible_cpu(i)
5407 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5408 return refcnt;
5409}
5410EXPORT_SYMBOL(netdev_refcnt_read);
5411
Linus Torvalds1da177e2005-04-16 15:20:36 -07005412/*
5413 * netdev_wait_allrefs - wait until all references are gone.
5414 *
5415 * This is called when unregistering network devices.
5416 *
5417 * Any protocol or device that holds a reference should register
5418 * for netdevice notification, and cleanup and put back the
5419 * reference if they receive an UNREGISTER event.
5420 * We can get stuck here if buggy protocols don't correctly
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005421 * call dev_put.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005422 */
5423static void netdev_wait_allrefs(struct net_device *dev)
5424{
5425 unsigned long rebroadcast_time, warning_time;
Eric Dumazet29b44332010-10-11 10:22:12 +00005426 int refcnt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005427
Eric Dumazete014deb2009-11-17 05:59:21 +00005428 linkwatch_forget_dev(dev);
5429
Linus Torvalds1da177e2005-04-16 15:20:36 -07005430 rebroadcast_time = warning_time = jiffies;
Eric Dumazet29b44332010-10-11 10:22:12 +00005431 refcnt = netdev_refcnt_read(dev);
5432
5433 while (refcnt != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005434 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005435 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005436
5437 /* Rebroadcast unregister notification */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005438 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005439 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
Octavian Purdila395264d2009-11-16 13:49:35 +00005440 * should have already handle it the first time */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005441
5442 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5443 &dev->state)) {
5444 /* We must not have linkwatch events
5445 * pending on unregister. If this
5446 * happens, we simply run the queue
5447 * unscheduled, resulting in a noop
5448 * for this device.
5449 */
5450 linkwatch_run_queue();
5451 }
5452
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005453 __rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005454
5455 rebroadcast_time = jiffies;
5456 }
5457
5458 msleep(250);
5459
Eric Dumazet29b44332010-10-11 10:22:12 +00005460 refcnt = netdev_refcnt_read(dev);
5461
Linus Torvalds1da177e2005-04-16 15:20:36 -07005462 if (time_after(jiffies, warning_time + 10 * HZ)) {
5463 printk(KERN_EMERG "unregister_netdevice: "
5464 "waiting for %s to become free. Usage "
5465 "count = %d\n",
Eric Dumazet29b44332010-10-11 10:22:12 +00005466 dev->name, refcnt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005467 warning_time = jiffies;
5468 }
5469 }
5470}
5471
5472/* The sequence is:
5473 *
5474 * rtnl_lock();
5475 * ...
5476 * register_netdevice(x1);
5477 * register_netdevice(x2);
5478 * ...
5479 * unregister_netdevice(y1);
5480 * unregister_netdevice(y2);
5481 * ...
5482 * rtnl_unlock();
5483 * free_netdev(y1);
5484 * free_netdev(y2);
5485 *
Herbert Xu58ec3b42008-10-07 15:50:03 -07005486 * We are invoked by rtnl_unlock().
Linus Torvalds1da177e2005-04-16 15:20:36 -07005487 * This allows us to deal with problems:
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005488 * 1) We can delete sysfs objects which invoke hotplug
Linus Torvalds1da177e2005-04-16 15:20:36 -07005489 * without deadlocking with linkwatch via keventd.
5490 * 2) Since we run with the RTNL semaphore not held, we can sleep
5491 * safely in order to wait for the netdev refcnt to drop to zero.
Herbert Xu58ec3b42008-10-07 15:50:03 -07005492 *
5493 * We must not return until all unregister events added during
5494 * the interval the lock was held have been completed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005495 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005496void netdev_run_todo(void)
5497{
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005498 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005499
Linus Torvalds1da177e2005-04-16 15:20:36 -07005500 /* Snapshot list, allow later requests */
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005501 list_replace_init(&net_todo_list, &list);
Herbert Xu58ec3b42008-10-07 15:50:03 -07005502
5503 __rtnl_unlock();
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005504
Linus Torvalds1da177e2005-04-16 15:20:36 -07005505 while (!list_empty(&list)) {
5506 struct net_device *dev
stephen hemmingere5e26d72010-02-24 14:01:38 +00005507 = list_first_entry(&list, struct net_device, todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005508 list_del(&dev->todo_list);
5509
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005510 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005511 printk(KERN_ERR "network todo '%s' but state %d\n",
5512 dev->name, dev->reg_state);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005513 dump_stack();
5514 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005515 }
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005516
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005517 dev->reg_state = NETREG_UNREGISTERED;
5518
Changli Gao152102c2010-03-30 20:16:22 +00005519 on_each_cpu(flush_backlog, dev, 1);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07005520
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005521 netdev_wait_allrefs(dev);
5522
5523 /* paranoia */
Eric Dumazet29b44332010-10-11 10:22:12 +00005524 BUG_ON(netdev_refcnt_read(dev));
Eric Dumazet95ae6b22010-09-15 04:04:31 +00005525 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
Eric Dumazet198caec2010-10-24 21:32:05 +00005526 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
Ilpo Järvinen547b7922008-07-25 21:43:18 -07005527 WARN_ON(dev->dn_ptr);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005528
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005529 if (dev->destructor)
5530 dev->destructor(dev);
Stephen Hemminger9093bbb2007-05-19 15:39:25 -07005531
5532 /* Free network device */
5533 kobject_put(&dev->dev.kobj);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005534 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005535}
5536
Ben Hutchings3cfde792010-07-09 09:11:52 +00005537/* Convert net_device_stats to rtnl_link_stats64. They have the same
5538 * fields in the same order, with only the type differing.
5539 */
5540static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5541 const struct net_device_stats *netdev_stats)
5542{
5543#if BITS_PER_LONG == 64
5544 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5545 memcpy(stats64, netdev_stats, sizeof(*stats64));
5546#else
5547 size_t i, n = sizeof(*stats64) / sizeof(u64);
5548 const unsigned long *src = (const unsigned long *)netdev_stats;
5549 u64 *dst = (u64 *)stats64;
5550
5551 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5552 sizeof(*stats64) / sizeof(u64));
5553 for (i = 0; i < n; i++)
5554 dst[i] = src[i];
5555#endif
5556}
5557
Eric Dumazetd83345a2009-11-16 03:36:51 +00005558/**
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005559 * dev_get_stats - get network device statistics
5560 * @dev: device to get statistics from
Eric Dumazet28172732010-07-07 14:58:56 -07005561 * @storage: place to store stats
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005562 *
Ben Hutchingsd7753512010-07-09 09:12:41 +00005563 * Get network statistics from device. Return @storage.
5564 * The device driver may provide its own method by setting
5565 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5566 * otherwise the internal statistics structure is used.
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005567 */
Ben Hutchingsd7753512010-07-09 09:12:41 +00005568struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5569 struct rtnl_link_stats64 *storage)
Eric Dumazet7004bf22009-05-18 00:34:33 +00005570{
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005571 const struct net_device_ops *ops = dev->netdev_ops;
5572
Eric Dumazet28172732010-07-07 14:58:56 -07005573 if (ops->ndo_get_stats64) {
5574 memset(storage, 0, sizeof(*storage));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005575 ops->ndo_get_stats64(dev, storage);
5576 } else if (ops->ndo_get_stats) {
Ben Hutchings3cfde792010-07-09 09:11:52 +00005577 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005578 } else {
5579 netdev_stats_to_stats64(storage, &dev->stats);
Eric Dumazet28172732010-07-07 14:58:56 -07005580 }
Eric Dumazetcaf586e2010-09-30 21:06:55 +00005581 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
Eric Dumazet28172732010-07-07 14:58:56 -07005582 return storage;
Rusty Russellc45d2862007-03-28 14:29:08 -07005583}
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005584EXPORT_SYMBOL(dev_get_stats);
Rusty Russellc45d2862007-03-28 14:29:08 -07005585
Eric Dumazet24824a02010-10-02 06:11:55 +00005586struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
David S. Millerdc2b4842008-07-08 17:18:23 -07005587{
Eric Dumazet24824a02010-10-02 06:11:55 +00005588 struct netdev_queue *queue = dev_ingress_queue(dev);
David S. Millerdc2b4842008-07-08 17:18:23 -07005589
Eric Dumazet24824a02010-10-02 06:11:55 +00005590#ifdef CONFIG_NET_CLS_ACT
5591 if (queue)
5592 return queue;
5593 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5594 if (!queue)
5595 return NULL;
5596 netdev_init_one_queue(dev, queue, NULL);
Eric Dumazet24824a02010-10-02 06:11:55 +00005597 queue->qdisc = &noop_qdisc;
5598 queue->qdisc_sleeping = &noop_qdisc;
5599 rcu_assign_pointer(dev->ingress_queue, queue);
5600#endif
5601 return queue;
David S. Millerbb949fb2008-07-08 16:55:56 -07005602}
5603
Linus Torvalds1da177e2005-04-16 15:20:36 -07005604/**
Tom Herbert36909ea2011-01-09 19:36:31 +00005605 * alloc_netdev_mqs - allocate network device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005606 * @sizeof_priv: size of private data to allocate space for
5607 * @name: device name format string
5608 * @setup: callback to initialize device
Tom Herbert36909ea2011-01-09 19:36:31 +00005609 * @txqs: the number of TX subqueues to allocate
5610 * @rxqs: the number of RX subqueues to allocate
Linus Torvalds1da177e2005-04-16 15:20:36 -07005611 *
5612 * Allocates a struct net_device with private data area for driver use
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005613 * and performs basic initialization. Also allocates subquue structs
Tom Herbert36909ea2011-01-09 19:36:31 +00005614 * for each queue on the device.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005615 */
Tom Herbert36909ea2011-01-09 19:36:31 +00005616struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5617 void (*setup)(struct net_device *),
5618 unsigned int txqs, unsigned int rxqs)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005619{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005620 struct net_device *dev;
Stephen Hemminger79439862008-07-21 13:28:44 -07005621 size_t alloc_size;
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005622 struct net_device *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005623
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07005624 BUG_ON(strlen(name) >= sizeof(dev->name));
5625
Tom Herbert36909ea2011-01-09 19:36:31 +00005626 if (txqs < 1) {
Tom Herbert55513fb2010-10-18 17:55:58 +00005627 pr_err("alloc_netdev: Unable to allocate device "
5628 "with zero queues.\n");
5629 return NULL;
5630 }
5631
Tom Herbert36909ea2011-01-09 19:36:31 +00005632#ifdef CONFIG_RPS
5633 if (rxqs < 1) {
5634 pr_err("alloc_netdev: Unable to allocate device "
5635 "with zero RX queues.\n");
5636 return NULL;
5637 }
5638#endif
5639
David S. Millerfd2ea0a2008-07-17 01:56:23 -07005640 alloc_size = sizeof(struct net_device);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005641 if (sizeof_priv) {
5642 /* ensure 32-byte alignment of private area */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005643 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005644 alloc_size += sizeof_priv;
5645 }
5646 /* ensure 32-byte alignment of whole construct */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005647 alloc_size += NETDEV_ALIGN - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005648
Paolo 'Blaisorblade' Giarrusso31380de2006-04-06 22:38:28 -07005649 p = kzalloc(alloc_size, GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005650 if (!p) {
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07005651 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005652 return NULL;
5653 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005654
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005655 dev = PTR_ALIGN(p, NETDEV_ALIGN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005656 dev->padded = (char *)dev - (char *)p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005657
Eric Dumazet29b44332010-10-11 10:22:12 +00005658 dev->pcpu_refcnt = alloc_percpu(int);
5659 if (!dev->pcpu_refcnt)
Tom Herberte6484932010-10-18 18:04:39 +00005660 goto free_p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005661
Linus Torvalds1da177e2005-04-16 15:20:36 -07005662 if (dev_addr_init(dev))
Eric Dumazet29b44332010-10-11 10:22:12 +00005663 goto free_pcpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005664
Jiri Pirko22bedad32010-04-01 21:22:57 +00005665 dev_mc_init(dev);
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005666 dev_uc_init(dev);
Jiri Pirkoccffad252009-05-22 23:22:17 +00005667
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005668 dev_net_set(dev, &init_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005669
Tom Herbert36909ea2011-01-09 19:36:31 +00005670 dev->num_tx_queues = txqs;
5671 dev->real_num_tx_queues = txqs;
Tom Herberted9af2e2010-11-09 10:47:30 +00005672 if (netif_alloc_netdev_queues(dev))
5673 goto free_pcpu;
David S. Millere8a04642008-07-17 00:34:19 -07005674
Eric Dumazetdf334542010-03-24 19:13:54 +00005675#ifdef CONFIG_RPS
Tom Herbert36909ea2011-01-09 19:36:31 +00005676 dev->num_rx_queues = rxqs;
5677 dev->real_num_rx_queues = rxqs;
Tom Herbertfe822242010-11-09 10:47:38 +00005678 if (netif_alloc_rx_queues(dev))
5679 goto free_pcpu;
Eric Dumazetdf334542010-03-24 19:13:54 +00005680#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00005681
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07005682 dev->gso_max_size = GSO_MAX_SIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005683
Peter P Waskiewicz Jr15682bc2010-02-10 20:03:05 -08005684 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5685 dev->ethtool_ntuple_list.count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08005686 INIT_LIST_HEAD(&dev->napi_list);
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005687 INIT_LIST_HEAD(&dev->unreg_list);
Eric Dumazete014deb2009-11-17 05:59:21 +00005688 INIT_LIST_HEAD(&dev->link_watch_list);
Eric Dumazet93f154b2009-05-18 22:19:19 -07005689 dev->priv_flags = IFF_XMIT_DST_RELEASE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005690 setup(dev);
5691 strcpy(dev->name, name);
Vlad Dogarucbda10f2011-01-13 23:38:30 +00005692 dev->group = INIT_NETDEV_GROUP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005693 return dev;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005694
Eric Dumazet29b44332010-10-11 10:22:12 +00005695free_pcpu:
5696 free_percpu(dev->pcpu_refcnt);
Tom Herberted9af2e2010-11-09 10:47:30 +00005697 kfree(dev->_tx);
Tom Herbertfe822242010-11-09 10:47:38 +00005698#ifdef CONFIG_RPS
5699 kfree(dev->_rx);
5700#endif
5701
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005702free_p:
5703 kfree(p);
5704 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005705}
Tom Herbert36909ea2011-01-09 19:36:31 +00005706EXPORT_SYMBOL(alloc_netdev_mqs);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005707
5708/**
5709 * free_netdev - free network device
5710 * @dev: device
5711 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005712 * This function does the last stage of destroying an allocated device
5713 * interface. The reference to the device object is released.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005714 * If this is the last reference then it will be freed.
5715 */
5716void free_netdev(struct net_device *dev)
5717{
Herbert Xud565b0a2008-12-15 23:38:52 -08005718 struct napi_struct *p, *n;
5719
Denis V. Lunevf3005d72008-04-16 02:02:18 -07005720 release_net(dev_net(dev));
5721
David S. Millere8a04642008-07-17 00:34:19 -07005722 kfree(dev->_tx);
Tom Herbertfe822242010-11-09 10:47:38 +00005723#ifdef CONFIG_RPS
5724 kfree(dev->_rx);
5725#endif
David S. Millere8a04642008-07-17 00:34:19 -07005726
Eric Dumazet24824a02010-10-02 06:11:55 +00005727 kfree(rcu_dereference_raw(dev->ingress_queue));
5728
Jiri Pirkof001fde2009-05-05 02:48:28 +00005729 /* Flush device addresses */
5730 dev_addr_flush(dev);
5731
Peter P Waskiewicz Jr15682bc2010-02-10 20:03:05 -08005732 /* Clear ethtool n-tuple list */
5733 ethtool_ntuple_flush(dev);
5734
Herbert Xud565b0a2008-12-15 23:38:52 -08005735 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5736 netif_napi_del(p);
5737
Eric Dumazet29b44332010-10-11 10:22:12 +00005738 free_percpu(dev->pcpu_refcnt);
5739 dev->pcpu_refcnt = NULL;
5740
Stephen Hemminger3041a062006-05-26 13:25:24 -07005741 /* Compatibility with error handling in drivers */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005742 if (dev->reg_state == NETREG_UNINITIALIZED) {
5743 kfree((char *)dev - dev->padded);
5744 return;
5745 }
5746
5747 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5748 dev->reg_state = NETREG_RELEASED;
5749
Greg Kroah-Hartman43cb76d2002-04-09 12:14:34 -07005750 /* will free via device release */
5751 put_device(&dev->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005752}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005753EXPORT_SYMBOL(free_netdev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005754
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005755/**
5756 * synchronize_net - Synchronize with packet receive processing
5757 *
5758 * Wait for packets currently being received to be done.
5759 * Does not block later packets from starting.
5760 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005761void synchronize_net(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005762{
5763 might_sleep();
Paul E. McKenneyfbd568a3e2005-05-01 08:59:04 -07005764 synchronize_rcu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005765}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005766EXPORT_SYMBOL(synchronize_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005767
5768/**
Eric Dumazet44a08732009-10-27 07:03:04 +00005769 * unregister_netdevice_queue - remove device from the kernel
Linus Torvalds1da177e2005-04-16 15:20:36 -07005770 * @dev: device
Eric Dumazet44a08732009-10-27 07:03:04 +00005771 * @head: list
Jaswinder Singh Rajput6ebfbc02009-11-22 20:43:13 -08005772 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07005773 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005774 * from the kernel tables.
Eric Dumazet44a08732009-10-27 07:03:04 +00005775 * If head not NULL, device is queued to be unregistered later.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005776 *
5777 * Callers must hold the rtnl semaphore. You may want
5778 * unregister_netdev() instead of this.
5779 */
5780
Eric Dumazet44a08732009-10-27 07:03:04 +00005781void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005782{
Herbert Xua6620712007-12-12 19:21:56 -08005783 ASSERT_RTNL();
5784
Eric Dumazet44a08732009-10-27 07:03:04 +00005785 if (head) {
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005786 list_move_tail(&dev->unreg_list, head);
Eric Dumazet44a08732009-10-27 07:03:04 +00005787 } else {
5788 rollback_registered(dev);
5789 /* Finish processing unregister after unlock */
5790 net_set_todo(dev);
5791 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005792}
Eric Dumazet44a08732009-10-27 07:03:04 +00005793EXPORT_SYMBOL(unregister_netdevice_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005794
5795/**
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005796 * unregister_netdevice_many - unregister many devices
5797 * @head: list of devices
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005798 */
5799void unregister_netdevice_many(struct list_head *head)
5800{
5801 struct net_device *dev;
5802
5803 if (!list_empty(head)) {
5804 rollback_registered_many(head);
5805 list_for_each_entry(dev, head, unreg_list)
5806 net_set_todo(dev);
5807 }
5808}
Eric Dumazet63c80992009-10-27 07:06:49 +00005809EXPORT_SYMBOL(unregister_netdevice_many);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005810
5811/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005812 * unregister_netdev - remove device from the kernel
5813 * @dev: device
5814 *
5815 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005816 * from the kernel tables.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005817 *
5818 * This is just a wrapper for unregister_netdevice that takes
5819 * the rtnl semaphore. In general you want to use this and not
5820 * unregister_netdevice.
5821 */
5822void unregister_netdev(struct net_device *dev)
5823{
5824 rtnl_lock();
5825 unregister_netdevice(dev);
5826 rtnl_unlock();
5827}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005828EXPORT_SYMBOL(unregister_netdev);
5829
Eric W. Biedermance286d32007-09-12 13:53:49 +02005830/**
5831 * dev_change_net_namespace - move device to different nethost namespace
5832 * @dev: device
5833 * @net: network namespace
5834 * @pat: If not NULL name pattern to try if the current device name
5835 * is already taken in the destination network namespace.
5836 *
5837 * This function shuts down a device interface and moves it
5838 * to a new network namespace. On success 0 is returned, on
5839 * a failure a netagive errno code is returned.
5840 *
5841 * Callers must hold the rtnl semaphore.
5842 */
5843
5844int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5845{
Eric W. Biedermance286d32007-09-12 13:53:49 +02005846 int err;
5847
5848 ASSERT_RTNL();
5849
5850 /* Don't allow namespace local devices to be moved. */
5851 err = -EINVAL;
5852 if (dev->features & NETIF_F_NETNS_LOCAL)
5853 goto out;
5854
5855 /* Ensure the device has been registrered */
5856 err = -EINVAL;
5857 if (dev->reg_state != NETREG_REGISTERED)
5858 goto out;
5859
5860 /* Get out if there is nothing todo */
5861 err = 0;
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09005862 if (net_eq(dev_net(dev), net))
Eric W. Biedermance286d32007-09-12 13:53:49 +02005863 goto out;
5864
5865 /* Pick the destination device name, and ensure
5866 * we can use it in the destination network namespace.
5867 */
5868 err = -EEXIST;
Octavian Purdilad9031022009-11-18 02:36:59 +00005869 if (__dev_get_by_name(net, dev->name)) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02005870 /* We get here if we can't use the current device name */
5871 if (!pat)
5872 goto out;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00005873 if (dev_get_valid_name(dev, pat, 1))
Eric W. Biedermance286d32007-09-12 13:53:49 +02005874 goto out;
5875 }
5876
5877 /*
5878 * And now a mini version of register_netdevice unregister_netdevice.
5879 */
5880
5881 /* If device is running close it first. */
Pavel Emelyanov9b772652007-10-10 02:49:09 -07005882 dev_close(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005883
5884 /* And unlink it from device chain */
5885 err = -ENODEV;
5886 unlist_netdevice(dev);
5887
5888 synchronize_net();
5889
5890 /* Shutdown queueing discipline. */
5891 dev_shutdown(dev);
5892
5893 /* Notify protocols, that we are about to destroy
5894 this device. They should clean all the things.
David Lamparter3b27e102010-09-17 03:22:19 +00005895
5896 Note that dev->reg_state stays at NETREG_REGISTERED.
5897 This is wanted because this way 8021q and macvlan know
5898 the device is just moving and can keep their slaves up.
Eric W. Biedermance286d32007-09-12 13:53:49 +02005899 */
5900 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005901 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005902
5903 /*
5904 * Flush the unicast and multicast chains
5905 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005906 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00005907 dev_mc_flush(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005908
5909 /* Actually switch the network namespace */
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005910 dev_net_set(dev, net);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005911
Eric W. Biedermance286d32007-09-12 13:53:49 +02005912 /* If there is an ifindex conflict assign a new one */
5913 if (__dev_get_by_index(net, dev->ifindex)) {
5914 int iflink = (dev->iflink == dev->ifindex);
5915 dev->ifindex = dev_new_index(net);
5916 if (iflink)
5917 dev->iflink = dev->ifindex;
5918 }
5919
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005920 /* Fixup kobjects */
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07005921 err = device_rename(&dev->dev, dev->name);
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005922 WARN_ON(err);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005923
5924 /* Add the device back in the hashes */
5925 list_netdevice(dev);
5926
5927 /* Notify protocols, that a new device appeared. */
5928 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5929
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005930 /*
5931 * Prevent userspace races by waiting until the network
5932 * device is fully setup before sending notifications.
5933 */
5934 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5935
Eric W. Biedermance286d32007-09-12 13:53:49 +02005936 synchronize_net();
5937 err = 0;
5938out:
5939 return err;
5940}
Johannes Berg463d0182009-07-14 00:33:35 +02005941EXPORT_SYMBOL_GPL(dev_change_net_namespace);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005942
Linus Torvalds1da177e2005-04-16 15:20:36 -07005943static int dev_cpu_callback(struct notifier_block *nfb,
5944 unsigned long action,
5945 void *ocpu)
5946{
5947 struct sk_buff **list_skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005948 struct sk_buff *skb;
5949 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5950 struct softnet_data *sd, *oldsd;
5951
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005952 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005953 return NOTIFY_OK;
5954
5955 local_irq_disable();
5956 cpu = smp_processor_id();
5957 sd = &per_cpu(softnet_data, cpu);
5958 oldsd = &per_cpu(softnet_data, oldcpu);
5959
5960 /* Find end of our completion_queue. */
5961 list_skb = &sd->completion_queue;
5962 while (*list_skb)
5963 list_skb = &(*list_skb)->next;
5964 /* Append completion queue from offline CPU. */
5965 *list_skb = oldsd->completion_queue;
5966 oldsd->completion_queue = NULL;
5967
Linus Torvalds1da177e2005-04-16 15:20:36 -07005968 /* Append output queue from offline CPU. */
Changli Gaoa9cbd582010-04-26 23:06:24 +00005969 if (oldsd->output_queue) {
5970 *sd->output_queue_tailp = oldsd->output_queue;
5971 sd->output_queue_tailp = oldsd->output_queue_tailp;
5972 oldsd->output_queue = NULL;
5973 oldsd->output_queue_tailp = &oldsd->output_queue;
5974 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005975
5976 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5977 local_irq_enable();
5978
5979 /* Process offline CPU's input_pkt_queue */
Tom Herbert76cc8b12010-05-20 18:37:59 +00005980 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5981 netif_rx(skb);
5982 input_queue_head_incr(oldsd);
5983 }
Tom Herbertfec5e652010-04-16 16:01:27 -07005984 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005985 netif_rx(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00005986 input_queue_head_incr(oldsd);
Tom Herbertfec5e652010-04-16 16:01:27 -07005987 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005988
5989 return NOTIFY_OK;
5990}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005991
5992
Herbert Xu7f353bf2007-08-10 15:47:58 -07005993/**
Herbert Xub63365a2008-10-23 01:11:29 -07005994 * netdev_increment_features - increment feature set by one
5995 * @all: current feature set
5996 * @one: new feature set
5997 * @mask: mask feature set
Herbert Xu7f353bf2007-08-10 15:47:58 -07005998 *
5999 * Computes a new feature set after adding a device with feature set
Herbert Xub63365a2008-10-23 01:11:29 -07006000 * @one to the master device with current feature set @all. Will not
6001 * enable anything that is off in @mask. Returns the new feature set.
Herbert Xu7f353bf2007-08-10 15:47:58 -07006002 */
Herbert Xub63365a2008-10-23 01:11:29 -07006003unsigned long netdev_increment_features(unsigned long all, unsigned long one,
6004 unsigned long mask)
Herbert Xu7f353bf2007-08-10 15:47:58 -07006005{
Herbert Xub63365a2008-10-23 01:11:29 -07006006 /* If device needs checksumming, downgrade to it. */
Eric Dumazetd1b19df2009-09-03 01:29:39 -07006007 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
Herbert Xub63365a2008-10-23 01:11:29 -07006008 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6009 else if (mask & NETIF_F_ALL_CSUM) {
6010 /* If one device supports v4/v6 checksumming, set for all. */
6011 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6012 !(all & NETIF_F_GEN_CSUM)) {
6013 all &= ~NETIF_F_ALL_CSUM;
6014 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6015 }
Herbert Xu7f353bf2007-08-10 15:47:58 -07006016
Herbert Xub63365a2008-10-23 01:11:29 -07006017 /* If one device supports hw checksumming, set for all. */
6018 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6019 all &= ~NETIF_F_ALL_CSUM;
6020 all |= NETIF_F_HW_CSUM;
6021 }
6022 }
Herbert Xu7f353bf2007-08-10 15:47:58 -07006023
Herbert Xub63365a2008-10-23 01:11:29 -07006024 one |= NETIF_F_ALL_CSUM;
Herbert Xu7f353bf2007-08-10 15:47:58 -07006025
Herbert Xub63365a2008-10-23 01:11:29 -07006026 one |= all & NETIF_F_ONE_FOR_ALL;
Sridhar Samudralad9f59502009-10-07 12:24:25 +00006027 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
Herbert Xub63365a2008-10-23 01:11:29 -07006028 all |= one & mask & NETIF_F_ONE_FOR_ALL;
Herbert Xu7f353bf2007-08-10 15:47:58 -07006029
6030 return all;
6031}
Herbert Xub63365a2008-10-23 01:11:29 -07006032EXPORT_SYMBOL(netdev_increment_features);
Herbert Xu7f353bf2007-08-10 15:47:58 -07006033
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006034static struct hlist_head *netdev_create_hash(void)
6035{
6036 int i;
6037 struct hlist_head *hash;
6038
6039 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6040 if (hash != NULL)
6041 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6042 INIT_HLIST_HEAD(&hash[i]);
6043
6044 return hash;
6045}
6046
Eric W. Biederman881d9662007-09-17 11:56:21 -07006047/* Initialize per network namespace state */
Pavel Emelyanov46650792007-10-08 20:38:39 -07006048static int __net_init netdev_init(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07006049{
Eric W. Biederman881d9662007-09-17 11:56:21 -07006050 INIT_LIST_HEAD(&net->dev_base_head);
Eric W. Biederman881d9662007-09-17 11:56:21 -07006051
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006052 net->dev_name_head = netdev_create_hash();
6053 if (net->dev_name_head == NULL)
6054 goto err_name;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006055
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006056 net->dev_index_head = netdev_create_hash();
6057 if (net->dev_index_head == NULL)
6058 goto err_idx;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006059
6060 return 0;
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07006061
6062err_idx:
6063 kfree(net->dev_name_head);
6064err_name:
6065 return -ENOMEM;
Eric W. Biederman881d9662007-09-17 11:56:21 -07006066}
6067
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07006068/**
6069 * netdev_drivername - network driver for the device
6070 * @dev: network device
6071 * @buffer: buffer for resulting name
6072 * @len: size of buffer
6073 *
6074 * Determine network driver for device.
6075 */
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07006076char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
Arjan van de Ven6579e572008-07-21 13:31:48 -07006077{
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07006078 const struct device_driver *driver;
6079 const struct device *parent;
Arjan van de Ven6579e572008-07-21 13:31:48 -07006080
6081 if (len <= 0 || !buffer)
6082 return buffer;
6083 buffer[0] = 0;
6084
6085 parent = dev->dev.parent;
6086
6087 if (!parent)
6088 return buffer;
6089
6090 driver = parent->driver;
6091 if (driver && driver->name)
6092 strlcpy(buffer, driver->name, len);
6093 return buffer;
6094}
6095
Joe Perches256df2f2010-06-27 01:02:35 +00006096static int __netdev_printk(const char *level, const struct net_device *dev,
6097 struct va_format *vaf)
6098{
6099 int r;
6100
6101 if (dev && dev->dev.parent)
6102 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6103 netdev_name(dev), vaf);
6104 else if (dev)
6105 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6106 else
6107 r = printk("%s(NULL net_device): %pV", level, vaf);
6108
6109 return r;
6110}
6111
6112int netdev_printk(const char *level, const struct net_device *dev,
6113 const char *format, ...)
6114{
6115 struct va_format vaf;
6116 va_list args;
6117 int r;
6118
6119 va_start(args, format);
6120
6121 vaf.fmt = format;
6122 vaf.va = &args;
6123
6124 r = __netdev_printk(level, dev, &vaf);
6125 va_end(args);
6126
6127 return r;
6128}
6129EXPORT_SYMBOL(netdev_printk);
6130
6131#define define_netdev_printk_level(func, level) \
6132int func(const struct net_device *dev, const char *fmt, ...) \
6133{ \
6134 int r; \
6135 struct va_format vaf; \
6136 va_list args; \
6137 \
6138 va_start(args, fmt); \
6139 \
6140 vaf.fmt = fmt; \
6141 vaf.va = &args; \
6142 \
6143 r = __netdev_printk(level, dev, &vaf); \
6144 va_end(args); \
6145 \
6146 return r; \
6147} \
6148EXPORT_SYMBOL(func);
6149
6150define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6151define_netdev_printk_level(netdev_alert, KERN_ALERT);
6152define_netdev_printk_level(netdev_crit, KERN_CRIT);
6153define_netdev_printk_level(netdev_err, KERN_ERR);
6154define_netdev_printk_level(netdev_warn, KERN_WARNING);
6155define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6156define_netdev_printk_level(netdev_info, KERN_INFO);
6157
Pavel Emelyanov46650792007-10-08 20:38:39 -07006158static void __net_exit netdev_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07006159{
6160 kfree(net->dev_name_head);
6161 kfree(net->dev_index_head);
6162}
6163
Denis V. Lunev022cbae2007-11-13 03:23:50 -08006164static struct pernet_operations __net_initdata netdev_net_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07006165 .init = netdev_init,
6166 .exit = netdev_exit,
6167};
6168
Pavel Emelyanov46650792007-10-08 20:38:39 -07006169static void __net_exit default_device_exit(struct net *net)
Eric W. Biedermance286d32007-09-12 13:53:49 +02006170{
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006171 struct net_device *dev, *aux;
Eric W. Biedermance286d32007-09-12 13:53:49 +02006172 /*
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006173 * Push all migratable network devices back to the
Eric W. Biedermance286d32007-09-12 13:53:49 +02006174 * initial network namespace
6175 */
6176 rtnl_lock();
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006177 for_each_netdev_safe(net, dev, aux) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006178 int err;
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006179 char fb_name[IFNAMSIZ];
Eric W. Biedermance286d32007-09-12 13:53:49 +02006180
6181 /* Ignore unmoveable devices (i.e. loopback) */
6182 if (dev->features & NETIF_F_NETNS_LOCAL)
6183 continue;
6184
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00006185 /* Leave virtual devices for the generic cleanup */
6186 if (dev->rtnl_link_ops)
6187 continue;
Eric W. Biedermand0c082c2008-11-05 15:59:38 -08006188
Eric W. Biedermance286d32007-09-12 13:53:49 +02006189 /* Push remaing network devices to init_net */
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006190 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6191 err = dev_change_net_namespace(dev, &init_net, fb_name);
Eric W. Biedermance286d32007-09-12 13:53:49 +02006192 if (err) {
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006193 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
Eric W. Biedermance286d32007-09-12 13:53:49 +02006194 __func__, dev->name, err);
Pavel Emelyanovaca51392008-05-08 01:24:25 -07006195 BUG();
Eric W. Biedermance286d32007-09-12 13:53:49 +02006196 }
6197 }
6198 rtnl_unlock();
6199}
6200
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006201static void __net_exit default_device_exit_batch(struct list_head *net_list)
6202{
6203 /* At exit all network devices most be removed from a network
Uwe Kleine-Königb5950762010-11-01 15:38:34 -04006204 * namespace. Do this in the reverse order of registration.
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006205 * Do this across as many network namespaces as possible to
6206 * improve batching efficiency.
6207 */
6208 struct net_device *dev;
6209 struct net *net;
6210 LIST_HEAD(dev_kill_list);
6211
6212 rtnl_lock();
6213 list_for_each_entry(net, net_list, exit_list) {
6214 for_each_netdev_reverse(net, dev) {
6215 if (dev->rtnl_link_ops)
6216 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6217 else
6218 unregister_netdevice_queue(dev, &dev_kill_list);
6219 }
6220 }
6221 unregister_netdevice_many(&dev_kill_list);
6222 rtnl_unlock();
6223}
6224
Denis V. Lunev022cbae2007-11-13 03:23:50 -08006225static struct pernet_operations __net_initdata default_device_ops = {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006226 .exit = default_device_exit,
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006227 .exit_batch = default_device_exit_batch,
Eric W. Biedermance286d32007-09-12 13:53:49 +02006228};
6229
Linus Torvalds1da177e2005-04-16 15:20:36 -07006230/*
6231 * Initialize the DEV module. At boot time this walks the device list and
6232 * unhooks any devices that fail to initialise (normally hardware not
6233 * present) and leaves us with a valid list of present and active devices.
6234 *
6235 */
6236
6237/*
6238 * This is called single threaded during boot, so no need
6239 * to take the rtnl semaphore.
6240 */
6241static int __init net_dev_init(void)
6242{
6243 int i, rc = -ENOMEM;
6244
6245 BUG_ON(!dev_boot_phase);
6246
Linus Torvalds1da177e2005-04-16 15:20:36 -07006247 if (dev_proc_init())
6248 goto out;
6249
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006250 if (netdev_kobject_init())
Linus Torvalds1da177e2005-04-16 15:20:36 -07006251 goto out;
6252
6253 INIT_LIST_HEAD(&ptype_all);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08006254 for (i = 0; i < PTYPE_HASH_SIZE; i++)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006255 INIT_LIST_HEAD(&ptype_base[i]);
6256
Eric W. Biederman881d9662007-09-17 11:56:21 -07006257 if (register_pernet_subsys(&netdev_net_ops))
6258 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006259
6260 /*
6261 * Initialise the packet receive queues.
6262 */
6263
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07006264 for_each_possible_cpu(i) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006265 struct softnet_data *sd = &per_cpu(softnet_data, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006266
Changli Gaodee42872010-05-02 05:42:16 +00006267 memset(sd, 0, sizeof(*sd));
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006268 skb_queue_head_init(&sd->input_pkt_queue);
Changli Gao6e7676c2010-04-27 15:07:33 -07006269 skb_queue_head_init(&sd->process_queue);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006270 sd->completion_queue = NULL;
6271 INIT_LIST_HEAD(&sd->poll_list);
Changli Gaoa9cbd582010-04-26 23:06:24 +00006272 sd->output_queue = NULL;
6273 sd->output_queue_tailp = &sd->output_queue;
Eric Dumazetdf334542010-03-24 19:13:54 +00006274#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006275 sd->csd.func = rps_trigger_softirq;
6276 sd->csd.info = sd;
6277 sd->csd.flags = 0;
6278 sd->cpu = i;
Tom Herbert1e94d722010-03-18 17:45:44 -07006279#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00006280
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006281 sd->backlog.poll = process_backlog;
6282 sd->backlog.weight = weight_p;
6283 sd->backlog.gro_list = NULL;
6284 sd->backlog.gro_count = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006285 }
6286
Linus Torvalds1da177e2005-04-16 15:20:36 -07006287 dev_boot_phase = 0;
6288
Eric W. Biederman505d4f72008-11-07 22:54:20 -08006289 /* The loopback device is special if any other network devices
6290 * is present in a network namespace the loopback device must
6291 * be present. Since we now dynamically allocate and free the
6292 * loopback device ensure this invariant is maintained by
6293 * keeping the loopback device as the first device on the
6294 * list of network devices. Ensuring the loopback devices
6295 * is the first device that appears and the last network device
6296 * that disappears.
6297 */
6298 if (register_pernet_device(&loopback_net_ops))
6299 goto out;
6300
6301 if (register_pernet_device(&default_device_ops))
6302 goto out;
6303
Carlos R. Mafra962cf362008-05-15 11:15:37 -03006304 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6305 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006306
6307 hotcpu_notifier(dev_cpu_callback, 0);
6308 dst_init();
6309 dev_mcast_init();
6310 rc = 0;
6311out:
6312 return rc;
6313}
6314
6315subsys_initcall(net_dev_init);
6316
Krishna Kumare88721f2009-02-18 17:55:02 -08006317static int __init initialize_hashrnd(void)
6318{
Tom Herbert0a9627f2010-03-16 08:03:29 +00006319 get_random_bytes(&hashrnd, sizeof(hashrnd));
Krishna Kumare88721f2009-02-18 17:55:02 -08006320 return 0;
6321}
6322
6323late_initcall_sync(initialize_hashrnd);
6324