blob: ec01a5998d70ff7a33d89879ea4ae28428727a33 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <asm/system.h>
77#include <linux/bitops.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080078#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070079#include <linux/cpu.h>
80#include <linux/types.h>
81#include <linux/kernel.h>
stephen hemminger08e98972009-11-10 07:20:34 +000082#include <linux/hash.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090083#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/sched.h>
Arjan van de Ven4a3e2f72006-03-20 22:33:17 -080085#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070086#include <linux/string.h>
87#include <linux/mm.h>
88#include <linux/socket.h>
89#include <linux/sockios.h>
90#include <linux/errno.h>
91#include <linux/interrupt.h>
92#include <linux/if_ether.h>
93#include <linux/netdevice.h>
94#include <linux/etherdevice.h>
Ben Hutchings0187bdf2008-06-19 16:15:47 -070095#include <linux/ethtool.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070096#include <linux/notifier.h>
97#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020098#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070099#include <net/sock.h>
100#include <linux/rtnetlink.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/stat.h>
104#include <linux/if_bridge.h>
Patrick McHardyb863ceb2007-07-14 18:55:06 -0700105#include <linux/if_macvlan.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106#include <net/dst.h>
107#include <net/pkt_sched.h>
108#include <net/checksum.h>
Arnd Bergmann44540962009-11-26 06:07:08 +0000109#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110#include <linux/highmem.h>
111#include <linux/init.h>
112#include <linux/kmod.h>
113#include <linux/module.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114#include <linux/netpoll.h>
115#include <linux/rcupdate.h>
116#include <linux/delay.h>
Johannes Berg295f4a12007-04-26 20:43:56 -0700117#include <net/wext.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118#include <net/iw_handler.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119#include <asm/current.h>
Steve Grubb5bdb9882005-12-03 08:39:35 -0500120#include <linux/audit.h>
Chris Leechdb217332006-06-17 21:24:58 -0700121#include <linux/dmaengine.h>
Herbert Xuf6a78bf2006-06-22 02:57:17 -0700122#include <linux/err.h>
David S. Millerc7fa9d12006-08-15 16:34:13 -0700123#include <linux/ctype.h>
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700124#include <linux/if_arp.h>
Ben Hutchings6de329e2008-06-16 17:02:28 -0700125#include <linux/if_vlan.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700126#include <linux/ip.h>
Alexander Duyckad55dca2008-09-20 22:05:50 -0700127#include <net/ip.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700128#include <linux/ipv6.h>
129#include <linux/in.h>
David S. Millerb6b2fed2008-07-21 09:48:06 -0700130#include <linux/jhash.h>
131#include <linux/random.h>
David S. Miller9cbc1cb2009-06-15 03:02:23 -0700132#include <trace/events/napi.h>
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +0000133#include <linux/pci.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134
Pavel Emelyanov342709e2007-10-23 21:14:45 -0700135#include "net-sysfs.h"
136
Herbert Xud565b0a2008-12-15 23:38:52 -0800137/* Instead of increasing this, you should create a hash table. */
138#define MAX_GRO_SKBS 8
139
Herbert Xu5d38a072009-01-04 16:13:40 -0800140/* This should be increased if a protocol with a bigger head is added. */
141#define GRO_MAX_HEAD (MAX_HEADER + 128)
142
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143/*
144 * The list of packet types we will receive (as opposed to discard)
145 * and the routines to invoke.
146 *
147 * Why 16. Because with 16 the only overlap we get on a hash of the
148 * low nibble of the protocol value is RARP/SNAP/X.25.
149 *
150 * NOTE: That is no longer true with the addition of VLAN tags. Not
151 * sure which should go first, but I bet it won't make much
152 * difference if we are running VLANs. The good news is that
153 * this protocol won't be in the list unless compiled in, so
Stephen Hemminger3041a062006-05-26 13:25:24 -0700154 * the average user (w/out VLANs) will not be adversely affected.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 * --BLG
156 *
157 * 0800 IP
158 * 8100 802.1Q VLAN
159 * 0001 802.3
160 * 0002 AX.25
161 * 0004 802.2
162 * 8035 RARP
163 * 0005 SNAP
164 * 0805 X.25
165 * 0806 ARP
166 * 8137 IPX
167 * 0009 Localtalk
168 * 86DD IPv6
169 */
170
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +0800171#define PTYPE_HASH_SIZE (16)
172#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
173
Linus Torvalds1da177e2005-04-16 15:20:36 -0700174static DEFINE_SPINLOCK(ptype_lock);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +0800175static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -0700176static struct list_head ptype_all __read_mostly; /* Taps */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700177
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178/*
Pavel Emelianov7562f872007-05-03 15:13:45 -0700179 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180 * semaphore.
181 *
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800182 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183 *
184 * Writers must hold the rtnl semaphore while they loop through the
Pavel Emelianov7562f872007-05-03 15:13:45 -0700185 * dev_base_head list, and hold dev_base_lock for writing when they do the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186 * actual updates. This allows pure readers to access the list even
187 * while a writer is preparing to update it.
188 *
189 * To put it another way, dev_base_lock is held for writing only to
190 * protect against pure readers; the rtnl semaphore provides the
191 * protection against other writers.
192 *
193 * See, for example usages, register_netdevice() and
194 * unregister_netdevice(), which must be called with the rtnl
195 * semaphore held.
196 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197DEFINE_RWLOCK(dev_base_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198EXPORT_SYMBOL(dev_base_lock);
199
Eric W. Biederman881d9662007-09-17 11:56:21 -0700200static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700201{
202 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
stephen hemminger08e98972009-11-10 07:20:34 +0000203 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204}
205
Eric W. Biederman881d9662007-09-17 11:56:21 -0700206static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207{
Eric Dumazet7c28bd02009-10-24 06:13:17 -0700208 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209}
210
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000211static inline void rps_lock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000212{
213#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000214 spin_lock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000215#endif
216}
217
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000218static inline void rps_unlock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000219{
220#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000221 spin_unlock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000222#endif
223}
224
Eric W. Biedermance286d32007-09-12 13:53:49 +0200225/* Device list insertion */
226static int list_netdevice(struct net_device *dev)
227{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900228 struct net *net = dev_net(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200229
230 ASSERT_RTNL();
231
232 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800233 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
Eric Dumazet72c95282009-10-30 07:11:27 +0000234 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000235 hlist_add_head_rcu(&dev->index_hlist,
236 dev_index_hash(net, dev->ifindex));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200237 write_unlock_bh(&dev_base_lock);
238 return 0;
239}
240
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000241/* Device list removal
242 * caller must respect a RCU grace period before freeing/reusing dev
243 */
Eric W. Biedermance286d32007-09-12 13:53:49 +0200244static void unlist_netdevice(struct net_device *dev)
245{
246 ASSERT_RTNL();
247
248 /* Unlink dev from the device chain */
249 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800250 list_del_rcu(&dev->dev_list);
Eric Dumazet72c95282009-10-30 07:11:27 +0000251 hlist_del_rcu(&dev->name_hlist);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000252 hlist_del_rcu(&dev->index_hlist);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200253 write_unlock_bh(&dev_base_lock);
254}
255
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256/*
257 * Our notifier list
258 */
259
Alan Sternf07d5b92006-05-09 15:23:03 -0700260static RAW_NOTIFIER_HEAD(netdev_chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261
262/*
263 * Device drivers call our routines to queue packets here. We empty the
264 * queue in the local softnet handler.
265 */
Stephen Hemmingerbea33482007-10-03 16:41:36 -0700266
Eric Dumazet9958da02010-04-17 04:17:02 +0000267DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700268EXPORT_PER_CPU_SYMBOL(softnet_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700269
David S. Millercf508b12008-07-22 14:16:42 -0700270#ifdef CONFIG_LOCKDEP
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700271/*
David S. Millerc773e842008-07-08 23:13:53 -0700272 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700273 * according to dev->type
274 */
275static const unsigned short netdev_lock_type[] =
276 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
277 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
278 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
279 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
280 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
281 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
282 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
283 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
284 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
285 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
286 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
287 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
288 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
Rémi Denis-Courmont2d91d782008-12-17 15:47:29 -0800289 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
Dmitry Eremin-Solenikov929122cd2009-08-14 20:00:20 +0400290 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000291 ARPHRD_VOID, ARPHRD_NONE};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700292
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700293static const char *const netdev_lock_name[] =
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700294 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
295 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
296 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
297 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
298 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
299 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
300 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
301 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
302 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
303 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
304 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
305 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
306 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
Rémi Denis-Courmont2d91d782008-12-17 15:47:29 -0800307 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
Dmitry Eremin-Solenikov929122cd2009-08-14 20:00:20 +0400308 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000309 "_xmit_VOID", "_xmit_NONE"};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700310
311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
David S. Millercf508b12008-07-22 14:16:42 -0700312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700313
314static inline unsigned short netdev_lock_pos(unsigned short dev_type)
315{
316 int i;
317
318 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319 if (netdev_lock_type[i] == dev_type)
320 return i;
321 /* the last key is used by default */
322 return ARRAY_SIZE(netdev_lock_type) - 1;
323}
324
David S. Millercf508b12008-07-22 14:16:42 -0700325static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326 unsigned short dev_type)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700327{
328 int i;
329
330 i = netdev_lock_pos(dev_type);
331 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332 netdev_lock_name[i]);
333}
David S. Millercf508b12008-07-22 14:16:42 -0700334
335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
336{
337 int i;
338
339 i = netdev_lock_pos(dev->type);
340 lockdep_set_class_and_name(&dev->addr_list_lock,
341 &netdev_addr_lock_key[i],
342 netdev_lock_name[i]);
343}
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700344#else
David S. Millercf508b12008-07-22 14:16:42 -0700345static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346 unsigned short dev_type)
347{
348}
349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700350{
351}
352#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700353
354/*******************************************************************************
355
356 Protocol management and registration routines
357
358*******************************************************************************/
359
360/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700361 * Add a protocol ID to the list. Now that the input handler is
362 * smarter we can dispense with all the messy stuff that used to be
363 * here.
364 *
365 * BEWARE!!! Protocol handlers, mangling input packets,
366 * MUST BE last in hash buckets and checking protocol handlers
367 * MUST start from promiscuous ptype_all chain in net_bh.
368 * It is true now, do not change it.
369 * Explanation follows: if protocol handler, mangling packet, will
370 * be the first on list, it is not able to sense, that packet
371 * is cloned and should be copied-on-write, so that it will
372 * change it and subsequent readers will get broken packet.
373 * --ANK (980803)
374 */
375
376/**
377 * dev_add_pack - add packet handler
378 * @pt: packet type declaration
379 *
380 * Add a protocol handler to the networking stack. The passed &packet_type
381 * is linked into kernel lists and may not be freed until it has been
382 * removed from the kernel lists.
383 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900384 * This call does not sleep therefore it can not
Linus Torvalds1da177e2005-04-16 15:20:36 -0700385 * guarantee all CPU's that are in middle of receiving packets
386 * will see the new packet type (until the next received packet).
387 */
388
389void dev_add_pack(struct packet_type *pt)
390{
391 int hash;
392
393 spin_lock_bh(&ptype_lock);
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -0700394 if (pt->type == htons(ETH_P_ALL))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700395 list_add_rcu(&pt->list, &ptype_all);
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -0700396 else {
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +0800397 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700398 list_add_rcu(&pt->list, &ptype_base[hash]);
399 }
400 spin_unlock_bh(&ptype_lock);
401}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700402EXPORT_SYMBOL(dev_add_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404/**
405 * __dev_remove_pack - remove packet handler
406 * @pt: packet type declaration
407 *
408 * Remove a protocol handler that was previously added to the kernel
409 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
410 * from the kernel lists and can be freed or reused once this function
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900411 * returns.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700412 *
413 * The packet type might still be in use by receivers
414 * and must not be freed until after all the CPU's have gone
415 * through a quiescent state.
416 */
417void __dev_remove_pack(struct packet_type *pt)
418{
419 struct list_head *head;
420 struct packet_type *pt1;
421
422 spin_lock_bh(&ptype_lock);
423
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -0700424 if (pt->type == htons(ETH_P_ALL))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425 head = &ptype_all;
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -0700426 else
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +0800427 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700428
429 list_for_each_entry(pt1, head, list) {
430 if (pt == pt1) {
431 list_del_rcu(&pt->list);
432 goto out;
433 }
434 }
435
436 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
437out:
438 spin_unlock_bh(&ptype_lock);
439}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700440EXPORT_SYMBOL(__dev_remove_pack);
441
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442/**
443 * dev_remove_pack - remove packet handler
444 * @pt: packet type declaration
445 *
446 * Remove a protocol handler that was previously added to the kernel
447 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
448 * from the kernel lists and can be freed or reused once this function
449 * returns.
450 *
451 * This call sleeps to guarantee that no CPU is looking at the packet
452 * type after return.
453 */
454void dev_remove_pack(struct packet_type *pt)
455{
456 __dev_remove_pack(pt);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900457
Linus Torvalds1da177e2005-04-16 15:20:36 -0700458 synchronize_net();
459}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700460EXPORT_SYMBOL(dev_remove_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700461
462/******************************************************************************
463
464 Device Boot-time Settings Routines
465
466*******************************************************************************/
467
468/* Boot time configuration table */
469static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
470
471/**
472 * netdev_boot_setup_add - add new setup entry
473 * @name: name of the device
474 * @map: configured settings for the device
475 *
476 * Adds new setup entry to the dev_boot_setup list. The function
477 * returns 0 on error and 1 on success. This is a generic routine to
478 * all netdevices.
479 */
480static int netdev_boot_setup_add(char *name, struct ifmap *map)
481{
482 struct netdev_boot_setup *s;
483 int i;
484
485 s = dev_boot_setup;
486 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488 memset(s[i].name, 0, sizeof(s[i].name));
Wang Chen93b3cff2008-07-01 19:57:19 -0700489 strlcpy(s[i].name, name, IFNAMSIZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700490 memcpy(&s[i].map, map, sizeof(s[i].map));
491 break;
492 }
493 }
494
495 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
496}
497
498/**
499 * netdev_boot_setup_check - check boot time settings
500 * @dev: the netdevice
501 *
502 * Check boot time settings for the device.
503 * The found settings are set for the device to be used
504 * later in the device probing.
505 * Returns 0 if no settings found, 1 if they are.
506 */
507int netdev_boot_setup_check(struct net_device *dev)
508{
509 struct netdev_boot_setup *s = dev_boot_setup;
510 int i;
511
512 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
Wang Chen93b3cff2008-07-01 19:57:19 -0700514 !strcmp(dev->name, s[i].name)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700515 dev->irq = s[i].map.irq;
516 dev->base_addr = s[i].map.base_addr;
517 dev->mem_start = s[i].map.mem_start;
518 dev->mem_end = s[i].map.mem_end;
519 return 1;
520 }
521 }
522 return 0;
523}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700524EXPORT_SYMBOL(netdev_boot_setup_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700525
526
527/**
528 * netdev_boot_base - get address from boot time settings
529 * @prefix: prefix for network device
530 * @unit: id for network device
531 *
532 * Check boot time settings for the base address of device.
533 * The found settings are set for the device to be used
534 * later in the device probing.
535 * Returns 0 if no settings found.
536 */
537unsigned long netdev_boot_base(const char *prefix, int unit)
538{
539 const struct netdev_boot_setup *s = dev_boot_setup;
540 char name[IFNAMSIZ];
541 int i;
542
543 sprintf(name, "%s%d", prefix, unit);
544
545 /*
546 * If device already registered then return base of 1
547 * to indicate not to probe for this interface
548 */
Eric W. Biederman881d9662007-09-17 11:56:21 -0700549 if (__dev_get_by_name(&init_net, name))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700550 return 1;
551
552 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553 if (!strcmp(name, s[i].name))
554 return s[i].map.base_addr;
555 return 0;
556}
557
558/*
559 * Saves at boot time configured settings for any netdevice.
560 */
561int __init netdev_boot_setup(char *str)
562{
563 int ints[5];
564 struct ifmap map;
565
566 str = get_options(str, ARRAY_SIZE(ints), ints);
567 if (!str || !*str)
568 return 0;
569
570 /* Save settings */
571 memset(&map, 0, sizeof(map));
572 if (ints[0] > 0)
573 map.irq = ints[1];
574 if (ints[0] > 1)
575 map.base_addr = ints[2];
576 if (ints[0] > 2)
577 map.mem_start = ints[3];
578 if (ints[0] > 3)
579 map.mem_end = ints[4];
580
581 /* Add new entry to the list */
582 return netdev_boot_setup_add(str, &map);
583}
584
585__setup("netdev=", netdev_boot_setup);
586
587/*******************************************************************************
588
589 Device Interface Subroutines
590
591*******************************************************************************/
592
593/**
594 * __dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700595 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700596 * @name: name to find
597 *
598 * Find an interface by name. Must be called under RTNL semaphore
599 * or @dev_base_lock. If the name is found a pointer to the device
600 * is returned. If the name is not found then %NULL is returned. The
601 * reference counters are not incremented so the caller must be
602 * careful with locks.
603 */
604
Eric W. Biederman881d9662007-09-17 11:56:21 -0700605struct net_device *__dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700606{
607 struct hlist_node *p;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700608 struct net_device *dev;
609 struct hlist_head *head = dev_name_hash(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700610
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700611 hlist_for_each_entry(dev, p, head, name_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612 if (!strncmp(dev->name, name, IFNAMSIZ))
613 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700614
Linus Torvalds1da177e2005-04-16 15:20:36 -0700615 return NULL;
616}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700617EXPORT_SYMBOL(__dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618
619/**
Eric Dumazet72c95282009-10-30 07:11:27 +0000620 * dev_get_by_name_rcu - find a device by its name
621 * @net: the applicable net namespace
622 * @name: name to find
623 *
624 * Find an interface by name.
625 * If the name is found a pointer to the device is returned.
626 * If the name is not found then %NULL is returned.
627 * The reference counters are not incremented so the caller must be
628 * careful with locks. The caller must hold RCU lock.
629 */
630
631struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
632{
633 struct hlist_node *p;
634 struct net_device *dev;
635 struct hlist_head *head = dev_name_hash(net, name);
636
637 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638 if (!strncmp(dev->name, name, IFNAMSIZ))
639 return dev;
640
641 return NULL;
642}
643EXPORT_SYMBOL(dev_get_by_name_rcu);
644
645/**
Linus Torvalds1da177e2005-04-16 15:20:36 -0700646 * dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700647 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648 * @name: name to find
649 *
650 * Find an interface by name. This can be called from any
651 * context and does its own locking. The returned handle has
652 * the usage count incremented and the caller must use dev_put() to
653 * release it when it is no longer needed. %NULL is returned if no
654 * matching device is found.
655 */
656
Eric W. Biederman881d9662007-09-17 11:56:21 -0700657struct net_device *dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700658{
659 struct net_device *dev;
660
Eric Dumazet72c95282009-10-30 07:11:27 +0000661 rcu_read_lock();
662 dev = dev_get_by_name_rcu(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663 if (dev)
664 dev_hold(dev);
Eric Dumazet72c95282009-10-30 07:11:27 +0000665 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666 return dev;
667}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700668EXPORT_SYMBOL(dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669
670/**
671 * __dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700672 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700673 * @ifindex: index of device
674 *
675 * Search for an interface by index. Returns %NULL if the device
676 * is not found or a pointer to the device. The device has not
677 * had its reference counter increased so the caller must be careful
678 * about locking. The caller must hold either the RTNL semaphore
679 * or @dev_base_lock.
680 */
681
Eric W. Biederman881d9662007-09-17 11:56:21 -0700682struct net_device *__dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683{
684 struct hlist_node *p;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700685 struct net_device *dev;
686 struct hlist_head *head = dev_index_hash(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700687
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700688 hlist_for_each_entry(dev, p, head, index_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700689 if (dev->ifindex == ifindex)
690 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700691
Linus Torvalds1da177e2005-04-16 15:20:36 -0700692 return NULL;
693}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700694EXPORT_SYMBOL(__dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700695
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000696/**
697 * dev_get_by_index_rcu - find a device by its ifindex
698 * @net: the applicable net namespace
699 * @ifindex: index of device
700 *
701 * Search for an interface by index. Returns %NULL if the device
702 * is not found or a pointer to the device. The device has not
703 * had its reference counter increased so the caller must be careful
704 * about locking. The caller must hold RCU lock.
705 */
706
707struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
708{
709 struct hlist_node *p;
710 struct net_device *dev;
711 struct hlist_head *head = dev_index_hash(net, ifindex);
712
713 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714 if (dev->ifindex == ifindex)
715 return dev;
716
717 return NULL;
718}
719EXPORT_SYMBOL(dev_get_by_index_rcu);
720
Linus Torvalds1da177e2005-04-16 15:20:36 -0700721
722/**
723 * dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700724 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700725 * @ifindex: index of device
726 *
727 * Search for an interface by index. Returns NULL if the device
728 * is not found or a pointer to the device. The device returned has
729 * had a reference added and the pointer is safe until the user calls
730 * dev_put to indicate they have finished with it.
731 */
732
Eric W. Biederman881d9662007-09-17 11:56:21 -0700733struct net_device *dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700734{
735 struct net_device *dev;
736
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000737 rcu_read_lock();
738 dev = dev_get_by_index_rcu(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700739 if (dev)
740 dev_hold(dev);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000741 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700742 return dev;
743}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700744EXPORT_SYMBOL(dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700745
746/**
747 * dev_getbyhwaddr - find a device by its hardware address
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700748 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700749 * @type: media type of device
750 * @ha: hardware address
751 *
752 * Search for an interface by MAC address. Returns NULL if the device
753 * is not found or a pointer to the device. The caller must hold the
754 * rtnl semaphore. The returned device has not had its ref count increased
755 * and the caller must therefore be careful about locking
756 *
757 * BUGS:
758 * If the API was consistent this would be __dev_get_by_hwaddr
759 */
760
Eric W. Biederman881d9662007-09-17 11:56:21 -0700761struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700762{
763 struct net_device *dev;
764
765 ASSERT_RTNL();
766
Denis V. Lunev81103a52007-12-12 10:47:38 -0800767 for_each_netdev(net, dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700768 if (dev->type == type &&
769 !memcmp(dev->dev_addr, ha, dev->addr_len))
Pavel Emelianov7562f872007-05-03 15:13:45 -0700770 return dev;
771
772 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700773}
Jochen Friedrichcf309e32005-09-22 04:44:55 -0300774EXPORT_SYMBOL(dev_getbyhwaddr);
775
Eric W. Biederman881d9662007-09-17 11:56:21 -0700776struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700777{
778 struct net_device *dev;
779
780 ASSERT_RTNL();
Eric W. Biederman881d9662007-09-17 11:56:21 -0700781 for_each_netdev(net, dev)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700782 if (dev->type == type)
Pavel Emelianov7562f872007-05-03 15:13:45 -0700783 return dev;
784
785 return NULL;
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700786}
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700787EXPORT_SYMBOL(__dev_getfirstbyhwtype);
788
Eric W. Biederman881d9662007-09-17 11:56:21 -0700789struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790{
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000791 struct net_device *dev, *ret = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700792
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000793 rcu_read_lock();
794 for_each_netdev_rcu(net, dev)
795 if (dev->type == type) {
796 dev_hold(dev);
797 ret = dev;
798 break;
799 }
800 rcu_read_unlock();
801 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700802}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700803EXPORT_SYMBOL(dev_getfirstbyhwtype);
804
805/**
806 * dev_get_by_flags - find any device with given flags
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700807 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700808 * @if_flags: IFF_* values
809 * @mask: bitmask of bits in if_flags to check
810 *
811 * Search for any interface with the given flags. Returns NULL if a device
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900812 * is not found or a pointer to the device. The device returned has
Linus Torvalds1da177e2005-04-16 15:20:36 -0700813 * had a reference added and the pointer is safe until the user calls
814 * dev_put to indicate they have finished with it.
815 */
816
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700817struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
818 unsigned short mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700819{
Pavel Emelianov7562f872007-05-03 15:13:45 -0700820 struct net_device *dev, *ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700821
Pavel Emelianov7562f872007-05-03 15:13:45 -0700822 ret = NULL;
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800823 rcu_read_lock();
824 for_each_netdev_rcu(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700825 if (((dev->flags ^ if_flags) & mask) == 0) {
826 dev_hold(dev);
Pavel Emelianov7562f872007-05-03 15:13:45 -0700827 ret = dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700828 break;
829 }
830 }
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800831 rcu_read_unlock();
Pavel Emelianov7562f872007-05-03 15:13:45 -0700832 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700833}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700834EXPORT_SYMBOL(dev_get_by_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700835
836/**
837 * dev_valid_name - check if name is okay for network device
838 * @name: name string
839 *
840 * Network device names need to be valid file names to
David S. Millerc7fa9d12006-08-15 16:34:13 -0700841 * to allow sysfs to work. We also disallow any kind of
842 * whitespace.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700843 */
Mitch Williamsc2373ee2005-11-09 10:34:45 -0800844int dev_valid_name(const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700845{
David S. Millerc7fa9d12006-08-15 16:34:13 -0700846 if (*name == '\0')
847 return 0;
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -0700848 if (strlen(name) >= IFNAMSIZ)
849 return 0;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700850 if (!strcmp(name, ".") || !strcmp(name, ".."))
851 return 0;
852
853 while (*name) {
854 if (*name == '/' || isspace(*name))
855 return 0;
856 name++;
857 }
858 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700859}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700860EXPORT_SYMBOL(dev_valid_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700861
862/**
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200863 * __dev_alloc_name - allocate a name for a device
864 * @net: network namespace to allocate the device name in
Linus Torvalds1da177e2005-04-16 15:20:36 -0700865 * @name: name format string
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200866 * @buf: scratch buffer and result name string
Linus Torvalds1da177e2005-04-16 15:20:36 -0700867 *
868 * Passed a format string - eg "lt%d" it will try and find a suitable
Stephen Hemminger3041a062006-05-26 13:25:24 -0700869 * id. It scans list of devices to build up a free map, then chooses
870 * the first empty slot. The caller must hold the dev_base or rtnl lock
871 * while allocating the name and adding the device in order to avoid
872 * duplicates.
873 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
874 * Returns the number of the unit assigned or a negative errno code.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700875 */
876
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200877static int __dev_alloc_name(struct net *net, const char *name, char *buf)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700878{
879 int i = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700880 const char *p;
881 const int max_netdevices = 8*PAGE_SIZE;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700882 unsigned long *inuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700883 struct net_device *d;
884
885 p = strnchr(name, IFNAMSIZ-1, '%');
886 if (p) {
887 /*
888 * Verify the string as this thing may have come from
889 * the user. There must be either one "%d" and no other "%"
890 * characters.
891 */
892 if (p[1] != 'd' || strchr(p + 2, '%'))
893 return -EINVAL;
894
895 /* Use one page as a bit array of possible slots */
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700896 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700897 if (!inuse)
898 return -ENOMEM;
899
Eric W. Biederman881d9662007-09-17 11:56:21 -0700900 for_each_netdev(net, d) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700901 if (!sscanf(d->name, name, &i))
902 continue;
903 if (i < 0 || i >= max_netdevices)
904 continue;
905
906 /* avoid cases where sscanf is not exact inverse of printf */
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200907 snprintf(buf, IFNAMSIZ, name, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700908 if (!strncmp(buf, d->name, IFNAMSIZ))
909 set_bit(i, inuse);
910 }
911
912 i = find_first_zero_bit(inuse, max_netdevices);
913 free_page((unsigned long) inuse);
914 }
915
Octavian Purdilad9031022009-11-18 02:36:59 +0000916 if (buf != name)
917 snprintf(buf, IFNAMSIZ, name, i);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200918 if (!__dev_get_by_name(net, buf))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700919 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700920
921 /* It is possible to run out of possible slots
922 * when the name is long and there isn't enough space left
923 * for the digits, or if all bits are used.
924 */
925 return -ENFILE;
926}
927
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200928/**
929 * dev_alloc_name - allocate a name for a device
930 * @dev: device
931 * @name: name format string
932 *
933 * Passed a format string - eg "lt%d" it will try and find a suitable
934 * id. It scans list of devices to build up a free map, then chooses
935 * the first empty slot. The caller must hold the dev_base or rtnl lock
936 * while allocating the name and adding the device in order to avoid
937 * duplicates.
938 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
939 * Returns the number of the unit assigned or a negative errno code.
940 */
941
942int dev_alloc_name(struct net_device *dev, const char *name)
943{
944 char buf[IFNAMSIZ];
945 struct net *net;
946 int ret;
947
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900948 BUG_ON(!dev_net(dev));
949 net = dev_net(dev);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200950 ret = __dev_alloc_name(net, name, buf);
951 if (ret >= 0)
952 strlcpy(dev->name, buf, IFNAMSIZ);
953 return ret;
954}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700955EXPORT_SYMBOL(dev_alloc_name);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200956
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000957static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
Octavian Purdilad9031022009-11-18 02:36:59 +0000958{
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000959 struct net *net;
960
961 BUG_ON(!dev_net(dev));
962 net = dev_net(dev);
963
Octavian Purdilad9031022009-11-18 02:36:59 +0000964 if (!dev_valid_name(name))
965 return -EINVAL;
966
967 if (fmt && strchr(name, '%'))
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000968 return dev_alloc_name(dev, name);
Octavian Purdilad9031022009-11-18 02:36:59 +0000969 else if (__dev_get_by_name(net, name))
970 return -EEXIST;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000971 else if (dev->name != name)
972 strlcpy(dev->name, name, IFNAMSIZ);
Octavian Purdilad9031022009-11-18 02:36:59 +0000973
974 return 0;
975}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700976
977/**
978 * dev_change_name - change name of a device
979 * @dev: device
980 * @newname: name (or format string) must be at least IFNAMSIZ
981 *
982 * Change name of a device, can pass format strings "eth%d".
983 * for wildcarding.
984 */
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -0700985int dev_change_name(struct net_device *dev, const char *newname)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700986{
Herbert Xufcc5a032007-07-30 17:03:38 -0700987 char oldname[IFNAMSIZ];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700988 int err = 0;
Herbert Xufcc5a032007-07-30 17:03:38 -0700989 int ret;
Eric W. Biederman881d9662007-09-17 11:56:21 -0700990 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991
992 ASSERT_RTNL();
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900993 BUG_ON(!dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700994
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900995 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700996 if (dev->flags & IFF_UP)
997 return -EBUSY;
998
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -0700999 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1000 return 0;
1001
Herbert Xufcc5a032007-07-30 17:03:38 -07001002 memcpy(oldname, dev->name, IFNAMSIZ);
1003
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001004 err = dev_get_valid_name(dev, newname, 1);
Octavian Purdilad9031022009-11-18 02:36:59 +00001005 if (err < 0)
1006 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001007
Herbert Xufcc5a032007-07-30 17:03:38 -07001008rollback:
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001009 ret = device_rename(&dev->dev, dev->name);
1010 if (ret) {
1011 memcpy(dev->name, oldname, IFNAMSIZ);
1012 return ret;
Stephen Hemmingerdcc99772008-05-14 22:33:38 -07001013 }
Herbert Xu7f988ea2007-07-30 16:35:46 -07001014
1015 write_lock_bh(&dev_base_lock);
Eric W. Biederman92749822007-04-03 00:07:30 -06001016 hlist_del(&dev->name_hlist);
Eric Dumazet72c95282009-10-30 07:11:27 +00001017 write_unlock_bh(&dev_base_lock);
1018
1019 synchronize_rcu();
1020
1021 write_lock_bh(&dev_base_lock);
1022 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Herbert Xu7f988ea2007-07-30 16:35:46 -07001023 write_unlock_bh(&dev_base_lock);
1024
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001025 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001026 ret = notifier_to_errno(ret);
1027
1028 if (ret) {
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001029 /* err >= 0 after dev_alloc_name() or stores the first errno */
1030 if (err >= 0) {
Herbert Xufcc5a032007-07-30 17:03:38 -07001031 err = ret;
1032 memcpy(dev->name, oldname, IFNAMSIZ);
1033 goto rollback;
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001034 } else {
1035 printk(KERN_ERR
1036 "%s: name change rollback failed: %d.\n",
1037 dev->name, ret);
Herbert Xufcc5a032007-07-30 17:03:38 -07001038 }
1039 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001040
1041 return err;
1042}
1043
1044/**
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001045 * dev_set_alias - change ifalias of a device
1046 * @dev: device
1047 * @alias: name up to IFALIASZ
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07001048 * @len: limit of bytes to copy from info
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001049 *
1050 * Set ifalias for a device,
1051 */
1052int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1053{
1054 ASSERT_RTNL();
1055
1056 if (len >= IFALIASZ)
1057 return -EINVAL;
1058
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001059 if (!len) {
1060 if (dev->ifalias) {
1061 kfree(dev->ifalias);
1062 dev->ifalias = NULL;
1063 }
1064 return 0;
1065 }
1066
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001067 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001068 if (!dev->ifalias)
1069 return -ENOMEM;
1070
1071 strlcpy(dev->ifalias, alias, len+1);
1072 return len;
1073}
1074
1075
1076/**
Stephen Hemminger3041a062006-05-26 13:25:24 -07001077 * netdev_features_change - device changes features
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001078 * @dev: device to cause notification
1079 *
1080 * Called to indicate a device has changed features.
1081 */
1082void netdev_features_change(struct net_device *dev)
1083{
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001084 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001085}
1086EXPORT_SYMBOL(netdev_features_change);
1087
1088/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001089 * netdev_state_change - device changes state
1090 * @dev: device to cause notification
1091 *
1092 * Called to indicate a device has changed state. This function calls
1093 * the notifier chains for netdev_chain and sends a NEWLINK message
1094 * to the routing socket.
1095 */
1096void netdev_state_change(struct net_device *dev)
1097{
1098 if (dev->flags & IFF_UP) {
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001099 call_netdevice_notifiers(NETDEV_CHANGE, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001100 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1101 }
1102}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001103EXPORT_SYMBOL(netdev_state_change);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001104
Jiri Pirko3ca5b402010-03-10 10:29:35 +00001105int netdev_bonding_change(struct net_device *dev, unsigned long event)
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001106{
Jiri Pirko3ca5b402010-03-10 10:29:35 +00001107 return call_netdevice_notifiers(event, dev);
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001108}
1109EXPORT_SYMBOL(netdev_bonding_change);
1110
Linus Torvalds1da177e2005-04-16 15:20:36 -07001111/**
1112 * dev_load - load a network module
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001113 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07001114 * @name: name of interface
1115 *
1116 * If a network interface is not present and the process has suitable
1117 * privileges this function loads the module. If module loading is not
1118 * available in this kernel then it becomes a nop.
1119 */
1120
Eric W. Biederman881d9662007-09-17 11:56:21 -07001121void dev_load(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001122{
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001123 struct net_device *dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001124
Eric Dumazet72c95282009-10-30 07:11:27 +00001125 rcu_read_lock();
1126 dev = dev_get_by_name_rcu(net, name);
1127 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001128
Eric Parisa8f80e82009-08-13 09:44:51 -04001129 if (!dev && capable(CAP_NET_ADMIN))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130 request_module("%s", name);
1131}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001132EXPORT_SYMBOL(dev_load);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001133
Patrick McHardybd380812010-02-26 06:34:53 +00001134static int __dev_open(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001135{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001136 const struct net_device_ops *ops = dev->netdev_ops;
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001137 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001138
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001139 ASSERT_RTNL();
1140
Linus Torvalds1da177e2005-04-16 15:20:36 -07001141 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001142 * Is it even present?
1143 */
1144 if (!netif_device_present(dev))
1145 return -ENODEV;
1146
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001147 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1148 ret = notifier_to_errno(ret);
1149 if (ret)
1150 return ret;
1151
Linus Torvalds1da177e2005-04-16 15:20:36 -07001152 /*
1153 * Call device private open method
1154 */
1155 set_bit(__LINK_STATE_START, &dev->state);
Jeff Garzikbada3392007-10-23 20:19:37 -07001156
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001157 if (ops->ndo_validate_addr)
1158 ret = ops->ndo_validate_addr(dev);
Jeff Garzikbada3392007-10-23 20:19:37 -07001159
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001160 if (!ret && ops->ndo_open)
1161 ret = ops->ndo_open(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001162
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001163 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001164 * If it went open OK then:
1165 */
1166
Jeff Garzikbada3392007-10-23 20:19:37 -07001167 if (ret)
1168 clear_bit(__LINK_STATE_START, &dev->state);
1169 else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001170 /*
1171 * Set the flags.
1172 */
1173 dev->flags |= IFF_UP;
1174
1175 /*
Dan Williams649274d2009-01-11 00:20:39 -08001176 * Enable NET_DMA
1177 */
David S. Millerb4bd07c2009-02-06 22:06:43 -08001178 net_dmaengine_get();
Dan Williams649274d2009-01-11 00:20:39 -08001179
1180 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001181 * Initialize multicasting status
1182 */
Patrick McHardy4417da62007-06-27 01:28:10 -07001183 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001184
1185 /*
1186 * Wakeup transmit queue engine
1187 */
1188 dev_activate(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001189 }
Jeff Garzikbada3392007-10-23 20:19:37 -07001190
Linus Torvalds1da177e2005-04-16 15:20:36 -07001191 return ret;
1192}
Patrick McHardybd380812010-02-26 06:34:53 +00001193
1194/**
1195 * dev_open - prepare an interface for use.
1196 * @dev: device to open
1197 *
1198 * Takes a device from down to up state. The device's private open
1199 * function is invoked and then the multicast lists are loaded. Finally
1200 * the device is moved into the up state and a %NETDEV_UP message is
1201 * sent to the netdev notifier chain.
1202 *
1203 * Calling this function on an active interface is a nop. On a failure
1204 * a negative errno code is returned.
1205 */
1206int dev_open(struct net_device *dev)
1207{
1208 int ret;
1209
1210 /*
1211 * Is it already up?
1212 */
1213 if (dev->flags & IFF_UP)
1214 return 0;
1215
1216 /*
1217 * Open device
1218 */
1219 ret = __dev_open(dev);
1220 if (ret < 0)
1221 return ret;
1222
1223 /*
1224 * ... and announce new interface.
1225 */
1226 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1227 call_netdevice_notifiers(NETDEV_UP, dev);
1228
1229 return ret;
1230}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001231EXPORT_SYMBOL(dev_open);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001232
Patrick McHardybd380812010-02-26 06:34:53 +00001233static int __dev_close(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001234{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001235 const struct net_device_ops *ops = dev->netdev_ops;
Patrick McHardybd380812010-02-26 06:34:53 +00001236
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001237 ASSERT_RTNL();
David S. Miller9d5010d2007-09-12 14:33:25 +02001238 might_sleep();
1239
Linus Torvalds1da177e2005-04-16 15:20:36 -07001240 /*
1241 * Tell people we are going down, so that they can
1242 * prepare to death, when device is still operating.
1243 */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001244 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001245
Linus Torvalds1da177e2005-04-16 15:20:36 -07001246 clear_bit(__LINK_STATE_START, &dev->state);
1247
1248 /* Synchronize to scheduled poll. We cannot touch poll list,
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001249 * it can be even on different cpu. So just clear netif_running().
1250 *
1251 * dev->stop() will invoke napi_disable() on all of it's
1252 * napi_struct instances on this device.
1253 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001254 smp_mb__after_clear_bit(); /* Commit netif_running(). */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001255
Matti Linnanvuorid8b2a4d2008-02-12 23:10:11 -08001256 dev_deactivate(dev);
1257
Linus Torvalds1da177e2005-04-16 15:20:36 -07001258 /*
1259 * Call the device specific close. This cannot fail.
1260 * Only if device is UP
1261 *
1262 * We allow it to be called even after a DETACH hot-plug
1263 * event.
1264 */
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001265 if (ops->ndo_stop)
1266 ops->ndo_stop(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001267
1268 /*
1269 * Device is now down.
1270 */
1271
1272 dev->flags &= ~IFF_UP;
1273
1274 /*
Dan Williams649274d2009-01-11 00:20:39 -08001275 * Shutdown NET_DMA
1276 */
David S. Millerb4bd07c2009-02-06 22:06:43 -08001277 net_dmaengine_put();
Dan Williams649274d2009-01-11 00:20:39 -08001278
Linus Torvalds1da177e2005-04-16 15:20:36 -07001279 return 0;
1280}
Patrick McHardybd380812010-02-26 06:34:53 +00001281
1282/**
1283 * dev_close - shutdown an interface.
1284 * @dev: device to shutdown
1285 *
1286 * This function moves an active device into down state. A
1287 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1288 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1289 * chain.
1290 */
1291int dev_close(struct net_device *dev)
1292{
1293 if (!(dev->flags & IFF_UP))
1294 return 0;
1295
1296 __dev_close(dev);
1297
1298 /*
1299 * Tell people we are down
1300 */
1301 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1302 call_netdevice_notifiers(NETDEV_DOWN, dev);
1303
1304 return 0;
1305}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001306EXPORT_SYMBOL(dev_close);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001307
1308
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001309/**
1310 * dev_disable_lro - disable Large Receive Offload on a device
1311 * @dev: device
1312 *
1313 * Disable Large Receive Offload (LRO) on a net device. Must be
1314 * called under RTNL. This is needed if received packets may be
1315 * forwarded to another interface.
1316 */
1317void dev_disable_lro(struct net_device *dev)
1318{
1319 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1320 dev->ethtool_ops->set_flags) {
1321 u32 flags = dev->ethtool_ops->get_flags(dev);
1322 if (flags & ETH_FLAG_LRO) {
1323 flags &= ~ETH_FLAG_LRO;
1324 dev->ethtool_ops->set_flags(dev, flags);
1325 }
1326 }
1327 WARN_ON(dev->features & NETIF_F_LRO);
1328}
1329EXPORT_SYMBOL(dev_disable_lro);
1330
1331
Eric W. Biederman881d9662007-09-17 11:56:21 -07001332static int dev_boot_phase = 1;
1333
Linus Torvalds1da177e2005-04-16 15:20:36 -07001334/*
1335 * Device change register/unregister. These are not inline or static
1336 * as we export them to the world.
1337 */
1338
1339/**
1340 * register_netdevice_notifier - register a network notifier block
1341 * @nb: notifier
1342 *
1343 * Register a notifier to be called when network device events occur.
1344 * The notifier passed is linked into the kernel structures and must
1345 * not be reused until it has been unregistered. A negative errno code
1346 * is returned on a failure.
1347 *
1348 * When registered all registration and up events are replayed
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001349 * to the new notifier to allow device to have a race free
Linus Torvalds1da177e2005-04-16 15:20:36 -07001350 * view of the network device list.
1351 */
1352
1353int register_netdevice_notifier(struct notifier_block *nb)
1354{
1355 struct net_device *dev;
Herbert Xufcc5a032007-07-30 17:03:38 -07001356 struct net_device *last;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001357 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001358 int err;
1359
1360 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001361 err = raw_notifier_chain_register(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001362 if (err)
1363 goto unlock;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001364 if (dev_boot_phase)
1365 goto unlock;
1366 for_each_net(net) {
1367 for_each_netdev(net, dev) {
1368 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1369 err = notifier_to_errno(err);
1370 if (err)
1371 goto rollback;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001372
Eric W. Biederman881d9662007-09-17 11:56:21 -07001373 if (!(dev->flags & IFF_UP))
1374 continue;
Herbert Xufcc5a032007-07-30 17:03:38 -07001375
Eric W. Biederman881d9662007-09-17 11:56:21 -07001376 nb->notifier_call(nb, NETDEV_UP, dev);
1377 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001378 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001379
1380unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381 rtnl_unlock();
1382 return err;
Herbert Xufcc5a032007-07-30 17:03:38 -07001383
1384rollback:
1385 last = dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001386 for_each_net(net) {
1387 for_each_netdev(net, dev) {
1388 if (dev == last)
1389 break;
Herbert Xufcc5a032007-07-30 17:03:38 -07001390
Eric W. Biederman881d9662007-09-17 11:56:21 -07001391 if (dev->flags & IFF_UP) {
1392 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1393 nb->notifier_call(nb, NETDEV_DOWN, dev);
1394 }
1395 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00001396 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001397 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001398 }
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001399
1400 raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001401 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001402}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001403EXPORT_SYMBOL(register_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001404
1405/**
1406 * unregister_netdevice_notifier - unregister a network notifier block
1407 * @nb: notifier
1408 *
1409 * Unregister a notifier previously registered by
1410 * register_netdevice_notifier(). The notifier is unlinked into the
1411 * kernel structures and may then be reused. A negative errno code
1412 * is returned on a failure.
1413 */
1414
1415int unregister_netdevice_notifier(struct notifier_block *nb)
1416{
Herbert Xu9f514952006-03-25 01:24:25 -08001417 int err;
1418
1419 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001420 err = raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xu9f514952006-03-25 01:24:25 -08001421 rtnl_unlock();
1422 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001423}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001424EXPORT_SYMBOL(unregister_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001425
1426/**
1427 * call_netdevice_notifiers - call all network notifier blocks
1428 * @val: value passed unmodified to notifier function
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001429 * @dev: net_device pointer passed unmodified to notifier function
Linus Torvalds1da177e2005-04-16 15:20:36 -07001430 *
1431 * Call all network notifier blocks. Parameters and return value
Alan Sternf07d5b92006-05-09 15:23:03 -07001432 * are as for raw_notifier_call_chain().
Linus Torvalds1da177e2005-04-16 15:20:36 -07001433 */
1434
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001435int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001436{
Jiri Pirkoab930472010-04-20 01:45:37 -07001437 ASSERT_RTNL();
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001438 return raw_notifier_call_chain(&netdev_chain, val, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001439}
1440
1441/* When > 0 there are consumers of rx skb time stamps */
1442static atomic_t netstamp_needed = ATOMIC_INIT(0);
1443
1444void net_enable_timestamp(void)
1445{
1446 atomic_inc(&netstamp_needed);
1447}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001448EXPORT_SYMBOL(net_enable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001449
1450void net_disable_timestamp(void)
1451{
1452 atomic_dec(&netstamp_needed);
1453}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001454EXPORT_SYMBOL(net_disable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001455
Eric Dumazet3b098e22010-05-15 23:57:10 -07001456static inline void net_timestamp_set(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001457{
1458 if (atomic_read(&netstamp_needed))
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001459 __net_timestamp(skb);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001460 else
1461 skb->tstamp.tv64 = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001462}
1463
Eric Dumazet3b098e22010-05-15 23:57:10 -07001464static inline void net_timestamp_check(struct sk_buff *skb)
1465{
1466 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1467 __net_timestamp(skb);
1468}
1469
Arnd Bergmann44540962009-11-26 06:07:08 +00001470/**
1471 * dev_forward_skb - loopback an skb to another netif
1472 *
1473 * @dev: destination network device
1474 * @skb: buffer to forward
1475 *
1476 * return values:
1477 * NET_RX_SUCCESS (no congestion)
Eric Dumazet6ec82562010-05-06 00:53:53 -07001478 * NET_RX_DROP (packet was dropped, but freed)
Arnd Bergmann44540962009-11-26 06:07:08 +00001479 *
1480 * dev_forward_skb can be used for injecting an skb from the
1481 * start_xmit function of one device into the receive queue
1482 * of another device.
1483 *
1484 * The receiving device may be in another namespace, so
1485 * we have to clear all information in the skb that could
1486 * impact namespace isolation.
1487 */
1488int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1489{
1490 skb_orphan(skb);
1491
Eric Dumazet6ec82562010-05-06 00:53:53 -07001492 if (!(dev->flags & IFF_UP) ||
1493 (skb->len > (dev->mtu + dev->hard_header_len))) {
1494 kfree_skb(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001495 return NET_RX_DROP;
Eric Dumazet6ec82562010-05-06 00:53:53 -07001496 }
Arnd Bergmann8a83a002010-01-30 12:23:03 +00001497 skb_set_dev(skb, dev);
Arnd Bergmann44540962009-11-26 06:07:08 +00001498 skb->tstamp.tv64 = 0;
1499 skb->pkt_type = PACKET_HOST;
1500 skb->protocol = eth_type_trans(skb, dev);
Arnd Bergmann44540962009-11-26 06:07:08 +00001501 return netif_rx(skb);
1502}
1503EXPORT_SYMBOL_GPL(dev_forward_skb);
1504
Linus Torvalds1da177e2005-04-16 15:20:36 -07001505/*
1506 * Support routine. Sends outgoing frames to any network
1507 * taps currently in use.
1508 */
1509
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001510static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511{
1512 struct packet_type *ptype;
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001513
Jarek Poplawski8caf1532009-04-17 10:08:49 +00001514#ifdef CONFIG_NET_CLS_ACT
1515 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
Eric Dumazet3b098e22010-05-15 23:57:10 -07001516 net_timestamp_set(skb);
Jarek Poplawski8caf1532009-04-17 10:08:49 +00001517#else
Eric Dumazet3b098e22010-05-15 23:57:10 -07001518 net_timestamp_set(skb);
Jarek Poplawski8caf1532009-04-17 10:08:49 +00001519#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001520
1521 rcu_read_lock();
1522 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1523 /* Never send packets back to the socket
1524 * they originated from - MvS (miquels@drinkel.ow.org)
1525 */
1526 if ((ptype->dev == dev || !ptype->dev) &&
1527 (ptype->af_packet_priv == NULL ||
1528 (struct sock *)ptype->af_packet_priv != skb->sk)) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001529 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001530 if (!skb2)
1531 break;
1532
1533 /* skb->nh should be correctly
1534 set by sender, so that the second statement is
1535 just protection against buggy protocols.
1536 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001537 skb_reset_mac_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001538
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001539 if (skb_network_header(skb2) < skb2->data ||
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001540 skb2->network_header > skb2->tail) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001541 if (net_ratelimit())
1542 printk(KERN_CRIT "protocol %04x is "
1543 "buggy, dev %s\n",
1544 skb2->protocol, dev->name);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07001545 skb_reset_network_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001546 }
1547
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001548 skb2->transport_header = skb2->network_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549 skb2->pkt_type = PACKET_OUTGOING;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07001550 ptype->func(skb2, skb->dev, ptype, skb->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001551 }
1552 }
1553 rcu_read_unlock();
1554}
1555
Denis Vlasenko56079432006-03-29 15:57:29 -08001556
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001557static inline void __netif_reschedule(struct Qdisc *q)
1558{
1559 struct softnet_data *sd;
1560 unsigned long flags;
1561
1562 local_irq_save(flags);
1563 sd = &__get_cpu_var(softnet_data);
Changli Gaoa9cbd582010-04-26 23:06:24 +00001564 q->next_sched = NULL;
1565 *sd->output_queue_tailp = q;
1566 sd->output_queue_tailp = &q->next_sched;
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001567 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1568 local_irq_restore(flags);
1569}
1570
David S. Miller37437bb2008-07-16 02:15:04 -07001571void __netif_schedule(struct Qdisc *q)
Denis Vlasenko56079432006-03-29 15:57:29 -08001572{
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001573 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1574 __netif_reschedule(q);
Denis Vlasenko56079432006-03-29 15:57:29 -08001575}
1576EXPORT_SYMBOL(__netif_schedule);
1577
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001578void dev_kfree_skb_irq(struct sk_buff *skb)
Denis Vlasenko56079432006-03-29 15:57:29 -08001579{
Eric Dumazet15e83ed2010-05-19 23:16:03 +00001580 if (!skb->destructor)
1581 dev_kfree_skb(skb);
1582 else if (atomic_dec_and_test(&skb->users)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001583 struct softnet_data *sd;
1584 unsigned long flags;
Denis Vlasenko56079432006-03-29 15:57:29 -08001585
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001586 local_irq_save(flags);
1587 sd = &__get_cpu_var(softnet_data);
1588 skb->next = sd->completion_queue;
1589 sd->completion_queue = skb;
1590 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1591 local_irq_restore(flags);
1592 }
Denis Vlasenko56079432006-03-29 15:57:29 -08001593}
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001594EXPORT_SYMBOL(dev_kfree_skb_irq);
Denis Vlasenko56079432006-03-29 15:57:29 -08001595
1596void dev_kfree_skb_any(struct sk_buff *skb)
1597{
1598 if (in_irq() || irqs_disabled())
1599 dev_kfree_skb_irq(skb);
1600 else
1601 dev_kfree_skb(skb);
1602}
1603EXPORT_SYMBOL(dev_kfree_skb_any);
1604
1605
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001606/**
1607 * netif_device_detach - mark device as removed
1608 * @dev: network device
1609 *
1610 * Mark device as removed from system and therefore no longer available.
1611 */
Denis Vlasenko56079432006-03-29 15:57:29 -08001612void netif_device_detach(struct net_device *dev)
1613{
1614 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1615 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00001616 netif_tx_stop_all_queues(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08001617 }
1618}
1619EXPORT_SYMBOL(netif_device_detach);
1620
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001621/**
1622 * netif_device_attach - mark device as attached
1623 * @dev: network device
1624 *
1625 * Mark device as attached from system and restart if needed.
1626 */
Denis Vlasenko56079432006-03-29 15:57:29 -08001627void netif_device_attach(struct net_device *dev)
1628{
1629 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1630 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00001631 netif_tx_wake_all_queues(dev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001632 __netdev_watchdog_up(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08001633 }
1634}
1635EXPORT_SYMBOL(netif_device_attach);
1636
Ben Hutchings6de329e2008-06-16 17:02:28 -07001637static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1638{
1639 return ((features & NETIF_F_GEN_CSUM) ||
1640 ((features & NETIF_F_IP_CSUM) &&
1641 protocol == htons(ETH_P_IP)) ||
1642 ((features & NETIF_F_IPV6_CSUM) &&
Yi Zou1c8dbcf2009-02-27 14:06:54 -08001643 protocol == htons(ETH_P_IPV6)) ||
1644 ((features & NETIF_F_FCOE_CRC) &&
1645 protocol == htons(ETH_P_FCOE)));
Ben Hutchings6de329e2008-06-16 17:02:28 -07001646}
1647
1648static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1649{
1650 if (can_checksum_protocol(dev->features, skb->protocol))
1651 return true;
1652
1653 if (skb->protocol == htons(ETH_P_8021Q)) {
1654 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1655 if (can_checksum_protocol(dev->features & dev->vlan_features,
1656 veh->h_vlan_encapsulated_proto))
1657 return true;
1658 }
1659
1660 return false;
1661}
Denis Vlasenko56079432006-03-29 15:57:29 -08001662
Arnd Bergmann8a83a002010-01-30 12:23:03 +00001663/**
1664 * skb_dev_set -- assign a new device to a buffer
1665 * @skb: buffer for the new device
1666 * @dev: network device
1667 *
1668 * If an skb is owned by a device already, we have to reset
1669 * all data private to the namespace a device belongs to
1670 * before assigning it a new device.
1671 */
1672#ifdef CONFIG_NET_NS
1673void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1674{
1675 skb_dst_drop(skb);
1676 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1677 secpath_reset(skb);
1678 nf_reset(skb);
1679 skb_init_secmark(skb);
1680 skb->mark = 0;
1681 skb->priority = 0;
1682 skb->nf_trace = 0;
1683 skb->ipvs_property = 0;
1684#ifdef CONFIG_NET_SCHED
1685 skb->tc_index = 0;
1686#endif
1687 }
1688 skb->dev = dev;
1689}
1690EXPORT_SYMBOL(skb_set_dev);
1691#endif /* CONFIG_NET_NS */
1692
Linus Torvalds1da177e2005-04-16 15:20:36 -07001693/*
1694 * Invalidate hardware checksum when packet is to be mangled, and
1695 * complete checksum manually on outgoing path.
1696 */
Patrick McHardy84fa7932006-08-29 16:44:56 -07001697int skb_checksum_help(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698{
Al Virod3bc23e2006-11-14 21:24:49 -08001699 __wsum csum;
Herbert Xu663ead32007-04-09 11:59:07 -07001700 int ret = 0, offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001701
Patrick McHardy84fa7932006-08-29 16:44:56 -07001702 if (skb->ip_summed == CHECKSUM_COMPLETE)
Herbert Xua430a432006-07-08 13:34:56 -07001703 goto out_set_summed;
1704
1705 if (unlikely(skb_shinfo(skb)->gso_size)) {
Herbert Xua430a432006-07-08 13:34:56 -07001706 /* Let GSO fix up the checksum. */
1707 goto out_set_summed;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001708 }
1709
Herbert Xua0308472007-10-15 01:47:15 -07001710 offset = skb->csum_start - skb_headroom(skb);
1711 BUG_ON(offset >= skb_headlen(skb));
1712 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1713
1714 offset += skb->csum_offset;
1715 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1716
1717 if (skb_cloned(skb) &&
1718 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001719 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1720 if (ret)
1721 goto out;
1722 }
1723
Herbert Xua0308472007-10-15 01:47:15 -07001724 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
Herbert Xua430a432006-07-08 13:34:56 -07001725out_set_summed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001726 skb->ip_summed = CHECKSUM_NONE;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001727out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001728 return ret;
1729}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001730EXPORT_SYMBOL(skb_checksum_help);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001731
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001732/**
1733 * skb_gso_segment - Perform segmentation on skb.
1734 * @skb: buffer to segment
Herbert Xu576a30e2006-06-27 13:22:38 -07001735 * @features: features for the output path (see dev->features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001736 *
1737 * This function segments the given skb and returns a list of segments.
Herbert Xu576a30e2006-06-27 13:22:38 -07001738 *
1739 * It may return NULL if the skb requires no segmentation. This is
1740 * only possible when GSO is used for verifying header integrity.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001741 */
Herbert Xu576a30e2006-06-27 13:22:38 -07001742struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001743{
1744 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1745 struct packet_type *ptype;
Al Viro252e3342006-11-14 20:48:11 -08001746 __be16 type = skb->protocol;
Herbert Xua430a432006-07-08 13:34:56 -07001747 int err;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001748
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001749 skb_reset_mac_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001750 skb->mac_len = skb->network_header - skb->mac_header;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001751 __skb_pull(skb, skb->mac_len);
1752
Herbert Xu67fd1a72009-01-19 16:26:44 -08001753 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1754 struct net_device *dev = skb->dev;
1755 struct ethtool_drvinfo info = {};
1756
1757 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1758 dev->ethtool_ops->get_drvinfo(dev, &info);
1759
1760 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1761 "ip_summed=%d",
1762 info.driver, dev ? dev->features : 0L,
1763 skb->sk ? skb->sk->sk_route_caps : 0L,
1764 skb->len, skb->data_len, skb->ip_summed);
1765
Herbert Xua430a432006-07-08 13:34:56 -07001766 if (skb_header_cloned(skb) &&
1767 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1768 return ERR_PTR(err);
1769 }
1770
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001771 rcu_read_lock();
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08001772 list_for_each_entry_rcu(ptype,
1773 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001774 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
Patrick McHardy84fa7932006-08-29 16:44:56 -07001775 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
Herbert Xua430a432006-07-08 13:34:56 -07001776 err = ptype->gso_send_check(skb);
1777 segs = ERR_PTR(err);
1778 if (err || skb_gso_ok(skb, features))
1779 break;
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001780 __skb_push(skb, (skb->data -
1781 skb_network_header(skb)));
Herbert Xua430a432006-07-08 13:34:56 -07001782 }
Herbert Xu576a30e2006-06-27 13:22:38 -07001783 segs = ptype->gso_segment(skb, features);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001784 break;
1785 }
1786 }
1787 rcu_read_unlock();
1788
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001789 __skb_push(skb, skb->data - skb_mac_header(skb));
Herbert Xu576a30e2006-06-27 13:22:38 -07001790
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001791 return segs;
1792}
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001793EXPORT_SYMBOL(skb_gso_segment);
1794
Herbert Xufb286bb2005-11-10 13:01:24 -08001795/* Take action when hardware reception checksum errors are detected. */
1796#ifdef CONFIG_BUG
1797void netdev_rx_csum_fault(struct net_device *dev)
1798{
1799 if (net_ratelimit()) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001800 printk(KERN_ERR "%s: hw csum failure.\n",
Stephen Hemminger246a4212005-12-08 15:21:39 -08001801 dev ? dev->name : "<unknown>");
Herbert Xufb286bb2005-11-10 13:01:24 -08001802 dump_stack();
1803 }
1804}
1805EXPORT_SYMBOL(netdev_rx_csum_fault);
1806#endif
1807
Linus Torvalds1da177e2005-04-16 15:20:36 -07001808/* Actually, we should eliminate this check as soon as we know, that:
1809 * 1. IOMMU is present and allows to map all the memory.
1810 * 2. No high memory really exists on this machine.
1811 */
1812
Eric Dumazet9092c652010-04-02 13:34:49 -07001813static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001814{
Herbert Xu3d3a8532006-06-27 13:33:10 -07001815#ifdef CONFIG_HIGHMEM
Linus Torvalds1da177e2005-04-16 15:20:36 -07001816 int i;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001817 if (!(dev->features & NETIF_F_HIGHDMA)) {
1818 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1819 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1820 return 1;
1821 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001822
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001823 if (PCI_DMA_BUS_IS_PHYS) {
1824 struct device *pdev = dev->dev.parent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001825
Eric Dumazet9092c652010-04-02 13:34:49 -07001826 if (!pdev)
1827 return 0;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001828 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1829 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1830 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1831 return 1;
1832 }
1833 }
Herbert Xu3d3a8532006-06-27 13:33:10 -07001834#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001835 return 0;
1836}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001837
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001838struct dev_gso_cb {
1839 void (*destructor)(struct sk_buff *skb);
1840};
1841
1842#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1843
1844static void dev_gso_skb_destructor(struct sk_buff *skb)
1845{
1846 struct dev_gso_cb *cb;
1847
1848 do {
1849 struct sk_buff *nskb = skb->next;
1850
1851 skb->next = nskb->next;
1852 nskb->next = NULL;
1853 kfree_skb(nskb);
1854 } while (skb->next);
1855
1856 cb = DEV_GSO_CB(skb);
1857 if (cb->destructor)
1858 cb->destructor(skb);
1859}
1860
1861/**
1862 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1863 * @skb: buffer to segment
1864 *
1865 * This function segments the given skb and stores the list of segments
1866 * in skb->next.
1867 */
1868static int dev_gso_segment(struct sk_buff *skb)
1869{
1870 struct net_device *dev = skb->dev;
1871 struct sk_buff *segs;
Herbert Xu576a30e2006-06-27 13:22:38 -07001872 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1873 NETIF_F_SG : 0);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001874
Herbert Xu576a30e2006-06-27 13:22:38 -07001875 segs = skb_gso_segment(skb, features);
1876
1877 /* Verifying header integrity only. */
1878 if (!segs)
1879 return 0;
1880
Hirofumi Nakagawa801678c2008-04-29 01:03:09 -07001881 if (IS_ERR(segs))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001882 return PTR_ERR(segs);
1883
1884 skb->next = segs;
1885 DEV_GSO_CB(skb)->destructor = skb->destructor;
1886 skb->destructor = dev_gso_skb_destructor;
1887
1888 return 0;
1889}
1890
Eric Dumazetfc6055a2010-04-16 12:18:22 +00001891/*
1892 * Try to orphan skb early, right before transmission by the device.
1893 * We cannot orphan skb if tx timestamp is requested, since
1894 * drivers need to call skb_tstamp_tx() to send the timestamp.
1895 */
1896static inline void skb_orphan_try(struct sk_buff *skb)
1897{
1898 if (!skb_tx(skb)->flags)
1899 skb_orphan(skb);
1900}
1901
David S. Millerfd2ea0a2008-07-17 01:56:23 -07001902int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1903 struct netdev_queue *txq)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001904{
Stephen Hemminger00829822008-11-20 20:14:53 -08001905 const struct net_device_ops *ops = dev->netdev_ops;
Patrick McHardy572a9d72009-11-10 06:14:14 +00001906 int rc = NETDEV_TX_OK;
Stephen Hemminger00829822008-11-20 20:14:53 -08001907
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001908 if (likely(!skb->next)) {
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -07001909 if (!list_empty(&ptype_all))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001910 dev_queue_xmit_nit(skb, dev);
1911
Eric Dumazet93f154b2009-05-18 22:19:19 -07001912 /*
1913 * If device doesnt need skb->dst, release it right now while
1914 * its hot in this cpu cache
1915 */
Eric Dumazetadf30902009-06-02 05:19:30 +00001916 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1917 skb_dst_drop(skb);
1918
Eric Dumazetfc6055a2010-04-16 12:18:22 +00001919 skb_orphan_try(skb);
David S. Miller9ccb8972010-04-22 01:02:07 -07001920
1921 if (netif_needs_gso(dev, skb)) {
1922 if (unlikely(dev_gso_segment(skb)))
1923 goto out_kfree_skb;
1924 if (skb->next)
1925 goto gso;
1926 }
1927
Patrick Ohlyac45f602009-02-12 05:03:37 +00001928 rc = ops->ndo_start_xmit(skb, dev);
Patrick McHardyec634fe2009-07-05 19:23:38 -07001929 if (rc == NETDEV_TX_OK)
Eric Dumazet08baf562009-05-25 22:58:01 -07001930 txq_trans_update(txq);
Patrick Ohlyac45f602009-02-12 05:03:37 +00001931 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001932 }
1933
Herbert Xu576a30e2006-06-27 13:22:38 -07001934gso:
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001935 do {
1936 struct sk_buff *nskb = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001937
1938 skb->next = nskb->next;
1939 nskb->next = NULL;
Krishna Kumar068a2de2009-12-09 20:59:58 +00001940
1941 /*
1942 * If device doesnt need nskb->dst, release it right now while
1943 * its hot in this cpu cache
1944 */
1945 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1946 skb_dst_drop(nskb);
1947
Stephen Hemminger00829822008-11-20 20:14:53 -08001948 rc = ops->ndo_start_xmit(nskb, dev);
Patrick McHardyec634fe2009-07-05 19:23:38 -07001949 if (unlikely(rc != NETDEV_TX_OK)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00001950 if (rc & ~NETDEV_TX_MASK)
1951 goto out_kfree_gso_skb;
Michael Chanf54d9e82006-06-25 23:57:04 -07001952 nskb->next = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001953 skb->next = nskb;
1954 return rc;
1955 }
Eric Dumazet08baf562009-05-25 22:58:01 -07001956 txq_trans_update(txq);
David S. Millerfd2ea0a2008-07-17 01:56:23 -07001957 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
Michael Chanf54d9e82006-06-25 23:57:04 -07001958 return NETDEV_TX_BUSY;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001959 } while (skb->next);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001960
Patrick McHardy572a9d72009-11-10 06:14:14 +00001961out_kfree_gso_skb:
1962 if (likely(skb->next == NULL))
1963 skb->destructor = DEV_GSO_CB(skb)->destructor;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001964out_kfree_skb:
1965 kfree_skb(skb);
Patrick McHardy572a9d72009-11-10 06:14:14 +00001966 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001967}
1968
Tom Herbert0a9627f2010-03-16 08:03:29 +00001969static u32 hashrnd __read_mostly;
David S. Millerb6b2fed2008-07-21 09:48:06 -07001970
Stephen Hemminger92477442009-03-21 13:39:26 -07001971u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
David S. Miller8f0f2222008-07-15 03:47:03 -07001972{
David S. Miller70192982009-01-27 16:34:47 -08001973 u32 hash;
David S. Millerb6b2fed2008-07-21 09:48:06 -07001974
David S. Miller513de112009-05-03 14:43:10 -07001975 if (skb_rx_queue_recorded(skb)) {
1976 hash = skb_get_rx_queue(skb);
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001977 while (unlikely(hash >= dev->real_num_tx_queues))
David S. Miller513de112009-05-03 14:43:10 -07001978 hash -= dev->real_num_tx_queues;
1979 return hash;
1980 }
Eric Dumazetec581f62009-05-01 09:05:06 -07001981
1982 if (skb->sk && skb->sk->sk_hash)
David S. Miller70192982009-01-27 16:34:47 -08001983 hash = skb->sk->sk_hash;
Eric Dumazetec581f62009-05-01 09:05:06 -07001984 else
Eric Dumazetb249dcb2010-04-19 21:56:38 +00001985 hash = (__force u16) skb->protocol;
David S. Millerd5a9e242009-01-27 16:22:11 -08001986
Tom Herbert0a9627f2010-03-16 08:03:29 +00001987 hash = jhash_1word(hash, hashrnd);
David S. Millerd5a9e242009-01-27 16:22:11 -08001988
David S. Millerb6b2fed2008-07-21 09:48:06 -07001989 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
David S. Miller8f0f2222008-07-15 03:47:03 -07001990}
Stephen Hemminger92477442009-03-21 13:39:26 -07001991EXPORT_SYMBOL(skb_tx_hash);
David S. Miller8f0f2222008-07-15 03:47:03 -07001992
Eric Dumazeted046422009-11-13 21:54:04 +00001993static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1994{
1995 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1996 if (net_ratelimit()) {
Eric Dumazet7a161ea2010-04-08 21:26:13 +00001997 pr_warning("%s selects TX queue %d, but "
1998 "real number of TX queues is %d\n",
1999 dev->name, queue_index, dev->real_num_tx_queues);
Eric Dumazeted046422009-11-13 21:54:04 +00002000 }
2001 return 0;
2002 }
2003 return queue_index;
2004}
2005
David S. Millere8a04642008-07-17 00:34:19 -07002006static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2007 struct sk_buff *skb)
2008{
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002009 u16 queue_index;
2010 struct sock *sk = skb->sk;
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002011
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002012 if (sk_tx_queue_recorded(sk)) {
2013 queue_index = sk_tx_queue_get(sk);
2014 } else {
2015 const struct net_device_ops *ops = dev->netdev_ops;
2016
2017 if (ops->ndo_select_queue) {
2018 queue_index = ops->ndo_select_queue(dev, skb);
Eric Dumazeted046422009-11-13 21:54:04 +00002019 queue_index = dev_cap_txqueue(dev, queue_index);
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002020 } else {
2021 queue_index = 0;
2022 if (dev->real_num_tx_queues > 1)
2023 queue_index = skb_tx_hash(dev, skb);
2024
Eric Dumazet8728c542010-04-11 21:18:17 +00002025 if (sk) {
David S. Miller87eb3672010-04-21 01:14:25 -07002026 struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
Eric Dumazet8728c542010-04-11 21:18:17 +00002027
2028 if (dst && skb_dst(skb) == dst)
2029 sk_tx_queue_set(sk, queue_index);
2030 }
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002031 }
2032 }
David S. Millereae792b2008-07-15 03:03:33 -07002033
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002034 skb_set_queue_mapping(skb, queue_index);
2035 return netdev_get_tx_queue(dev, queue_index);
David S. Millere8a04642008-07-17 00:34:19 -07002036}
2037
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002038static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2039 struct net_device *dev,
2040 struct netdev_queue *txq)
2041{
2042 spinlock_t *root_lock = qdisc_lock(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002043 bool contended = qdisc_is_running(q);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002044 int rc;
2045
Eric Dumazet79640a42010-06-02 05:09:29 -07002046 /*
2047 * Heuristic to force contended enqueues to serialize on a
2048 * separate lock before trying to get qdisc main lock.
2049 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2050 * and dequeue packets faster.
2051 */
2052 if (unlikely(contended))
2053 spin_lock(&q->busylock);
2054
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002055 spin_lock(root_lock);
2056 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2057 kfree_skb(skb);
2058 rc = NET_XMIT_DROP;
2059 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
Eric Dumazetbc135b22010-06-02 03:23:51 -07002060 qdisc_run_begin(q)) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002061 /*
2062 * This is a work-conserving queue; there are no old skbs
2063 * waiting to be sent out; and the qdisc is not running -
2064 * xmit the skb directly.
2065 */
Eric Dumazet7fee2262010-05-11 23:19:48 +00002066 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2067 skb_dst_force(skb);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002068 __qdisc_update_bstats(q, skb->len);
Eric Dumazet79640a42010-06-02 05:09:29 -07002069 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2070 if (unlikely(contended)) {
2071 spin_unlock(&q->busylock);
2072 contended = false;
2073 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002074 __qdisc_run(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002075 } else
Eric Dumazetbc135b22010-06-02 03:23:51 -07002076 qdisc_run_end(q);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002077
2078 rc = NET_XMIT_SUCCESS;
2079 } else {
Eric Dumazet7fee2262010-05-11 23:19:48 +00002080 skb_dst_force(skb);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002081 rc = qdisc_enqueue_root(skb, q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002082 if (qdisc_run_begin(q)) {
2083 if (unlikely(contended)) {
2084 spin_unlock(&q->busylock);
2085 contended = false;
2086 }
2087 __qdisc_run(q);
2088 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002089 }
2090 spin_unlock(root_lock);
Eric Dumazet79640a42010-06-02 05:09:29 -07002091 if (unlikely(contended))
2092 spin_unlock(&q->busylock);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002093 return rc;
2094}
2095
Krishna Kumar4b258462010-01-21 01:26:29 -08002096/*
2097 * Returns true if either:
2098 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2099 * 2. skb is fragmented and the device does not support SG, or if
2100 * at least one of fragments is in highmem and device does not
2101 * support DMA from it.
2102 */
2103static inline int skb_needs_linearize(struct sk_buff *skb,
2104 struct net_device *dev)
2105{
2106 return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
2107 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
2108 illegal_highdma(dev, skb)));
2109}
2110
Dave Jonesd29f7492008-07-22 14:09:06 -07002111/**
2112 * dev_queue_xmit - transmit a buffer
2113 * @skb: buffer to transmit
2114 *
2115 * Queue a buffer for transmission to a network device. The caller must
2116 * have set the device and priority and built the buffer before calling
2117 * this function. The function can be called from an interrupt.
2118 *
2119 * A negative errno code is returned on a failure. A success does not
2120 * guarantee the frame will be transmitted as it may be dropped due
2121 * to congestion or traffic shaping.
2122 *
2123 * -----------------------------------------------------------------------------------
2124 * I notice this method can also return errors from the queue disciplines,
2125 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2126 * be positive.
2127 *
2128 * Regardless of the return value, the skb is consumed, so it is currently
2129 * difficult to retry a send to this method. (You can bump the ref count
2130 * before sending to hold a reference for retry if you are careful.)
2131 *
2132 * When calling this method, interrupts MUST be enabled. This is because
2133 * the BH enable code must have IRQs enabled so that it will not deadlock.
2134 * --BLG
2135 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002136int dev_queue_xmit(struct sk_buff *skb)
2137{
2138 struct net_device *dev = skb->dev;
David S. Millerdc2b4842008-07-08 17:18:23 -07002139 struct netdev_queue *txq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002140 struct Qdisc *q;
2141 int rc = -ENOMEM;
2142
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002143 /* GSO will handle the following emulations directly. */
2144 if (netif_needs_gso(dev, skb))
2145 goto gso;
2146
Krishna Kumar4b258462010-01-21 01:26:29 -08002147 /* Convert a paged skb to linear, if required */
2148 if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002149 goto out_kfree_skb;
2150
2151 /* If packet is not checksummed and device does not support
2152 * checksumming for this protocol, complete checksumming here.
2153 */
Herbert Xu663ead32007-04-09 11:59:07 -07002154 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2155 skb_set_transport_header(skb, skb->csum_start -
2156 skb_headroom(skb));
Ben Hutchings6de329e2008-06-16 17:02:28 -07002157 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
2158 goto out_kfree_skb;
Herbert Xu663ead32007-04-09 11:59:07 -07002159 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002160
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002161gso:
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002162 /* Disable soft irqs for various locks below. Also
2163 * stops preemption for RCU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002164 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002165 rcu_read_lock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002166
David S. Millereae792b2008-07-15 03:03:33 -07002167 txq = dev_pick_tx(dev, skb);
Paul E. McKenneya898def2010-02-22 17:04:49 -08002168 q = rcu_dereference_bh(txq->qdisc);
David S. Miller37437bb2008-07-16 02:15:04 -07002169
Linus Torvalds1da177e2005-04-16 15:20:36 -07002170#ifdef CONFIG_NET_CLS_ACT
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002171 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002172#endif
2173 if (q->enqueue) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002174 rc = __dev_xmit_skb(skb, q, dev, txq);
David S. Miller37437bb2008-07-16 02:15:04 -07002175 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002176 }
2177
2178 /* The device has no queue. Common case for software devices:
2179 loopback, all the sorts of tunnels...
2180
Herbert Xu932ff272006-06-09 12:20:56 -07002181 Really, it is unlikely that netif_tx_lock protection is necessary
2182 here. (f.e. loopback and IP tunnels are clean ignoring statistics
Linus Torvalds1da177e2005-04-16 15:20:36 -07002183 counters.)
2184 However, it is possible, that they rely on protection
2185 made by us here.
2186
2187 Check this and shot the lock. It is not prone from deadlocks.
2188 Either shot noqueue qdisc, it is even simpler 8)
2189 */
2190 if (dev->flags & IFF_UP) {
2191 int cpu = smp_processor_id(); /* ok because BHs are off */
2192
David S. Millerc773e842008-07-08 23:13:53 -07002193 if (txq->xmit_lock_owner != cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002194
David S. Millerc773e842008-07-08 23:13:53 -07002195 HARD_TX_LOCK(dev, txq, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002196
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002197 if (!netif_tx_queue_stopped(txq)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002198 rc = dev_hard_start_xmit(skb, dev, txq);
2199 if (dev_xmit_complete(rc)) {
David S. Millerc773e842008-07-08 23:13:53 -07002200 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201 goto out;
2202 }
2203 }
David S. Millerc773e842008-07-08 23:13:53 -07002204 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002205 if (net_ratelimit())
2206 printk(KERN_CRIT "Virtual device %s asks to "
2207 "queue packet!\n", dev->name);
2208 } else {
2209 /* Recursion is detected! It is possible,
2210 * unfortunately */
2211 if (net_ratelimit())
2212 printk(KERN_CRIT "Dead loop on virtual device "
2213 "%s, fix it urgently!\n", dev->name);
2214 }
2215 }
2216
2217 rc = -ENETDOWN;
Herbert Xud4828d82006-06-22 02:28:18 -07002218 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002219
2220out_kfree_skb:
2221 kfree_skb(skb);
2222 return rc;
2223out:
Herbert Xud4828d82006-06-22 02:28:18 -07002224 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002225 return rc;
2226}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002227EXPORT_SYMBOL(dev_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002228
2229
2230/*=======================================================================
2231 Receiver routines
2232 =======================================================================*/
2233
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002234int netdev_max_backlog __read_mostly = 1000;
Eric Dumazet3b098e22010-05-15 23:57:10 -07002235int netdev_tstamp_prequeue __read_mostly = 1;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002236int netdev_budget __read_mostly = 300;
2237int weight_p __read_mostly = 64; /* old backlog weight */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002238
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002239/* Called with irq disabled */
2240static inline void ____napi_schedule(struct softnet_data *sd,
2241 struct napi_struct *napi)
2242{
2243 list_add_tail(&napi->poll_list, &sd->poll_list);
2244 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2245}
2246
Eric Dumazetdf334542010-03-24 19:13:54 +00002247#ifdef CONFIG_RPS
Tom Herbertfec5e652010-04-16 16:01:27 -07002248
2249/* One global table that all flow-based protocols share. */
Eric Dumazet8770acf2010-04-17 00:54:36 -07002250struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
Tom Herbertfec5e652010-04-16 16:01:27 -07002251EXPORT_SYMBOL(rps_sock_flow_table);
2252
Tom Herbert0a9627f2010-03-16 08:03:29 +00002253/*
2254 * get_rps_cpu is called from netif_receive_skb and returns the target
2255 * CPU from the RPS map of the receiving queue for a given skb.
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002256 * rcu_read_lock must be held on entry.
Tom Herbert0a9627f2010-03-16 08:03:29 +00002257 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002258static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2259 struct rps_dev_flow **rflowp)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002260{
2261 struct ipv6hdr *ip6;
2262 struct iphdr *ip;
2263 struct netdev_rx_queue *rxqueue;
2264 struct rps_map *map;
Tom Herbertfec5e652010-04-16 16:01:27 -07002265 struct rps_dev_flow_table *flow_table;
2266 struct rps_sock_flow_table *sock_flow_table;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002267 int cpu = -1;
2268 u8 ip_proto;
Tom Herbertfec5e652010-04-16 16:01:27 -07002269 u16 tcpu;
Changli Gao8c52d502010-04-24 22:50:10 -07002270 u32 addr1, addr2, ihl;
2271 union {
2272 u32 v32;
2273 u16 v16[2];
2274 } ports;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002275
Tom Herbert0a9627f2010-03-16 08:03:29 +00002276 if (skb_rx_queue_recorded(skb)) {
2277 u16 index = skb_get_rx_queue(skb);
2278 if (unlikely(index >= dev->num_rx_queues)) {
2279 if (net_ratelimit()) {
Eric Dumazet7a161ea2010-04-08 21:26:13 +00002280 pr_warning("%s received packet on queue "
2281 "%u, but number of RX queues is %u\n",
2282 dev->name, index, dev->num_rx_queues);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002283 }
2284 goto done;
2285 }
2286 rxqueue = dev->_rx + index;
2287 } else
2288 rxqueue = dev->_rx;
2289
Tom Herbertfec5e652010-04-16 16:01:27 -07002290 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002291 goto done;
2292
2293 if (skb->rxhash)
2294 goto got_hash; /* Skip hash computation on packet header */
2295
2296 switch (skb->protocol) {
2297 case __constant_htons(ETH_P_IP):
2298 if (!pskb_may_pull(skb, sizeof(*ip)))
2299 goto done;
2300
2301 ip = (struct iphdr *) skb->data;
2302 ip_proto = ip->protocol;
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002303 addr1 = (__force u32) ip->saddr;
2304 addr2 = (__force u32) ip->daddr;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002305 ihl = ip->ihl;
2306 break;
2307 case __constant_htons(ETH_P_IPV6):
2308 if (!pskb_may_pull(skb, sizeof(*ip6)))
2309 goto done;
2310
2311 ip6 = (struct ipv6hdr *) skb->data;
2312 ip_proto = ip6->nexthdr;
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002313 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2314 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
Tom Herbert0a9627f2010-03-16 08:03:29 +00002315 ihl = (40 >> 2);
2316 break;
2317 default:
2318 goto done;
2319 }
Tom Herbert0a9627f2010-03-16 08:03:29 +00002320 switch (ip_proto) {
2321 case IPPROTO_TCP:
2322 case IPPROTO_UDP:
2323 case IPPROTO_DCCP:
2324 case IPPROTO_ESP:
2325 case IPPROTO_AH:
2326 case IPPROTO_SCTP:
2327 case IPPROTO_UDPLITE:
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002328 if (pskb_may_pull(skb, (ihl * 4) + 4)) {
Changli Gao8c52d502010-04-24 22:50:10 -07002329 ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
2330 if (ports.v16[1] < ports.v16[0])
2331 swap(ports.v16[0], ports.v16[1]);
2332 break;
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002333 }
Tom Herbert0a9627f2010-03-16 08:03:29 +00002334 default:
Changli Gao8c52d502010-04-24 22:50:10 -07002335 ports.v32 = 0;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002336 break;
2337 }
2338
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002339 /* get a consistent hash (same value on both flow directions) */
2340 if (addr2 < addr1)
2341 swap(addr1, addr2);
Changli Gao8c52d502010-04-24 22:50:10 -07002342 skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002343 if (!skb->rxhash)
2344 skb->rxhash = 1;
2345
2346got_hash:
Tom Herbertfec5e652010-04-16 16:01:27 -07002347 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2348 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2349 if (flow_table && sock_flow_table) {
2350 u16 next_cpu;
2351 struct rps_dev_flow *rflow;
2352
2353 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2354 tcpu = rflow->cpu;
2355
2356 next_cpu = sock_flow_table->ents[skb->rxhash &
2357 sock_flow_table->mask];
2358
2359 /*
2360 * If the desired CPU (where last recvmsg was done) is
2361 * different from current CPU (one in the rx-queue flow
2362 * table entry), switch if one of the following holds:
2363 * - Current CPU is unset (equal to RPS_NO_CPU).
2364 * - Current CPU is offline.
2365 * - The current CPU's queue tail has advanced beyond the
2366 * last packet that was enqueued using this table entry.
2367 * This guarantees that all previous packets for the flow
2368 * have been dequeued, thus preserving in order delivery.
2369 */
2370 if (unlikely(tcpu != next_cpu) &&
2371 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2372 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2373 rflow->last_qtail)) >= 0)) {
2374 tcpu = rflow->cpu = next_cpu;
2375 if (tcpu != RPS_NO_CPU)
2376 rflow->last_qtail = per_cpu(softnet_data,
2377 tcpu).input_queue_head;
2378 }
2379 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2380 *rflowp = rflow;
2381 cpu = tcpu;
2382 goto done;
2383 }
2384 }
2385
Tom Herbert0a9627f2010-03-16 08:03:29 +00002386 map = rcu_dereference(rxqueue->rps_map);
2387 if (map) {
Tom Herbertfec5e652010-04-16 16:01:27 -07002388 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
Tom Herbert0a9627f2010-03-16 08:03:29 +00002389
2390 if (cpu_online(tcpu)) {
2391 cpu = tcpu;
2392 goto done;
2393 }
2394 }
2395
2396done:
Tom Herbert0a9627f2010-03-16 08:03:29 +00002397 return cpu;
2398}
2399
Tom Herbert0a9627f2010-03-16 08:03:29 +00002400/* Called from hardirq (IPI) context */
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002401static void rps_trigger_softirq(void *data)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002402{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002403 struct softnet_data *sd = data;
2404
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002405 ____napi_schedule(sd, &sd->backlog);
Changli Gaodee42872010-05-02 05:42:16 +00002406 sd->received_rps++;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002407}
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002408
Tom Herbertfec5e652010-04-16 16:01:27 -07002409#endif /* CONFIG_RPS */
Tom Herbert0a9627f2010-03-16 08:03:29 +00002410
2411/*
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002412 * Check if this softnet_data structure is another cpu one
2413 * If yes, queue it to our IPI list and return 1
2414 * If no, return 0
2415 */
2416static int rps_ipi_queued(struct softnet_data *sd)
2417{
2418#ifdef CONFIG_RPS
2419 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2420
2421 if (sd != mysd) {
2422 sd->rps_ipi_next = mysd->rps_ipi_list;
2423 mysd->rps_ipi_list = sd;
2424
2425 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2426 return 1;
2427 }
2428#endif /* CONFIG_RPS */
2429 return 0;
2430}
2431
2432/*
Tom Herbert0a9627f2010-03-16 08:03:29 +00002433 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2434 * queue (may be a remote CPU queue).
2435 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002436static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2437 unsigned int *qtail)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002438{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002439 struct softnet_data *sd;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002440 unsigned long flags;
2441
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002442 sd = &per_cpu(softnet_data, cpu);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002443
2444 local_irq_save(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002445
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002446 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07002447 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2448 if (skb_queue_len(&sd->input_pkt_queue)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00002449enqueue:
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002450 __skb_queue_tail(&sd->input_pkt_queue, skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00002451 input_queue_tail_incr_save(sd, qtail);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002452 rps_unlock(sd);
Changli Gao152102c2010-03-30 20:16:22 +00002453 local_irq_restore(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002454 return NET_RX_SUCCESS;
2455 }
2456
Eric Dumazetebda37c22010-05-06 23:51:21 +00002457 /* Schedule NAPI for backlog device
2458 * We can use non atomic operation since we own the queue lock
2459 */
2460 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002461 if (!rps_ipi_queued(sd))
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002462 ____napi_schedule(sd, &sd->backlog);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002463 }
2464 goto enqueue;
2465 }
2466
Changli Gaodee42872010-05-02 05:42:16 +00002467 sd->dropped++;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002468 rps_unlock(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002469
Tom Herbert0a9627f2010-03-16 08:03:29 +00002470 local_irq_restore(flags);
2471
2472 kfree_skb(skb);
2473 return NET_RX_DROP;
2474}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002475
Linus Torvalds1da177e2005-04-16 15:20:36 -07002476/**
2477 * netif_rx - post buffer to the network code
2478 * @skb: buffer to post
2479 *
2480 * This function receives a packet from a device driver and queues it for
2481 * the upper (protocol) levels to process. It always succeeds. The buffer
2482 * may be dropped during processing for congestion control or by the
2483 * protocol layers.
2484 *
2485 * return values:
2486 * NET_RX_SUCCESS (no congestion)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002487 * NET_RX_DROP (packet was dropped)
2488 *
2489 */
2490
2491int netif_rx(struct sk_buff *skb)
2492{
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002493 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002494
2495 /* if netpoll wants it, pretend we never saw it */
2496 if (netpoll_rx(skb))
2497 return NET_RX_DROP;
2498
Eric Dumazet3b098e22010-05-15 23:57:10 -07002499 if (netdev_tstamp_prequeue)
2500 net_timestamp_check(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002501
Eric Dumazetdf334542010-03-24 19:13:54 +00002502#ifdef CONFIG_RPS
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002503 {
Tom Herbertfec5e652010-04-16 16:01:27 -07002504 struct rps_dev_flow voidflow, *rflow = &voidflow;
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002505 int cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002506
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002507 rcu_read_lock();
Tom Herbertfec5e652010-04-16 16:01:27 -07002508
2509 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002510 if (cpu < 0)
2511 cpu = smp_processor_id();
Tom Herbertfec5e652010-04-16 16:01:27 -07002512
2513 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2514
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002515 rcu_read_unlock();
2516 }
2517#else
Tom Herbertfec5e652010-04-16 16:01:27 -07002518 {
2519 unsigned int qtail;
2520 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2521 put_cpu();
2522 }
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002523#endif
2524 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002525}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002526EXPORT_SYMBOL(netif_rx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002527
2528int netif_rx_ni(struct sk_buff *skb)
2529{
2530 int err;
2531
2532 preempt_disable();
2533 err = netif_rx(skb);
2534 if (local_softirq_pending())
2535 do_softirq();
2536 preempt_enable();
2537
2538 return err;
2539}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002540EXPORT_SYMBOL(netif_rx_ni);
2541
Linus Torvalds1da177e2005-04-16 15:20:36 -07002542static void net_tx_action(struct softirq_action *h)
2543{
2544 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2545
2546 if (sd->completion_queue) {
2547 struct sk_buff *clist;
2548
2549 local_irq_disable();
2550 clist = sd->completion_queue;
2551 sd->completion_queue = NULL;
2552 local_irq_enable();
2553
2554 while (clist) {
2555 struct sk_buff *skb = clist;
2556 clist = clist->next;
2557
Ilpo Järvinen547b7922008-07-25 21:43:18 -07002558 WARN_ON(atomic_read(&skb->users));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002559 __kfree_skb(skb);
2560 }
2561 }
2562
2563 if (sd->output_queue) {
David S. Miller37437bb2008-07-16 02:15:04 -07002564 struct Qdisc *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002565
2566 local_irq_disable();
2567 head = sd->output_queue;
2568 sd->output_queue = NULL;
Changli Gaoa9cbd582010-04-26 23:06:24 +00002569 sd->output_queue_tailp = &sd->output_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002570 local_irq_enable();
2571
2572 while (head) {
David S. Miller37437bb2008-07-16 02:15:04 -07002573 struct Qdisc *q = head;
2574 spinlock_t *root_lock;
2575
Linus Torvalds1da177e2005-04-16 15:20:36 -07002576 head = head->next_sched;
2577
David S. Miller5fb66222008-08-02 20:02:43 -07002578 root_lock = qdisc_lock(q);
David S. Miller37437bb2008-07-16 02:15:04 -07002579 if (spin_trylock(root_lock)) {
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002580 smp_mb__before_clear_bit();
2581 clear_bit(__QDISC_STATE_SCHED,
2582 &q->state);
David S. Miller37437bb2008-07-16 02:15:04 -07002583 qdisc_run(q);
2584 spin_unlock(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002585 } else {
David S. Miller195648b2008-08-19 04:00:36 -07002586 if (!test_bit(__QDISC_STATE_DEACTIVATED,
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07002587 &q->state)) {
David S. Miller195648b2008-08-19 04:00:36 -07002588 __netif_reschedule(q);
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07002589 } else {
2590 smp_mb__before_clear_bit();
2591 clear_bit(__QDISC_STATE_SCHED,
2592 &q->state);
2593 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002594 }
2595 }
2596 }
2597}
2598
Stephen Hemminger6f05f622007-03-08 20:46:03 -08002599static inline int deliver_skb(struct sk_buff *skb,
2600 struct packet_type *pt_prev,
2601 struct net_device *orig_dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002602{
2603 atomic_inc(&skb->users);
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002604 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002605}
2606
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002607#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2608 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
Michał Mirosławda678292009-06-05 05:35:28 +00002609/* This hook is defined here for ATM LANE */
2610int (*br_fdb_test_addr_hook)(struct net_device *dev,
2611 unsigned char *addr) __read_mostly;
Stephen Hemminger4fb019a2009-09-11 11:50:08 -07002612EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
Michał Mirosławda678292009-06-05 05:35:28 +00002613#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002614
Linus Torvalds1da177e2005-04-16 15:20:36 -07002615#ifdef CONFIG_NET_CLS_ACT
2616/* TODO: Maybe we should just force sch_ingress to be compiled in
2617 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2618 * a compare and 2 stores extra right now if we dont have it on
2619 * but have CONFIG_NET_CLS_ACT
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002620 * NOTE: This doesnt stop any functionality; if you dont have
Linus Torvalds1da177e2005-04-16 15:20:36 -07002621 * the ingress scheduler, you just cant add policies on ingress.
2622 *
2623 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002624static int ing_filter(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002625{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002626 struct net_device *dev = skb->dev;
Herbert Xuf697c3e2007-10-14 00:38:47 -07002627 u32 ttl = G_TC_RTTL(skb->tc_verd);
David S. Miller555353c2008-07-08 17:33:13 -07002628 struct netdev_queue *rxq;
2629 int result = TC_ACT_OK;
2630 struct Qdisc *q;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002631
Herbert Xuf697c3e2007-10-14 00:38:47 -07002632 if (MAX_RED_LOOP < ttl++) {
2633 printk(KERN_WARNING
2634 "Redir loop detected Dropping packet (%d->%d)\n",
Eric Dumazet8964be42009-11-20 15:35:04 -08002635 skb->skb_iif, dev->ifindex);
Herbert Xuf697c3e2007-10-14 00:38:47 -07002636 return TC_ACT_SHOT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002637 }
2638
Herbert Xuf697c3e2007-10-14 00:38:47 -07002639 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2640 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2641
David S. Miller555353c2008-07-08 17:33:13 -07002642 rxq = &dev->rx_queue;
2643
David S. Miller83874002008-07-17 00:53:03 -07002644 q = rxq->qdisc;
David S. Miller8d50b532008-07-30 02:37:46 -07002645 if (q != &noop_qdisc) {
David S. Miller83874002008-07-17 00:53:03 -07002646 spin_lock(qdisc_lock(q));
David S. Millera9312ae2008-08-17 21:51:03 -07002647 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2648 result = qdisc_enqueue_root(skb, q);
David S. Miller83874002008-07-17 00:53:03 -07002649 spin_unlock(qdisc_lock(q));
2650 }
Herbert Xuf697c3e2007-10-14 00:38:47 -07002651
Linus Torvalds1da177e2005-04-16 15:20:36 -07002652 return result;
2653}
Herbert Xuf697c3e2007-10-14 00:38:47 -07002654
2655static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2656 struct packet_type **pt_prev,
2657 int *ret, struct net_device *orig_dev)
2658{
David S. Miller8d50b532008-07-30 02:37:46 -07002659 if (skb->dev->rx_queue.qdisc == &noop_qdisc)
Herbert Xuf697c3e2007-10-14 00:38:47 -07002660 goto out;
2661
2662 if (*pt_prev) {
2663 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2664 *pt_prev = NULL;
2665 } else {
2666 /* Huh? Why does turning on AF_PACKET affect this? */
2667 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2668 }
2669
2670 switch (ing_filter(skb)) {
2671 case TC_ACT_SHOT:
2672 case TC_ACT_STOLEN:
2673 kfree_skb(skb);
2674 return NULL;
2675 }
2676
2677out:
2678 skb->tc_verd = 0;
2679 return skb;
2680}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002681#endif
2682
Patrick McHardybc1d0412008-07-14 22:49:30 -07002683/*
2684 * netif_nit_deliver - deliver received packets to network taps
2685 * @skb: buffer
2686 *
2687 * This function is used to deliver incoming packets to network
2688 * taps. It should be used when the normal netif_receive_skb path
2689 * is bypassed, for example because of VLAN acceleration.
2690 */
2691void netif_nit_deliver(struct sk_buff *skb)
2692{
2693 struct packet_type *ptype;
2694
2695 if (list_empty(&ptype_all))
2696 return;
2697
2698 skb_reset_network_header(skb);
2699 skb_reset_transport_header(skb);
2700 skb->mac_len = skb->network_header - skb->mac_header;
2701
2702 rcu_read_lock();
2703 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2704 if (!ptype->dev || ptype->dev == skb->dev)
2705 deliver_skb(skb, ptype, skb->dev);
2706 }
2707 rcu_read_unlock();
2708}
2709
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002710/**
2711 * netdev_rx_handler_register - register receive handler
2712 * @dev: device to register a handler for
2713 * @rx_handler: receive handler to register
2714 *
2715 * Register a receive hander for a device. This handler will then be
2716 * called from __netif_receive_skb. A negative errno code is returned
2717 * on a failure.
2718 *
2719 * The caller must hold the rtnl_mutex.
2720 */
2721int netdev_rx_handler_register(struct net_device *dev,
2722 rx_handler_func_t *rx_handler)
2723{
2724 ASSERT_RTNL();
2725
2726 if (dev->rx_handler)
2727 return -EBUSY;
2728
2729 rcu_assign_pointer(dev->rx_handler, rx_handler);
2730
2731 return 0;
2732}
2733EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2734
2735/**
2736 * netdev_rx_handler_unregister - unregister receive handler
2737 * @dev: device to unregister a handler from
2738 *
2739 * Unregister a receive hander from a device.
2740 *
2741 * The caller must hold the rtnl_mutex.
2742 */
2743void netdev_rx_handler_unregister(struct net_device *dev)
2744{
2745
2746 ASSERT_RTNL();
2747 rcu_assign_pointer(dev->rx_handler, NULL);
2748}
2749EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2750
Eric Dumazetacbbc072010-04-11 06:56:11 +00002751static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2752 struct net_device *master)
2753{
2754 if (skb->pkt_type == PACKET_HOST) {
2755 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2756
2757 memcpy(dest, master->dev_addr, ETH_ALEN);
2758 }
2759}
2760
2761/* On bonding slaves other than the currently active slave, suppress
2762 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2763 * ARP on active-backup slaves with arp_validate enabled.
2764 */
2765int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2766{
2767 struct net_device *dev = skb->dev;
2768
2769 if (master->priv_flags & IFF_MASTER_ARPMON)
2770 dev->last_rx = jiffies;
2771
2772 if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
2773 /* Do address unmangle. The local destination address
2774 * will be always the one master has. Provides the right
2775 * functionality in a bridge.
2776 */
2777 skb_bond_set_mac_by_master(skb, master);
2778 }
2779
2780 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2781 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2782 skb->protocol == __cpu_to_be16(ETH_P_ARP))
2783 return 0;
2784
2785 if (master->priv_flags & IFF_MASTER_ALB) {
2786 if (skb->pkt_type != PACKET_BROADCAST &&
2787 skb->pkt_type != PACKET_MULTICAST)
2788 return 0;
2789 }
2790 if (master->priv_flags & IFF_MASTER_8023AD &&
2791 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2792 return 0;
2793
2794 return 1;
2795 }
2796 return 0;
2797}
2798EXPORT_SYMBOL(__skb_bond_should_drop);
2799
Eric Dumazet10f744d2010-03-28 23:07:20 -07002800static int __netif_receive_skb(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002801{
2802 struct packet_type *ptype, *pt_prev;
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002803 rx_handler_func_t *rx_handler;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002804 struct net_device *orig_dev;
Eric Dumazet0641e4f2010-03-18 21:16:45 -07002805 struct net_device *master;
Joe Eykholt0d7a3682008-07-02 18:22:01 -07002806 struct net_device *null_or_orig;
Andy Gospodarekca8d9ea2010-01-06 12:56:37 +00002807 struct net_device *null_or_bond;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002808 int ret = NET_RX_DROP;
Al Viro252e3342006-11-14 20:48:11 -08002809 __be16 type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002810
Eric Dumazet3b098e22010-05-15 23:57:10 -07002811 if (!netdev_tstamp_prequeue)
2812 net_timestamp_check(skb);
Eric Dumazet81bbb3d2009-09-30 16:42:42 -07002813
Eric Dumazet05423b22009-10-26 18:40:35 -07002814 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
Patrick McHardy9b22ea52008-11-04 14:49:57 -08002815 return NET_RX_SUCCESS;
2816
Linus Torvalds1da177e2005-04-16 15:20:36 -07002817 /* if we've gotten here through NAPI, check netpoll */
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002818 if (netpoll_receive_skb(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002819 return NET_RX_DROP;
2820
Eric Dumazet8964be42009-11-20 15:35:04 -08002821 if (!skb->skb_iif)
2822 skb->skb_iif = skb->dev->ifindex;
David S. Miller86e65da2005-08-09 19:36:29 -07002823
Joe Eykholt0d7a3682008-07-02 18:22:01 -07002824 null_or_orig = NULL;
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07002825 orig_dev = skb->dev;
Eric Dumazet0641e4f2010-03-18 21:16:45 -07002826 master = ACCESS_ONCE(orig_dev->master);
2827 if (master) {
2828 if (skb_bond_should_drop(skb, master))
Joe Eykholt0d7a3682008-07-02 18:22:01 -07002829 null_or_orig = orig_dev; /* deliver only exact match */
2830 else
Eric Dumazet0641e4f2010-03-18 21:16:45 -07002831 skb->dev = master;
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07002832 }
Jay Vosburgh8f903c72006-02-21 16:36:44 -08002833
Eric Dumazet27f39c73e2010-05-19 22:07:23 +00002834 __this_cpu_inc(softnet_data.processed);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002835 skb_reset_network_header(skb);
Arnaldo Carvalho de Melobadff6d2007-03-13 13:06:52 -03002836 skb_reset_transport_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07002837 skb->mac_len = skb->network_header - skb->mac_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002838
2839 pt_prev = NULL;
2840
2841 rcu_read_lock();
2842
2843#ifdef CONFIG_NET_CLS_ACT
2844 if (skb->tc_verd & TC_NCLS) {
2845 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2846 goto ncls;
2847 }
2848#endif
2849
2850 list_for_each_entry_rcu(ptype, &ptype_all, list) {
Joe Eykholtf9823072008-07-02 18:22:02 -07002851 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2852 ptype->dev == orig_dev) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002853 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002854 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002855 pt_prev = ptype;
2856 }
2857 }
2858
2859#ifdef CONFIG_NET_CLS_ACT
Herbert Xuf697c3e2007-10-14 00:38:47 -07002860 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2861 if (!skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002862 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002863ncls:
2864#endif
2865
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002866 /* Handle special case of bridge or macvlan */
2867 rx_handler = rcu_dereference(skb->dev->rx_handler);
2868 if (rx_handler) {
2869 if (pt_prev) {
2870 ret = deliver_skb(skb, pt_prev, orig_dev);
2871 pt_prev = NULL;
2872 }
2873 skb = rx_handler(skb);
2874 if (!skb)
2875 goto out;
2876 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002877
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002878 /*
2879 * Make sure frames received on VLAN interfaces stacked on
2880 * bonding interfaces still make their way to any base bonding
2881 * device that may have registered for a specific ptype. The
2882 * handler may have to adjust skb->dev and orig_dev.
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002883 */
Andy Gospodarekca8d9ea2010-01-06 12:56:37 +00002884 null_or_bond = NULL;
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002885 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2886 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
Andy Gospodarekca8d9ea2010-01-06 12:56:37 +00002887 null_or_bond = vlan_dev_real_dev(skb->dev);
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002888 }
2889
Linus Torvalds1da177e2005-04-16 15:20:36 -07002890 type = skb->protocol;
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08002891 list_for_each_entry_rcu(ptype,
2892 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002893 if (ptype->type == type && (ptype->dev == null_or_orig ||
Andy Gospodarekca8d9ea2010-01-06 12:56:37 +00002894 ptype->dev == skb->dev || ptype->dev == orig_dev ||
2895 ptype->dev == null_or_bond)) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002896 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002897 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002898 pt_prev = ptype;
2899 }
2900 }
2901
2902 if (pt_prev) {
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002903 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002904 } else {
2905 kfree_skb(skb);
2906 /* Jamal, now you will not able to escape explaining
2907 * me how you were going to use this. :-)
2908 */
2909 ret = NET_RX_DROP;
2910 }
2911
2912out:
2913 rcu_read_unlock();
2914 return ret;
2915}
Tom Herbert0a9627f2010-03-16 08:03:29 +00002916
2917/**
2918 * netif_receive_skb - process receive buffer from network
2919 * @skb: buffer to process
2920 *
2921 * netif_receive_skb() is the main receive data processing function.
2922 * It always succeeds. The buffer may be dropped during processing
2923 * for congestion control or by the protocol layers.
2924 *
2925 * This function may only be called from softirq context and interrupts
2926 * should be enabled.
2927 *
2928 * Return values (usually ignored):
2929 * NET_RX_SUCCESS: no congestion
2930 * NET_RX_DROP: packet was dropped
2931 */
2932int netif_receive_skb(struct sk_buff *skb)
2933{
Eric Dumazet3b098e22010-05-15 23:57:10 -07002934 if (netdev_tstamp_prequeue)
2935 net_timestamp_check(skb);
2936
Eric Dumazetdf334542010-03-24 19:13:54 +00002937#ifdef CONFIG_RPS
Eric Dumazet3b098e22010-05-15 23:57:10 -07002938 {
2939 struct rps_dev_flow voidflow, *rflow = &voidflow;
2940 int cpu, ret;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002941
Eric Dumazet3b098e22010-05-15 23:57:10 -07002942 rcu_read_lock();
Tom Herbert0a9627f2010-03-16 08:03:29 +00002943
Eric Dumazet3b098e22010-05-15 23:57:10 -07002944 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Tom Herbertfec5e652010-04-16 16:01:27 -07002945
Eric Dumazet3b098e22010-05-15 23:57:10 -07002946 if (cpu >= 0) {
2947 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2948 rcu_read_unlock();
2949 } else {
2950 rcu_read_unlock();
2951 ret = __netif_receive_skb(skb);
2952 }
2953
2954 return ret;
Tom Herbertfec5e652010-04-16 16:01:27 -07002955 }
Tom Herbert1e94d722010-03-18 17:45:44 -07002956#else
2957 return __netif_receive_skb(skb);
2958#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00002959}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002960EXPORT_SYMBOL(netif_receive_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002961
Eric Dumazet88751272010-04-19 05:07:33 +00002962/* Network device is going away, flush any packets still pending
2963 * Called with irqs disabled.
2964 */
Changli Gao152102c2010-03-30 20:16:22 +00002965static void flush_backlog(void *arg)
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002966{
Changli Gao152102c2010-03-30 20:16:22 +00002967 struct net_device *dev = arg;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002968 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002969 struct sk_buff *skb, *tmp;
2970
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002971 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07002972 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002973 if (skb->dev == dev) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002974 __skb_unlink(skb, &sd->input_pkt_queue);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002975 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00002976 input_queue_head_incr(sd);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002977 }
Changli Gao6e7676c2010-04-27 15:07:33 -07002978 }
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002979 rps_unlock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07002980
2981 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
2982 if (skb->dev == dev) {
2983 __skb_unlink(skb, &sd->process_queue);
2984 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00002985 input_queue_head_incr(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07002986 }
2987 }
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002988}
2989
Herbert Xud565b0a2008-12-15 23:38:52 -08002990static int napi_gro_complete(struct sk_buff *skb)
2991{
2992 struct packet_type *ptype;
2993 __be16 type = skb->protocol;
2994 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2995 int err = -ENOENT;
2996
Herbert Xufc59f9a2009-04-14 15:11:06 -07002997 if (NAPI_GRO_CB(skb)->count == 1) {
2998 skb_shinfo(skb)->gso_size = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08002999 goto out;
Herbert Xufc59f9a2009-04-14 15:11:06 -07003000 }
Herbert Xud565b0a2008-12-15 23:38:52 -08003001
3002 rcu_read_lock();
3003 list_for_each_entry_rcu(ptype, head, list) {
3004 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3005 continue;
3006
3007 err = ptype->gro_complete(skb);
3008 break;
3009 }
3010 rcu_read_unlock();
3011
3012 if (err) {
3013 WARN_ON(&ptype->list == head);
3014 kfree_skb(skb);
3015 return NET_RX_SUCCESS;
3016 }
3017
3018out:
Herbert Xud565b0a2008-12-15 23:38:52 -08003019 return netif_receive_skb(skb);
3020}
3021
David S. Miller11380a42010-01-19 13:46:10 -08003022static void napi_gro_flush(struct napi_struct *napi)
Herbert Xud565b0a2008-12-15 23:38:52 -08003023{
3024 struct sk_buff *skb, *next;
3025
3026 for (skb = napi->gro_list; skb; skb = next) {
3027 next = skb->next;
3028 skb->next = NULL;
3029 napi_gro_complete(skb);
3030 }
3031
Herbert Xu4ae55442009-02-08 18:00:36 +00003032 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003033 napi->gro_list = NULL;
3034}
Herbert Xud565b0a2008-12-15 23:38:52 -08003035
Ben Hutchings5b252f02009-10-29 07:17:09 +00003036enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xud565b0a2008-12-15 23:38:52 -08003037{
3038 struct sk_buff **pp = NULL;
3039 struct packet_type *ptype;
3040 __be16 type = skb->protocol;
3041 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
Herbert Xu0da2afd52008-12-26 14:57:42 -08003042 int same_flow;
Herbert Xud565b0a2008-12-15 23:38:52 -08003043 int mac_len;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003044 enum gro_result ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003045
3046 if (!(skb->dev->features & NETIF_F_GRO))
3047 goto normal;
3048
David S. Miller4cf704f2009-06-09 00:18:51 -07003049 if (skb_is_gso(skb) || skb_has_frags(skb))
Herbert Xuf17f5c92009-01-14 14:36:12 -08003050 goto normal;
3051
Herbert Xud565b0a2008-12-15 23:38:52 -08003052 rcu_read_lock();
3053 list_for_each_entry_rcu(ptype, head, list) {
Herbert Xud565b0a2008-12-15 23:38:52 -08003054 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3055 continue;
3056
Herbert Xu86911732009-01-29 14:19:50 +00003057 skb_set_network_header(skb, skb_gro_offset(skb));
Herbert Xud565b0a2008-12-15 23:38:52 -08003058 mac_len = skb->network_header - skb->mac_header;
3059 skb->mac_len = mac_len;
3060 NAPI_GRO_CB(skb)->same_flow = 0;
3061 NAPI_GRO_CB(skb)->flush = 0;
Herbert Xu5d38a072009-01-04 16:13:40 -08003062 NAPI_GRO_CB(skb)->free = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003063
Herbert Xud565b0a2008-12-15 23:38:52 -08003064 pp = ptype->gro_receive(&napi->gro_list, skb);
3065 break;
3066 }
3067 rcu_read_unlock();
3068
3069 if (&ptype->list == head)
3070 goto normal;
3071
Herbert Xu0da2afd52008-12-26 14:57:42 -08003072 same_flow = NAPI_GRO_CB(skb)->same_flow;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003073 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
Herbert Xu0da2afd52008-12-26 14:57:42 -08003074
Herbert Xud565b0a2008-12-15 23:38:52 -08003075 if (pp) {
3076 struct sk_buff *nskb = *pp;
3077
3078 *pp = nskb->next;
3079 nskb->next = NULL;
3080 napi_gro_complete(nskb);
Herbert Xu4ae55442009-02-08 18:00:36 +00003081 napi->gro_count--;
Herbert Xud565b0a2008-12-15 23:38:52 -08003082 }
3083
Herbert Xu0da2afd52008-12-26 14:57:42 -08003084 if (same_flow)
Herbert Xud565b0a2008-12-15 23:38:52 -08003085 goto ok;
3086
Herbert Xu4ae55442009-02-08 18:00:36 +00003087 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
Herbert Xud565b0a2008-12-15 23:38:52 -08003088 goto normal;
Herbert Xud565b0a2008-12-15 23:38:52 -08003089
Herbert Xu4ae55442009-02-08 18:00:36 +00003090 napi->gro_count++;
Herbert Xud565b0a2008-12-15 23:38:52 -08003091 NAPI_GRO_CB(skb)->count = 1;
Herbert Xu86911732009-01-29 14:19:50 +00003092 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003093 skb->next = napi->gro_list;
3094 napi->gro_list = skb;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003095 ret = GRO_HELD;
Herbert Xud565b0a2008-12-15 23:38:52 -08003096
Herbert Xuad0f9902009-02-01 01:24:55 -08003097pull:
Herbert Xucb189782009-05-26 18:50:31 +00003098 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3099 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3100
3101 BUG_ON(skb->end - skb->tail < grow);
3102
3103 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3104
3105 skb->tail += grow;
3106 skb->data_len -= grow;
3107
3108 skb_shinfo(skb)->frags[0].page_offset += grow;
3109 skb_shinfo(skb)->frags[0].size -= grow;
3110
3111 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3112 put_page(skb_shinfo(skb)->frags[0].page);
3113 memmove(skb_shinfo(skb)->frags,
3114 skb_shinfo(skb)->frags + 1,
3115 --skb_shinfo(skb)->nr_frags);
3116 }
Herbert Xuad0f9902009-02-01 01:24:55 -08003117 }
3118
Herbert Xud565b0a2008-12-15 23:38:52 -08003119ok:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003120 return ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003121
3122normal:
Herbert Xuad0f9902009-02-01 01:24:55 -08003123 ret = GRO_NORMAL;
3124 goto pull;
Herbert Xu5d38a072009-01-04 16:13:40 -08003125}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003126EXPORT_SYMBOL(dev_gro_receive);
3127
Ben Hutchings5b252f02009-10-29 07:17:09 +00003128static gro_result_t
3129__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003130{
3131 struct sk_buff *p;
3132
Herbert Xud1c76af2009-03-16 10:50:02 -07003133 if (netpoll_rx_on(skb))
3134 return GRO_NORMAL;
3135
Herbert Xu96e93ea2009-01-06 10:49:34 -08003136 for (p = napi->gro_list; p; p = p->next) {
Joe Perchesf64f9e72009-11-29 16:55:45 -08003137 NAPI_GRO_CB(p)->same_flow =
3138 (p->dev == skb->dev) &&
3139 !compare_ether_header(skb_mac_header(p),
3140 skb_gro_mac_header(skb));
Herbert Xu96e93ea2009-01-06 10:49:34 -08003141 NAPI_GRO_CB(p)->flush = 0;
3142 }
3143
3144 return dev_gro_receive(napi, skb);
3145}
Herbert Xu5d38a072009-01-04 16:13:40 -08003146
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003147gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
Herbert Xu5d38a072009-01-04 16:13:40 -08003148{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003149 switch (ret) {
3150 case GRO_NORMAL:
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003151 if (netif_receive_skb(skb))
3152 ret = GRO_DROP;
3153 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003154
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003155 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003156 case GRO_MERGED_FREE:
Herbert Xu5d38a072009-01-04 16:13:40 -08003157 kfree_skb(skb);
3158 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003159
3160 case GRO_HELD:
3161 case GRO_MERGED:
3162 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003163 }
3164
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003165 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003166}
3167EXPORT_SYMBOL(napi_skb_finish);
3168
Herbert Xu78a478d2009-05-26 18:50:21 +00003169void skb_gro_reset_offset(struct sk_buff *skb)
3170{
3171 NAPI_GRO_CB(skb)->data_offset = 0;
3172 NAPI_GRO_CB(skb)->frag0 = NULL;
Herbert Xu74895942009-05-26 18:50:27 +00003173 NAPI_GRO_CB(skb)->frag0_len = 0;
Herbert Xu78a478d2009-05-26 18:50:21 +00003174
Herbert Xu78d3fd02009-05-26 18:50:23 +00003175 if (skb->mac_header == skb->tail &&
Herbert Xu74895942009-05-26 18:50:27 +00003176 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
Herbert Xu78a478d2009-05-26 18:50:21 +00003177 NAPI_GRO_CB(skb)->frag0 =
3178 page_address(skb_shinfo(skb)->frags[0].page) +
3179 skb_shinfo(skb)->frags[0].page_offset;
Herbert Xu74895942009-05-26 18:50:27 +00003180 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3181 }
Herbert Xu78a478d2009-05-26 18:50:21 +00003182}
3183EXPORT_SYMBOL(skb_gro_reset_offset);
3184
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003185gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003186{
Herbert Xu86911732009-01-29 14:19:50 +00003187 skb_gro_reset_offset(skb);
3188
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003189 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003190}
3191EXPORT_SYMBOL(napi_gro_receive);
3192
Herbert Xu96e93ea2009-01-06 10:49:34 -08003193void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3194{
Herbert Xu96e93ea2009-01-06 10:49:34 -08003195 __skb_pull(skb, skb_headlen(skb));
3196 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3197
3198 napi->skb = skb;
3199}
3200EXPORT_SYMBOL(napi_reuse_skb);
3201
Herbert Xu76620aa2009-04-16 02:02:07 -07003202struct sk_buff *napi_get_frags(struct napi_struct *napi)
Herbert Xu5d38a072009-01-04 16:13:40 -08003203{
Herbert Xu5d38a072009-01-04 16:13:40 -08003204 struct sk_buff *skb = napi->skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003205
3206 if (!skb) {
Eric Dumazet89d71a62009-10-13 05:34:20 +00003207 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3208 if (skb)
3209 napi->skb = skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003210 }
Herbert Xu96e93ea2009-01-06 10:49:34 -08003211 return skb;
3212}
Herbert Xu76620aa2009-04-16 02:02:07 -07003213EXPORT_SYMBOL(napi_get_frags);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003214
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003215gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3216 gro_result_t ret)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003217{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003218 switch (ret) {
3219 case GRO_NORMAL:
Herbert Xu86911732009-01-29 14:19:50 +00003220 case GRO_HELD:
Ajit Khapardee76b69c2010-02-16 20:25:43 +00003221 skb->protocol = eth_type_trans(skb, skb->dev);
Herbert Xu86911732009-01-29 14:19:50 +00003222
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003223 if (ret == GRO_HELD)
3224 skb_gro_pull(skb, -ETH_HLEN);
3225 else if (netif_receive_skb(skb))
3226 ret = GRO_DROP;
Herbert Xu86911732009-01-29 14:19:50 +00003227 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003228
3229 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003230 case GRO_MERGED_FREE:
3231 napi_reuse_skb(napi, skb);
3232 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003233
3234 case GRO_MERGED:
3235 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003236 }
3237
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003238 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003239}
3240EXPORT_SYMBOL(napi_frags_finish);
3241
Herbert Xu76620aa2009-04-16 02:02:07 -07003242struct sk_buff *napi_frags_skb(struct napi_struct *napi)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003243{
Herbert Xu76620aa2009-04-16 02:02:07 -07003244 struct sk_buff *skb = napi->skb;
3245 struct ethhdr *eth;
Herbert Xua5b1cf22009-05-26 18:50:28 +00003246 unsigned int hlen;
3247 unsigned int off;
Herbert Xu76620aa2009-04-16 02:02:07 -07003248
3249 napi->skb = NULL;
3250
3251 skb_reset_mac_header(skb);
3252 skb_gro_reset_offset(skb);
3253
Herbert Xua5b1cf22009-05-26 18:50:28 +00003254 off = skb_gro_offset(skb);
3255 hlen = off + sizeof(*eth);
3256 eth = skb_gro_header_fast(skb, off);
3257 if (skb_gro_header_hard(skb, hlen)) {
3258 eth = skb_gro_header_slow(skb, hlen, off);
3259 if (unlikely(!eth)) {
3260 napi_reuse_skb(napi, skb);
3261 skb = NULL;
3262 goto out;
3263 }
Herbert Xu76620aa2009-04-16 02:02:07 -07003264 }
3265
3266 skb_gro_pull(skb, sizeof(*eth));
3267
3268 /*
3269 * This works because the only protocols we care about don't require
3270 * special handling. We'll fix it up properly at the end.
3271 */
3272 skb->protocol = eth->h_proto;
3273
3274out:
3275 return skb;
3276}
3277EXPORT_SYMBOL(napi_frags_skb);
3278
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003279gro_result_t napi_gro_frags(struct napi_struct *napi)
Herbert Xu76620aa2009-04-16 02:02:07 -07003280{
3281 struct sk_buff *skb = napi_frags_skb(napi);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003282
3283 if (!skb)
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003284 return GRO_DROP;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003285
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003286 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
Herbert Xu5d38a072009-01-04 16:13:40 -08003287}
3288EXPORT_SYMBOL(napi_gro_frags);
3289
Eric Dumazete326bed2010-04-22 00:22:45 -07003290/*
3291 * net_rps_action sends any pending IPI's for rps.
3292 * Note: called with local irq disabled, but exits with local irq enabled.
3293 */
3294static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3295{
3296#ifdef CONFIG_RPS
3297 struct softnet_data *remsd = sd->rps_ipi_list;
3298
3299 if (remsd) {
3300 sd->rps_ipi_list = NULL;
3301
3302 local_irq_enable();
3303
3304 /* Send pending IPI's to kick RPS processing on remote cpus. */
3305 while (remsd) {
3306 struct softnet_data *next = remsd->rps_ipi_next;
3307
3308 if (cpu_online(remsd->cpu))
3309 __smp_call_function_single(remsd->cpu,
3310 &remsd->csd, 0);
3311 remsd = next;
3312 }
3313 } else
3314#endif
3315 local_irq_enable();
3316}
3317
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003318static int process_backlog(struct napi_struct *napi, int quota)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003319{
3320 int work = 0;
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003321 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003322
Eric Dumazete326bed2010-04-22 00:22:45 -07003323#ifdef CONFIG_RPS
3324 /* Check if we have pending ipi, its better to send them now,
3325 * not waiting net_rx_action() end.
3326 */
3327 if (sd->rps_ipi_list) {
3328 local_irq_disable();
3329 net_rps_action_and_irq_enable(sd);
3330 }
3331#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003332 napi->weight = weight_p;
Changli Gao6e7676c2010-04-27 15:07:33 -07003333 local_irq_disable();
3334 while (work < quota) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003335 struct sk_buff *skb;
Changli Gao6e7676c2010-04-27 15:07:33 -07003336 unsigned int qlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003337
Changli Gao6e7676c2010-04-27 15:07:33 -07003338 while ((skb = __skb_dequeue(&sd->process_queue))) {
Eric Dumazete4008272010-04-05 15:42:39 -07003339 local_irq_enable();
Changli Gao6e7676c2010-04-27 15:07:33 -07003340 __netif_receive_skb(skb);
Changli Gao6e7676c2010-04-27 15:07:33 -07003341 local_irq_disable();
Tom Herbert76cc8b12010-05-20 18:37:59 +00003342 input_queue_head_incr(sd);
3343 if (++work >= quota) {
3344 local_irq_enable();
3345 return work;
3346 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003347 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003348
Changli Gao6e7676c2010-04-27 15:07:33 -07003349 rps_lock(sd);
3350 qlen = skb_queue_len(&sd->input_pkt_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003351 if (qlen)
Changli Gao6e7676c2010-04-27 15:07:33 -07003352 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3353 &sd->process_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003354
Changli Gao6e7676c2010-04-27 15:07:33 -07003355 if (qlen < quota - work) {
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003356 /*
3357 * Inline a custom version of __napi_complete().
3358 * only current cpu owns and manipulates this napi,
3359 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3360 * we can use a plain write instead of clear_bit(),
3361 * and we dont need an smp_mb() memory barrier.
3362 */
3363 list_del(&napi->poll_list);
3364 napi->state = 0;
3365
Changli Gao6e7676c2010-04-27 15:07:33 -07003366 quota = work + qlen;
3367 }
3368 rps_unlock(sd);
3369 }
3370 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003371
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003372 return work;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003373}
3374
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003375/**
3376 * __napi_schedule - schedule for receive
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07003377 * @n: entry to schedule
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003378 *
3379 * The entry's receive function will be scheduled to run
3380 */
Harvey Harrisonb5606c22008-02-13 15:03:16 -08003381void __napi_schedule(struct napi_struct *n)
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003382{
3383 unsigned long flags;
3384
3385 local_irq_save(flags);
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003386 ____napi_schedule(&__get_cpu_var(softnet_data), n);
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003387 local_irq_restore(flags);
3388}
3389EXPORT_SYMBOL(__napi_schedule);
3390
Herbert Xud565b0a2008-12-15 23:38:52 -08003391void __napi_complete(struct napi_struct *n)
3392{
3393 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3394 BUG_ON(n->gro_list);
3395
3396 list_del(&n->poll_list);
3397 smp_mb__before_clear_bit();
3398 clear_bit(NAPI_STATE_SCHED, &n->state);
3399}
3400EXPORT_SYMBOL(__napi_complete);
3401
3402void napi_complete(struct napi_struct *n)
3403{
3404 unsigned long flags;
3405
3406 /*
3407 * don't let napi dequeue from the cpu poll list
3408 * just in case its running on a different cpu
3409 */
3410 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3411 return;
3412
3413 napi_gro_flush(n);
3414 local_irq_save(flags);
3415 __napi_complete(n);
3416 local_irq_restore(flags);
3417}
3418EXPORT_SYMBOL(napi_complete);
3419
3420void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3421 int (*poll)(struct napi_struct *, int), int weight)
3422{
3423 INIT_LIST_HEAD(&napi->poll_list);
Herbert Xu4ae55442009-02-08 18:00:36 +00003424 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003425 napi->gro_list = NULL;
Herbert Xu5d38a072009-01-04 16:13:40 -08003426 napi->skb = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08003427 napi->poll = poll;
3428 napi->weight = weight;
3429 list_add(&napi->dev_list, &dev->napi_list);
Herbert Xud565b0a2008-12-15 23:38:52 -08003430 napi->dev = dev;
Herbert Xu5d38a072009-01-04 16:13:40 -08003431#ifdef CONFIG_NETPOLL
Herbert Xud565b0a2008-12-15 23:38:52 -08003432 spin_lock_init(&napi->poll_lock);
3433 napi->poll_owner = -1;
3434#endif
3435 set_bit(NAPI_STATE_SCHED, &napi->state);
3436}
3437EXPORT_SYMBOL(netif_napi_add);
3438
3439void netif_napi_del(struct napi_struct *napi)
3440{
3441 struct sk_buff *skb, *next;
3442
Peter P Waskiewicz Jrd7b06632008-12-26 01:35:35 -08003443 list_del_init(&napi->dev_list);
Herbert Xu76620aa2009-04-16 02:02:07 -07003444 napi_free_frags(napi);
Herbert Xud565b0a2008-12-15 23:38:52 -08003445
3446 for (skb = napi->gro_list; skb; skb = next) {
3447 next = skb->next;
3448 skb->next = NULL;
3449 kfree_skb(skb);
3450 }
3451
3452 napi->gro_list = NULL;
Herbert Xu4ae55442009-02-08 18:00:36 +00003453 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003454}
3455EXPORT_SYMBOL(netif_napi_del);
3456
Linus Torvalds1da177e2005-04-16 15:20:36 -07003457static void net_rx_action(struct softirq_action *h)
3458{
Eric Dumazete326bed2010-04-22 00:22:45 -07003459 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003460 unsigned long time_limit = jiffies + 2;
Stephen Hemminger51b0bde2005-06-23 20:14:40 -07003461 int budget = netdev_budget;
Matt Mackall53fb95d2005-08-11 19:27:43 -07003462 void *have;
3463
Linus Torvalds1da177e2005-04-16 15:20:36 -07003464 local_irq_disable();
3465
Eric Dumazete326bed2010-04-22 00:22:45 -07003466 while (!list_empty(&sd->poll_list)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003467 struct napi_struct *n;
3468 int work, weight;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003469
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003470 /* If softirq window is exhuasted then punt.
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003471 * Allow this to run for 2 jiffies since which will allow
3472 * an average latency of 1.5/HZ.
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003473 */
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003474 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003475 goto softnet_break;
3476
3477 local_irq_enable();
3478
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003479 /* Even though interrupts have been re-enabled, this
3480 * access is safe because interrupts can only add new
3481 * entries to the tail of this list, and only ->poll()
3482 * calls can remove this head entry from the list.
3483 */
Eric Dumazete326bed2010-04-22 00:22:45 -07003484 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003485
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003486 have = netpoll_poll_lock(n);
3487
3488 weight = n->weight;
3489
David S. Miller0a7606c2007-10-29 21:28:47 -07003490 /* This NAPI_STATE_SCHED test is for avoiding a race
3491 * with netpoll's poll_napi(). Only the entity which
3492 * obtains the lock and sees NAPI_STATE_SCHED set will
3493 * actually make the ->poll() call. Therefore we avoid
3494 * accidently calling ->poll() when NAPI is not scheduled.
3495 */
3496 work = 0;
Neil Horman4ea7e382009-05-21 07:36:08 +00003497 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
David S. Miller0a7606c2007-10-29 21:28:47 -07003498 work = n->poll(n, weight);
Neil Horman4ea7e382009-05-21 07:36:08 +00003499 trace_napi_poll(n);
3500 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003501
3502 WARN_ON_ONCE(work > weight);
3503
3504 budget -= work;
3505
3506 local_irq_disable();
3507
3508 /* Drivers must not modify the NAPI state if they
3509 * consume the entire weight. In such cases this code
3510 * still "owns" the NAPI instance and therefore can
3511 * move the instance around on the list at-will.
3512 */
David S. Millerfed17f32008-01-07 21:00:40 -08003513 if (unlikely(work == weight)) {
Herbert Xuff780cd2009-06-26 19:27:04 -07003514 if (unlikely(napi_disable_pending(n))) {
3515 local_irq_enable();
3516 napi_complete(n);
3517 local_irq_disable();
3518 } else
Eric Dumazete326bed2010-04-22 00:22:45 -07003519 list_move_tail(&n->poll_list, &sd->poll_list);
David S. Millerfed17f32008-01-07 21:00:40 -08003520 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003521
3522 netpoll_poll_unlock(have);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003523 }
3524out:
Eric Dumazete326bed2010-04-22 00:22:45 -07003525 net_rps_action_and_irq_enable(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003526
Chris Leechdb217332006-06-17 21:24:58 -07003527#ifdef CONFIG_NET_DMA
3528 /*
3529 * There may not be any more sk_buffs coming right now, so push
3530 * any pending DMA copies to hardware
3531 */
Dan Williams2ba05622009-01-06 11:38:14 -07003532 dma_issue_pending_all();
Chris Leechdb217332006-06-17 21:24:58 -07003533#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003534
Linus Torvalds1da177e2005-04-16 15:20:36 -07003535 return;
3536
3537softnet_break:
Changli Gaodee42872010-05-02 05:42:16 +00003538 sd->time_squeeze++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003539 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3540 goto out;
3541}
3542
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003543static gifconf_func_t *gifconf_list[NPROTO];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003544
3545/**
3546 * register_gifconf - register a SIOCGIF handler
3547 * @family: Address family
3548 * @gifconf: Function handler
3549 *
3550 * Register protocol dependent address dumping routines. The handler
3551 * that is passed must not be freed or reused until it has been replaced
3552 * by another handler.
3553 */
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003554int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003555{
3556 if (family >= NPROTO)
3557 return -EINVAL;
3558 gifconf_list[family] = gifconf;
3559 return 0;
3560}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003561EXPORT_SYMBOL(register_gifconf);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003562
3563
3564/*
3565 * Map an interface index to its name (SIOCGIFNAME)
3566 */
3567
3568/*
3569 * We need this ioctl for efficient implementation of the
3570 * if_indextoname() function required by the IPv6 API. Without
3571 * it, we would have to search all the interfaces to find a
3572 * match. --pb
3573 */
3574
Eric W. Biederman881d9662007-09-17 11:56:21 -07003575static int dev_ifname(struct net *net, struct ifreq __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003576{
3577 struct net_device *dev;
3578 struct ifreq ifr;
3579
3580 /*
3581 * Fetch the caller's info block.
3582 */
3583
3584 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3585 return -EFAULT;
3586
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003587 rcu_read_lock();
3588 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003589 if (!dev) {
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003590 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003591 return -ENODEV;
3592 }
3593
3594 strcpy(ifr.ifr_name, dev->name);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003595 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003596
3597 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3598 return -EFAULT;
3599 return 0;
3600}
3601
3602/*
3603 * Perform a SIOCGIFCONF call. This structure will change
3604 * size eventually, and there is nothing I can do about it.
3605 * Thus we will need a 'compatibility mode'.
3606 */
3607
Eric W. Biederman881d9662007-09-17 11:56:21 -07003608static int dev_ifconf(struct net *net, char __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003609{
3610 struct ifconf ifc;
3611 struct net_device *dev;
3612 char __user *pos;
3613 int len;
3614 int total;
3615 int i;
3616
3617 /*
3618 * Fetch the caller's info block.
3619 */
3620
3621 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3622 return -EFAULT;
3623
3624 pos = ifc.ifc_buf;
3625 len = ifc.ifc_len;
3626
3627 /*
3628 * Loop over the interfaces, and write an info block for each.
3629 */
3630
3631 total = 0;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003632 for_each_netdev(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003633 for (i = 0; i < NPROTO; i++) {
3634 if (gifconf_list[i]) {
3635 int done;
3636 if (!pos)
3637 done = gifconf_list[i](dev, NULL, 0);
3638 else
3639 done = gifconf_list[i](dev, pos + total,
3640 len - total);
3641 if (done < 0)
3642 return -EFAULT;
3643 total += done;
3644 }
3645 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003646 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003647
3648 /*
3649 * All done. Write the updated control block back to the caller.
3650 */
3651 ifc.ifc_len = total;
3652
3653 /*
3654 * Both BSD and Solaris return 0 here, so we do too.
3655 */
3656 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3657}
3658
3659#ifdef CONFIG_PROC_FS
3660/*
3661 * This is invoked by the /proc filesystem handler to display a device
3662 * in detail.
3663 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003664void *dev_seq_start(struct seq_file *seq, loff_t *pos)
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003665 __acquires(RCU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003666{
Denis V. Luneve372c412007-11-19 22:31:54 -08003667 struct net *net = seq_file_net(seq);
Pavel Emelianov7562f872007-05-03 15:13:45 -07003668 loff_t off;
3669 struct net_device *dev;
3670
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003671 rcu_read_lock();
Pavel Emelianov7562f872007-05-03 15:13:45 -07003672 if (!*pos)
3673 return SEQ_START_TOKEN;
3674
3675 off = 1;
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003676 for_each_netdev_rcu(net, dev)
Pavel Emelianov7562f872007-05-03 15:13:45 -07003677 if (off++ == *pos)
3678 return dev;
3679
3680 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003681}
3682
3683void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3684{
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003685 struct net_device *dev = (v == SEQ_START_TOKEN) ?
3686 first_net_device(seq_file_net(seq)) :
3687 next_net_device((struct net_device *)v);
3688
Linus Torvalds1da177e2005-04-16 15:20:36 -07003689 ++*pos;
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003690 return rcu_dereference(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003691}
3692
3693void dev_seq_stop(struct seq_file *seq, void *v)
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003694 __releases(RCU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003695{
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003696 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003697}
3698
3699static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3700{
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08003701 const struct net_device_stats *stats = dev_get_stats(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003702
Jesper Dangaard Brouer2d13baf2010-01-05 05:50:52 +00003703 seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
Rusty Russell5a1b5892007-04-28 21:04:03 -07003704 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3705 dev->name, stats->rx_bytes, stats->rx_packets,
3706 stats->rx_errors,
3707 stats->rx_dropped + stats->rx_missed_errors,
3708 stats->rx_fifo_errors,
3709 stats->rx_length_errors + stats->rx_over_errors +
3710 stats->rx_crc_errors + stats->rx_frame_errors,
3711 stats->rx_compressed, stats->multicast,
3712 stats->tx_bytes, stats->tx_packets,
3713 stats->tx_errors, stats->tx_dropped,
3714 stats->tx_fifo_errors, stats->collisions,
3715 stats->tx_carrier_errors +
3716 stats->tx_aborted_errors +
3717 stats->tx_window_errors +
3718 stats->tx_heartbeat_errors,
3719 stats->tx_compressed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003720}
3721
3722/*
3723 * Called from the PROCfs module. This now uses the new arbitrary sized
3724 * /proc/net interface to create /proc/net/dev
3725 */
3726static int dev_seq_show(struct seq_file *seq, void *v)
3727{
3728 if (v == SEQ_START_TOKEN)
3729 seq_puts(seq, "Inter-| Receive "
3730 " | Transmit\n"
3731 " face |bytes packets errs drop fifo frame "
3732 "compressed multicast|bytes packets errs "
3733 "drop fifo colls carrier compressed\n");
3734 else
3735 dev_seq_printf_stats(seq, v);
3736 return 0;
3737}
3738
Changli Gaodee42872010-05-02 05:42:16 +00003739static struct softnet_data *softnet_get_online(loff_t *pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003740{
Changli Gaodee42872010-05-02 05:42:16 +00003741 struct softnet_data *sd = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003742
Mike Travis0c0b0ac2008-05-02 16:43:08 -07003743 while (*pos < nr_cpu_ids)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003744 if (cpu_online(*pos)) {
Changli Gaodee42872010-05-02 05:42:16 +00003745 sd = &per_cpu(softnet_data, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003746 break;
3747 } else
3748 ++*pos;
Changli Gaodee42872010-05-02 05:42:16 +00003749 return sd;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003750}
3751
3752static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3753{
3754 return softnet_get_online(pos);
3755}
3756
3757static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3758{
3759 ++*pos;
3760 return softnet_get_online(pos);
3761}
3762
3763static void softnet_seq_stop(struct seq_file *seq, void *v)
3764{
3765}
3766
3767static int softnet_seq_show(struct seq_file *seq, void *v)
3768{
Changli Gaodee42872010-05-02 05:42:16 +00003769 struct softnet_data *sd = v;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003770
Tom Herbert0a9627f2010-03-16 08:03:29 +00003771 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
Changli Gaodee42872010-05-02 05:42:16 +00003772 sd->processed, sd->dropped, sd->time_squeeze, 0,
Stephen Hemmingerc1ebcdb2005-06-23 20:08:59 -07003773 0, 0, 0, 0, /* was fastroute */
Changli Gaodee42872010-05-02 05:42:16 +00003774 sd->cpu_collision, sd->received_rps);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003775 return 0;
3776}
3777
Stephen Hemmingerf6908082007-03-12 14:34:29 -07003778static const struct seq_operations dev_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003779 .start = dev_seq_start,
3780 .next = dev_seq_next,
3781 .stop = dev_seq_stop,
3782 .show = dev_seq_show,
3783};
3784
3785static int dev_seq_open(struct inode *inode, struct file *file)
3786{
Denis V. Luneve372c412007-11-19 22:31:54 -08003787 return seq_open_net(inode, file, &dev_seq_ops,
3788 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003789}
3790
Arjan van de Ven9a321442007-02-12 00:55:35 -08003791static const struct file_operations dev_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003792 .owner = THIS_MODULE,
3793 .open = dev_seq_open,
3794 .read = seq_read,
3795 .llseek = seq_lseek,
Denis V. Luneve372c412007-11-19 22:31:54 -08003796 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003797};
3798
Stephen Hemmingerf6908082007-03-12 14:34:29 -07003799static const struct seq_operations softnet_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003800 .start = softnet_seq_start,
3801 .next = softnet_seq_next,
3802 .stop = softnet_seq_stop,
3803 .show = softnet_seq_show,
3804};
3805
3806static int softnet_seq_open(struct inode *inode, struct file *file)
3807{
3808 return seq_open(file, &softnet_seq_ops);
3809}
3810
Arjan van de Ven9a321442007-02-12 00:55:35 -08003811static const struct file_operations softnet_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003812 .owner = THIS_MODULE,
3813 .open = softnet_seq_open,
3814 .read = seq_read,
3815 .llseek = seq_lseek,
3816 .release = seq_release,
3817};
3818
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003819static void *ptype_get_idx(loff_t pos)
3820{
3821 struct packet_type *pt = NULL;
3822 loff_t i = 0;
3823 int t;
3824
3825 list_for_each_entry_rcu(pt, &ptype_all, list) {
3826 if (i == pos)
3827 return pt;
3828 ++i;
3829 }
3830
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08003831 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003832 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3833 if (i == pos)
3834 return pt;
3835 ++i;
3836 }
3837 }
3838 return NULL;
3839}
3840
3841static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
Stephen Hemminger72348a42008-01-21 02:27:29 -08003842 __acquires(RCU)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003843{
3844 rcu_read_lock();
3845 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3846}
3847
3848static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3849{
3850 struct packet_type *pt;
3851 struct list_head *nxt;
3852 int hash;
3853
3854 ++*pos;
3855 if (v == SEQ_START_TOKEN)
3856 return ptype_get_idx(0);
3857
3858 pt = v;
3859 nxt = pt->list.next;
3860 if (pt->type == htons(ETH_P_ALL)) {
3861 if (nxt != &ptype_all)
3862 goto found;
3863 hash = 0;
3864 nxt = ptype_base[0].next;
3865 } else
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08003866 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003867
3868 while (nxt == &ptype_base[hash]) {
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08003869 if (++hash >= PTYPE_HASH_SIZE)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003870 return NULL;
3871 nxt = ptype_base[hash].next;
3872 }
3873found:
3874 return list_entry(nxt, struct packet_type, list);
3875}
3876
3877static void ptype_seq_stop(struct seq_file *seq, void *v)
Stephen Hemminger72348a42008-01-21 02:27:29 -08003878 __releases(RCU)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003879{
3880 rcu_read_unlock();
3881}
3882
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003883static int ptype_seq_show(struct seq_file *seq, void *v)
3884{
3885 struct packet_type *pt = v;
3886
3887 if (v == SEQ_START_TOKEN)
3888 seq_puts(seq, "Type Device Function\n");
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09003889 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003890 if (pt->type == htons(ETH_P_ALL))
3891 seq_puts(seq, "ALL ");
3892 else
3893 seq_printf(seq, "%04x", ntohs(pt->type));
3894
Alexey Dobriyan908cd2d2008-11-16 19:50:35 -08003895 seq_printf(seq, " %-8s %pF\n",
3896 pt->dev ? pt->dev->name : "", pt->func);
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003897 }
3898
3899 return 0;
3900}
3901
3902static const struct seq_operations ptype_seq_ops = {
3903 .start = ptype_seq_start,
3904 .next = ptype_seq_next,
3905 .stop = ptype_seq_stop,
3906 .show = ptype_seq_show,
3907};
3908
3909static int ptype_seq_open(struct inode *inode, struct file *file)
3910{
Pavel Emelyanov2feb27d2008-03-24 14:57:45 -07003911 return seq_open_net(inode, file, &ptype_seq_ops,
3912 sizeof(struct seq_net_private));
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003913}
3914
3915static const struct file_operations ptype_seq_fops = {
3916 .owner = THIS_MODULE,
3917 .open = ptype_seq_open,
3918 .read = seq_read,
3919 .llseek = seq_lseek,
Pavel Emelyanov2feb27d2008-03-24 14:57:45 -07003920 .release = seq_release_net,
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003921};
3922
3923
Pavel Emelyanov46650792007-10-08 20:38:39 -07003924static int __net_init dev_proc_net_init(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003925{
3926 int rc = -ENOMEM;
3927
Eric W. Biederman881d9662007-09-17 11:56:21 -07003928 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003929 goto out;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003930 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003931 goto out_dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003932 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003933 goto out_softnet;
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003934
Eric W. Biederman881d9662007-09-17 11:56:21 -07003935 if (wext_proc_init(net))
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003936 goto out_ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003937 rc = 0;
3938out:
3939 return rc;
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003940out_ptype:
Eric W. Biederman881d9662007-09-17 11:56:21 -07003941 proc_net_remove(net, "ptype");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003942out_softnet:
Eric W. Biederman881d9662007-09-17 11:56:21 -07003943 proc_net_remove(net, "softnet_stat");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003944out_dev:
Eric W. Biederman881d9662007-09-17 11:56:21 -07003945 proc_net_remove(net, "dev");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003946 goto out;
3947}
Eric W. Biederman881d9662007-09-17 11:56:21 -07003948
Pavel Emelyanov46650792007-10-08 20:38:39 -07003949static void __net_exit dev_proc_net_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07003950{
3951 wext_proc_exit(net);
3952
3953 proc_net_remove(net, "ptype");
3954 proc_net_remove(net, "softnet_stat");
3955 proc_net_remove(net, "dev");
3956}
3957
Denis V. Lunev022cbae2007-11-13 03:23:50 -08003958static struct pernet_operations __net_initdata dev_proc_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07003959 .init = dev_proc_net_init,
3960 .exit = dev_proc_net_exit,
3961};
3962
3963static int __init dev_proc_init(void)
3964{
3965 return register_pernet_subsys(&dev_proc_ops);
3966}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003967#else
3968#define dev_proc_init() 0
3969#endif /* CONFIG_PROC_FS */
3970
3971
3972/**
3973 * netdev_set_master - set up master/slave pair
3974 * @slave: slave device
3975 * @master: new master device
3976 *
3977 * Changes the master device of the slave. Pass %NULL to break the
3978 * bonding. The caller must hold the RTNL semaphore. On a failure
3979 * a negative errno code is returned. On success the reference counts
3980 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3981 * function returns zero.
3982 */
3983int netdev_set_master(struct net_device *slave, struct net_device *master)
3984{
3985 struct net_device *old = slave->master;
3986
3987 ASSERT_RTNL();
3988
3989 if (master) {
3990 if (old)
3991 return -EBUSY;
3992 dev_hold(master);
3993 }
3994
3995 slave->master = master;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003996
Eric Dumazet283f2fe2010-03-18 13:37:40 +00003997 if (old) {
3998 synchronize_net();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003999 dev_put(old);
Eric Dumazet283f2fe2010-03-18 13:37:40 +00004000 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004001 if (master)
4002 slave->flags |= IFF_SLAVE;
4003 else
4004 slave->flags &= ~IFF_SLAVE;
4005
4006 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4007 return 0;
4008}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004009EXPORT_SYMBOL(netdev_set_master);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004010
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004011static void dev_change_rx_flags(struct net_device *dev, int flags)
4012{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004013 const struct net_device_ops *ops = dev->netdev_ops;
4014
4015 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4016 ops->ndo_change_rx_flags(dev, flags);
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004017}
4018
Wang Chendad9b332008-06-18 01:48:28 -07004019static int __dev_set_promiscuity(struct net_device *dev, int inc)
Patrick McHardy4417da62007-06-27 01:28:10 -07004020{
4021 unsigned short old_flags = dev->flags;
David Howells8192b0c2008-11-14 10:39:10 +11004022 uid_t uid;
4023 gid_t gid;
Patrick McHardy4417da62007-06-27 01:28:10 -07004024
Patrick McHardy24023452007-07-14 18:51:31 -07004025 ASSERT_RTNL();
4026
Wang Chendad9b332008-06-18 01:48:28 -07004027 dev->flags |= IFF_PROMISC;
4028 dev->promiscuity += inc;
4029 if (dev->promiscuity == 0) {
4030 /*
4031 * Avoid overflow.
4032 * If inc causes overflow, untouch promisc and return error.
4033 */
4034 if (inc < 0)
4035 dev->flags &= ~IFF_PROMISC;
4036 else {
4037 dev->promiscuity -= inc;
4038 printk(KERN_WARNING "%s: promiscuity touches roof, "
4039 "set promiscuity failed, promiscuity feature "
4040 "of device might be broken.\n", dev->name);
4041 return -EOVERFLOW;
4042 }
4043 }
Patrick McHardy4417da62007-06-27 01:28:10 -07004044 if (dev->flags != old_flags) {
4045 printk(KERN_INFO "device %s %s promiscuous mode\n",
4046 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4047 "left");
David Howells8192b0c2008-11-14 10:39:10 +11004048 if (audit_enabled) {
4049 current_uid_gid(&uid, &gid);
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004050 audit_log(current->audit_context, GFP_ATOMIC,
4051 AUDIT_ANOM_PROMISCUOUS,
4052 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4053 dev->name, (dev->flags & IFF_PROMISC),
4054 (old_flags & IFF_PROMISC),
4055 audit_get_loginuid(current),
David Howells8192b0c2008-11-14 10:39:10 +11004056 uid, gid,
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004057 audit_get_sessionid(current));
David Howells8192b0c2008-11-14 10:39:10 +11004058 }
Patrick McHardy24023452007-07-14 18:51:31 -07004059
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004060 dev_change_rx_flags(dev, IFF_PROMISC);
Patrick McHardy4417da62007-06-27 01:28:10 -07004061 }
Wang Chendad9b332008-06-18 01:48:28 -07004062 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004063}
4064
Linus Torvalds1da177e2005-04-16 15:20:36 -07004065/**
4066 * dev_set_promiscuity - update promiscuity count on a device
4067 * @dev: device
4068 * @inc: modifier
4069 *
Stephen Hemminger3041a062006-05-26 13:25:24 -07004070 * Add or remove promiscuity from a device. While the count in the device
Linus Torvalds1da177e2005-04-16 15:20:36 -07004071 * remains above zero the interface remains promiscuous. Once it hits zero
4072 * the device reverts back to normal filtering operation. A negative inc
4073 * value is used to drop promiscuity on the device.
Wang Chendad9b332008-06-18 01:48:28 -07004074 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004075 */
Wang Chendad9b332008-06-18 01:48:28 -07004076int dev_set_promiscuity(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004077{
4078 unsigned short old_flags = dev->flags;
Wang Chendad9b332008-06-18 01:48:28 -07004079 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004080
Wang Chendad9b332008-06-18 01:48:28 -07004081 err = __dev_set_promiscuity(dev, inc);
Patrick McHardy4b5a6982008-07-06 15:49:08 -07004082 if (err < 0)
Wang Chendad9b332008-06-18 01:48:28 -07004083 return err;
Patrick McHardy4417da62007-06-27 01:28:10 -07004084 if (dev->flags != old_flags)
4085 dev_set_rx_mode(dev);
Wang Chendad9b332008-06-18 01:48:28 -07004086 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004087}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004088EXPORT_SYMBOL(dev_set_promiscuity);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004089
4090/**
4091 * dev_set_allmulti - update allmulti count on a device
4092 * @dev: device
4093 * @inc: modifier
4094 *
4095 * Add or remove reception of all multicast frames to a device. While the
4096 * count in the device remains above zero the interface remains listening
4097 * to all interfaces. Once it hits zero the device reverts back to normal
4098 * filtering operation. A negative @inc value is used to drop the counter
4099 * when releasing a resource needing all multicasts.
Wang Chendad9b332008-06-18 01:48:28 -07004100 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004101 */
4102
Wang Chendad9b332008-06-18 01:48:28 -07004103int dev_set_allmulti(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004104{
4105 unsigned short old_flags = dev->flags;
4106
Patrick McHardy24023452007-07-14 18:51:31 -07004107 ASSERT_RTNL();
4108
Linus Torvalds1da177e2005-04-16 15:20:36 -07004109 dev->flags |= IFF_ALLMULTI;
Wang Chendad9b332008-06-18 01:48:28 -07004110 dev->allmulti += inc;
4111 if (dev->allmulti == 0) {
4112 /*
4113 * Avoid overflow.
4114 * If inc causes overflow, untouch allmulti and return error.
4115 */
4116 if (inc < 0)
4117 dev->flags &= ~IFF_ALLMULTI;
4118 else {
4119 dev->allmulti -= inc;
4120 printk(KERN_WARNING "%s: allmulti touches roof, "
4121 "set allmulti failed, allmulti feature of "
4122 "device might be broken.\n", dev->name);
4123 return -EOVERFLOW;
4124 }
4125 }
Patrick McHardy24023452007-07-14 18:51:31 -07004126 if (dev->flags ^ old_flags) {
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004127 dev_change_rx_flags(dev, IFF_ALLMULTI);
Patrick McHardy4417da62007-06-27 01:28:10 -07004128 dev_set_rx_mode(dev);
Patrick McHardy24023452007-07-14 18:51:31 -07004129 }
Wang Chendad9b332008-06-18 01:48:28 -07004130 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004131}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004132EXPORT_SYMBOL(dev_set_allmulti);
Patrick McHardy4417da62007-06-27 01:28:10 -07004133
4134/*
4135 * Upload unicast and multicast address lists to device and
4136 * configure RX filtering. When the device doesn't support unicast
Joe Perches53ccaae2007-12-20 14:02:06 -08004137 * filtering it is put in promiscuous mode while unicast addresses
Patrick McHardy4417da62007-06-27 01:28:10 -07004138 * are present.
4139 */
4140void __dev_set_rx_mode(struct net_device *dev)
4141{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004142 const struct net_device_ops *ops = dev->netdev_ops;
4143
Patrick McHardy4417da62007-06-27 01:28:10 -07004144 /* dev_open will call this function so the list will stay sane. */
4145 if (!(dev->flags&IFF_UP))
4146 return;
4147
4148 if (!netif_device_present(dev))
YOSHIFUJI Hideaki40b77c92007-07-19 10:43:23 +09004149 return;
Patrick McHardy4417da62007-06-27 01:28:10 -07004150
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004151 if (ops->ndo_set_rx_mode)
4152 ops->ndo_set_rx_mode(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004153 else {
4154 /* Unicast addresses changes may only happen under the rtnl,
4155 * therefore calling __dev_set_promiscuity here is safe.
4156 */
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004157 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004158 __dev_set_promiscuity(dev, 1);
4159 dev->uc_promisc = 1;
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004160 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004161 __dev_set_promiscuity(dev, -1);
4162 dev->uc_promisc = 0;
4163 }
4164
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004165 if (ops->ndo_set_multicast_list)
4166 ops->ndo_set_multicast_list(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004167 }
4168}
4169
4170void dev_set_rx_mode(struct net_device *dev)
4171{
David S. Millerb9e40852008-07-15 00:15:08 -07004172 netif_addr_lock_bh(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004173 __dev_set_rx_mode(dev);
David S. Millerb9e40852008-07-15 00:15:08 -07004174 netif_addr_unlock_bh(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004175}
4176
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004177/**
4178 * dev_get_flags - get flags reported to userspace
4179 * @dev: device
4180 *
4181 * Get the combination of flag bits exported through APIs to userspace.
4182 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004183unsigned dev_get_flags(const struct net_device *dev)
4184{
4185 unsigned flags;
4186
4187 flags = (dev->flags & ~(IFF_PROMISC |
4188 IFF_ALLMULTI |
Stefan Rompfb00055a2006-03-20 17:09:11 -08004189 IFF_RUNNING |
4190 IFF_LOWER_UP |
4191 IFF_DORMANT)) |
Linus Torvalds1da177e2005-04-16 15:20:36 -07004192 (dev->gflags & (IFF_PROMISC |
4193 IFF_ALLMULTI));
4194
Stefan Rompfb00055a2006-03-20 17:09:11 -08004195 if (netif_running(dev)) {
4196 if (netif_oper_up(dev))
4197 flags |= IFF_RUNNING;
4198 if (netif_carrier_ok(dev))
4199 flags |= IFF_LOWER_UP;
4200 if (netif_dormant(dev))
4201 flags |= IFF_DORMANT;
4202 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004203
4204 return flags;
4205}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004206EXPORT_SYMBOL(dev_get_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004207
Patrick McHardybd380812010-02-26 06:34:53 +00004208int __dev_change_flags(struct net_device *dev, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004209{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004210 int old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00004211 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004212
Patrick McHardy24023452007-07-14 18:51:31 -07004213 ASSERT_RTNL();
4214
Linus Torvalds1da177e2005-04-16 15:20:36 -07004215 /*
4216 * Set the flags on our device.
4217 */
4218
4219 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4220 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4221 IFF_AUTOMEDIA)) |
4222 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4223 IFF_ALLMULTI));
4224
4225 /*
4226 * Load in the correct multicast list now the flags have changed.
4227 */
4228
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004229 if ((old_flags ^ flags) & IFF_MULTICAST)
4230 dev_change_rx_flags(dev, IFF_MULTICAST);
Patrick McHardy24023452007-07-14 18:51:31 -07004231
Patrick McHardy4417da62007-06-27 01:28:10 -07004232 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004233
4234 /*
4235 * Have we downed the interface. We handle IFF_UP ourselves
4236 * according to user attempts to set it, rather than blindly
4237 * setting it.
4238 */
4239
4240 ret = 0;
4241 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
Patrick McHardybd380812010-02-26 06:34:53 +00004242 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004243
4244 if (!ret)
Patrick McHardy4417da62007-06-27 01:28:10 -07004245 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004246 }
4247
Linus Torvalds1da177e2005-04-16 15:20:36 -07004248 if ((flags ^ dev->gflags) & IFF_PROMISC) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004249 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4250
Linus Torvalds1da177e2005-04-16 15:20:36 -07004251 dev->gflags ^= IFF_PROMISC;
4252 dev_set_promiscuity(dev, inc);
4253 }
4254
4255 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4256 is important. Some (broken) drivers set IFF_PROMISC, when
4257 IFF_ALLMULTI is requested not asking us and not reporting.
4258 */
4259 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004260 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4261
Linus Torvalds1da177e2005-04-16 15:20:36 -07004262 dev->gflags ^= IFF_ALLMULTI;
4263 dev_set_allmulti(dev, inc);
4264 }
4265
Patrick McHardybd380812010-02-26 06:34:53 +00004266 return ret;
4267}
4268
4269void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4270{
4271 unsigned int changes = dev->flags ^ old_flags;
4272
4273 if (changes & IFF_UP) {
4274 if (dev->flags & IFF_UP)
4275 call_netdevice_notifiers(NETDEV_UP, dev);
4276 else
4277 call_netdevice_notifiers(NETDEV_DOWN, dev);
4278 }
4279
4280 if (dev->flags & IFF_UP &&
4281 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4282 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4283}
4284
4285/**
4286 * dev_change_flags - change device settings
4287 * @dev: device
4288 * @flags: device state flags
4289 *
4290 * Change settings on device based state flags. The flags are
4291 * in the userspace exported format.
4292 */
4293int dev_change_flags(struct net_device *dev, unsigned flags)
4294{
4295 int ret, changes;
4296 int old_flags = dev->flags;
4297
4298 ret = __dev_change_flags(dev, flags);
4299 if (ret < 0)
4300 return ret;
4301
4302 changes = old_flags ^ dev->flags;
Thomas Graf7c355f52007-06-05 16:03:03 -07004303 if (changes)
4304 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004305
Patrick McHardybd380812010-02-26 06:34:53 +00004306 __dev_notify_flags(dev, old_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004307 return ret;
4308}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004309EXPORT_SYMBOL(dev_change_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004310
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004311/**
4312 * dev_set_mtu - Change maximum transfer unit
4313 * @dev: device
4314 * @new_mtu: new transfer unit
4315 *
4316 * Change the maximum transfer size of the network device.
4317 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004318int dev_set_mtu(struct net_device *dev, int new_mtu)
4319{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004320 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004321 int err;
4322
4323 if (new_mtu == dev->mtu)
4324 return 0;
4325
4326 /* MTU must be positive. */
4327 if (new_mtu < 0)
4328 return -EINVAL;
4329
4330 if (!netif_device_present(dev))
4331 return -ENODEV;
4332
4333 err = 0;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004334 if (ops->ndo_change_mtu)
4335 err = ops->ndo_change_mtu(dev, new_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004336 else
4337 dev->mtu = new_mtu;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004338
Linus Torvalds1da177e2005-04-16 15:20:36 -07004339 if (!err && dev->flags & IFF_UP)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07004340 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004341 return err;
4342}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004343EXPORT_SYMBOL(dev_set_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004344
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004345/**
4346 * dev_set_mac_address - Change Media Access Control Address
4347 * @dev: device
4348 * @sa: new address
4349 *
4350 * Change the hardware (MAC) address of the device
4351 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004352int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4353{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004354 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004355 int err;
4356
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004357 if (!ops->ndo_set_mac_address)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004358 return -EOPNOTSUPP;
4359 if (sa->sa_family != dev->type)
4360 return -EINVAL;
4361 if (!netif_device_present(dev))
4362 return -ENODEV;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004363 err = ops->ndo_set_mac_address(dev, sa);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004364 if (!err)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07004365 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004366 return err;
4367}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004368EXPORT_SYMBOL(dev_set_mac_address);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004369
4370/*
Eric Dumazet3710bec2009-11-01 19:42:09 +00004371 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07004372 */
Jeff Garzik14e3e072007-10-08 00:06:32 -07004373static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004374{
4375 int err;
Eric Dumazet3710bec2009-11-01 19:42:09 +00004376 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004377
4378 if (!dev)
4379 return -ENODEV;
4380
4381 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004382 case SIOCGIFFLAGS: /* Get interface flags */
4383 ifr->ifr_flags = (short) dev_get_flags(dev);
4384 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004385
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004386 case SIOCGIFMETRIC: /* Get the metric on the interface
4387 (currently unused) */
4388 ifr->ifr_metric = 0;
4389 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004390
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004391 case SIOCGIFMTU: /* Get the MTU of a device */
4392 ifr->ifr_mtu = dev->mtu;
4393 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004394
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004395 case SIOCGIFHWADDR:
4396 if (!dev->addr_len)
4397 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4398 else
4399 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4400 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4401 ifr->ifr_hwaddr.sa_family = dev->type;
4402 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004403
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004404 case SIOCGIFSLAVE:
4405 err = -EINVAL;
4406 break;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004407
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004408 case SIOCGIFMAP:
4409 ifr->ifr_map.mem_start = dev->mem_start;
4410 ifr->ifr_map.mem_end = dev->mem_end;
4411 ifr->ifr_map.base_addr = dev->base_addr;
4412 ifr->ifr_map.irq = dev->irq;
4413 ifr->ifr_map.dma = dev->dma;
4414 ifr->ifr_map.port = dev->if_port;
4415 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004416
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004417 case SIOCGIFINDEX:
4418 ifr->ifr_ifindex = dev->ifindex;
4419 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004420
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004421 case SIOCGIFTXQLEN:
4422 ifr->ifr_qlen = dev->tx_queue_len;
4423 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004424
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004425 default:
4426 /* dev_ioctl() should ensure this case
4427 * is never reached
4428 */
4429 WARN_ON(1);
4430 err = -EINVAL;
4431 break;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004432
4433 }
4434 return err;
4435}
4436
4437/*
4438 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4439 */
4440static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4441{
4442 int err;
4443 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
Jarek Poplawski5f2f6da2008-12-22 19:35:28 -08004444 const struct net_device_ops *ops;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004445
4446 if (!dev)
4447 return -ENODEV;
4448
Jarek Poplawski5f2f6da2008-12-22 19:35:28 -08004449 ops = dev->netdev_ops;
4450
Jeff Garzik14e3e072007-10-08 00:06:32 -07004451 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004452 case SIOCSIFFLAGS: /* Set interface flags */
4453 return dev_change_flags(dev, ifr->ifr_flags);
Jeff Garzik14e3e072007-10-08 00:06:32 -07004454
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004455 case SIOCSIFMETRIC: /* Set the metric on the interface
4456 (currently unused) */
4457 return -EOPNOTSUPP;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004458
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004459 case SIOCSIFMTU: /* Set the MTU of a device */
4460 return dev_set_mtu(dev, ifr->ifr_mtu);
Jeff Garzik14e3e072007-10-08 00:06:32 -07004461
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004462 case SIOCSIFHWADDR:
4463 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004464
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004465 case SIOCSIFHWBROADCAST:
4466 if (ifr->ifr_hwaddr.sa_family != dev->type)
4467 return -EINVAL;
4468 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4469 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4470 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4471 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004472
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004473 case SIOCSIFMAP:
4474 if (ops->ndo_set_config) {
4475 if (!netif_device_present(dev))
4476 return -ENODEV;
4477 return ops->ndo_set_config(dev, &ifr->ifr_map);
4478 }
4479 return -EOPNOTSUPP;
4480
4481 case SIOCADDMULTI:
4482 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4483 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4484 return -EINVAL;
4485 if (!netif_device_present(dev))
4486 return -ENODEV;
Jiri Pirko22bedad32010-04-01 21:22:57 +00004487 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004488
4489 case SIOCDELMULTI:
4490 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4491 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4492 return -EINVAL;
4493 if (!netif_device_present(dev))
4494 return -ENODEV;
Jiri Pirko22bedad32010-04-01 21:22:57 +00004495 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004496
4497 case SIOCSIFTXQLEN:
4498 if (ifr->ifr_qlen < 0)
4499 return -EINVAL;
4500 dev->tx_queue_len = ifr->ifr_qlen;
4501 return 0;
4502
4503 case SIOCSIFNAME:
4504 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4505 return dev_change_name(dev, ifr->ifr_newname);
4506
4507 /*
4508 * Unknown or private ioctl
4509 */
4510 default:
4511 if ((cmd >= SIOCDEVPRIVATE &&
4512 cmd <= SIOCDEVPRIVATE + 15) ||
4513 cmd == SIOCBONDENSLAVE ||
4514 cmd == SIOCBONDRELEASE ||
4515 cmd == SIOCBONDSETHWADDR ||
4516 cmd == SIOCBONDSLAVEINFOQUERY ||
4517 cmd == SIOCBONDINFOQUERY ||
4518 cmd == SIOCBONDCHANGEACTIVE ||
4519 cmd == SIOCGMIIPHY ||
4520 cmd == SIOCGMIIREG ||
4521 cmd == SIOCSMIIREG ||
4522 cmd == SIOCBRADDIF ||
4523 cmd == SIOCBRDELIF ||
4524 cmd == SIOCSHWTSTAMP ||
4525 cmd == SIOCWANDEV) {
4526 err = -EOPNOTSUPP;
4527 if (ops->ndo_do_ioctl) {
4528 if (netif_device_present(dev))
4529 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4530 else
4531 err = -ENODEV;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004532 }
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004533 } else
4534 err = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004535
4536 }
4537 return err;
4538}
4539
4540/*
4541 * This function handles all "interface"-type I/O control requests. The actual
4542 * 'doing' part of this is dev_ifsioc above.
4543 */
4544
4545/**
4546 * dev_ioctl - network device ioctl
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004547 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07004548 * @cmd: command to issue
4549 * @arg: pointer to a struct ifreq in user space
4550 *
4551 * Issue ioctl functions to devices. This is normally called by the
4552 * user space syscall interfaces but can sometimes be useful for
4553 * other purposes. The return value is the return from the syscall if
4554 * positive or a negative errno code on error.
4555 */
4556
Eric W. Biederman881d9662007-09-17 11:56:21 -07004557int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004558{
4559 struct ifreq ifr;
4560 int ret;
4561 char *colon;
4562
4563 /* One special case: SIOCGIFCONF takes ifconf argument
4564 and requires shared lock, because it sleeps writing
4565 to user space.
4566 */
4567
4568 if (cmd == SIOCGIFCONF) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08004569 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -07004570 ret = dev_ifconf(net, (char __user *) arg);
Stephen Hemminger6756ae42006-03-20 22:23:58 -08004571 rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004572 return ret;
4573 }
4574 if (cmd == SIOCGIFNAME)
Eric W. Biederman881d9662007-09-17 11:56:21 -07004575 return dev_ifname(net, (struct ifreq __user *)arg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004576
4577 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4578 return -EFAULT;
4579
4580 ifr.ifr_name[IFNAMSIZ-1] = 0;
4581
4582 colon = strchr(ifr.ifr_name, ':');
4583 if (colon)
4584 *colon = 0;
4585
4586 /*
4587 * See which interface the caller is talking about.
4588 */
4589
4590 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004591 /*
4592 * These ioctl calls:
4593 * - can be done by all.
4594 * - atomic and do not require locking.
4595 * - return a value
4596 */
4597 case SIOCGIFFLAGS:
4598 case SIOCGIFMETRIC:
4599 case SIOCGIFMTU:
4600 case SIOCGIFHWADDR:
4601 case SIOCGIFSLAVE:
4602 case SIOCGIFMAP:
4603 case SIOCGIFINDEX:
4604 case SIOCGIFTXQLEN:
4605 dev_load(net, ifr.ifr_name);
Eric Dumazet3710bec2009-11-01 19:42:09 +00004606 rcu_read_lock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004607 ret = dev_ifsioc_locked(net, &ifr, cmd);
Eric Dumazet3710bec2009-11-01 19:42:09 +00004608 rcu_read_unlock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004609 if (!ret) {
4610 if (colon)
4611 *colon = ':';
4612 if (copy_to_user(arg, &ifr,
4613 sizeof(struct ifreq)))
4614 ret = -EFAULT;
4615 }
4616 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004617
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004618 case SIOCETHTOOL:
4619 dev_load(net, ifr.ifr_name);
4620 rtnl_lock();
4621 ret = dev_ethtool(net, &ifr);
4622 rtnl_unlock();
4623 if (!ret) {
4624 if (colon)
4625 *colon = ':';
4626 if (copy_to_user(arg, &ifr,
4627 sizeof(struct ifreq)))
4628 ret = -EFAULT;
4629 }
4630 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004631
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004632 /*
4633 * These ioctl calls:
4634 * - require superuser power.
4635 * - require strict serialization.
4636 * - return a value
4637 */
4638 case SIOCGMIIPHY:
4639 case SIOCGMIIREG:
4640 case SIOCSIFNAME:
4641 if (!capable(CAP_NET_ADMIN))
4642 return -EPERM;
4643 dev_load(net, ifr.ifr_name);
4644 rtnl_lock();
4645 ret = dev_ifsioc(net, &ifr, cmd);
4646 rtnl_unlock();
4647 if (!ret) {
4648 if (colon)
4649 *colon = ':';
4650 if (copy_to_user(arg, &ifr,
4651 sizeof(struct ifreq)))
4652 ret = -EFAULT;
4653 }
4654 return ret;
4655
4656 /*
4657 * These ioctl calls:
4658 * - require superuser power.
4659 * - require strict serialization.
4660 * - do not return a value
4661 */
4662 case SIOCSIFFLAGS:
4663 case SIOCSIFMETRIC:
4664 case SIOCSIFMTU:
4665 case SIOCSIFMAP:
4666 case SIOCSIFHWADDR:
4667 case SIOCSIFSLAVE:
4668 case SIOCADDMULTI:
4669 case SIOCDELMULTI:
4670 case SIOCSIFHWBROADCAST:
4671 case SIOCSIFTXQLEN:
4672 case SIOCSMIIREG:
4673 case SIOCBONDENSLAVE:
4674 case SIOCBONDRELEASE:
4675 case SIOCBONDSETHWADDR:
4676 case SIOCBONDCHANGEACTIVE:
4677 case SIOCBRADDIF:
4678 case SIOCBRDELIF:
4679 case SIOCSHWTSTAMP:
4680 if (!capable(CAP_NET_ADMIN))
4681 return -EPERM;
4682 /* fall through */
4683 case SIOCBONDSLAVEINFOQUERY:
4684 case SIOCBONDINFOQUERY:
4685 dev_load(net, ifr.ifr_name);
4686 rtnl_lock();
4687 ret = dev_ifsioc(net, &ifr, cmd);
4688 rtnl_unlock();
4689 return ret;
4690
4691 case SIOCGIFMEM:
4692 /* Get the per device memory space. We can add this but
4693 * currently do not support it */
4694 case SIOCSIFMEM:
4695 /* Set the per device memory buffer space.
4696 * Not applicable in our case */
4697 case SIOCSIFLINK:
4698 return -EINVAL;
4699
4700 /*
4701 * Unknown or private ioctl.
4702 */
4703 default:
4704 if (cmd == SIOCWANDEV ||
4705 (cmd >= SIOCDEVPRIVATE &&
4706 cmd <= SIOCDEVPRIVATE + 15)) {
Eric W. Biederman881d9662007-09-17 11:56:21 -07004707 dev_load(net, ifr.ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004708 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -07004709 ret = dev_ifsioc(net, &ifr, cmd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004710 rtnl_unlock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004711 if (!ret && copy_to_user(arg, &ifr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07004712 sizeof(struct ifreq)))
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004713 ret = -EFAULT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004714 return ret;
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004715 }
4716 /* Take care of Wireless Extensions */
4717 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4718 return wext_handle_ioctl(net, &ifr, cmd, arg);
4719 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004720 }
4721}
4722
4723
4724/**
4725 * dev_new_index - allocate an ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004726 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07004727 *
4728 * Returns a suitable unique value for a new device interface
4729 * number. The caller must hold the rtnl semaphore or the
4730 * dev_base_lock to be sure it remains unique.
4731 */
Eric W. Biederman881d9662007-09-17 11:56:21 -07004732static int dev_new_index(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004733{
4734 static int ifindex;
4735 for (;;) {
4736 if (++ifindex <= 0)
4737 ifindex = 1;
Eric W. Biederman881d9662007-09-17 11:56:21 -07004738 if (!__dev_get_by_index(net, ifindex))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004739 return ifindex;
4740 }
4741}
4742
Linus Torvalds1da177e2005-04-16 15:20:36 -07004743/* Delayed registration/unregisteration */
Denis Cheng3b5b34f2007-12-07 00:49:17 -08004744static LIST_HEAD(net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004745
Stephen Hemminger6f05f622007-03-08 20:46:03 -08004746static void net_set_todo(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004747{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004748 list_add_tail(&dev->todo_list, &net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004749}
4750
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004751static void rollback_registered_many(struct list_head *head)
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004752{
Krishna Kumare93737b2009-12-08 22:26:02 +00004753 struct net_device *dev, *tmp;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004754
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004755 BUG_ON(dev_boot_phase);
4756 ASSERT_RTNL();
4757
Krishna Kumare93737b2009-12-08 22:26:02 +00004758 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004759 /* Some devices call without registering
Krishna Kumare93737b2009-12-08 22:26:02 +00004760 * for initialization unwind. Remove those
4761 * devices and proceed with the remaining.
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004762 */
4763 if (dev->reg_state == NETREG_UNINITIALIZED) {
4764 pr_debug("unregister_netdevice: device %s/%p never "
4765 "was registered\n", dev->name, dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004766
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004767 WARN_ON(1);
Krishna Kumare93737b2009-12-08 22:26:02 +00004768 list_del(&dev->unreg_list);
4769 continue;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004770 }
4771
4772 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4773
4774 /* If device is running, close it first. */
4775 dev_close(dev);
4776
4777 /* And unlink it from device chain. */
4778 unlist_netdevice(dev);
4779
4780 dev->reg_state = NETREG_UNREGISTERING;
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004781 }
4782
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004783 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004784
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004785 list_for_each_entry(dev, head, unreg_list) {
4786 /* Shutdown queueing discipline. */
4787 dev_shutdown(dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004788
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004789
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004790 /* Notify protocols, that we are about to destroy
4791 this device. They should clean all the things.
4792 */
4793 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4794
Patrick McHardya2835762010-02-26 06:34:51 +00004795 if (!dev->rtnl_link_ops ||
4796 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4797 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4798
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004799 /*
4800 * Flush the unicast and multicast chains
4801 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00004802 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00004803 dev_mc_flush(dev);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004804
4805 if (dev->netdev_ops->ndo_uninit)
4806 dev->netdev_ops->ndo_uninit(dev);
4807
4808 /* Notifier chain MUST detach us from master device. */
4809 WARN_ON(dev->master);
4810
4811 /* Remove entries from kobject tree */
4812 netdev_unregister_kobject(dev);
4813 }
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004814
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00004815 /* Process any work delayed until the end of the batch */
stephen hemmingere5e26d72010-02-24 14:01:38 +00004816 dev = list_first_entry(head, struct net_device, unreg_list);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00004817 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4818
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004819 synchronize_net();
4820
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00004821 list_for_each_entry(dev, head, unreg_list)
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004822 dev_put(dev);
4823}
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004824
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004825static void rollback_registered(struct net_device *dev)
4826{
4827 LIST_HEAD(single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004828
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004829 list_add(&dev->unreg_list, &single);
4830 rollback_registered_many(&single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004831}
4832
David S. Millere8a04642008-07-17 00:34:19 -07004833static void __netdev_init_queue_locks_one(struct net_device *dev,
4834 struct netdev_queue *dev_queue,
4835 void *_unused)
David S. Millerc773e842008-07-08 23:13:53 -07004836{
4837 spin_lock_init(&dev_queue->_xmit_lock);
David S. Millercf508b12008-07-22 14:16:42 -07004838 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
David S. Millerc773e842008-07-08 23:13:53 -07004839 dev_queue->xmit_lock_owner = -1;
4840}
4841
4842static void netdev_init_queue_locks(struct net_device *dev)
4843{
David S. Millere8a04642008-07-17 00:34:19 -07004844 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4845 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
David S. Millerc773e842008-07-08 23:13:53 -07004846}
4847
Herbert Xub63365a2008-10-23 01:11:29 -07004848unsigned long netdev_fix_features(unsigned long features, const char *name)
4849{
4850 /* Fix illegal SG+CSUM combinations. */
4851 if ((features & NETIF_F_SG) &&
4852 !(features & NETIF_F_ALL_CSUM)) {
4853 if (name)
4854 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4855 "checksum feature.\n", name);
4856 features &= ~NETIF_F_SG;
4857 }
4858
4859 /* TSO requires that SG is present as well. */
4860 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4861 if (name)
4862 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4863 "SG feature.\n", name);
4864 features &= ~NETIF_F_TSO;
4865 }
4866
4867 if (features & NETIF_F_UFO) {
4868 if (!(features & NETIF_F_GEN_CSUM)) {
4869 if (name)
4870 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4871 "since no NETIF_F_HW_CSUM feature.\n",
4872 name);
4873 features &= ~NETIF_F_UFO;
4874 }
4875
4876 if (!(features & NETIF_F_SG)) {
4877 if (name)
4878 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4879 "since no NETIF_F_SG feature.\n", name);
4880 features &= ~NETIF_F_UFO;
4881 }
4882 }
4883
4884 return features;
4885}
4886EXPORT_SYMBOL(netdev_fix_features);
4887
Linus Torvalds1da177e2005-04-16 15:20:36 -07004888/**
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08004889 * netif_stacked_transfer_operstate - transfer operstate
4890 * @rootdev: the root or lower level device to transfer state from
4891 * @dev: the device to transfer operstate to
4892 *
4893 * Transfer operational state from root to device. This is normally
4894 * called when a stacking relationship exists between the root
4895 * device and the device(a leaf device).
4896 */
4897void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4898 struct net_device *dev)
4899{
4900 if (rootdev->operstate == IF_OPER_DORMANT)
4901 netif_dormant_on(dev);
4902 else
4903 netif_dormant_off(dev);
4904
4905 if (netif_carrier_ok(rootdev)) {
4906 if (!netif_carrier_ok(dev))
4907 netif_carrier_on(dev);
4908 } else {
4909 if (netif_carrier_ok(dev))
4910 netif_carrier_off(dev);
4911 }
4912}
4913EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4914
4915/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07004916 * register_netdevice - register a network device
4917 * @dev: device to register
4918 *
4919 * Take a completed network device structure and add it to the kernel
4920 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4921 * chain. 0 is returned on success. A negative errno code is returned
4922 * on a failure to set up the device, or if the name is a duplicate.
4923 *
4924 * Callers must hold the rtnl semaphore. You may want
4925 * register_netdev() instead of this.
4926 *
4927 * BUGS:
4928 * The locking appears insufficient to guarantee two parallel registers
4929 * will not get the same name.
4930 */
4931
4932int register_netdevice(struct net_device *dev)
4933{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004934 int ret;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004935 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004936
4937 BUG_ON(dev_boot_phase);
4938 ASSERT_RTNL();
4939
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07004940 might_sleep();
4941
Linus Torvalds1da177e2005-04-16 15:20:36 -07004942 /* When net_device's are persistent, this will be fatal. */
4943 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004944 BUG_ON(!net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004945
David S. Millerf1f28aa2008-07-15 00:08:33 -07004946 spin_lock_init(&dev->addr_list_lock);
David S. Millercf508b12008-07-22 14:16:42 -07004947 netdev_set_addr_lockdep_class(dev);
David S. Millerc773e842008-07-08 23:13:53 -07004948 netdev_init_queue_locks(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004949
Linus Torvalds1da177e2005-04-16 15:20:36 -07004950 dev->iflink = -1;
4951
Eric Dumazetdf334542010-03-24 19:13:54 +00004952#ifdef CONFIG_RPS
Tom Herbert0a9627f2010-03-16 08:03:29 +00004953 if (!dev->num_rx_queues) {
4954 /*
4955 * Allocate a single RX queue if driver never called
4956 * alloc_netdev_mq
4957 */
4958
4959 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
4960 if (!dev->_rx) {
4961 ret = -ENOMEM;
4962 goto out;
4963 }
4964
4965 dev->_rx->first = dev->_rx;
4966 atomic_set(&dev->_rx->count, 1);
4967 dev->num_rx_queues = 1;
4968 }
Eric Dumazetdf334542010-03-24 19:13:54 +00004969#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004970 /* Init, if this function is available */
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004971 if (dev->netdev_ops->ndo_init) {
4972 ret = dev->netdev_ops->ndo_init(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004973 if (ret) {
4974 if (ret > 0)
4975 ret = -EIO;
Adrian Bunk90833aa2006-11-13 16:02:22 -08004976 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004977 }
4978 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09004979
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00004980 ret = dev_get_valid_name(dev, dev->name, 0);
Octavian Purdilad9031022009-11-18 02:36:59 +00004981 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07004982 goto err_uninit;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004983
Eric W. Biederman881d9662007-09-17 11:56:21 -07004984 dev->ifindex = dev_new_index(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004985 if (dev->iflink == -1)
4986 dev->iflink = dev->ifindex;
4987
Stephen Hemmingerd212f872007-06-27 00:47:37 -07004988 /* Fix illegal checksum combinations */
4989 if ((dev->features & NETIF_F_HW_CSUM) &&
4990 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4991 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4992 dev->name);
4993 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4994 }
4995
4996 if ((dev->features & NETIF_F_NO_CSUM) &&
4997 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4998 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4999 dev->name);
5000 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5001 }
5002
Herbert Xub63365a2008-10-23 01:11:29 -07005003 dev->features = netdev_fix_features(dev->features, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005004
Lennert Buytenheke5a4a722008-08-03 01:23:10 -07005005 /* Enable software GSO if SG is supported. */
5006 if (dev->features & NETIF_F_SG)
5007 dev->features |= NETIF_F_GSO;
5008
Johannes Berg7ffbe3f2009-10-02 05:15:27 +00005009 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5010 ret = notifier_to_errno(ret);
5011 if (ret)
5012 goto err_uninit;
5013
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005014 ret = netdev_register_kobject(dev);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005015 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005016 goto err_uninit;
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005017 dev->reg_state = NETREG_REGISTERED;
5018
Linus Torvalds1da177e2005-04-16 15:20:36 -07005019 /*
5020 * Default initial state at registry is that the
5021 * device is present.
5022 */
5023
5024 set_bit(__LINK_STATE_PRESENT, &dev->state);
5025
Linus Torvalds1da177e2005-04-16 15:20:36 -07005026 dev_init_scheduler(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005027 dev_hold(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005028 list_netdevice(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005029
5030 /* Notify protocols, that a new device appeared. */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005031 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07005032 ret = notifier_to_errno(ret);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005033 if (ret) {
5034 rollback_registered(dev);
5035 dev->reg_state = NETREG_UNREGISTERED;
5036 }
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005037 /*
5038 * Prevent userspace races by waiting until the network
5039 * device is fully setup before sending notifications.
5040 */
Patrick McHardya2835762010-02-26 06:34:51 +00005041 if (!dev->rtnl_link_ops ||
5042 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5043 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005044
5045out:
5046 return ret;
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005047
5048err_uninit:
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005049 if (dev->netdev_ops->ndo_uninit)
5050 dev->netdev_ops->ndo_uninit(dev);
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005051 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005052}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005053EXPORT_SYMBOL(register_netdevice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005054
5055/**
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005056 * init_dummy_netdev - init a dummy network device for NAPI
5057 * @dev: device to init
5058 *
5059 * This takes a network device structure and initialize the minimum
5060 * amount of fields so it can be used to schedule NAPI polls without
5061 * registering a full blown interface. This is to be used by drivers
5062 * that need to tie several hardware interfaces to a single NAPI
5063 * poll scheduler due to HW limitations.
5064 */
5065int init_dummy_netdev(struct net_device *dev)
5066{
5067 /* Clear everything. Note we don't initialize spinlocks
5068 * are they aren't supposed to be taken by any of the
5069 * NAPI code and this dummy netdev is supposed to be
5070 * only ever used for NAPI polls
5071 */
5072 memset(dev, 0, sizeof(struct net_device));
5073
5074 /* make sure we BUG if trying to hit standard
5075 * register/unregister code path
5076 */
5077 dev->reg_state = NETREG_DUMMY;
5078
5079 /* initialize the ref count */
5080 atomic_set(&dev->refcnt, 1);
5081
5082 /* NAPI wants this */
5083 INIT_LIST_HEAD(&dev->napi_list);
5084
5085 /* a dummy interface is started by default */
5086 set_bit(__LINK_STATE_PRESENT, &dev->state);
5087 set_bit(__LINK_STATE_START, &dev->state);
5088
5089 return 0;
5090}
5091EXPORT_SYMBOL_GPL(init_dummy_netdev);
5092
5093
5094/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005095 * register_netdev - register a network device
5096 * @dev: device to register
5097 *
5098 * Take a completed network device structure and add it to the kernel
5099 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5100 * chain. 0 is returned on success. A negative errno code is returned
5101 * on a failure to set up the device, or if the name is a duplicate.
5102 *
Borislav Petkov38b4da32007-04-20 22:14:10 -07005103 * This is a wrapper around register_netdevice that takes the rtnl semaphore
Linus Torvalds1da177e2005-04-16 15:20:36 -07005104 * and expands the device name if you passed a format string to
5105 * alloc_netdev.
5106 */
5107int register_netdev(struct net_device *dev)
5108{
5109 int err;
5110
5111 rtnl_lock();
5112
5113 /*
5114 * If the name is a format string the caller wants us to do a
5115 * name allocation.
5116 */
5117 if (strchr(dev->name, '%')) {
5118 err = dev_alloc_name(dev, dev->name);
5119 if (err < 0)
5120 goto out;
5121 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005122
Linus Torvalds1da177e2005-04-16 15:20:36 -07005123 err = register_netdevice(dev);
5124out:
5125 rtnl_unlock();
5126 return err;
5127}
5128EXPORT_SYMBOL(register_netdev);
5129
5130/*
5131 * netdev_wait_allrefs - wait until all references are gone.
5132 *
5133 * This is called when unregistering network devices.
5134 *
5135 * Any protocol or device that holds a reference should register
5136 * for netdevice notification, and cleanup and put back the
5137 * reference if they receive an UNREGISTER event.
5138 * We can get stuck here if buggy protocols don't correctly
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005139 * call dev_put.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005140 */
5141static void netdev_wait_allrefs(struct net_device *dev)
5142{
5143 unsigned long rebroadcast_time, warning_time;
5144
Eric Dumazete014deb2009-11-17 05:59:21 +00005145 linkwatch_forget_dev(dev);
5146
Linus Torvalds1da177e2005-04-16 15:20:36 -07005147 rebroadcast_time = warning_time = jiffies;
5148 while (atomic_read(&dev->refcnt) != 0) {
5149 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005150 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005151
5152 /* Rebroadcast unregister notification */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005153 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005154 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
Octavian Purdila395264d2009-11-16 13:49:35 +00005155 * should have already handle it the first time */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005156
5157 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5158 &dev->state)) {
5159 /* We must not have linkwatch events
5160 * pending on unregister. If this
5161 * happens, we simply run the queue
5162 * unscheduled, resulting in a noop
5163 * for this device.
5164 */
5165 linkwatch_run_queue();
5166 }
5167
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005168 __rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005169
5170 rebroadcast_time = jiffies;
5171 }
5172
5173 msleep(250);
5174
5175 if (time_after(jiffies, warning_time + 10 * HZ)) {
5176 printk(KERN_EMERG "unregister_netdevice: "
5177 "waiting for %s to become free. Usage "
5178 "count = %d\n",
5179 dev->name, atomic_read(&dev->refcnt));
5180 warning_time = jiffies;
5181 }
5182 }
5183}
5184
5185/* The sequence is:
5186 *
5187 * rtnl_lock();
5188 * ...
5189 * register_netdevice(x1);
5190 * register_netdevice(x2);
5191 * ...
5192 * unregister_netdevice(y1);
5193 * unregister_netdevice(y2);
5194 * ...
5195 * rtnl_unlock();
5196 * free_netdev(y1);
5197 * free_netdev(y2);
5198 *
Herbert Xu58ec3b42008-10-07 15:50:03 -07005199 * We are invoked by rtnl_unlock().
Linus Torvalds1da177e2005-04-16 15:20:36 -07005200 * This allows us to deal with problems:
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005201 * 1) We can delete sysfs objects which invoke hotplug
Linus Torvalds1da177e2005-04-16 15:20:36 -07005202 * without deadlocking with linkwatch via keventd.
5203 * 2) Since we run with the RTNL semaphore not held, we can sleep
5204 * safely in order to wait for the netdev refcnt to drop to zero.
Herbert Xu58ec3b42008-10-07 15:50:03 -07005205 *
5206 * We must not return until all unregister events added during
5207 * the interval the lock was held have been completed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005208 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005209void netdev_run_todo(void)
5210{
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005211 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005212
Linus Torvalds1da177e2005-04-16 15:20:36 -07005213 /* Snapshot list, allow later requests */
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005214 list_replace_init(&net_todo_list, &list);
Herbert Xu58ec3b42008-10-07 15:50:03 -07005215
5216 __rtnl_unlock();
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005217
Linus Torvalds1da177e2005-04-16 15:20:36 -07005218 while (!list_empty(&list)) {
5219 struct net_device *dev
stephen hemmingere5e26d72010-02-24 14:01:38 +00005220 = list_first_entry(&list, struct net_device, todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005221 list_del(&dev->todo_list);
5222
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005223 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005224 printk(KERN_ERR "network todo '%s' but state %d\n",
5225 dev->name, dev->reg_state);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005226 dump_stack();
5227 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005228 }
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005229
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005230 dev->reg_state = NETREG_UNREGISTERED;
5231
Changli Gao152102c2010-03-30 20:16:22 +00005232 on_each_cpu(flush_backlog, dev, 1);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07005233
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005234 netdev_wait_allrefs(dev);
5235
5236 /* paranoia */
5237 BUG_ON(atomic_read(&dev->refcnt));
Ilpo Järvinen547b7922008-07-25 21:43:18 -07005238 WARN_ON(dev->ip_ptr);
5239 WARN_ON(dev->ip6_ptr);
5240 WARN_ON(dev->dn_ptr);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005241
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005242 if (dev->destructor)
5243 dev->destructor(dev);
Stephen Hemminger9093bbb2007-05-19 15:39:25 -07005244
5245 /* Free network device */
5246 kobject_put(&dev->dev.kobj);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005247 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005248}
5249
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005250/**
Eric Dumazetd83345a2009-11-16 03:36:51 +00005251 * dev_txq_stats_fold - fold tx_queues stats
5252 * @dev: device to get statistics from
5253 * @stats: struct net_device_stats to hold results
5254 */
5255void dev_txq_stats_fold(const struct net_device *dev,
5256 struct net_device_stats *stats)
5257{
5258 unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5259 unsigned int i;
5260 struct netdev_queue *txq;
5261
5262 for (i = 0; i < dev->num_tx_queues; i++) {
5263 txq = netdev_get_tx_queue(dev, i);
5264 tx_bytes += txq->tx_bytes;
5265 tx_packets += txq->tx_packets;
5266 tx_dropped += txq->tx_dropped;
5267 }
5268 if (tx_bytes || tx_packets || tx_dropped) {
5269 stats->tx_bytes = tx_bytes;
5270 stats->tx_packets = tx_packets;
5271 stats->tx_dropped = tx_dropped;
5272 }
5273}
5274EXPORT_SYMBOL(dev_txq_stats_fold);
5275
5276/**
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005277 * dev_get_stats - get network device statistics
5278 * @dev: device to get statistics from
5279 *
5280 * Get network statistics from device. The device driver may provide
5281 * its own method by setting dev->netdev_ops->get_stats; otherwise
5282 * the internal statistics structure is used.
5283 */
5284const struct net_device_stats *dev_get_stats(struct net_device *dev)
Eric Dumazet7004bf22009-05-18 00:34:33 +00005285{
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005286 const struct net_device_ops *ops = dev->netdev_ops;
5287
5288 if (ops->ndo_get_stats)
5289 return ops->ndo_get_stats(dev);
Eric Dumazet7004bf22009-05-18 00:34:33 +00005290
Eric Dumazetd83345a2009-11-16 03:36:51 +00005291 dev_txq_stats_fold(dev, &dev->stats);
5292 return &dev->stats;
Rusty Russellc45d2862007-03-28 14:29:08 -07005293}
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005294EXPORT_SYMBOL(dev_get_stats);
Rusty Russellc45d2862007-03-28 14:29:08 -07005295
David S. Millerdc2b4842008-07-08 17:18:23 -07005296static void netdev_init_one_queue(struct net_device *dev,
David S. Millere8a04642008-07-17 00:34:19 -07005297 struct netdev_queue *queue,
5298 void *_unused)
David S. Millerdc2b4842008-07-08 17:18:23 -07005299{
David S. Millerdc2b4842008-07-08 17:18:23 -07005300 queue->dev = dev;
5301}
5302
David S. Millerbb949fb2008-07-08 16:55:56 -07005303static void netdev_init_queues(struct net_device *dev)
5304{
David S. Millere8a04642008-07-17 00:34:19 -07005305 netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5306 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
David S. Millerc3f26a22008-07-31 16:58:50 -07005307 spin_lock_init(&dev->tx_global_lock);
David S. Millerbb949fb2008-07-08 16:55:56 -07005308}
5309
Linus Torvalds1da177e2005-04-16 15:20:36 -07005310/**
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005311 * alloc_netdev_mq - allocate network device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005312 * @sizeof_priv: size of private data to allocate space for
5313 * @name: device name format string
5314 * @setup: callback to initialize device
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005315 * @queue_count: the number of subqueues to allocate
Linus Torvalds1da177e2005-04-16 15:20:36 -07005316 *
5317 * Allocates a struct net_device with private data area for driver use
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005318 * and performs basic initialization. Also allocates subquue structs
5319 * for each queue on the device at the end of the netdevice.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005320 */
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005321struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5322 void (*setup)(struct net_device *), unsigned int queue_count)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005323{
David S. Millere8a04642008-07-17 00:34:19 -07005324 struct netdev_queue *tx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005325 struct net_device *dev;
Stephen Hemminger79439862008-07-21 13:28:44 -07005326 size_t alloc_size;
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005327 struct net_device *p;
Eric Dumazetdf334542010-03-24 19:13:54 +00005328#ifdef CONFIG_RPS
5329 struct netdev_rx_queue *rx;
Tom Herbert0a9627f2010-03-16 08:03:29 +00005330 int i;
Eric Dumazetdf334542010-03-24 19:13:54 +00005331#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005332
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07005333 BUG_ON(strlen(name) >= sizeof(dev->name));
5334
David S. Millerfd2ea0a2008-07-17 01:56:23 -07005335 alloc_size = sizeof(struct net_device);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005336 if (sizeof_priv) {
5337 /* ensure 32-byte alignment of private area */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005338 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005339 alloc_size += sizeof_priv;
5340 }
5341 /* ensure 32-byte alignment of whole construct */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005342 alloc_size += NETDEV_ALIGN - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005343
Paolo 'Blaisorblade' Giarrusso31380de2006-04-06 22:38:28 -07005344 p = kzalloc(alloc_size, GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005345 if (!p) {
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07005346 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005347 return NULL;
5348 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005349
Stephen Hemminger79439862008-07-21 13:28:44 -07005350 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
David S. Millere8a04642008-07-17 00:34:19 -07005351 if (!tx) {
5352 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5353 "tx qdiscs.\n");
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005354 goto free_p;
David S. Millere8a04642008-07-17 00:34:19 -07005355 }
5356
Eric Dumazetdf334542010-03-24 19:13:54 +00005357#ifdef CONFIG_RPS
Tom Herbert0a9627f2010-03-16 08:03:29 +00005358 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5359 if (!rx) {
5360 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5361 "rx queues.\n");
5362 goto free_tx;
5363 }
5364
5365 atomic_set(&rx->count, queue_count);
5366
5367 /*
5368 * Set a pointer to first element in the array which holds the
5369 * reference count.
5370 */
5371 for (i = 0; i < queue_count; i++)
5372 rx[i].first = rx;
Eric Dumazetdf334542010-03-24 19:13:54 +00005373#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00005374
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005375 dev = PTR_ALIGN(p, NETDEV_ALIGN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005376 dev->padded = (char *)dev - (char *)p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005377
5378 if (dev_addr_init(dev))
Tom Herbert0a9627f2010-03-16 08:03:29 +00005379 goto free_rx;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005380
Jiri Pirko22bedad32010-04-01 21:22:57 +00005381 dev_mc_init(dev);
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005382 dev_uc_init(dev);
Jiri Pirkoccffad252009-05-22 23:22:17 +00005383
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005384 dev_net_set(dev, &init_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005385
David S. Millere8a04642008-07-17 00:34:19 -07005386 dev->_tx = tx;
5387 dev->num_tx_queues = queue_count;
David S. Millerfd2ea0a2008-07-17 01:56:23 -07005388 dev->real_num_tx_queues = queue_count;
David S. Millere8a04642008-07-17 00:34:19 -07005389
Eric Dumazetdf334542010-03-24 19:13:54 +00005390#ifdef CONFIG_RPS
Tom Herbert0a9627f2010-03-16 08:03:29 +00005391 dev->_rx = rx;
5392 dev->num_rx_queues = queue_count;
Eric Dumazetdf334542010-03-24 19:13:54 +00005393#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00005394
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07005395 dev->gso_max_size = GSO_MAX_SIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005396
David S. Millerbb949fb2008-07-08 16:55:56 -07005397 netdev_init_queues(dev);
5398
Peter P Waskiewicz Jr15682bc2010-02-10 20:03:05 -08005399 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5400 dev->ethtool_ntuple_list.count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08005401 INIT_LIST_HEAD(&dev->napi_list);
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005402 INIT_LIST_HEAD(&dev->unreg_list);
Eric Dumazete014deb2009-11-17 05:59:21 +00005403 INIT_LIST_HEAD(&dev->link_watch_list);
Eric Dumazet93f154b2009-05-18 22:19:19 -07005404 dev->priv_flags = IFF_XMIT_DST_RELEASE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005405 setup(dev);
5406 strcpy(dev->name, name);
5407 return dev;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005408
Tom Herbert0a9627f2010-03-16 08:03:29 +00005409free_rx:
Eric Dumazetdf334542010-03-24 19:13:54 +00005410#ifdef CONFIG_RPS
Tom Herbert0a9627f2010-03-16 08:03:29 +00005411 kfree(rx);
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005412free_tx:
Eric Dumazetdf334542010-03-24 19:13:54 +00005413#endif
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005414 kfree(tx);
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005415free_p:
5416 kfree(p);
5417 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005418}
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005419EXPORT_SYMBOL(alloc_netdev_mq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005420
5421/**
5422 * free_netdev - free network device
5423 * @dev: device
5424 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005425 * This function does the last stage of destroying an allocated device
5426 * interface. The reference to the device object is released.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005427 * If this is the last reference then it will be freed.
5428 */
5429void free_netdev(struct net_device *dev)
5430{
Herbert Xud565b0a2008-12-15 23:38:52 -08005431 struct napi_struct *p, *n;
5432
Denis V. Lunevf3005d72008-04-16 02:02:18 -07005433 release_net(dev_net(dev));
5434
David S. Millere8a04642008-07-17 00:34:19 -07005435 kfree(dev->_tx);
5436
Jiri Pirkof001fde2009-05-05 02:48:28 +00005437 /* Flush device addresses */
5438 dev_addr_flush(dev);
5439
Peter P Waskiewicz Jr15682bc2010-02-10 20:03:05 -08005440 /* Clear ethtool n-tuple list */
5441 ethtool_ntuple_flush(dev);
5442
Herbert Xud565b0a2008-12-15 23:38:52 -08005443 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5444 netif_napi_del(p);
5445
Stephen Hemminger3041a062006-05-26 13:25:24 -07005446 /* Compatibility with error handling in drivers */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005447 if (dev->reg_state == NETREG_UNINITIALIZED) {
5448 kfree((char *)dev - dev->padded);
5449 return;
5450 }
5451
5452 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5453 dev->reg_state = NETREG_RELEASED;
5454
Greg Kroah-Hartman43cb76d2002-04-09 12:14:34 -07005455 /* will free via device release */
5456 put_device(&dev->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005457}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005458EXPORT_SYMBOL(free_netdev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005459
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005460/**
5461 * synchronize_net - Synchronize with packet receive processing
5462 *
5463 * Wait for packets currently being received to be done.
5464 * Does not block later packets from starting.
5465 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005466void synchronize_net(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005467{
5468 might_sleep();
Paul E. McKenneyfbd568a3e2005-05-01 08:59:04 -07005469 synchronize_rcu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005470}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005471EXPORT_SYMBOL(synchronize_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005472
5473/**
Eric Dumazet44a08732009-10-27 07:03:04 +00005474 * unregister_netdevice_queue - remove device from the kernel
Linus Torvalds1da177e2005-04-16 15:20:36 -07005475 * @dev: device
Eric Dumazet44a08732009-10-27 07:03:04 +00005476 * @head: list
Jaswinder Singh Rajput6ebfbc02009-11-22 20:43:13 -08005477 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07005478 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005479 * from the kernel tables.
Eric Dumazet44a08732009-10-27 07:03:04 +00005480 * If head not NULL, device is queued to be unregistered later.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005481 *
5482 * Callers must hold the rtnl semaphore. You may want
5483 * unregister_netdev() instead of this.
5484 */
5485
Eric Dumazet44a08732009-10-27 07:03:04 +00005486void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005487{
Herbert Xua6620712007-12-12 19:21:56 -08005488 ASSERT_RTNL();
5489
Eric Dumazet44a08732009-10-27 07:03:04 +00005490 if (head) {
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005491 list_move_tail(&dev->unreg_list, head);
Eric Dumazet44a08732009-10-27 07:03:04 +00005492 } else {
5493 rollback_registered(dev);
5494 /* Finish processing unregister after unlock */
5495 net_set_todo(dev);
5496 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005497}
Eric Dumazet44a08732009-10-27 07:03:04 +00005498EXPORT_SYMBOL(unregister_netdevice_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005499
5500/**
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005501 * unregister_netdevice_many - unregister many devices
5502 * @head: list of devices
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005503 */
5504void unregister_netdevice_many(struct list_head *head)
5505{
5506 struct net_device *dev;
5507
5508 if (!list_empty(head)) {
5509 rollback_registered_many(head);
5510 list_for_each_entry(dev, head, unreg_list)
5511 net_set_todo(dev);
5512 }
5513}
Eric Dumazet63c80992009-10-27 07:06:49 +00005514EXPORT_SYMBOL(unregister_netdevice_many);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005515
5516/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005517 * unregister_netdev - remove device from the kernel
5518 * @dev: device
5519 *
5520 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005521 * from the kernel tables.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005522 *
5523 * This is just a wrapper for unregister_netdevice that takes
5524 * the rtnl semaphore. In general you want to use this and not
5525 * unregister_netdevice.
5526 */
5527void unregister_netdev(struct net_device *dev)
5528{
5529 rtnl_lock();
5530 unregister_netdevice(dev);
5531 rtnl_unlock();
5532}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005533EXPORT_SYMBOL(unregister_netdev);
5534
Eric W. Biedermance286d32007-09-12 13:53:49 +02005535/**
5536 * dev_change_net_namespace - move device to different nethost namespace
5537 * @dev: device
5538 * @net: network namespace
5539 * @pat: If not NULL name pattern to try if the current device name
5540 * is already taken in the destination network namespace.
5541 *
5542 * This function shuts down a device interface and moves it
5543 * to a new network namespace. On success 0 is returned, on
5544 * a failure a netagive errno code is returned.
5545 *
5546 * Callers must hold the rtnl semaphore.
5547 */
5548
5549int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5550{
Eric W. Biedermance286d32007-09-12 13:53:49 +02005551 int err;
5552
5553 ASSERT_RTNL();
5554
5555 /* Don't allow namespace local devices to be moved. */
5556 err = -EINVAL;
5557 if (dev->features & NETIF_F_NETNS_LOCAL)
5558 goto out;
5559
5560 /* Ensure the device has been registrered */
5561 err = -EINVAL;
5562 if (dev->reg_state != NETREG_REGISTERED)
5563 goto out;
5564
5565 /* Get out if there is nothing todo */
5566 err = 0;
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09005567 if (net_eq(dev_net(dev), net))
Eric W. Biedermance286d32007-09-12 13:53:49 +02005568 goto out;
5569
5570 /* Pick the destination device name, and ensure
5571 * we can use it in the destination network namespace.
5572 */
5573 err = -EEXIST;
Octavian Purdilad9031022009-11-18 02:36:59 +00005574 if (__dev_get_by_name(net, dev->name)) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02005575 /* We get here if we can't use the current device name */
5576 if (!pat)
5577 goto out;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00005578 if (dev_get_valid_name(dev, pat, 1))
Eric W. Biedermance286d32007-09-12 13:53:49 +02005579 goto out;
5580 }
5581
5582 /*
5583 * And now a mini version of register_netdevice unregister_netdevice.
5584 */
5585
5586 /* If device is running close it first. */
Pavel Emelyanov9b772652007-10-10 02:49:09 -07005587 dev_close(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005588
5589 /* And unlink it from device chain */
5590 err = -ENODEV;
5591 unlist_netdevice(dev);
5592
5593 synchronize_net();
5594
5595 /* Shutdown queueing discipline. */
5596 dev_shutdown(dev);
5597
5598 /* Notify protocols, that we are about to destroy
5599 this device. They should clean all the things.
5600 */
5601 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005602 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005603
5604 /*
5605 * Flush the unicast and multicast chains
5606 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005607 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00005608 dev_mc_flush(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005609
5610 /* Actually switch the network namespace */
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005611 dev_net_set(dev, net);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005612
Eric W. Biedermance286d32007-09-12 13:53:49 +02005613 /* If there is an ifindex conflict assign a new one */
5614 if (__dev_get_by_index(net, dev->ifindex)) {
5615 int iflink = (dev->iflink == dev->ifindex);
5616 dev->ifindex = dev_new_index(net);
5617 if (iflink)
5618 dev->iflink = dev->ifindex;
5619 }
5620
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005621 /* Fixup kobjects */
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07005622 err = device_rename(&dev->dev, dev->name);
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005623 WARN_ON(err);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005624
5625 /* Add the device back in the hashes */
5626 list_netdevice(dev);
5627
5628 /* Notify protocols, that a new device appeared. */
5629 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5630
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005631 /*
5632 * Prevent userspace races by waiting until the network
5633 * device is fully setup before sending notifications.
5634 */
5635 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5636
Eric W. Biedermance286d32007-09-12 13:53:49 +02005637 synchronize_net();
5638 err = 0;
5639out:
5640 return err;
5641}
Johannes Berg463d0182009-07-14 00:33:35 +02005642EXPORT_SYMBOL_GPL(dev_change_net_namespace);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005643
Linus Torvalds1da177e2005-04-16 15:20:36 -07005644static int dev_cpu_callback(struct notifier_block *nfb,
5645 unsigned long action,
5646 void *ocpu)
5647{
5648 struct sk_buff **list_skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005649 struct sk_buff *skb;
5650 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5651 struct softnet_data *sd, *oldsd;
5652
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005653 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005654 return NOTIFY_OK;
5655
5656 local_irq_disable();
5657 cpu = smp_processor_id();
5658 sd = &per_cpu(softnet_data, cpu);
5659 oldsd = &per_cpu(softnet_data, oldcpu);
5660
5661 /* Find end of our completion_queue. */
5662 list_skb = &sd->completion_queue;
5663 while (*list_skb)
5664 list_skb = &(*list_skb)->next;
5665 /* Append completion queue from offline CPU. */
5666 *list_skb = oldsd->completion_queue;
5667 oldsd->completion_queue = NULL;
5668
Linus Torvalds1da177e2005-04-16 15:20:36 -07005669 /* Append output queue from offline CPU. */
Changli Gaoa9cbd582010-04-26 23:06:24 +00005670 if (oldsd->output_queue) {
5671 *sd->output_queue_tailp = oldsd->output_queue;
5672 sd->output_queue_tailp = oldsd->output_queue_tailp;
5673 oldsd->output_queue = NULL;
5674 oldsd->output_queue_tailp = &oldsd->output_queue;
5675 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005676
5677 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5678 local_irq_enable();
5679
5680 /* Process offline CPU's input_pkt_queue */
Tom Herbert76cc8b12010-05-20 18:37:59 +00005681 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5682 netif_rx(skb);
5683 input_queue_head_incr(oldsd);
5684 }
Tom Herbertfec5e652010-04-16 16:01:27 -07005685 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005686 netif_rx(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00005687 input_queue_head_incr(oldsd);
Tom Herbertfec5e652010-04-16 16:01:27 -07005688 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005689
5690 return NOTIFY_OK;
5691}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005692
5693
Herbert Xu7f353bf2007-08-10 15:47:58 -07005694/**
Herbert Xub63365a2008-10-23 01:11:29 -07005695 * netdev_increment_features - increment feature set by one
5696 * @all: current feature set
5697 * @one: new feature set
5698 * @mask: mask feature set
Herbert Xu7f353bf2007-08-10 15:47:58 -07005699 *
5700 * Computes a new feature set after adding a device with feature set
Herbert Xub63365a2008-10-23 01:11:29 -07005701 * @one to the master device with current feature set @all. Will not
5702 * enable anything that is off in @mask. Returns the new feature set.
Herbert Xu7f353bf2007-08-10 15:47:58 -07005703 */
Herbert Xub63365a2008-10-23 01:11:29 -07005704unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5705 unsigned long mask)
Herbert Xu7f353bf2007-08-10 15:47:58 -07005706{
Herbert Xub63365a2008-10-23 01:11:29 -07005707 /* If device needs checksumming, downgrade to it. */
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005708 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
Herbert Xub63365a2008-10-23 01:11:29 -07005709 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5710 else if (mask & NETIF_F_ALL_CSUM) {
5711 /* If one device supports v4/v6 checksumming, set for all. */
5712 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5713 !(all & NETIF_F_GEN_CSUM)) {
5714 all &= ~NETIF_F_ALL_CSUM;
5715 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5716 }
Herbert Xu7f353bf2007-08-10 15:47:58 -07005717
Herbert Xub63365a2008-10-23 01:11:29 -07005718 /* If one device supports hw checksumming, set for all. */
5719 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5720 all &= ~NETIF_F_ALL_CSUM;
5721 all |= NETIF_F_HW_CSUM;
5722 }
5723 }
Herbert Xu7f353bf2007-08-10 15:47:58 -07005724
Herbert Xub63365a2008-10-23 01:11:29 -07005725 one |= NETIF_F_ALL_CSUM;
Herbert Xu7f353bf2007-08-10 15:47:58 -07005726
Herbert Xub63365a2008-10-23 01:11:29 -07005727 one |= all & NETIF_F_ONE_FOR_ALL;
Sridhar Samudralad9f59502009-10-07 12:24:25 +00005728 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
Herbert Xub63365a2008-10-23 01:11:29 -07005729 all |= one & mask & NETIF_F_ONE_FOR_ALL;
Herbert Xu7f353bf2007-08-10 15:47:58 -07005730
5731 return all;
5732}
Herbert Xub63365a2008-10-23 01:11:29 -07005733EXPORT_SYMBOL(netdev_increment_features);
Herbert Xu7f353bf2007-08-10 15:47:58 -07005734
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07005735static struct hlist_head *netdev_create_hash(void)
5736{
5737 int i;
5738 struct hlist_head *hash;
5739
5740 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5741 if (hash != NULL)
5742 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5743 INIT_HLIST_HEAD(&hash[i]);
5744
5745 return hash;
5746}
5747
Eric W. Biederman881d9662007-09-17 11:56:21 -07005748/* Initialize per network namespace state */
Pavel Emelyanov46650792007-10-08 20:38:39 -07005749static int __net_init netdev_init(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07005750{
Eric W. Biederman881d9662007-09-17 11:56:21 -07005751 INIT_LIST_HEAD(&net->dev_base_head);
Eric W. Biederman881d9662007-09-17 11:56:21 -07005752
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07005753 net->dev_name_head = netdev_create_hash();
5754 if (net->dev_name_head == NULL)
5755 goto err_name;
Eric W. Biederman881d9662007-09-17 11:56:21 -07005756
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07005757 net->dev_index_head = netdev_create_hash();
5758 if (net->dev_index_head == NULL)
5759 goto err_idx;
Eric W. Biederman881d9662007-09-17 11:56:21 -07005760
5761 return 0;
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07005762
5763err_idx:
5764 kfree(net->dev_name_head);
5765err_name:
5766 return -ENOMEM;
Eric W. Biederman881d9662007-09-17 11:56:21 -07005767}
5768
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005769/**
5770 * netdev_drivername - network driver for the device
5771 * @dev: network device
5772 * @buffer: buffer for resulting name
5773 * @len: size of buffer
5774 *
5775 * Determine network driver for device.
5776 */
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07005777char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
Arjan van de Ven6579e572008-07-21 13:31:48 -07005778{
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07005779 const struct device_driver *driver;
5780 const struct device *parent;
Arjan van de Ven6579e572008-07-21 13:31:48 -07005781
5782 if (len <= 0 || !buffer)
5783 return buffer;
5784 buffer[0] = 0;
5785
5786 parent = dev->dev.parent;
5787
5788 if (!parent)
5789 return buffer;
5790
5791 driver = parent->driver;
5792 if (driver && driver->name)
5793 strlcpy(buffer, driver->name, len);
5794 return buffer;
5795}
5796
Pavel Emelyanov46650792007-10-08 20:38:39 -07005797static void __net_exit netdev_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07005798{
5799 kfree(net->dev_name_head);
5800 kfree(net->dev_index_head);
5801}
5802
Denis V. Lunev022cbae2007-11-13 03:23:50 -08005803static struct pernet_operations __net_initdata netdev_net_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07005804 .init = netdev_init,
5805 .exit = netdev_exit,
5806};
5807
Pavel Emelyanov46650792007-10-08 20:38:39 -07005808static void __net_exit default_device_exit(struct net *net)
Eric W. Biedermance286d32007-09-12 13:53:49 +02005809{
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00005810 struct net_device *dev, *aux;
Eric W. Biedermance286d32007-09-12 13:53:49 +02005811 /*
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00005812 * Push all migratable network devices back to the
Eric W. Biedermance286d32007-09-12 13:53:49 +02005813 * initial network namespace
5814 */
5815 rtnl_lock();
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00005816 for_each_netdev_safe(net, dev, aux) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02005817 int err;
Pavel Emelyanovaca51392008-05-08 01:24:25 -07005818 char fb_name[IFNAMSIZ];
Eric W. Biedermance286d32007-09-12 13:53:49 +02005819
5820 /* Ignore unmoveable devices (i.e. loopback) */
5821 if (dev->features & NETIF_F_NETNS_LOCAL)
5822 continue;
5823
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00005824 /* Leave virtual devices for the generic cleanup */
5825 if (dev->rtnl_link_ops)
5826 continue;
Eric W. Biedermand0c082c2008-11-05 15:59:38 -08005827
Eric W. Biedermance286d32007-09-12 13:53:49 +02005828 /* Push remaing network devices to init_net */
Pavel Emelyanovaca51392008-05-08 01:24:25 -07005829 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5830 err = dev_change_net_namespace(dev, &init_net, fb_name);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005831 if (err) {
Pavel Emelyanovaca51392008-05-08 01:24:25 -07005832 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
Eric W. Biedermance286d32007-09-12 13:53:49 +02005833 __func__, dev->name, err);
Pavel Emelyanovaca51392008-05-08 01:24:25 -07005834 BUG();
Eric W. Biedermance286d32007-09-12 13:53:49 +02005835 }
5836 }
5837 rtnl_unlock();
5838}
5839
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00005840static void __net_exit default_device_exit_batch(struct list_head *net_list)
5841{
5842 /* At exit all network devices most be removed from a network
5843 * namespace. Do this in the reverse order of registeration.
5844 * Do this across as many network namespaces as possible to
5845 * improve batching efficiency.
5846 */
5847 struct net_device *dev;
5848 struct net *net;
5849 LIST_HEAD(dev_kill_list);
5850
5851 rtnl_lock();
5852 list_for_each_entry(net, net_list, exit_list) {
5853 for_each_netdev_reverse(net, dev) {
5854 if (dev->rtnl_link_ops)
5855 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
5856 else
5857 unregister_netdevice_queue(dev, &dev_kill_list);
5858 }
5859 }
5860 unregister_netdevice_many(&dev_kill_list);
5861 rtnl_unlock();
5862}
5863
Denis V. Lunev022cbae2007-11-13 03:23:50 -08005864static struct pernet_operations __net_initdata default_device_ops = {
Eric W. Biedermance286d32007-09-12 13:53:49 +02005865 .exit = default_device_exit,
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00005866 .exit_batch = default_device_exit_batch,
Eric W. Biedermance286d32007-09-12 13:53:49 +02005867};
5868
Linus Torvalds1da177e2005-04-16 15:20:36 -07005869/*
5870 * Initialize the DEV module. At boot time this walks the device list and
5871 * unhooks any devices that fail to initialise (normally hardware not
5872 * present) and leaves us with a valid list of present and active devices.
5873 *
5874 */
5875
5876/*
5877 * This is called single threaded during boot, so no need
5878 * to take the rtnl semaphore.
5879 */
5880static int __init net_dev_init(void)
5881{
5882 int i, rc = -ENOMEM;
5883
5884 BUG_ON(!dev_boot_phase);
5885
Linus Torvalds1da177e2005-04-16 15:20:36 -07005886 if (dev_proc_init())
5887 goto out;
5888
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005889 if (netdev_kobject_init())
Linus Torvalds1da177e2005-04-16 15:20:36 -07005890 goto out;
5891
5892 INIT_LIST_HEAD(&ptype_all);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08005893 for (i = 0; i < PTYPE_HASH_SIZE; i++)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005894 INIT_LIST_HEAD(&ptype_base[i]);
5895
Eric W. Biederman881d9662007-09-17 11:56:21 -07005896 if (register_pernet_subsys(&netdev_net_ops))
5897 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005898
5899 /*
5900 * Initialise the packet receive queues.
5901 */
5902
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07005903 for_each_possible_cpu(i) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00005904 struct softnet_data *sd = &per_cpu(softnet_data, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005905
Changli Gaodee42872010-05-02 05:42:16 +00005906 memset(sd, 0, sizeof(*sd));
Eric Dumazete36fa2f2010-04-19 21:17:14 +00005907 skb_queue_head_init(&sd->input_pkt_queue);
Changli Gao6e7676c2010-04-27 15:07:33 -07005908 skb_queue_head_init(&sd->process_queue);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00005909 sd->completion_queue = NULL;
5910 INIT_LIST_HEAD(&sd->poll_list);
Changli Gaoa9cbd582010-04-26 23:06:24 +00005911 sd->output_queue = NULL;
5912 sd->output_queue_tailp = &sd->output_queue;
Eric Dumazetdf334542010-03-24 19:13:54 +00005913#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +00005914 sd->csd.func = rps_trigger_softirq;
5915 sd->csd.info = sd;
5916 sd->csd.flags = 0;
5917 sd->cpu = i;
Tom Herbert1e94d722010-03-18 17:45:44 -07005918#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00005919
Eric Dumazete36fa2f2010-04-19 21:17:14 +00005920 sd->backlog.poll = process_backlog;
5921 sd->backlog.weight = weight_p;
5922 sd->backlog.gro_list = NULL;
5923 sd->backlog.gro_count = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005924 }
5925
Linus Torvalds1da177e2005-04-16 15:20:36 -07005926 dev_boot_phase = 0;
5927
Eric W. Biederman505d4f72008-11-07 22:54:20 -08005928 /* The loopback device is special if any other network devices
5929 * is present in a network namespace the loopback device must
5930 * be present. Since we now dynamically allocate and free the
5931 * loopback device ensure this invariant is maintained by
5932 * keeping the loopback device as the first device on the
5933 * list of network devices. Ensuring the loopback devices
5934 * is the first device that appears and the last network device
5935 * that disappears.
5936 */
5937 if (register_pernet_device(&loopback_net_ops))
5938 goto out;
5939
5940 if (register_pernet_device(&default_device_ops))
5941 goto out;
5942
Carlos R. Mafra962cf362008-05-15 11:15:37 -03005943 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5944 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005945
5946 hotcpu_notifier(dev_cpu_callback, 0);
5947 dst_init();
5948 dev_mcast_init();
5949 rc = 0;
5950out:
5951 return rc;
5952}
5953
5954subsys_initcall(net_dev_init);
5955
Krishna Kumare88721f2009-02-18 17:55:02 -08005956static int __init initialize_hashrnd(void)
5957{
Tom Herbert0a9627f2010-03-16 08:03:29 +00005958 get_random_bytes(&hashrnd, sizeof(hashrnd));
Krishna Kumare88721f2009-02-18 17:55:02 -08005959 return 0;
5960}
5961
5962late_initcall_sync(initialize_hashrnd);
5963