blob: 1698b3998981ccf06e4421bbede64296616e2cc0 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <asm/system.h>
77#include <linux/bitops.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080078#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070079#include <linux/cpu.h>
80#include <linux/types.h>
81#include <linux/kernel.h>
82#include <linux/sched.h>
Arjan van de Ven4a3e2f72006-03-20 22:33:17 -080083#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/string.h>
85#include <linux/mm.h>
86#include <linux/socket.h>
87#include <linux/sockios.h>
88#include <linux/errno.h>
89#include <linux/interrupt.h>
90#include <linux/if_ether.h>
91#include <linux/netdevice.h>
92#include <linux/etherdevice.h>
Ben Hutchings0187bdf2008-06-19 16:15:47 -070093#include <linux/ethtool.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070094#include <linux/notifier.h>
95#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020096#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070097#include <net/sock.h>
98#include <linux/rtnetlink.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/stat.h>
102#include <linux/if_bridge.h>
Patrick McHardyb863ceb2007-07-14 18:55:06 -0700103#include <linux/if_macvlan.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700104#include <net/dst.h>
105#include <net/pkt_sched.h>
106#include <net/checksum.h>
107#include <linux/highmem.h>
108#include <linux/init.h>
109#include <linux/kmod.h>
110#include <linux/module.h>
111#include <linux/kallsyms.h>
112#include <linux/netpoll.h>
113#include <linux/rcupdate.h>
114#include <linux/delay.h>
Johannes Berg295f4a12007-04-26 20:43:56 -0700115#include <net/wext.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116#include <net/iw_handler.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117#include <asm/current.h>
Steve Grubb5bdb9882005-12-03 08:39:35 -0500118#include <linux/audit.h>
Chris Leechdb217332006-06-17 21:24:58 -0700119#include <linux/dmaengine.h>
Herbert Xuf6a78bf2006-06-22 02:57:17 -0700120#include <linux/err.h>
David S. Millerc7fa9d12006-08-15 16:34:13 -0700121#include <linux/ctype.h>
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700122#include <linux/if_arp.h>
Ben Hutchings6de329e2008-06-16 17:02:28 -0700123#include <linux/if_vlan.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700124#include <linux/ip.h>
125#include <linux/ipv6.h>
126#include <linux/in.h>
David S. Millerb6b2fed2008-07-21 09:48:06 -0700127#include <linux/jhash.h>
128#include <linux/random.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700129
Pavel Emelyanov342709e2007-10-23 21:14:45 -0700130#include "net-sysfs.h"
131
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132/*
133 * The list of packet types we will receive (as opposed to discard)
134 * and the routines to invoke.
135 *
136 * Why 16. Because with 16 the only overlap we get on a hash of the
137 * low nibble of the protocol value is RARP/SNAP/X.25.
138 *
139 * NOTE: That is no longer true with the addition of VLAN tags. Not
140 * sure which should go first, but I bet it won't make much
141 * difference if we are running VLANs. The good news is that
142 * this protocol won't be in the list unless compiled in, so
Stephen Hemminger3041a062006-05-26 13:25:24 -0700143 * the average user (w/out VLANs) will not be adversely affected.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144 * --BLG
145 *
146 * 0800 IP
147 * 8100 802.1Q VLAN
148 * 0001 802.3
149 * 0002 AX.25
150 * 0004 802.2
151 * 8035 RARP
152 * 0005 SNAP
153 * 0805 X.25
154 * 0806 ARP
155 * 8137 IPX
156 * 0009 Localtalk
157 * 86DD IPv6
158 */
159
Pavel Emelyanov82d8a862007-11-26 20:12:58 +0800160#define PTYPE_HASH_SIZE (16)
161#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
162
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163static DEFINE_SPINLOCK(ptype_lock);
Pavel Emelyanov82d8a862007-11-26 20:12:58 +0800164static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -0700165static struct list_head ptype_all __read_mostly; /* Taps */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700166
Chris Leechdb217332006-06-17 21:24:58 -0700167#ifdef CONFIG_NET_DMA
Dan Williamsd379b012007-07-09 11:56:42 -0700168struct net_dma {
169 struct dma_client client;
170 spinlock_t lock;
171 cpumask_t channel_mask;
Mike Travis0c0b0ac2008-05-02 16:43:08 -0700172 struct dma_chan **channels;
Dan Williamsd379b012007-07-09 11:56:42 -0700173};
174
175static enum dma_state_client
176netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
177 enum dma_state state);
178
179static struct net_dma net_dma = {
180 .client = {
181 .event_callback = netdev_dma_event,
182 },
183};
Chris Leechdb217332006-06-17 21:24:58 -0700184#endif
185
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186/*
Pavel Emelianov7562f872007-05-03 15:13:45 -0700187 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188 * semaphore.
189 *
190 * Pure readers hold dev_base_lock for reading.
191 *
192 * Writers must hold the rtnl semaphore while they loop through the
Pavel Emelianov7562f872007-05-03 15:13:45 -0700193 * dev_base_head list, and hold dev_base_lock for writing when they do the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194 * actual updates. This allows pure readers to access the list even
195 * while a writer is preparing to update it.
196 *
197 * To put it another way, dev_base_lock is held for writing only to
198 * protect against pure readers; the rtnl semaphore provides the
199 * protection against other writers.
200 *
201 * See, for example usages, register_netdevice() and
202 * unregister_netdevice(), which must be called with the rtnl
203 * semaphore held.
204 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700205DEFINE_RWLOCK(dev_base_lock);
206
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207EXPORT_SYMBOL(dev_base_lock);
208
209#define NETDEV_HASHBITS 8
Eric W. Biederman881d9662007-09-17 11:56:21 -0700210#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700211
Eric W. Biederman881d9662007-09-17 11:56:21 -0700212static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213{
214 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
Eric W. Biederman881d9662007-09-17 11:56:21 -0700215 return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700216}
217
Eric W. Biederman881d9662007-09-17 11:56:21 -0700218static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700219{
Eric W. Biederman881d9662007-09-17 11:56:21 -0700220 return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700221}
222
Eric W. Biedermance286d32007-09-12 13:53:49 +0200223/* Device list insertion */
224static int list_netdevice(struct net_device *dev)
225{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900226 struct net *net = dev_net(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200227
228 ASSERT_RTNL();
229
230 write_lock_bh(&dev_base_lock);
231 list_add_tail(&dev->dev_list, &net->dev_base_head);
232 hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
233 hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
234 write_unlock_bh(&dev_base_lock);
235 return 0;
236}
237
238/* Device list removal */
239static void unlist_netdevice(struct net_device *dev)
240{
241 ASSERT_RTNL();
242
243 /* Unlink dev from the device chain */
244 write_lock_bh(&dev_base_lock);
245 list_del(&dev->dev_list);
246 hlist_del(&dev->name_hlist);
247 hlist_del(&dev->index_hlist);
248 write_unlock_bh(&dev_base_lock);
249}
250
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251/*
252 * Our notifier list
253 */
254
Alan Sternf07d5b92006-05-09 15:23:03 -0700255static RAW_NOTIFIER_HEAD(netdev_chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256
257/*
258 * Device drivers call our routines to queue packets here. We empty the
259 * queue in the local softnet handler.
260 */
Stephen Hemmingerbea33482007-10-03 16:41:36 -0700261
262DEFINE_PER_CPU(struct softnet_data, softnet_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700264#ifdef CONFIG_DEBUG_LOCK_ALLOC
265/*
David S. Millerc773e842008-07-08 23:13:53 -0700266 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700267 * according to dev->type
268 */
269static const unsigned short netdev_lock_type[] =
270 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
271 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
272 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
273 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
274 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
275 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
276 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
277 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
278 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
279 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
280 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
281 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
282 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
283 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_VOID,
284 ARPHRD_NONE};
285
286static const char *netdev_lock_name[] =
287 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
288 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
289 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
290 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
291 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
292 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
293 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
294 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
295 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
296 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
297 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
298 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
299 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
300 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_VOID",
301 "_xmit_NONE"};
302
303static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
304
305static inline unsigned short netdev_lock_pos(unsigned short dev_type)
306{
307 int i;
308
309 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
310 if (netdev_lock_type[i] == dev_type)
311 return i;
312 /* the last key is used by default */
313 return ARRAY_SIZE(netdev_lock_type) - 1;
314}
315
316static inline void netdev_set_lockdep_class(spinlock_t *lock,
317 unsigned short dev_type)
318{
319 int i;
320
321 i = netdev_lock_pos(dev_type);
322 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
323 netdev_lock_name[i]);
324}
325#else
326static inline void netdev_set_lockdep_class(spinlock_t *lock,
327 unsigned short dev_type)
328{
329}
330#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700331
332/*******************************************************************************
333
334 Protocol management and registration routines
335
336*******************************************************************************/
337
338/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339 * Add a protocol ID to the list. Now that the input handler is
340 * smarter we can dispense with all the messy stuff that used to be
341 * here.
342 *
343 * BEWARE!!! Protocol handlers, mangling input packets,
344 * MUST BE last in hash buckets and checking protocol handlers
345 * MUST start from promiscuous ptype_all chain in net_bh.
346 * It is true now, do not change it.
347 * Explanation follows: if protocol handler, mangling packet, will
348 * be the first on list, it is not able to sense, that packet
349 * is cloned and should be copied-on-write, so that it will
350 * change it and subsequent readers will get broken packet.
351 * --ANK (980803)
352 */
353
354/**
355 * dev_add_pack - add packet handler
356 * @pt: packet type declaration
357 *
358 * Add a protocol handler to the networking stack. The passed &packet_type
359 * is linked into kernel lists and may not be freed until it has been
360 * removed from the kernel lists.
361 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900362 * This call does not sleep therefore it can not
Linus Torvalds1da177e2005-04-16 15:20:36 -0700363 * guarantee all CPU's that are in middle of receiving packets
364 * will see the new packet type (until the next received packet).
365 */
366
367void dev_add_pack(struct packet_type *pt)
368{
369 int hash;
370
371 spin_lock_bh(&ptype_lock);
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -0700372 if (pt->type == htons(ETH_P_ALL))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700373 list_add_rcu(&pt->list, &ptype_all);
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -0700374 else {
Pavel Emelyanov82d8a862007-11-26 20:12:58 +0800375 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700376 list_add_rcu(&pt->list, &ptype_base[hash]);
377 }
378 spin_unlock_bh(&ptype_lock);
379}
380
Linus Torvalds1da177e2005-04-16 15:20:36 -0700381/**
382 * __dev_remove_pack - remove packet handler
383 * @pt: packet type declaration
384 *
385 * Remove a protocol handler that was previously added to the kernel
386 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
387 * from the kernel lists and can be freed or reused once this function
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900388 * returns.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389 *
390 * The packet type might still be in use by receivers
391 * and must not be freed until after all the CPU's have gone
392 * through a quiescent state.
393 */
394void __dev_remove_pack(struct packet_type *pt)
395{
396 struct list_head *head;
397 struct packet_type *pt1;
398
399 spin_lock_bh(&ptype_lock);
400
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -0700401 if (pt->type == htons(ETH_P_ALL))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402 head = &ptype_all;
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -0700403 else
Pavel Emelyanov82d8a862007-11-26 20:12:58 +0800404 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700405
406 list_for_each_entry(pt1, head, list) {
407 if (pt == pt1) {
408 list_del_rcu(&pt->list);
409 goto out;
410 }
411 }
412
413 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
414out:
415 spin_unlock_bh(&ptype_lock);
416}
417/**
418 * dev_remove_pack - remove packet handler
419 * @pt: packet type declaration
420 *
421 * Remove a protocol handler that was previously added to the kernel
422 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
423 * from the kernel lists and can be freed or reused once this function
424 * returns.
425 *
426 * This call sleeps to guarantee that no CPU is looking at the packet
427 * type after return.
428 */
429void dev_remove_pack(struct packet_type *pt)
430{
431 __dev_remove_pack(pt);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900432
Linus Torvalds1da177e2005-04-16 15:20:36 -0700433 synchronize_net();
434}
435
436/******************************************************************************
437
438 Device Boot-time Settings Routines
439
440*******************************************************************************/
441
442/* Boot time configuration table */
443static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
444
445/**
446 * netdev_boot_setup_add - add new setup entry
447 * @name: name of the device
448 * @map: configured settings for the device
449 *
450 * Adds new setup entry to the dev_boot_setup list. The function
451 * returns 0 on error and 1 on success. This is a generic routine to
452 * all netdevices.
453 */
454static int netdev_boot_setup_add(char *name, struct ifmap *map)
455{
456 struct netdev_boot_setup *s;
457 int i;
458
459 s = dev_boot_setup;
460 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
461 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
462 memset(s[i].name, 0, sizeof(s[i].name));
Wang Chen93b3cff2008-07-01 19:57:19 -0700463 strlcpy(s[i].name, name, IFNAMSIZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700464 memcpy(&s[i].map, map, sizeof(s[i].map));
465 break;
466 }
467 }
468
469 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
470}
471
472/**
473 * netdev_boot_setup_check - check boot time settings
474 * @dev: the netdevice
475 *
476 * Check boot time settings for the device.
477 * The found settings are set for the device to be used
478 * later in the device probing.
479 * Returns 0 if no settings found, 1 if they are.
480 */
481int netdev_boot_setup_check(struct net_device *dev)
482{
483 struct netdev_boot_setup *s = dev_boot_setup;
484 int i;
485
486 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
Wang Chen93b3cff2008-07-01 19:57:19 -0700488 !strcmp(dev->name, s[i].name)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700489 dev->irq = s[i].map.irq;
490 dev->base_addr = s[i].map.base_addr;
491 dev->mem_start = s[i].map.mem_start;
492 dev->mem_end = s[i].map.mem_end;
493 return 1;
494 }
495 }
496 return 0;
497}
498
499
500/**
501 * netdev_boot_base - get address from boot time settings
502 * @prefix: prefix for network device
503 * @unit: id for network device
504 *
505 * Check boot time settings for the base address of device.
506 * The found settings are set for the device to be used
507 * later in the device probing.
508 * Returns 0 if no settings found.
509 */
510unsigned long netdev_boot_base(const char *prefix, int unit)
511{
512 const struct netdev_boot_setup *s = dev_boot_setup;
513 char name[IFNAMSIZ];
514 int i;
515
516 sprintf(name, "%s%d", prefix, unit);
517
518 /*
519 * If device already registered then return base of 1
520 * to indicate not to probe for this interface
521 */
Eric W. Biederman881d9662007-09-17 11:56:21 -0700522 if (__dev_get_by_name(&init_net, name))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700523 return 1;
524
525 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
526 if (!strcmp(name, s[i].name))
527 return s[i].map.base_addr;
528 return 0;
529}
530
531/*
532 * Saves at boot time configured settings for any netdevice.
533 */
534int __init netdev_boot_setup(char *str)
535{
536 int ints[5];
537 struct ifmap map;
538
539 str = get_options(str, ARRAY_SIZE(ints), ints);
540 if (!str || !*str)
541 return 0;
542
543 /* Save settings */
544 memset(&map, 0, sizeof(map));
545 if (ints[0] > 0)
546 map.irq = ints[1];
547 if (ints[0] > 1)
548 map.base_addr = ints[2];
549 if (ints[0] > 2)
550 map.mem_start = ints[3];
551 if (ints[0] > 3)
552 map.mem_end = ints[4];
553
554 /* Add new entry to the list */
555 return netdev_boot_setup_add(str, &map);
556}
557
558__setup("netdev=", netdev_boot_setup);
559
560/*******************************************************************************
561
562 Device Interface Subroutines
563
564*******************************************************************************/
565
566/**
567 * __dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700568 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700569 * @name: name to find
570 *
571 * Find an interface by name. Must be called under RTNL semaphore
572 * or @dev_base_lock. If the name is found a pointer to the device
573 * is returned. If the name is not found then %NULL is returned. The
574 * reference counters are not incremented so the caller must be
575 * careful with locks.
576 */
577
Eric W. Biederman881d9662007-09-17 11:56:21 -0700578struct net_device *__dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700579{
580 struct hlist_node *p;
581
Eric W. Biederman881d9662007-09-17 11:56:21 -0700582 hlist_for_each(p, dev_name_hash(net, name)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700583 struct net_device *dev
584 = hlist_entry(p, struct net_device, name_hlist);
585 if (!strncmp(dev->name, name, IFNAMSIZ))
586 return dev;
587 }
588 return NULL;
589}
590
591/**
592 * dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700593 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700594 * @name: name to find
595 *
596 * Find an interface by name. This can be called from any
597 * context and does its own locking. The returned handle has
598 * the usage count incremented and the caller must use dev_put() to
599 * release it when it is no longer needed. %NULL is returned if no
600 * matching device is found.
601 */
602
Eric W. Biederman881d9662007-09-17 11:56:21 -0700603struct net_device *dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700604{
605 struct net_device *dev;
606
607 read_lock(&dev_base_lock);
Eric W. Biederman881d9662007-09-17 11:56:21 -0700608 dev = __dev_get_by_name(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700609 if (dev)
610 dev_hold(dev);
611 read_unlock(&dev_base_lock);
612 return dev;
613}
614
615/**
616 * __dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700617 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618 * @ifindex: index of device
619 *
620 * Search for an interface by index. Returns %NULL if the device
621 * is not found or a pointer to the device. The device has not
622 * had its reference counter increased so the caller must be careful
623 * about locking. The caller must hold either the RTNL semaphore
624 * or @dev_base_lock.
625 */
626
Eric W. Biederman881d9662007-09-17 11:56:21 -0700627struct net_device *__dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700628{
629 struct hlist_node *p;
630
Eric W. Biederman881d9662007-09-17 11:56:21 -0700631 hlist_for_each(p, dev_index_hash(net, ifindex)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700632 struct net_device *dev
633 = hlist_entry(p, struct net_device, index_hlist);
634 if (dev->ifindex == ifindex)
635 return dev;
636 }
637 return NULL;
638}
639
640
641/**
642 * dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700643 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700644 * @ifindex: index of device
645 *
646 * Search for an interface by index. Returns NULL if the device
647 * is not found or a pointer to the device. The device returned has
648 * had a reference added and the pointer is safe until the user calls
649 * dev_put to indicate they have finished with it.
650 */
651
Eric W. Biederman881d9662007-09-17 11:56:21 -0700652struct net_device *dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700653{
654 struct net_device *dev;
655
656 read_lock(&dev_base_lock);
Eric W. Biederman881d9662007-09-17 11:56:21 -0700657 dev = __dev_get_by_index(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700658 if (dev)
659 dev_hold(dev);
660 read_unlock(&dev_base_lock);
661 return dev;
662}
663
664/**
665 * dev_getbyhwaddr - find a device by its hardware address
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700666 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667 * @type: media type of device
668 * @ha: hardware address
669 *
670 * Search for an interface by MAC address. Returns NULL if the device
671 * is not found or a pointer to the device. The caller must hold the
672 * rtnl semaphore. The returned device has not had its ref count increased
673 * and the caller must therefore be careful about locking
674 *
675 * BUGS:
676 * If the API was consistent this would be __dev_get_by_hwaddr
677 */
678
Eric W. Biederman881d9662007-09-17 11:56:21 -0700679struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680{
681 struct net_device *dev;
682
683 ASSERT_RTNL();
684
Denis V. Lunev81103a52007-12-12 10:47:38 -0800685 for_each_netdev(net, dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700686 if (dev->type == type &&
687 !memcmp(dev->dev_addr, ha, dev->addr_len))
Pavel Emelianov7562f872007-05-03 15:13:45 -0700688 return dev;
689
690 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700691}
692
Jochen Friedrichcf309e32005-09-22 04:44:55 -0300693EXPORT_SYMBOL(dev_getbyhwaddr);
694
Eric W. Biederman881d9662007-09-17 11:56:21 -0700695struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700696{
697 struct net_device *dev;
698
699 ASSERT_RTNL();
Eric W. Biederman881d9662007-09-17 11:56:21 -0700700 for_each_netdev(net, dev)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700701 if (dev->type == type)
Pavel Emelianov7562f872007-05-03 15:13:45 -0700702 return dev;
703
704 return NULL;
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700705}
706
707EXPORT_SYMBOL(__dev_getfirstbyhwtype);
708
Eric W. Biederman881d9662007-09-17 11:56:21 -0700709struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700710{
711 struct net_device *dev;
712
713 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -0700714 dev = __dev_getfirstbyhwtype(net, type);
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700715 if (dev)
716 dev_hold(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700717 rtnl_unlock();
718 return dev;
719}
720
721EXPORT_SYMBOL(dev_getfirstbyhwtype);
722
723/**
724 * dev_get_by_flags - find any device with given flags
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700725 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700726 * @if_flags: IFF_* values
727 * @mask: bitmask of bits in if_flags to check
728 *
729 * Search for any interface with the given flags. Returns NULL if a device
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900730 * is not found or a pointer to the device. The device returned has
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731 * had a reference added and the pointer is safe until the user calls
732 * dev_put to indicate they have finished with it.
733 */
734
Eric W. Biederman881d9662007-09-17 11:56:21 -0700735struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700736{
Pavel Emelianov7562f872007-05-03 15:13:45 -0700737 struct net_device *dev, *ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700738
Pavel Emelianov7562f872007-05-03 15:13:45 -0700739 ret = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740 read_lock(&dev_base_lock);
Eric W. Biederman881d9662007-09-17 11:56:21 -0700741 for_each_netdev(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700742 if (((dev->flags ^ if_flags) & mask) == 0) {
743 dev_hold(dev);
Pavel Emelianov7562f872007-05-03 15:13:45 -0700744 ret = dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700745 break;
746 }
747 }
748 read_unlock(&dev_base_lock);
Pavel Emelianov7562f872007-05-03 15:13:45 -0700749 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700750}
751
752/**
753 * dev_valid_name - check if name is okay for network device
754 * @name: name string
755 *
756 * Network device names need to be valid file names to
David S. Millerc7fa9d12006-08-15 16:34:13 -0700757 * to allow sysfs to work. We also disallow any kind of
758 * whitespace.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700759 */
Mitch Williamsc2373ee2005-11-09 10:34:45 -0800760int dev_valid_name(const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700761{
David S. Millerc7fa9d12006-08-15 16:34:13 -0700762 if (*name == '\0')
763 return 0;
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -0700764 if (strlen(name) >= IFNAMSIZ)
765 return 0;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700766 if (!strcmp(name, ".") || !strcmp(name, ".."))
767 return 0;
768
769 while (*name) {
770 if (*name == '/' || isspace(*name))
771 return 0;
772 name++;
773 }
774 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700775}
776
777/**
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200778 * __dev_alloc_name - allocate a name for a device
779 * @net: network namespace to allocate the device name in
Linus Torvalds1da177e2005-04-16 15:20:36 -0700780 * @name: name format string
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200781 * @buf: scratch buffer and result name string
Linus Torvalds1da177e2005-04-16 15:20:36 -0700782 *
783 * Passed a format string - eg "lt%d" it will try and find a suitable
Stephen Hemminger3041a062006-05-26 13:25:24 -0700784 * id. It scans list of devices to build up a free map, then chooses
785 * the first empty slot. The caller must hold the dev_base or rtnl lock
786 * while allocating the name and adding the device in order to avoid
787 * duplicates.
788 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
789 * Returns the number of the unit assigned or a negative errno code.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790 */
791
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200792static int __dev_alloc_name(struct net *net, const char *name, char *buf)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700793{
794 int i = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700795 const char *p;
796 const int max_netdevices = 8*PAGE_SIZE;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700797 unsigned long *inuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700798 struct net_device *d;
799
800 p = strnchr(name, IFNAMSIZ-1, '%');
801 if (p) {
802 /*
803 * Verify the string as this thing may have come from
804 * the user. There must be either one "%d" and no other "%"
805 * characters.
806 */
807 if (p[1] != 'd' || strchr(p + 2, '%'))
808 return -EINVAL;
809
810 /* Use one page as a bit array of possible slots */
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700811 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700812 if (!inuse)
813 return -ENOMEM;
814
Eric W. Biederman881d9662007-09-17 11:56:21 -0700815 for_each_netdev(net, d) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700816 if (!sscanf(d->name, name, &i))
817 continue;
818 if (i < 0 || i >= max_netdevices)
819 continue;
820
821 /* avoid cases where sscanf is not exact inverse of printf */
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200822 snprintf(buf, IFNAMSIZ, name, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700823 if (!strncmp(buf, d->name, IFNAMSIZ))
824 set_bit(i, inuse);
825 }
826
827 i = find_first_zero_bit(inuse, max_netdevices);
828 free_page((unsigned long) inuse);
829 }
830
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200831 snprintf(buf, IFNAMSIZ, name, i);
832 if (!__dev_get_by_name(net, buf))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700833 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700834
835 /* It is possible to run out of possible slots
836 * when the name is long and there isn't enough space left
837 * for the digits, or if all bits are used.
838 */
839 return -ENFILE;
840}
841
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200842/**
843 * dev_alloc_name - allocate a name for a device
844 * @dev: device
845 * @name: name format string
846 *
847 * Passed a format string - eg "lt%d" it will try and find a suitable
848 * id. It scans list of devices to build up a free map, then chooses
849 * the first empty slot. The caller must hold the dev_base or rtnl lock
850 * while allocating the name and adding the device in order to avoid
851 * duplicates.
852 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
853 * Returns the number of the unit assigned or a negative errno code.
854 */
855
856int dev_alloc_name(struct net_device *dev, const char *name)
857{
858 char buf[IFNAMSIZ];
859 struct net *net;
860 int ret;
861
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900862 BUG_ON(!dev_net(dev));
863 net = dev_net(dev);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200864 ret = __dev_alloc_name(net, name, buf);
865 if (ret >= 0)
866 strlcpy(dev->name, buf, IFNAMSIZ);
867 return ret;
868}
869
Linus Torvalds1da177e2005-04-16 15:20:36 -0700870
871/**
872 * dev_change_name - change name of a device
873 * @dev: device
874 * @newname: name (or format string) must be at least IFNAMSIZ
875 *
876 * Change name of a device, can pass format strings "eth%d".
877 * for wildcarding.
878 */
879int dev_change_name(struct net_device *dev, char *newname)
880{
Herbert Xufcc5a032007-07-30 17:03:38 -0700881 char oldname[IFNAMSIZ];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700882 int err = 0;
Herbert Xufcc5a032007-07-30 17:03:38 -0700883 int ret;
Eric W. Biederman881d9662007-09-17 11:56:21 -0700884 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700885
886 ASSERT_RTNL();
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900887 BUG_ON(!dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700888
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900889 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700890 if (dev->flags & IFF_UP)
891 return -EBUSY;
892
893 if (!dev_valid_name(newname))
894 return -EINVAL;
895
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -0700896 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
897 return 0;
898
Herbert Xufcc5a032007-07-30 17:03:38 -0700899 memcpy(oldname, dev->name, IFNAMSIZ);
900
Linus Torvalds1da177e2005-04-16 15:20:36 -0700901 if (strchr(newname, '%')) {
902 err = dev_alloc_name(dev, newname);
903 if (err < 0)
904 return err;
905 strcpy(newname, dev->name);
906 }
Eric W. Biederman881d9662007-09-17 11:56:21 -0700907 else if (__dev_get_by_name(net, newname))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700908 return -EEXIST;
909 else
910 strlcpy(dev->name, newname, IFNAMSIZ);
911
Herbert Xufcc5a032007-07-30 17:03:38 -0700912rollback:
Stephen Hemmingerdcc99772008-05-14 22:33:38 -0700913 err = device_rename(&dev->dev, dev->name);
914 if (err) {
915 memcpy(dev->name, oldname, IFNAMSIZ);
916 return err;
917 }
Herbert Xu7f988ea2007-07-30 16:35:46 -0700918
919 write_lock_bh(&dev_base_lock);
Eric W. Biederman92749822007-04-03 00:07:30 -0600920 hlist_del(&dev->name_hlist);
Eric W. Biederman881d9662007-09-17 11:56:21 -0700921 hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
Herbert Xu7f988ea2007-07-30 16:35:46 -0700922 write_unlock_bh(&dev_base_lock);
923
Pavel Emelyanov056925a2007-09-16 15:42:43 -0700924 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -0700925 ret = notifier_to_errno(ret);
926
927 if (ret) {
928 if (err) {
929 printk(KERN_ERR
930 "%s: name change rollback failed: %d.\n",
931 dev->name, ret);
932 } else {
933 err = ret;
934 memcpy(dev->name, oldname, IFNAMSIZ);
935 goto rollback;
936 }
937 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700938
939 return err;
940}
941
942/**
Stephen Hemminger3041a062006-05-26 13:25:24 -0700943 * netdev_features_change - device changes features
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -0700944 * @dev: device to cause notification
945 *
946 * Called to indicate a device has changed features.
947 */
948void netdev_features_change(struct net_device *dev)
949{
Pavel Emelyanov056925a2007-09-16 15:42:43 -0700950 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -0700951}
952EXPORT_SYMBOL(netdev_features_change);
953
954/**
Linus Torvalds1da177e2005-04-16 15:20:36 -0700955 * netdev_state_change - device changes state
956 * @dev: device to cause notification
957 *
958 * Called to indicate a device has changed state. This function calls
959 * the notifier chains for netdev_chain and sends a NEWLINK message
960 * to the routing socket.
961 */
962void netdev_state_change(struct net_device *dev)
963{
964 if (dev->flags & IFF_UP) {
Pavel Emelyanov056925a2007-09-16 15:42:43 -0700965 call_netdevice_notifiers(NETDEV_CHANGE, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700966 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
967 }
968}
969
Or Gerlitzc1da4ac2008-06-13 18:12:00 -0700970void netdev_bonding_change(struct net_device *dev)
971{
972 call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
973}
974EXPORT_SYMBOL(netdev_bonding_change);
975
Linus Torvalds1da177e2005-04-16 15:20:36 -0700976/**
977 * dev_load - load a network module
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700978 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700979 * @name: name of interface
980 *
981 * If a network interface is not present and the process has suitable
982 * privileges this function loads the module. If module loading is not
983 * available in this kernel then it becomes a nop.
984 */
985
Eric W. Biederman881d9662007-09-17 11:56:21 -0700986void dev_load(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700987{
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900988 struct net_device *dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700989
990 read_lock(&dev_base_lock);
Eric W. Biederman881d9662007-09-17 11:56:21 -0700991 dev = __dev_get_by_name(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700992 read_unlock(&dev_base_lock);
993
994 if (!dev && capable(CAP_SYS_MODULE))
995 request_module("%s", name);
996}
997
Linus Torvalds1da177e2005-04-16 15:20:36 -0700998/**
999 * dev_open - prepare an interface for use.
1000 * @dev: device to open
1001 *
1002 * Takes a device from down to up state. The device's private open
1003 * function is invoked and then the multicast lists are loaded. Finally
1004 * the device is moved into the up state and a %NETDEV_UP message is
1005 * sent to the netdev notifier chain.
1006 *
1007 * Calling this function on an active interface is a nop. On a failure
1008 * a negative errno code is returned.
1009 */
1010int dev_open(struct net_device *dev)
1011{
1012 int ret = 0;
1013
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001014 ASSERT_RTNL();
1015
Linus Torvalds1da177e2005-04-16 15:20:36 -07001016 /*
1017 * Is it already up?
1018 */
1019
1020 if (dev->flags & IFF_UP)
1021 return 0;
1022
1023 /*
1024 * Is it even present?
1025 */
1026 if (!netif_device_present(dev))
1027 return -ENODEV;
1028
1029 /*
1030 * Call device private open method
1031 */
1032 set_bit(__LINK_STATE_START, &dev->state);
Jeff Garzikbada3392007-10-23 20:19:37 -07001033
1034 if (dev->validate_addr)
1035 ret = dev->validate_addr(dev);
1036
1037 if (!ret && dev->open)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001038 ret = dev->open(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001039
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001040 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001041 * If it went open OK then:
1042 */
1043
Jeff Garzikbada3392007-10-23 20:19:37 -07001044 if (ret)
1045 clear_bit(__LINK_STATE_START, &dev->state);
1046 else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001047 /*
1048 * Set the flags.
1049 */
1050 dev->flags |= IFF_UP;
1051
1052 /*
1053 * Initialize multicasting status
1054 */
Patrick McHardy4417da62007-06-27 01:28:10 -07001055 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001056
1057 /*
1058 * Wakeup transmit queue engine
1059 */
1060 dev_activate(dev);
1061
1062 /*
1063 * ... and announce new interface.
1064 */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001065 call_netdevice_notifiers(NETDEV_UP, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001066 }
Jeff Garzikbada3392007-10-23 20:19:37 -07001067
Linus Torvalds1da177e2005-04-16 15:20:36 -07001068 return ret;
1069}
1070
1071/**
1072 * dev_close - shutdown an interface.
1073 * @dev: device to shutdown
1074 *
1075 * This function moves an active device into down state. A
1076 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1077 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1078 * chain.
1079 */
1080int dev_close(struct net_device *dev)
1081{
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001082 ASSERT_RTNL();
1083
David S. Miller9d5010d2007-09-12 14:33:25 +02001084 might_sleep();
1085
Linus Torvalds1da177e2005-04-16 15:20:36 -07001086 if (!(dev->flags & IFF_UP))
1087 return 0;
1088
1089 /*
1090 * Tell people we are going down, so that they can
1091 * prepare to death, when device is still operating.
1092 */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001093 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001094
Linus Torvalds1da177e2005-04-16 15:20:36 -07001095 clear_bit(__LINK_STATE_START, &dev->state);
1096
1097 /* Synchronize to scheduled poll. We cannot touch poll list,
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001098 * it can be even on different cpu. So just clear netif_running().
1099 *
1100 * dev->stop() will invoke napi_disable() on all of it's
1101 * napi_struct instances on this device.
1102 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001103 smp_mb__after_clear_bit(); /* Commit netif_running(). */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001104
Matti Linnanvuorid8b2a4d2008-02-12 23:10:11 -08001105 dev_deactivate(dev);
1106
Linus Torvalds1da177e2005-04-16 15:20:36 -07001107 /*
1108 * Call the device specific close. This cannot fail.
1109 * Only if device is UP
1110 *
1111 * We allow it to be called even after a DETACH hot-plug
1112 * event.
1113 */
1114 if (dev->stop)
1115 dev->stop(dev);
1116
1117 /*
1118 * Device is now down.
1119 */
1120
1121 dev->flags &= ~IFF_UP;
1122
1123 /*
1124 * Tell people we are down
1125 */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001126 call_netdevice_notifiers(NETDEV_DOWN, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001127
1128 return 0;
1129}
1130
1131
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001132/**
1133 * dev_disable_lro - disable Large Receive Offload on a device
1134 * @dev: device
1135 *
1136 * Disable Large Receive Offload (LRO) on a net device. Must be
1137 * called under RTNL. This is needed if received packets may be
1138 * forwarded to another interface.
1139 */
1140void dev_disable_lro(struct net_device *dev)
1141{
1142 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1143 dev->ethtool_ops->set_flags) {
1144 u32 flags = dev->ethtool_ops->get_flags(dev);
1145 if (flags & ETH_FLAG_LRO) {
1146 flags &= ~ETH_FLAG_LRO;
1147 dev->ethtool_ops->set_flags(dev, flags);
1148 }
1149 }
1150 WARN_ON(dev->features & NETIF_F_LRO);
1151}
1152EXPORT_SYMBOL(dev_disable_lro);
1153
1154
Eric W. Biederman881d9662007-09-17 11:56:21 -07001155static int dev_boot_phase = 1;
1156
Linus Torvalds1da177e2005-04-16 15:20:36 -07001157/*
1158 * Device change register/unregister. These are not inline or static
1159 * as we export them to the world.
1160 */
1161
1162/**
1163 * register_netdevice_notifier - register a network notifier block
1164 * @nb: notifier
1165 *
1166 * Register a notifier to be called when network device events occur.
1167 * The notifier passed is linked into the kernel structures and must
1168 * not be reused until it has been unregistered. A negative errno code
1169 * is returned on a failure.
1170 *
1171 * When registered all registration and up events are replayed
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001172 * to the new notifier to allow device to have a race free
Linus Torvalds1da177e2005-04-16 15:20:36 -07001173 * view of the network device list.
1174 */
1175
1176int register_netdevice_notifier(struct notifier_block *nb)
1177{
1178 struct net_device *dev;
Herbert Xufcc5a032007-07-30 17:03:38 -07001179 struct net_device *last;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001180 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001181 int err;
1182
1183 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001184 err = raw_notifier_chain_register(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001185 if (err)
1186 goto unlock;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001187 if (dev_boot_phase)
1188 goto unlock;
1189 for_each_net(net) {
1190 for_each_netdev(net, dev) {
1191 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1192 err = notifier_to_errno(err);
1193 if (err)
1194 goto rollback;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001195
Eric W. Biederman881d9662007-09-17 11:56:21 -07001196 if (!(dev->flags & IFF_UP))
1197 continue;
Herbert Xufcc5a032007-07-30 17:03:38 -07001198
Eric W. Biederman881d9662007-09-17 11:56:21 -07001199 nb->notifier_call(nb, NETDEV_UP, dev);
1200 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001201 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001202
1203unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001204 rtnl_unlock();
1205 return err;
Herbert Xufcc5a032007-07-30 17:03:38 -07001206
1207rollback:
1208 last = dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001209 for_each_net(net) {
1210 for_each_netdev(net, dev) {
1211 if (dev == last)
1212 break;
Herbert Xufcc5a032007-07-30 17:03:38 -07001213
Eric W. Biederman881d9662007-09-17 11:56:21 -07001214 if (dev->flags & IFF_UP) {
1215 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1216 nb->notifier_call(nb, NETDEV_DOWN, dev);
1217 }
1218 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001219 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001220 }
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001221
1222 raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001223 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001224}
1225
1226/**
1227 * unregister_netdevice_notifier - unregister a network notifier block
1228 * @nb: notifier
1229 *
1230 * Unregister a notifier previously registered by
1231 * register_netdevice_notifier(). The notifier is unlinked into the
1232 * kernel structures and may then be reused. A negative errno code
1233 * is returned on a failure.
1234 */
1235
1236int unregister_netdevice_notifier(struct notifier_block *nb)
1237{
Herbert Xu9f514952006-03-25 01:24:25 -08001238 int err;
1239
1240 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001241 err = raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xu9f514952006-03-25 01:24:25 -08001242 rtnl_unlock();
1243 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001244}
1245
1246/**
1247 * call_netdevice_notifiers - call all network notifier blocks
1248 * @val: value passed unmodified to notifier function
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001249 * @dev: net_device pointer passed unmodified to notifier function
Linus Torvalds1da177e2005-04-16 15:20:36 -07001250 *
1251 * Call all network notifier blocks. Parameters and return value
Alan Sternf07d5b92006-05-09 15:23:03 -07001252 * are as for raw_notifier_call_chain().
Linus Torvalds1da177e2005-04-16 15:20:36 -07001253 */
1254
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001255int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001256{
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001257 return raw_notifier_call_chain(&netdev_chain, val, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001258}
1259
1260/* When > 0 there are consumers of rx skb time stamps */
1261static atomic_t netstamp_needed = ATOMIC_INIT(0);
1262
1263void net_enable_timestamp(void)
1264{
1265 atomic_inc(&netstamp_needed);
1266}
1267
1268void net_disable_timestamp(void)
1269{
1270 atomic_dec(&netstamp_needed);
1271}
1272
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001273static inline void net_timestamp(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001274{
1275 if (atomic_read(&netstamp_needed))
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001276 __net_timestamp(skb);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001277 else
1278 skb->tstamp.tv64 = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001279}
1280
1281/*
1282 * Support routine. Sends outgoing frames to any network
1283 * taps currently in use.
1284 */
1285
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001286static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001287{
1288 struct packet_type *ptype;
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001289
1290 net_timestamp(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001291
1292 rcu_read_lock();
1293 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1294 /* Never send packets back to the socket
1295 * they originated from - MvS (miquels@drinkel.ow.org)
1296 */
1297 if ((ptype->dev == dev || !ptype->dev) &&
1298 (ptype->af_packet_priv == NULL ||
1299 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1300 struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1301 if (!skb2)
1302 break;
1303
1304 /* skb->nh should be correctly
1305 set by sender, so that the second statement is
1306 just protection against buggy protocols.
1307 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001308 skb_reset_mac_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001309
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001310 if (skb_network_header(skb2) < skb2->data ||
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001311 skb2->network_header > skb2->tail) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001312 if (net_ratelimit())
1313 printk(KERN_CRIT "protocol %04x is "
1314 "buggy, dev %s\n",
1315 skb2->protocol, dev->name);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07001316 skb_reset_network_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001317 }
1318
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001319 skb2->transport_header = skb2->network_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001320 skb2->pkt_type = PACKET_OUTGOING;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07001321 ptype->func(skb2, skb->dev, ptype, skb->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001322 }
1323 }
1324 rcu_read_unlock();
1325}
1326
Denis Vlasenko56079432006-03-29 15:57:29 -08001327
David S. Miller37437bb2008-07-16 02:15:04 -07001328void __netif_schedule(struct Qdisc *q)
Denis Vlasenko56079432006-03-29 15:57:29 -08001329{
Linus Torvalds867d79f2008-07-21 09:54:18 -07001330 if (WARN_ON_ONCE(q == &noop_qdisc))
1331 return;
David S. Miller86d804e2008-07-08 23:11:25 -07001332
David S. Miller37437bb2008-07-16 02:15:04 -07001333 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) {
Denis Vlasenko56079432006-03-29 15:57:29 -08001334 struct softnet_data *sd;
David S. Miller86d804e2008-07-08 23:11:25 -07001335 unsigned long flags;
Denis Vlasenko56079432006-03-29 15:57:29 -08001336
1337 local_irq_save(flags);
1338 sd = &__get_cpu_var(softnet_data);
David S. Miller37437bb2008-07-16 02:15:04 -07001339 q->next_sched = sd->output_queue;
1340 sd->output_queue = q;
Denis Vlasenko56079432006-03-29 15:57:29 -08001341 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1342 local_irq_restore(flags);
1343 }
1344}
1345EXPORT_SYMBOL(__netif_schedule);
1346
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001347void dev_kfree_skb_irq(struct sk_buff *skb)
Denis Vlasenko56079432006-03-29 15:57:29 -08001348{
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001349 if (atomic_dec_and_test(&skb->users)) {
1350 struct softnet_data *sd;
1351 unsigned long flags;
Denis Vlasenko56079432006-03-29 15:57:29 -08001352
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001353 local_irq_save(flags);
1354 sd = &__get_cpu_var(softnet_data);
1355 skb->next = sd->completion_queue;
1356 sd->completion_queue = skb;
1357 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1358 local_irq_restore(flags);
1359 }
Denis Vlasenko56079432006-03-29 15:57:29 -08001360}
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001361EXPORT_SYMBOL(dev_kfree_skb_irq);
Denis Vlasenko56079432006-03-29 15:57:29 -08001362
1363void dev_kfree_skb_any(struct sk_buff *skb)
1364{
1365 if (in_irq() || irqs_disabled())
1366 dev_kfree_skb_irq(skb);
1367 else
1368 dev_kfree_skb(skb);
1369}
1370EXPORT_SYMBOL(dev_kfree_skb_any);
1371
1372
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001373/**
1374 * netif_device_detach - mark device as removed
1375 * @dev: network device
1376 *
1377 * Mark device as removed from system and therefore no longer available.
1378 */
Denis Vlasenko56079432006-03-29 15:57:29 -08001379void netif_device_detach(struct net_device *dev)
1380{
1381 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1382 netif_running(dev)) {
1383 netif_stop_queue(dev);
1384 }
1385}
1386EXPORT_SYMBOL(netif_device_detach);
1387
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001388/**
1389 * netif_device_attach - mark device as attached
1390 * @dev: network device
1391 *
1392 * Mark device as attached from system and restart if needed.
1393 */
Denis Vlasenko56079432006-03-29 15:57:29 -08001394void netif_device_attach(struct net_device *dev)
1395{
1396 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1397 netif_running(dev)) {
1398 netif_wake_queue(dev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001399 __netdev_watchdog_up(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08001400 }
1401}
1402EXPORT_SYMBOL(netif_device_attach);
1403
Ben Hutchings6de329e2008-06-16 17:02:28 -07001404static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1405{
1406 return ((features & NETIF_F_GEN_CSUM) ||
1407 ((features & NETIF_F_IP_CSUM) &&
1408 protocol == htons(ETH_P_IP)) ||
1409 ((features & NETIF_F_IPV6_CSUM) &&
1410 protocol == htons(ETH_P_IPV6)));
1411}
1412
1413static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1414{
1415 if (can_checksum_protocol(dev->features, skb->protocol))
1416 return true;
1417
1418 if (skb->protocol == htons(ETH_P_8021Q)) {
1419 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1420 if (can_checksum_protocol(dev->features & dev->vlan_features,
1421 veh->h_vlan_encapsulated_proto))
1422 return true;
1423 }
1424
1425 return false;
1426}
Denis Vlasenko56079432006-03-29 15:57:29 -08001427
Linus Torvalds1da177e2005-04-16 15:20:36 -07001428/*
1429 * Invalidate hardware checksum when packet is to be mangled, and
1430 * complete checksum manually on outgoing path.
1431 */
Patrick McHardy84fa7932006-08-29 16:44:56 -07001432int skb_checksum_help(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001433{
Al Virod3bc23e2006-11-14 21:24:49 -08001434 __wsum csum;
Herbert Xu663ead32007-04-09 11:59:07 -07001435 int ret = 0, offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001436
Patrick McHardy84fa7932006-08-29 16:44:56 -07001437 if (skb->ip_summed == CHECKSUM_COMPLETE)
Herbert Xua430a432006-07-08 13:34:56 -07001438 goto out_set_summed;
1439
1440 if (unlikely(skb_shinfo(skb)->gso_size)) {
Herbert Xua430a432006-07-08 13:34:56 -07001441 /* Let GSO fix up the checksum. */
1442 goto out_set_summed;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001443 }
1444
Herbert Xua0308472007-10-15 01:47:15 -07001445 offset = skb->csum_start - skb_headroom(skb);
1446 BUG_ON(offset >= skb_headlen(skb));
1447 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1448
1449 offset += skb->csum_offset;
1450 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1451
1452 if (skb_cloned(skb) &&
1453 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001454 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1455 if (ret)
1456 goto out;
1457 }
1458
Herbert Xua0308472007-10-15 01:47:15 -07001459 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
Herbert Xua430a432006-07-08 13:34:56 -07001460out_set_summed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001461 skb->ip_summed = CHECKSUM_NONE;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001462out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001463 return ret;
1464}
1465
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001466/**
1467 * skb_gso_segment - Perform segmentation on skb.
1468 * @skb: buffer to segment
Herbert Xu576a30e2006-06-27 13:22:38 -07001469 * @features: features for the output path (see dev->features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001470 *
1471 * This function segments the given skb and returns a list of segments.
Herbert Xu576a30e2006-06-27 13:22:38 -07001472 *
1473 * It may return NULL if the skb requires no segmentation. This is
1474 * only possible when GSO is used for verifying header integrity.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001475 */
Herbert Xu576a30e2006-06-27 13:22:38 -07001476struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001477{
1478 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1479 struct packet_type *ptype;
Al Viro252e3342006-11-14 20:48:11 -08001480 __be16 type = skb->protocol;
Herbert Xua430a432006-07-08 13:34:56 -07001481 int err;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001482
1483 BUG_ON(skb_shinfo(skb)->frag_list);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001484
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001485 skb_reset_mac_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001486 skb->mac_len = skb->network_header - skb->mac_header;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001487 __skb_pull(skb, skb->mac_len);
1488
Herbert Xuf9d106a2007-04-23 22:36:13 -07001489 if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
Herbert Xua430a432006-07-08 13:34:56 -07001490 if (skb_header_cloned(skb) &&
1491 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1492 return ERR_PTR(err);
1493 }
1494
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001495 rcu_read_lock();
Pavel Emelyanov82d8a862007-11-26 20:12:58 +08001496 list_for_each_entry_rcu(ptype,
1497 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001498 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
Patrick McHardy84fa7932006-08-29 16:44:56 -07001499 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
Herbert Xua430a432006-07-08 13:34:56 -07001500 err = ptype->gso_send_check(skb);
1501 segs = ERR_PTR(err);
1502 if (err || skb_gso_ok(skb, features))
1503 break;
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001504 __skb_push(skb, (skb->data -
1505 skb_network_header(skb)));
Herbert Xua430a432006-07-08 13:34:56 -07001506 }
Herbert Xu576a30e2006-06-27 13:22:38 -07001507 segs = ptype->gso_segment(skb, features);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001508 break;
1509 }
1510 }
1511 rcu_read_unlock();
1512
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001513 __skb_push(skb, skb->data - skb_mac_header(skb));
Herbert Xu576a30e2006-06-27 13:22:38 -07001514
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001515 return segs;
1516}
1517
1518EXPORT_SYMBOL(skb_gso_segment);
1519
Herbert Xufb286bb2005-11-10 13:01:24 -08001520/* Take action when hardware reception checksum errors are detected. */
1521#ifdef CONFIG_BUG
1522void netdev_rx_csum_fault(struct net_device *dev)
1523{
1524 if (net_ratelimit()) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001525 printk(KERN_ERR "%s: hw csum failure.\n",
Stephen Hemminger246a4212005-12-08 15:21:39 -08001526 dev ? dev->name : "<unknown>");
Herbert Xufb286bb2005-11-10 13:01:24 -08001527 dump_stack();
1528 }
1529}
1530EXPORT_SYMBOL(netdev_rx_csum_fault);
1531#endif
1532
Linus Torvalds1da177e2005-04-16 15:20:36 -07001533/* Actually, we should eliminate this check as soon as we know, that:
1534 * 1. IOMMU is present and allows to map all the memory.
1535 * 2. No high memory really exists on this machine.
1536 */
1537
1538static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1539{
Herbert Xu3d3a8532006-06-27 13:33:10 -07001540#ifdef CONFIG_HIGHMEM
Linus Torvalds1da177e2005-04-16 15:20:36 -07001541 int i;
1542
1543 if (dev->features & NETIF_F_HIGHDMA)
1544 return 0;
1545
1546 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1547 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1548 return 1;
1549
Herbert Xu3d3a8532006-06-27 13:33:10 -07001550#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001551 return 0;
1552}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001553
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001554struct dev_gso_cb {
1555 void (*destructor)(struct sk_buff *skb);
1556};
1557
1558#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1559
1560static void dev_gso_skb_destructor(struct sk_buff *skb)
1561{
1562 struct dev_gso_cb *cb;
1563
1564 do {
1565 struct sk_buff *nskb = skb->next;
1566
1567 skb->next = nskb->next;
1568 nskb->next = NULL;
1569 kfree_skb(nskb);
1570 } while (skb->next);
1571
1572 cb = DEV_GSO_CB(skb);
1573 if (cb->destructor)
1574 cb->destructor(skb);
1575}
1576
1577/**
1578 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1579 * @skb: buffer to segment
1580 *
1581 * This function segments the given skb and stores the list of segments
1582 * in skb->next.
1583 */
1584static int dev_gso_segment(struct sk_buff *skb)
1585{
1586 struct net_device *dev = skb->dev;
1587 struct sk_buff *segs;
Herbert Xu576a30e2006-06-27 13:22:38 -07001588 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1589 NETIF_F_SG : 0);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001590
Herbert Xu576a30e2006-06-27 13:22:38 -07001591 segs = skb_gso_segment(skb, features);
1592
1593 /* Verifying header integrity only. */
1594 if (!segs)
1595 return 0;
1596
Hirofumi Nakagawa801678c2008-04-29 01:03:09 -07001597 if (IS_ERR(segs))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001598 return PTR_ERR(segs);
1599
1600 skb->next = segs;
1601 DEV_GSO_CB(skb)->destructor = skb->destructor;
1602 skb->destructor = dev_gso_skb_destructor;
1603
1604 return 0;
1605}
1606
David S. Millerfd2ea0a2008-07-17 01:56:23 -07001607int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1608 struct netdev_queue *txq)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001609{
1610 if (likely(!skb->next)) {
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -07001611 if (!list_empty(&ptype_all))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001612 dev_queue_xmit_nit(skb, dev);
1613
Herbert Xu576a30e2006-06-27 13:22:38 -07001614 if (netif_needs_gso(dev, skb)) {
1615 if (unlikely(dev_gso_segment(skb)))
1616 goto out_kfree_skb;
1617 if (skb->next)
1618 goto gso;
1619 }
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001620
Herbert Xu576a30e2006-06-27 13:22:38 -07001621 return dev->hard_start_xmit(skb, dev);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001622 }
1623
Herbert Xu576a30e2006-06-27 13:22:38 -07001624gso:
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001625 do {
1626 struct sk_buff *nskb = skb->next;
1627 int rc;
1628
1629 skb->next = nskb->next;
1630 nskb->next = NULL;
1631 rc = dev->hard_start_xmit(nskb, dev);
1632 if (unlikely(rc)) {
Michael Chanf54d9e82006-06-25 23:57:04 -07001633 nskb->next = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001634 skb->next = nskb;
1635 return rc;
1636 }
David S. Millerfd2ea0a2008-07-17 01:56:23 -07001637 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
Michael Chanf54d9e82006-06-25 23:57:04 -07001638 return NETDEV_TX_BUSY;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001639 } while (skb->next);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001640
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001641 skb->destructor = DEV_GSO_CB(skb)->destructor;
1642
1643out_kfree_skb:
1644 kfree_skb(skb);
1645 return 0;
1646}
1647
Linus Torvalds1da177e2005-04-16 15:20:36 -07001648/**
1649 * dev_queue_xmit - transmit a buffer
1650 * @skb: buffer to transmit
1651 *
1652 * Queue a buffer for transmission to a network device. The caller must
1653 * have set the device and priority and built the buffer before calling
1654 * this function. The function can be called from an interrupt.
1655 *
1656 * A negative errno code is returned on a failure. A success does not
1657 * guarantee the frame will be transmitted as it may be dropped due
1658 * to congestion or traffic shaping.
Ben Greearaf191362005-04-24 20:12:36 -07001659 *
1660 * -----------------------------------------------------------------------------------
1661 * I notice this method can also return errors from the queue disciplines,
1662 * including NET_XMIT_DROP, which is a positive value. So, errors can also
1663 * be positive.
1664 *
1665 * Regardless of the return value, the skb is consumed, so it is currently
1666 * difficult to retry a send to this method. (You can bump the ref count
1667 * before sending to hold a reference for retry if you are careful.)
1668 *
1669 * When calling this method, interrupts MUST be enabled. This is because
1670 * the BH enable code must have IRQs enabled so that it will not deadlock.
1671 * --BLG
Linus Torvalds1da177e2005-04-16 15:20:36 -07001672 */
1673
David S. Millerb6b2fed2008-07-21 09:48:06 -07001674static u32 simple_tx_hashrnd;
1675static int simple_tx_hashrnd_initialized = 0;
1676
David S. Miller8f0f2222008-07-15 03:47:03 -07001677static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
1678{
David S. Millerb6b2fed2008-07-21 09:48:06 -07001679 u32 addr1, addr2, ports;
1680 u32 hash, ihl;
David S. Miller8f0f2222008-07-15 03:47:03 -07001681 u8 ip_proto;
David S. Millerb6b2fed2008-07-21 09:48:06 -07001682
1683 if (unlikely(!simple_tx_hashrnd_initialized)) {
1684 get_random_bytes(&simple_tx_hashrnd, 4);
1685 simple_tx_hashrnd_initialized = 1;
1686 }
David S. Miller8f0f2222008-07-15 03:47:03 -07001687
1688 switch (skb->protocol) {
1689 case __constant_htons(ETH_P_IP):
1690 ip_proto = ip_hdr(skb)->protocol;
David S. Millerb6b2fed2008-07-21 09:48:06 -07001691 addr1 = ip_hdr(skb)->saddr;
1692 addr2 = ip_hdr(skb)->daddr;
David S. Miller8f0f2222008-07-15 03:47:03 -07001693 ihl = ip_hdr(skb)->ihl;
David S. Miller8f0f2222008-07-15 03:47:03 -07001694 break;
1695 case __constant_htons(ETH_P_IPV6):
1696 ip_proto = ipv6_hdr(skb)->nexthdr;
David S. Millerb6b2fed2008-07-21 09:48:06 -07001697 addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
1698 addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
David S. Miller8f0f2222008-07-15 03:47:03 -07001699 ihl = (40 >> 2);
David S. Miller8f0f2222008-07-15 03:47:03 -07001700 break;
1701 default:
1702 return 0;
1703 }
1704
David S. Miller8f0f2222008-07-15 03:47:03 -07001705
1706 switch (ip_proto) {
1707 case IPPROTO_TCP:
1708 case IPPROTO_UDP:
1709 case IPPROTO_DCCP:
1710 case IPPROTO_ESP:
1711 case IPPROTO_AH:
1712 case IPPROTO_SCTP:
1713 case IPPROTO_UDPLITE:
David S. Millerb6b2fed2008-07-21 09:48:06 -07001714 ports = *((u32 *) (skb_network_header(skb) + (ihl * 4)));
David S. Miller8f0f2222008-07-15 03:47:03 -07001715 break;
1716
1717 default:
David S. Millerb6b2fed2008-07-21 09:48:06 -07001718 ports = 0;
David S. Miller8f0f2222008-07-15 03:47:03 -07001719 break;
1720 }
1721
David S. Millerb6b2fed2008-07-21 09:48:06 -07001722 hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd);
1723
1724 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
David S. Miller8f0f2222008-07-15 03:47:03 -07001725}
1726
David S. Millere8a04642008-07-17 00:34:19 -07001727static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1728 struct sk_buff *skb)
1729{
David S. Millerfd2ea0a2008-07-17 01:56:23 -07001730 u16 queue_index = 0;
1731
David S. Millereae792b2008-07-15 03:03:33 -07001732 if (dev->select_queue)
1733 queue_index = dev->select_queue(dev, skb);
David S. Miller8f0f2222008-07-15 03:47:03 -07001734 else if (dev->real_num_tx_queues > 1)
1735 queue_index = simple_tx_hash(dev, skb);
David S. Millereae792b2008-07-15 03:03:33 -07001736
David S. Millerfd2ea0a2008-07-17 01:56:23 -07001737 skb_set_queue_mapping(skb, queue_index);
1738 return netdev_get_tx_queue(dev, queue_index);
David S. Millere8a04642008-07-17 00:34:19 -07001739}
1740
Linus Torvalds1da177e2005-04-16 15:20:36 -07001741int dev_queue_xmit(struct sk_buff *skb)
1742{
1743 struct net_device *dev = skb->dev;
David S. Millerdc2b4842008-07-08 17:18:23 -07001744 struct netdev_queue *txq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001745 struct Qdisc *q;
1746 int rc = -ENOMEM;
1747
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001748 /* GSO will handle the following emulations directly. */
1749 if (netif_needs_gso(dev, skb))
1750 goto gso;
1751
Linus Torvalds1da177e2005-04-16 15:20:36 -07001752 if (skb_shinfo(skb)->frag_list &&
1753 !(dev->features & NETIF_F_FRAGLIST) &&
Herbert Xu364c6ba2006-06-09 16:10:40 -07001754 __skb_linearize(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001755 goto out_kfree_skb;
1756
1757 /* Fragmented skb is linearized if device does not support SG,
1758 * or if at least one of fragments is in highmem and device
1759 * does not support DMA from it.
1760 */
1761 if (skb_shinfo(skb)->nr_frags &&
1762 (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
Herbert Xu364c6ba2006-06-09 16:10:40 -07001763 __skb_linearize(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001764 goto out_kfree_skb;
1765
1766 /* If packet is not checksummed and device does not support
1767 * checksumming for this protocol, complete checksumming here.
1768 */
Herbert Xu663ead32007-04-09 11:59:07 -07001769 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1770 skb_set_transport_header(skb, skb->csum_start -
1771 skb_headroom(skb));
Ben Hutchings6de329e2008-06-16 17:02:28 -07001772 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1773 goto out_kfree_skb;
Herbert Xu663ead32007-04-09 11:59:07 -07001774 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001775
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001776gso:
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001777 /* Disable soft irqs for various locks below. Also
1778 * stops preemption for RCU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001779 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001780 rcu_read_lock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001781
David S. Millereae792b2008-07-15 03:03:33 -07001782 txq = dev_pick_tx(dev, skb);
David S. Millerb0e1e642008-07-08 17:42:10 -07001783 q = rcu_dereference(txq->qdisc);
David S. Miller37437bb2008-07-16 02:15:04 -07001784
Linus Torvalds1da177e2005-04-16 15:20:36 -07001785#ifdef CONFIG_NET_CLS_ACT
1786 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1787#endif
1788 if (q->enqueue) {
David S. Miller37437bb2008-07-16 02:15:04 -07001789 spinlock_t *root_lock = qdisc_root_lock(q);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001790
David S. Miller37437bb2008-07-16 02:15:04 -07001791 spin_lock(root_lock);
1792
Jussi Kivilinna5f861732008-07-20 00:08:04 -07001793 rc = qdisc_enqueue_root(skb, q);
David S. Miller37437bb2008-07-16 02:15:04 -07001794 qdisc_run(q);
1795
1796 spin_unlock(root_lock);
1797
1798 rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1799 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001800 }
1801
1802 /* The device has no queue. Common case for software devices:
1803 loopback, all the sorts of tunnels...
1804
Herbert Xu932ff272006-06-09 12:20:56 -07001805 Really, it is unlikely that netif_tx_lock protection is necessary
1806 here. (f.e. loopback and IP tunnels are clean ignoring statistics
Linus Torvalds1da177e2005-04-16 15:20:36 -07001807 counters.)
1808 However, it is possible, that they rely on protection
1809 made by us here.
1810
1811 Check this and shot the lock. It is not prone from deadlocks.
1812 Either shot noqueue qdisc, it is even simpler 8)
1813 */
1814 if (dev->flags & IFF_UP) {
1815 int cpu = smp_processor_id(); /* ok because BHs are off */
1816
David S. Millerc773e842008-07-08 23:13:53 -07001817 if (txq->xmit_lock_owner != cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001818
David S. Millerc773e842008-07-08 23:13:53 -07001819 HARD_TX_LOCK(dev, txq, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001820
David S. Millerfd2ea0a2008-07-17 01:56:23 -07001821 if (!netif_tx_queue_stopped(txq)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001822 rc = 0;
David S. Millerfd2ea0a2008-07-17 01:56:23 -07001823 if (!dev_hard_start_xmit(skb, dev, txq)) {
David S. Millerc773e842008-07-08 23:13:53 -07001824 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001825 goto out;
1826 }
1827 }
David S. Millerc773e842008-07-08 23:13:53 -07001828 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001829 if (net_ratelimit())
1830 printk(KERN_CRIT "Virtual device %s asks to "
1831 "queue packet!\n", dev->name);
1832 } else {
1833 /* Recursion is detected! It is possible,
1834 * unfortunately */
1835 if (net_ratelimit())
1836 printk(KERN_CRIT "Dead loop on virtual device "
1837 "%s, fix it urgently!\n", dev->name);
1838 }
1839 }
1840
1841 rc = -ENETDOWN;
Herbert Xud4828d82006-06-22 02:28:18 -07001842 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001843
1844out_kfree_skb:
1845 kfree_skb(skb);
1846 return rc;
1847out:
Herbert Xud4828d82006-06-22 02:28:18 -07001848 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001849 return rc;
1850}
1851
1852
1853/*=======================================================================
1854 Receiver routines
1855 =======================================================================*/
1856
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07001857int netdev_max_backlog __read_mostly = 1000;
1858int netdev_budget __read_mostly = 300;
1859int weight_p __read_mostly = 64; /* old backlog weight */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001860
1861DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1862
1863
Linus Torvalds1da177e2005-04-16 15:20:36 -07001864/**
1865 * netif_rx - post buffer to the network code
1866 * @skb: buffer to post
1867 *
1868 * This function receives a packet from a device driver and queues it for
1869 * the upper (protocol) levels to process. It always succeeds. The buffer
1870 * may be dropped during processing for congestion control or by the
1871 * protocol layers.
1872 *
1873 * return values:
1874 * NET_RX_SUCCESS (no congestion)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001875 * NET_RX_DROP (packet was dropped)
1876 *
1877 */
1878
1879int netif_rx(struct sk_buff *skb)
1880{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001881 struct softnet_data *queue;
1882 unsigned long flags;
1883
1884 /* if netpoll wants it, pretend we never saw it */
1885 if (netpoll_rx(skb))
1886 return NET_RX_DROP;
1887
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001888 if (!skb->tstamp.tv64)
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001889 net_timestamp(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001890
1891 /*
1892 * The code is rearranged so that the path is the most
1893 * short when CPU is congested, but is still operating.
1894 */
1895 local_irq_save(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001896 queue = &__get_cpu_var(softnet_data);
1897
1898 __get_cpu_var(netdev_rx_stat).total++;
1899 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1900 if (queue->input_pkt_queue.qlen) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001901enqueue:
1902 dev_hold(skb->dev);
1903 __skb_queue_tail(&queue->input_pkt_queue, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001904 local_irq_restore(flags);
Stephen Hemminger34008d82005-06-23 20:10:00 -07001905 return NET_RX_SUCCESS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001906 }
1907
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001908 napi_schedule(&queue->backlog);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001909 goto enqueue;
1910 }
1911
Linus Torvalds1da177e2005-04-16 15:20:36 -07001912 __get_cpu_var(netdev_rx_stat).dropped++;
1913 local_irq_restore(flags);
1914
1915 kfree_skb(skb);
1916 return NET_RX_DROP;
1917}
1918
1919int netif_rx_ni(struct sk_buff *skb)
1920{
1921 int err;
1922
1923 preempt_disable();
1924 err = netif_rx(skb);
1925 if (local_softirq_pending())
1926 do_softirq();
1927 preempt_enable();
1928
1929 return err;
1930}
1931
1932EXPORT_SYMBOL(netif_rx_ni);
1933
David S. Millerf2ccd8f2005-08-09 19:34:12 -07001934static inline struct net_device *skb_bond(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001935{
1936 struct net_device *dev = skb->dev;
1937
Jay Vosburgh8f903c72006-02-21 16:36:44 -08001938 if (dev->master) {
David S. Miller7ea49ed2006-08-14 17:08:36 -07001939 if (skb_bond_should_drop(skb)) {
Jay Vosburgh8f903c72006-02-21 16:36:44 -08001940 kfree_skb(skb);
1941 return NULL;
1942 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001943 skb->dev = dev->master;
Jay Vosburgh8f903c72006-02-21 16:36:44 -08001944 }
David S. Millerf2ccd8f2005-08-09 19:34:12 -07001945
1946 return dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001947}
1948
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001949
Linus Torvalds1da177e2005-04-16 15:20:36 -07001950static void net_tx_action(struct softirq_action *h)
1951{
1952 struct softnet_data *sd = &__get_cpu_var(softnet_data);
1953
1954 if (sd->completion_queue) {
1955 struct sk_buff *clist;
1956
1957 local_irq_disable();
1958 clist = sd->completion_queue;
1959 sd->completion_queue = NULL;
1960 local_irq_enable();
1961
1962 while (clist) {
1963 struct sk_buff *skb = clist;
1964 clist = clist->next;
1965
1966 BUG_TRAP(!atomic_read(&skb->users));
1967 __kfree_skb(skb);
1968 }
1969 }
1970
1971 if (sd->output_queue) {
David S. Miller37437bb2008-07-16 02:15:04 -07001972 struct Qdisc *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001973
1974 local_irq_disable();
1975 head = sd->output_queue;
1976 sd->output_queue = NULL;
1977 local_irq_enable();
1978
1979 while (head) {
David S. Miller37437bb2008-07-16 02:15:04 -07001980 struct Qdisc *q = head;
1981 spinlock_t *root_lock;
1982
Linus Torvalds1da177e2005-04-16 15:20:36 -07001983 head = head->next_sched;
1984
1985 smp_mb__before_clear_bit();
David S. Miller37437bb2008-07-16 02:15:04 -07001986 clear_bit(__QDISC_STATE_SCHED, &q->state);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001987
David S. Miller37437bb2008-07-16 02:15:04 -07001988 root_lock = qdisc_root_lock(q);
1989 if (spin_trylock(root_lock)) {
1990 qdisc_run(q);
1991 spin_unlock(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001992 } else {
David S. Miller37437bb2008-07-16 02:15:04 -07001993 __netif_schedule(q);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001994 }
1995 }
1996 }
1997}
1998
Stephen Hemminger6f05f622007-03-08 20:46:03 -08001999static inline int deliver_skb(struct sk_buff *skb,
2000 struct packet_type *pt_prev,
2001 struct net_device *orig_dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002002{
2003 atomic_inc(&skb->users);
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002004 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002005}
2006
2007#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
Stephen Hemminger6229e362007-03-21 13:38:47 -07002008/* These hooks defined here for ATM */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002009struct net_bridge;
2010struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2011 unsigned char *addr);
Stephen Hemminger6229e362007-03-21 13:38:47 -07002012void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002013
Stephen Hemminger6229e362007-03-21 13:38:47 -07002014/*
2015 * If bridge module is loaded call bridging hook.
2016 * returns NULL if packet was consumed.
2017 */
2018struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2019 struct sk_buff *skb) __read_mostly;
2020static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2021 struct packet_type **pt_prev, int *ret,
2022 struct net_device *orig_dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002023{
2024 struct net_bridge_port *port;
2025
Stephen Hemminger6229e362007-03-21 13:38:47 -07002026 if (skb->pkt_type == PACKET_LOOPBACK ||
2027 (port = rcu_dereference(skb->dev->br_port)) == NULL)
2028 return skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002029
2030 if (*pt_prev) {
Stephen Hemminger6229e362007-03-21 13:38:47 -07002031 *ret = deliver_skb(skb, *pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002032 *pt_prev = NULL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002033 }
2034
Stephen Hemminger6229e362007-03-21 13:38:47 -07002035 return br_handle_frame_hook(port, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002036}
2037#else
Stephen Hemminger6229e362007-03-21 13:38:47 -07002038#define handle_bridge(skb, pt_prev, ret, orig_dev) (skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002039#endif
2040
Patrick McHardyb863ceb2007-07-14 18:55:06 -07002041#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2042struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2043EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2044
2045static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2046 struct packet_type **pt_prev,
2047 int *ret,
2048 struct net_device *orig_dev)
2049{
2050 if (skb->dev->macvlan_port == NULL)
2051 return skb;
2052
2053 if (*pt_prev) {
2054 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2055 *pt_prev = NULL;
2056 }
2057 return macvlan_handle_frame_hook(skb);
2058}
2059#else
2060#define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb)
2061#endif
2062
Linus Torvalds1da177e2005-04-16 15:20:36 -07002063#ifdef CONFIG_NET_CLS_ACT
2064/* TODO: Maybe we should just force sch_ingress to be compiled in
2065 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2066 * a compare and 2 stores extra right now if we dont have it on
2067 * but have CONFIG_NET_CLS_ACT
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002068 * NOTE: This doesnt stop any functionality; if you dont have
Linus Torvalds1da177e2005-04-16 15:20:36 -07002069 * the ingress scheduler, you just cant add policies on ingress.
2070 *
2071 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002072static int ing_filter(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002073{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002074 struct net_device *dev = skb->dev;
Herbert Xuf697c3e2007-10-14 00:38:47 -07002075 u32 ttl = G_TC_RTTL(skb->tc_verd);
David S. Miller555353c2008-07-08 17:33:13 -07002076 struct netdev_queue *rxq;
2077 int result = TC_ACT_OK;
2078 struct Qdisc *q;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002079
Herbert Xuf697c3e2007-10-14 00:38:47 -07002080 if (MAX_RED_LOOP < ttl++) {
2081 printk(KERN_WARNING
2082 "Redir loop detected Dropping packet (%d->%d)\n",
2083 skb->iif, dev->ifindex);
2084 return TC_ACT_SHOT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002085 }
2086
Herbert Xuf697c3e2007-10-14 00:38:47 -07002087 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2088 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2089
David S. Miller555353c2008-07-08 17:33:13 -07002090 rxq = &dev->rx_queue;
2091
David S. Miller83874002008-07-17 00:53:03 -07002092 q = rxq->qdisc;
2093 if (q) {
2094 spin_lock(qdisc_lock(q));
Jussi Kivilinna5f861732008-07-20 00:08:04 -07002095 result = qdisc_enqueue_root(skb, q);
David S. Miller83874002008-07-17 00:53:03 -07002096 spin_unlock(qdisc_lock(q));
2097 }
Herbert Xuf697c3e2007-10-14 00:38:47 -07002098
Linus Torvalds1da177e2005-04-16 15:20:36 -07002099 return result;
2100}
Herbert Xuf697c3e2007-10-14 00:38:47 -07002101
2102static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2103 struct packet_type **pt_prev,
2104 int *ret, struct net_device *orig_dev)
2105{
David S. Miller816f3252008-07-08 22:49:00 -07002106 if (!skb->dev->rx_queue.qdisc)
Herbert Xuf697c3e2007-10-14 00:38:47 -07002107 goto out;
2108
2109 if (*pt_prev) {
2110 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2111 *pt_prev = NULL;
2112 } else {
2113 /* Huh? Why does turning on AF_PACKET affect this? */
2114 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2115 }
2116
2117 switch (ing_filter(skb)) {
2118 case TC_ACT_SHOT:
2119 case TC_ACT_STOLEN:
2120 kfree_skb(skb);
2121 return NULL;
2122 }
2123
2124out:
2125 skb->tc_verd = 0;
2126 return skb;
2127}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002128#endif
2129
Patrick McHardybc1d0412008-07-14 22:49:30 -07002130/*
2131 * netif_nit_deliver - deliver received packets to network taps
2132 * @skb: buffer
2133 *
2134 * This function is used to deliver incoming packets to network
2135 * taps. It should be used when the normal netif_receive_skb path
2136 * is bypassed, for example because of VLAN acceleration.
2137 */
2138void netif_nit_deliver(struct sk_buff *skb)
2139{
2140 struct packet_type *ptype;
2141
2142 if (list_empty(&ptype_all))
2143 return;
2144
2145 skb_reset_network_header(skb);
2146 skb_reset_transport_header(skb);
2147 skb->mac_len = skb->network_header - skb->mac_header;
2148
2149 rcu_read_lock();
2150 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2151 if (!ptype->dev || ptype->dev == skb->dev)
2152 deliver_skb(skb, ptype, skb->dev);
2153 }
2154 rcu_read_unlock();
2155}
2156
Stephen Hemminger3b582cc2007-11-01 02:21:47 -07002157/**
2158 * netif_receive_skb - process receive buffer from network
2159 * @skb: buffer to process
2160 *
2161 * netif_receive_skb() is the main receive data processing function.
2162 * It always succeeds. The buffer may be dropped during processing
2163 * for congestion control or by the protocol layers.
2164 *
2165 * This function may only be called from softirq context and interrupts
2166 * should be enabled.
2167 *
2168 * Return values (usually ignored):
2169 * NET_RX_SUCCESS: no congestion
2170 * NET_RX_DROP: packet was dropped
2171 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002172int netif_receive_skb(struct sk_buff *skb)
2173{
2174 struct packet_type *ptype, *pt_prev;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002175 struct net_device *orig_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002176 int ret = NET_RX_DROP;
Al Viro252e3342006-11-14 20:48:11 -08002177 __be16 type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002178
2179 /* if we've gotten here through NAPI, check netpoll */
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002180 if (netpoll_receive_skb(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002181 return NET_RX_DROP;
2182
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002183 if (!skb->tstamp.tv64)
Patrick McHardya61bbcf2005-08-14 17:24:31 -07002184 net_timestamp(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002185
Patrick McHardyc01003c2007-03-29 11:46:52 -07002186 if (!skb->iif)
2187 skb->iif = skb->dev->ifindex;
David S. Miller86e65da2005-08-09 19:36:29 -07002188
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002189 orig_dev = skb_bond(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002190
Jay Vosburgh8f903c72006-02-21 16:36:44 -08002191 if (!orig_dev)
2192 return NET_RX_DROP;
2193
Linus Torvalds1da177e2005-04-16 15:20:36 -07002194 __get_cpu_var(netdev_rx_stat).total++;
2195
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002196 skb_reset_network_header(skb);
Arnaldo Carvalho de Melobadff6d2007-03-13 13:06:52 -03002197 skb_reset_transport_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07002198 skb->mac_len = skb->network_header - skb->mac_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002199
2200 pt_prev = NULL;
2201
2202 rcu_read_lock();
2203
Eric W. Biedermanb9f75f42008-06-20 22:16:51 -07002204 /* Don't receive packets in an exiting network namespace */
2205 if (!net_alive(dev_net(skb->dev)))
2206 goto out;
2207
Linus Torvalds1da177e2005-04-16 15:20:36 -07002208#ifdef CONFIG_NET_CLS_ACT
2209 if (skb->tc_verd & TC_NCLS) {
2210 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2211 goto ncls;
2212 }
2213#endif
2214
2215 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2216 if (!ptype->dev || ptype->dev == skb->dev) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002217 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002218 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002219 pt_prev = ptype;
2220 }
2221 }
2222
2223#ifdef CONFIG_NET_CLS_ACT
Herbert Xuf697c3e2007-10-14 00:38:47 -07002224 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2225 if (!skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002226 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002227ncls:
2228#endif
2229
Stephen Hemminger6229e362007-03-21 13:38:47 -07002230 skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2231 if (!skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002232 goto out;
Patrick McHardyb863ceb2007-07-14 18:55:06 -07002233 skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2234 if (!skb)
2235 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002236
2237 type = skb->protocol;
Pavel Emelyanov82d8a862007-11-26 20:12:58 +08002238 list_for_each_entry_rcu(ptype,
2239 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002240 if (ptype->type == type &&
2241 (!ptype->dev || ptype->dev == skb->dev)) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002242 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002243 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002244 pt_prev = ptype;
2245 }
2246 }
2247
2248 if (pt_prev) {
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002249 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002250 } else {
2251 kfree_skb(skb);
2252 /* Jamal, now you will not able to escape explaining
2253 * me how you were going to use this. :-)
2254 */
2255 ret = NET_RX_DROP;
2256 }
2257
2258out:
2259 rcu_read_unlock();
2260 return ret;
2261}
2262
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002263static int process_backlog(struct napi_struct *napi, int quota)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002264{
2265 int work = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002266 struct softnet_data *queue = &__get_cpu_var(softnet_data);
2267 unsigned long start_time = jiffies;
2268
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002269 napi->weight = weight_p;
2270 do {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002271 struct sk_buff *skb;
2272 struct net_device *dev;
2273
2274 local_irq_disable();
2275 skb = __skb_dequeue(&queue->input_pkt_queue);
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002276 if (!skb) {
2277 __napi_complete(napi);
2278 local_irq_enable();
2279 break;
2280 }
2281
Linus Torvalds1da177e2005-04-16 15:20:36 -07002282 local_irq_enable();
2283
2284 dev = skb->dev;
2285
2286 netif_receive_skb(skb);
2287
2288 dev_put(dev);
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002289 } while (++work < quota && jiffies == start_time);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002290
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002291 return work;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002292}
2293
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002294/**
2295 * __napi_schedule - schedule for receive
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07002296 * @n: entry to schedule
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002297 *
2298 * The entry's receive function will be scheduled to run
2299 */
Harvey Harrisonb5606c22008-02-13 15:03:16 -08002300void __napi_schedule(struct napi_struct *n)
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002301{
2302 unsigned long flags;
2303
2304 local_irq_save(flags);
2305 list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2306 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2307 local_irq_restore(flags);
2308}
2309EXPORT_SYMBOL(__napi_schedule);
2310
2311
Linus Torvalds1da177e2005-04-16 15:20:36 -07002312static void net_rx_action(struct softirq_action *h)
2313{
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002314 struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002315 unsigned long start_time = jiffies;
Stephen Hemminger51b0bde2005-06-23 20:14:40 -07002316 int budget = netdev_budget;
Matt Mackall53fb95d2005-08-11 19:27:43 -07002317 void *have;
2318
Linus Torvalds1da177e2005-04-16 15:20:36 -07002319 local_irq_disable();
2320
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002321 while (!list_empty(list)) {
2322 struct napi_struct *n;
2323 int work, weight;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002324
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002325 /* If softirq window is exhuasted then punt.
2326 *
2327 * Note that this is a slight policy change from the
2328 * previous NAPI code, which would allow up to 2
2329 * jiffies to pass before breaking out. The test
2330 * used to be "jiffies - start_time > 1".
2331 */
2332 if (unlikely(budget <= 0 || jiffies != start_time))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002333 goto softnet_break;
2334
2335 local_irq_enable();
2336
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002337 /* Even though interrupts have been re-enabled, this
2338 * access is safe because interrupts can only add new
2339 * entries to the tail of this list, and only ->poll()
2340 * calls can remove this head entry from the list.
2341 */
2342 n = list_entry(list->next, struct napi_struct, poll_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002343
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002344 have = netpoll_poll_lock(n);
2345
2346 weight = n->weight;
2347
David S. Miller0a7606c2007-10-29 21:28:47 -07002348 /* This NAPI_STATE_SCHED test is for avoiding a race
2349 * with netpoll's poll_napi(). Only the entity which
2350 * obtains the lock and sees NAPI_STATE_SCHED set will
2351 * actually make the ->poll() call. Therefore we avoid
2352 * accidently calling ->poll() when NAPI is not scheduled.
2353 */
2354 work = 0;
2355 if (test_bit(NAPI_STATE_SCHED, &n->state))
2356 work = n->poll(n, weight);
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002357
2358 WARN_ON_ONCE(work > weight);
2359
2360 budget -= work;
2361
2362 local_irq_disable();
2363
2364 /* Drivers must not modify the NAPI state if they
2365 * consume the entire weight. In such cases this code
2366 * still "owns" the NAPI instance and therefore can
2367 * move the instance around on the list at-will.
2368 */
David S. Millerfed17f32008-01-07 21:00:40 -08002369 if (unlikely(work == weight)) {
2370 if (unlikely(napi_disable_pending(n)))
2371 __napi_complete(n);
2372 else
2373 list_move_tail(&n->poll_list, list);
2374 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002375
2376 netpoll_poll_unlock(have);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002377 }
2378out:
Shannon Nelson515e06c2007-06-23 23:09:23 -07002379 local_irq_enable();
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002380
Chris Leechdb217332006-06-17 21:24:58 -07002381#ifdef CONFIG_NET_DMA
2382 /*
2383 * There may not be any more sk_buffs coming right now, so push
2384 * any pending DMA copies to hardware
2385 */
Dan Williamsd379b012007-07-09 11:56:42 -07002386 if (!cpus_empty(net_dma.channel_mask)) {
2387 int chan_idx;
2388 for_each_cpu_mask(chan_idx, net_dma.channel_mask) {
2389 struct dma_chan *chan = net_dma.channels[chan_idx];
2390 if (chan)
2391 dma_async_memcpy_issue_pending(chan);
2392 }
Chris Leechdb217332006-06-17 21:24:58 -07002393 }
2394#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002395
Linus Torvalds1da177e2005-04-16 15:20:36 -07002396 return;
2397
2398softnet_break:
2399 __get_cpu_var(netdev_rx_stat).time_squeeze++;
2400 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2401 goto out;
2402}
2403
2404static gifconf_func_t * gifconf_list [NPROTO];
2405
2406/**
2407 * register_gifconf - register a SIOCGIF handler
2408 * @family: Address family
2409 * @gifconf: Function handler
2410 *
2411 * Register protocol dependent address dumping routines. The handler
2412 * that is passed must not be freed or reused until it has been replaced
2413 * by another handler.
2414 */
2415int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2416{
2417 if (family >= NPROTO)
2418 return -EINVAL;
2419 gifconf_list[family] = gifconf;
2420 return 0;
2421}
2422
2423
2424/*
2425 * Map an interface index to its name (SIOCGIFNAME)
2426 */
2427
2428/*
2429 * We need this ioctl for efficient implementation of the
2430 * if_indextoname() function required by the IPv6 API. Without
2431 * it, we would have to search all the interfaces to find a
2432 * match. --pb
2433 */
2434
Eric W. Biederman881d9662007-09-17 11:56:21 -07002435static int dev_ifname(struct net *net, struct ifreq __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002436{
2437 struct net_device *dev;
2438 struct ifreq ifr;
2439
2440 /*
2441 * Fetch the caller's info block.
2442 */
2443
2444 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2445 return -EFAULT;
2446
2447 read_lock(&dev_base_lock);
Eric W. Biederman881d9662007-09-17 11:56:21 -07002448 dev = __dev_get_by_index(net, ifr.ifr_ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002449 if (!dev) {
2450 read_unlock(&dev_base_lock);
2451 return -ENODEV;
2452 }
2453
2454 strcpy(ifr.ifr_name, dev->name);
2455 read_unlock(&dev_base_lock);
2456
2457 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2458 return -EFAULT;
2459 return 0;
2460}
2461
2462/*
2463 * Perform a SIOCGIFCONF call. This structure will change
2464 * size eventually, and there is nothing I can do about it.
2465 * Thus we will need a 'compatibility mode'.
2466 */
2467
Eric W. Biederman881d9662007-09-17 11:56:21 -07002468static int dev_ifconf(struct net *net, char __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002469{
2470 struct ifconf ifc;
2471 struct net_device *dev;
2472 char __user *pos;
2473 int len;
2474 int total;
2475 int i;
2476
2477 /*
2478 * Fetch the caller's info block.
2479 */
2480
2481 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2482 return -EFAULT;
2483
2484 pos = ifc.ifc_buf;
2485 len = ifc.ifc_len;
2486
2487 /*
2488 * Loop over the interfaces, and write an info block for each.
2489 */
2490
2491 total = 0;
Eric W. Biederman881d9662007-09-17 11:56:21 -07002492 for_each_netdev(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002493 for (i = 0; i < NPROTO; i++) {
2494 if (gifconf_list[i]) {
2495 int done;
2496 if (!pos)
2497 done = gifconf_list[i](dev, NULL, 0);
2498 else
2499 done = gifconf_list[i](dev, pos + total,
2500 len - total);
2501 if (done < 0)
2502 return -EFAULT;
2503 total += done;
2504 }
2505 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002506 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002507
2508 /*
2509 * All done. Write the updated control block back to the caller.
2510 */
2511 ifc.ifc_len = total;
2512
2513 /*
2514 * Both BSD and Solaris return 0 here, so we do too.
2515 */
2516 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2517}
2518
2519#ifdef CONFIG_PROC_FS
2520/*
2521 * This is invoked by the /proc filesystem handler to display a device
2522 * in detail.
2523 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002524void *dev_seq_start(struct seq_file *seq, loff_t *pos)
Eric Dumazet9a429c42008-01-01 21:58:02 -08002525 __acquires(dev_base_lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002526{
Denis V. Luneve372c412007-11-19 22:31:54 -08002527 struct net *net = seq_file_net(seq);
Pavel Emelianov7562f872007-05-03 15:13:45 -07002528 loff_t off;
2529 struct net_device *dev;
2530
Linus Torvalds1da177e2005-04-16 15:20:36 -07002531 read_lock(&dev_base_lock);
Pavel Emelianov7562f872007-05-03 15:13:45 -07002532 if (!*pos)
2533 return SEQ_START_TOKEN;
2534
2535 off = 1;
Eric W. Biederman881d9662007-09-17 11:56:21 -07002536 for_each_netdev(net, dev)
Pavel Emelianov7562f872007-05-03 15:13:45 -07002537 if (off++ == *pos)
2538 return dev;
2539
2540 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002541}
2542
2543void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2544{
Denis V. Luneve372c412007-11-19 22:31:54 -08002545 struct net *net = seq_file_net(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002546 ++*pos;
Pavel Emelianov7562f872007-05-03 15:13:45 -07002547 return v == SEQ_START_TOKEN ?
Eric W. Biederman881d9662007-09-17 11:56:21 -07002548 first_net_device(net) : next_net_device((struct net_device *)v);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002549}
2550
2551void dev_seq_stop(struct seq_file *seq, void *v)
Eric Dumazet9a429c42008-01-01 21:58:02 -08002552 __releases(dev_base_lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002553{
2554 read_unlock(&dev_base_lock);
2555}
2556
2557static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2558{
Rusty Russellc45d2862007-03-28 14:29:08 -07002559 struct net_device_stats *stats = dev->get_stats(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002560
Rusty Russell5a1b5892007-04-28 21:04:03 -07002561 seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2562 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2563 dev->name, stats->rx_bytes, stats->rx_packets,
2564 stats->rx_errors,
2565 stats->rx_dropped + stats->rx_missed_errors,
2566 stats->rx_fifo_errors,
2567 stats->rx_length_errors + stats->rx_over_errors +
2568 stats->rx_crc_errors + stats->rx_frame_errors,
2569 stats->rx_compressed, stats->multicast,
2570 stats->tx_bytes, stats->tx_packets,
2571 stats->tx_errors, stats->tx_dropped,
2572 stats->tx_fifo_errors, stats->collisions,
2573 stats->tx_carrier_errors +
2574 stats->tx_aborted_errors +
2575 stats->tx_window_errors +
2576 stats->tx_heartbeat_errors,
2577 stats->tx_compressed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002578}
2579
2580/*
2581 * Called from the PROCfs module. This now uses the new arbitrary sized
2582 * /proc/net interface to create /proc/net/dev
2583 */
2584static int dev_seq_show(struct seq_file *seq, void *v)
2585{
2586 if (v == SEQ_START_TOKEN)
2587 seq_puts(seq, "Inter-| Receive "
2588 " | Transmit\n"
2589 " face |bytes packets errs drop fifo frame "
2590 "compressed multicast|bytes packets errs "
2591 "drop fifo colls carrier compressed\n");
2592 else
2593 dev_seq_printf_stats(seq, v);
2594 return 0;
2595}
2596
2597static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2598{
2599 struct netif_rx_stats *rc = NULL;
2600
Mike Travis0c0b0ac2008-05-02 16:43:08 -07002601 while (*pos < nr_cpu_ids)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002602 if (cpu_online(*pos)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002603 rc = &per_cpu(netdev_rx_stat, *pos);
2604 break;
2605 } else
2606 ++*pos;
2607 return rc;
2608}
2609
2610static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2611{
2612 return softnet_get_online(pos);
2613}
2614
2615static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2616{
2617 ++*pos;
2618 return softnet_get_online(pos);
2619}
2620
2621static void softnet_seq_stop(struct seq_file *seq, void *v)
2622{
2623}
2624
2625static int softnet_seq_show(struct seq_file *seq, void *v)
2626{
2627 struct netif_rx_stats *s = v;
2628
2629 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
Stephen Hemminger31aa02c2005-06-23 20:12:48 -07002630 s->total, s->dropped, s->time_squeeze, 0,
Stephen Hemmingerc1ebcdb2005-06-23 20:08:59 -07002631 0, 0, 0, 0, /* was fastroute */
2632 s->cpu_collision );
Linus Torvalds1da177e2005-04-16 15:20:36 -07002633 return 0;
2634}
2635
Stephen Hemmingerf6908082007-03-12 14:34:29 -07002636static const struct seq_operations dev_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002637 .start = dev_seq_start,
2638 .next = dev_seq_next,
2639 .stop = dev_seq_stop,
2640 .show = dev_seq_show,
2641};
2642
2643static int dev_seq_open(struct inode *inode, struct file *file)
2644{
Denis V. Luneve372c412007-11-19 22:31:54 -08002645 return seq_open_net(inode, file, &dev_seq_ops,
2646 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002647}
2648
Arjan van de Ven9a321442007-02-12 00:55:35 -08002649static const struct file_operations dev_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002650 .owner = THIS_MODULE,
2651 .open = dev_seq_open,
2652 .read = seq_read,
2653 .llseek = seq_lseek,
Denis V. Luneve372c412007-11-19 22:31:54 -08002654 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002655};
2656
Stephen Hemmingerf6908082007-03-12 14:34:29 -07002657static const struct seq_operations softnet_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002658 .start = softnet_seq_start,
2659 .next = softnet_seq_next,
2660 .stop = softnet_seq_stop,
2661 .show = softnet_seq_show,
2662};
2663
2664static int softnet_seq_open(struct inode *inode, struct file *file)
2665{
2666 return seq_open(file, &softnet_seq_ops);
2667}
2668
Arjan van de Ven9a321442007-02-12 00:55:35 -08002669static const struct file_operations softnet_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002670 .owner = THIS_MODULE,
2671 .open = softnet_seq_open,
2672 .read = seq_read,
2673 .llseek = seq_lseek,
2674 .release = seq_release,
2675};
2676
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07002677static void *ptype_get_idx(loff_t pos)
2678{
2679 struct packet_type *pt = NULL;
2680 loff_t i = 0;
2681 int t;
2682
2683 list_for_each_entry_rcu(pt, &ptype_all, list) {
2684 if (i == pos)
2685 return pt;
2686 ++i;
2687 }
2688
Pavel Emelyanov82d8a862007-11-26 20:12:58 +08002689 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07002690 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
2691 if (i == pos)
2692 return pt;
2693 ++i;
2694 }
2695 }
2696 return NULL;
2697}
2698
2699static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
Stephen Hemminger72348a42008-01-21 02:27:29 -08002700 __acquires(RCU)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07002701{
2702 rcu_read_lock();
2703 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
2704}
2705
2706static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2707{
2708 struct packet_type *pt;
2709 struct list_head *nxt;
2710 int hash;
2711
2712 ++*pos;
2713 if (v == SEQ_START_TOKEN)
2714 return ptype_get_idx(0);
2715
2716 pt = v;
2717 nxt = pt->list.next;
2718 if (pt->type == htons(ETH_P_ALL)) {
2719 if (nxt != &ptype_all)
2720 goto found;
2721 hash = 0;
2722 nxt = ptype_base[0].next;
2723 } else
Pavel Emelyanov82d8a862007-11-26 20:12:58 +08002724 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07002725
2726 while (nxt == &ptype_base[hash]) {
Pavel Emelyanov82d8a862007-11-26 20:12:58 +08002727 if (++hash >= PTYPE_HASH_SIZE)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07002728 return NULL;
2729 nxt = ptype_base[hash].next;
2730 }
2731found:
2732 return list_entry(nxt, struct packet_type, list);
2733}
2734
2735static void ptype_seq_stop(struct seq_file *seq, void *v)
Stephen Hemminger72348a42008-01-21 02:27:29 -08002736 __releases(RCU)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07002737{
2738 rcu_read_unlock();
2739}
2740
2741static void ptype_seq_decode(struct seq_file *seq, void *sym)
2742{
2743#ifdef CONFIG_KALLSYMS
2744 unsigned long offset = 0, symsize;
2745 const char *symname;
2746 char *modname;
2747 char namebuf[128];
2748
2749 symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset,
2750 &modname, namebuf);
2751
2752 if (symname) {
2753 char *delim = ":";
2754
2755 if (!modname)
2756 modname = delim = "";
2757 seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim,
2758 symname, offset);
2759 return;
2760 }
2761#endif
2762
2763 seq_printf(seq, "[%p]", sym);
2764}
2765
2766static int ptype_seq_show(struct seq_file *seq, void *v)
2767{
2768 struct packet_type *pt = v;
2769
2770 if (v == SEQ_START_TOKEN)
2771 seq_puts(seq, "Type Device Function\n");
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002772 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07002773 if (pt->type == htons(ETH_P_ALL))
2774 seq_puts(seq, "ALL ");
2775 else
2776 seq_printf(seq, "%04x", ntohs(pt->type));
2777
2778 seq_printf(seq, " %-8s ",
2779 pt->dev ? pt->dev->name : "");
2780 ptype_seq_decode(seq, pt->func);
2781 seq_putc(seq, '\n');
2782 }
2783
2784 return 0;
2785}
2786
2787static const struct seq_operations ptype_seq_ops = {
2788 .start = ptype_seq_start,
2789 .next = ptype_seq_next,
2790 .stop = ptype_seq_stop,
2791 .show = ptype_seq_show,
2792};
2793
2794static int ptype_seq_open(struct inode *inode, struct file *file)
2795{
Pavel Emelyanov2feb27d2008-03-24 14:57:45 -07002796 return seq_open_net(inode, file, &ptype_seq_ops,
2797 sizeof(struct seq_net_private));
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07002798}
2799
2800static const struct file_operations ptype_seq_fops = {
2801 .owner = THIS_MODULE,
2802 .open = ptype_seq_open,
2803 .read = seq_read,
2804 .llseek = seq_lseek,
Pavel Emelyanov2feb27d2008-03-24 14:57:45 -07002805 .release = seq_release_net,
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07002806};
2807
2808
Pavel Emelyanov46650792007-10-08 20:38:39 -07002809static int __net_init dev_proc_net_init(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002810{
2811 int rc = -ENOMEM;
2812
Eric W. Biederman881d9662007-09-17 11:56:21 -07002813 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002814 goto out;
Eric W. Biederman881d9662007-09-17 11:56:21 -07002815 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002816 goto out_dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07002817 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02002818 goto out_softnet;
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07002819
Eric W. Biederman881d9662007-09-17 11:56:21 -07002820 if (wext_proc_init(net))
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02002821 goto out_ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002822 rc = 0;
2823out:
2824 return rc;
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02002825out_ptype:
Eric W. Biederman881d9662007-09-17 11:56:21 -07002826 proc_net_remove(net, "ptype");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002827out_softnet:
Eric W. Biederman881d9662007-09-17 11:56:21 -07002828 proc_net_remove(net, "softnet_stat");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002829out_dev:
Eric W. Biederman881d9662007-09-17 11:56:21 -07002830 proc_net_remove(net, "dev");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002831 goto out;
2832}
Eric W. Biederman881d9662007-09-17 11:56:21 -07002833
Pavel Emelyanov46650792007-10-08 20:38:39 -07002834static void __net_exit dev_proc_net_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07002835{
2836 wext_proc_exit(net);
2837
2838 proc_net_remove(net, "ptype");
2839 proc_net_remove(net, "softnet_stat");
2840 proc_net_remove(net, "dev");
2841}
2842
Denis V. Lunev022cbae2007-11-13 03:23:50 -08002843static struct pernet_operations __net_initdata dev_proc_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07002844 .init = dev_proc_net_init,
2845 .exit = dev_proc_net_exit,
2846};
2847
2848static int __init dev_proc_init(void)
2849{
2850 return register_pernet_subsys(&dev_proc_ops);
2851}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002852#else
2853#define dev_proc_init() 0
2854#endif /* CONFIG_PROC_FS */
2855
2856
2857/**
2858 * netdev_set_master - set up master/slave pair
2859 * @slave: slave device
2860 * @master: new master device
2861 *
2862 * Changes the master device of the slave. Pass %NULL to break the
2863 * bonding. The caller must hold the RTNL semaphore. On a failure
2864 * a negative errno code is returned. On success the reference counts
2865 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2866 * function returns zero.
2867 */
2868int netdev_set_master(struct net_device *slave, struct net_device *master)
2869{
2870 struct net_device *old = slave->master;
2871
2872 ASSERT_RTNL();
2873
2874 if (master) {
2875 if (old)
2876 return -EBUSY;
2877 dev_hold(master);
2878 }
2879
2880 slave->master = master;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002881
Linus Torvalds1da177e2005-04-16 15:20:36 -07002882 synchronize_net();
2883
2884 if (old)
2885 dev_put(old);
2886
2887 if (master)
2888 slave->flags |= IFF_SLAVE;
2889 else
2890 slave->flags &= ~IFF_SLAVE;
2891
2892 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2893 return 0;
2894}
2895
Wang Chendad9b332008-06-18 01:48:28 -07002896static int __dev_set_promiscuity(struct net_device *dev, int inc)
Patrick McHardy4417da62007-06-27 01:28:10 -07002897{
2898 unsigned short old_flags = dev->flags;
2899
Patrick McHardy24023452007-07-14 18:51:31 -07002900 ASSERT_RTNL();
2901
Wang Chendad9b332008-06-18 01:48:28 -07002902 dev->flags |= IFF_PROMISC;
2903 dev->promiscuity += inc;
2904 if (dev->promiscuity == 0) {
2905 /*
2906 * Avoid overflow.
2907 * If inc causes overflow, untouch promisc and return error.
2908 */
2909 if (inc < 0)
2910 dev->flags &= ~IFF_PROMISC;
2911 else {
2912 dev->promiscuity -= inc;
2913 printk(KERN_WARNING "%s: promiscuity touches roof, "
2914 "set promiscuity failed, promiscuity feature "
2915 "of device might be broken.\n", dev->name);
2916 return -EOVERFLOW;
2917 }
2918 }
Patrick McHardy4417da62007-06-27 01:28:10 -07002919 if (dev->flags != old_flags) {
2920 printk(KERN_INFO "device %s %s promiscuous mode\n",
2921 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2922 "left");
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05002923 if (audit_enabled)
2924 audit_log(current->audit_context, GFP_ATOMIC,
2925 AUDIT_ANOM_PROMISCUOUS,
2926 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
2927 dev->name, (dev->flags & IFF_PROMISC),
2928 (old_flags & IFF_PROMISC),
2929 audit_get_loginuid(current),
2930 current->uid, current->gid,
2931 audit_get_sessionid(current));
Patrick McHardy24023452007-07-14 18:51:31 -07002932
2933 if (dev->change_rx_flags)
2934 dev->change_rx_flags(dev, IFF_PROMISC);
Patrick McHardy4417da62007-06-27 01:28:10 -07002935 }
Wang Chendad9b332008-06-18 01:48:28 -07002936 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07002937}
2938
Linus Torvalds1da177e2005-04-16 15:20:36 -07002939/**
2940 * dev_set_promiscuity - update promiscuity count on a device
2941 * @dev: device
2942 * @inc: modifier
2943 *
Stephen Hemminger3041a062006-05-26 13:25:24 -07002944 * Add or remove promiscuity from a device. While the count in the device
Linus Torvalds1da177e2005-04-16 15:20:36 -07002945 * remains above zero the interface remains promiscuous. Once it hits zero
2946 * the device reverts back to normal filtering operation. A negative inc
2947 * value is used to drop promiscuity on the device.
Wang Chendad9b332008-06-18 01:48:28 -07002948 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002949 */
Wang Chendad9b332008-06-18 01:48:28 -07002950int dev_set_promiscuity(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002951{
2952 unsigned short old_flags = dev->flags;
Wang Chendad9b332008-06-18 01:48:28 -07002953 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002954
Wang Chendad9b332008-06-18 01:48:28 -07002955 err = __dev_set_promiscuity(dev, inc);
Patrick McHardy4b5a6982008-07-06 15:49:08 -07002956 if (err < 0)
Wang Chendad9b332008-06-18 01:48:28 -07002957 return err;
Patrick McHardy4417da62007-06-27 01:28:10 -07002958 if (dev->flags != old_flags)
2959 dev_set_rx_mode(dev);
Wang Chendad9b332008-06-18 01:48:28 -07002960 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002961}
2962
2963/**
2964 * dev_set_allmulti - update allmulti count on a device
2965 * @dev: device
2966 * @inc: modifier
2967 *
2968 * Add or remove reception of all multicast frames to a device. While the
2969 * count in the device remains above zero the interface remains listening
2970 * to all interfaces. Once it hits zero the device reverts back to normal
2971 * filtering operation. A negative @inc value is used to drop the counter
2972 * when releasing a resource needing all multicasts.
Wang Chendad9b332008-06-18 01:48:28 -07002973 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002974 */
2975
Wang Chendad9b332008-06-18 01:48:28 -07002976int dev_set_allmulti(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002977{
2978 unsigned short old_flags = dev->flags;
2979
Patrick McHardy24023452007-07-14 18:51:31 -07002980 ASSERT_RTNL();
2981
Linus Torvalds1da177e2005-04-16 15:20:36 -07002982 dev->flags |= IFF_ALLMULTI;
Wang Chendad9b332008-06-18 01:48:28 -07002983 dev->allmulti += inc;
2984 if (dev->allmulti == 0) {
2985 /*
2986 * Avoid overflow.
2987 * If inc causes overflow, untouch allmulti and return error.
2988 */
2989 if (inc < 0)
2990 dev->flags &= ~IFF_ALLMULTI;
2991 else {
2992 dev->allmulti -= inc;
2993 printk(KERN_WARNING "%s: allmulti touches roof, "
2994 "set allmulti failed, allmulti feature of "
2995 "device might be broken.\n", dev->name);
2996 return -EOVERFLOW;
2997 }
2998 }
Patrick McHardy24023452007-07-14 18:51:31 -07002999 if (dev->flags ^ old_flags) {
3000 if (dev->change_rx_flags)
3001 dev->change_rx_flags(dev, IFF_ALLMULTI);
Patrick McHardy4417da62007-06-27 01:28:10 -07003002 dev_set_rx_mode(dev);
Patrick McHardy24023452007-07-14 18:51:31 -07003003 }
Wang Chendad9b332008-06-18 01:48:28 -07003004 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07003005}
3006
3007/*
3008 * Upload unicast and multicast address lists to device and
3009 * configure RX filtering. When the device doesn't support unicast
Joe Perches53ccaae2007-12-20 14:02:06 -08003010 * filtering it is put in promiscuous mode while unicast addresses
Patrick McHardy4417da62007-06-27 01:28:10 -07003011 * are present.
3012 */
3013void __dev_set_rx_mode(struct net_device *dev)
3014{
3015 /* dev_open will call this function so the list will stay sane. */
3016 if (!(dev->flags&IFF_UP))
3017 return;
3018
3019 if (!netif_device_present(dev))
YOSHIFUJI Hideaki40b77c92007-07-19 10:43:23 +09003020 return;
Patrick McHardy4417da62007-06-27 01:28:10 -07003021
3022 if (dev->set_rx_mode)
3023 dev->set_rx_mode(dev);
3024 else {
3025 /* Unicast addresses changes may only happen under the rtnl,
3026 * therefore calling __dev_set_promiscuity here is safe.
3027 */
3028 if (dev->uc_count > 0 && !dev->uc_promisc) {
3029 __dev_set_promiscuity(dev, 1);
3030 dev->uc_promisc = 1;
3031 } else if (dev->uc_count == 0 && dev->uc_promisc) {
3032 __dev_set_promiscuity(dev, -1);
3033 dev->uc_promisc = 0;
3034 }
3035
3036 if (dev->set_multicast_list)
3037 dev->set_multicast_list(dev);
3038 }
3039}
3040
3041void dev_set_rx_mode(struct net_device *dev)
3042{
David S. Millerb9e40852008-07-15 00:15:08 -07003043 netif_addr_lock_bh(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07003044 __dev_set_rx_mode(dev);
David S. Millerb9e40852008-07-15 00:15:08 -07003045 netif_addr_unlock_bh(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003046}
3047
Patrick McHardy61cbc2f2007-06-30 13:35:52 -07003048int __dev_addr_delete(struct dev_addr_list **list, int *count,
3049 void *addr, int alen, int glbl)
Patrick McHardybf742482007-06-27 01:26:19 -07003050{
3051 struct dev_addr_list *da;
3052
3053 for (; (da = *list) != NULL; list = &da->next) {
3054 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3055 alen == da->da_addrlen) {
3056 if (glbl) {
3057 int old_glbl = da->da_gusers;
3058 da->da_gusers = 0;
3059 if (old_glbl == 0)
3060 break;
3061 }
3062 if (--da->da_users)
3063 return 0;
3064
3065 *list = da->next;
3066 kfree(da);
Patrick McHardy61cbc2f2007-06-30 13:35:52 -07003067 (*count)--;
Patrick McHardybf742482007-06-27 01:26:19 -07003068 return 0;
3069 }
3070 }
3071 return -ENOENT;
3072}
3073
Patrick McHardy61cbc2f2007-06-30 13:35:52 -07003074int __dev_addr_add(struct dev_addr_list **list, int *count,
3075 void *addr, int alen, int glbl)
Patrick McHardybf742482007-06-27 01:26:19 -07003076{
3077 struct dev_addr_list *da;
3078
3079 for (da = *list; da != NULL; da = da->next) {
3080 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3081 da->da_addrlen == alen) {
3082 if (glbl) {
3083 int old_glbl = da->da_gusers;
3084 da->da_gusers = 1;
3085 if (old_glbl)
3086 return 0;
3087 }
3088 da->da_users++;
3089 return 0;
3090 }
3091 }
3092
Jorge Boncompte [DTI2]12aa3432008-02-19 14:17:04 -08003093 da = kzalloc(sizeof(*da), GFP_ATOMIC);
Patrick McHardybf742482007-06-27 01:26:19 -07003094 if (da == NULL)
3095 return -ENOMEM;
3096 memcpy(da->da_addr, addr, alen);
3097 da->da_addrlen = alen;
3098 da->da_users = 1;
3099 da->da_gusers = glbl ? 1 : 0;
3100 da->next = *list;
3101 *list = da;
Patrick McHardy61cbc2f2007-06-30 13:35:52 -07003102 (*count)++;
Patrick McHardybf742482007-06-27 01:26:19 -07003103 return 0;
3104}
3105
Patrick McHardy4417da62007-06-27 01:28:10 -07003106/**
3107 * dev_unicast_delete - Release secondary unicast address.
3108 * @dev: device
Randy Dunlap0ed72ec2007-07-26 00:03:29 -07003109 * @addr: address to delete
3110 * @alen: length of @addr
Patrick McHardy4417da62007-06-27 01:28:10 -07003111 *
3112 * Release reference to a secondary unicast address and remove it
Randy Dunlap0ed72ec2007-07-26 00:03:29 -07003113 * from the device if the reference count drops to zero.
Patrick McHardy4417da62007-06-27 01:28:10 -07003114 *
3115 * The caller must hold the rtnl_mutex.
3116 */
3117int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3118{
3119 int err;
3120
3121 ASSERT_RTNL();
3122
David S. Millerb9e40852008-07-15 00:15:08 -07003123 netif_addr_lock_bh(dev);
Patrick McHardy61cbc2f2007-06-30 13:35:52 -07003124 err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3125 if (!err)
Patrick McHardy4417da62007-06-27 01:28:10 -07003126 __dev_set_rx_mode(dev);
David S. Millerb9e40852008-07-15 00:15:08 -07003127 netif_addr_unlock_bh(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07003128 return err;
3129}
3130EXPORT_SYMBOL(dev_unicast_delete);
3131
3132/**
3133 * dev_unicast_add - add a secondary unicast address
3134 * @dev: device
Wang Chen5dbaec52008-06-27 19:35:16 -07003135 * @addr: address to add
Randy Dunlap0ed72ec2007-07-26 00:03:29 -07003136 * @alen: length of @addr
Patrick McHardy4417da62007-06-27 01:28:10 -07003137 *
3138 * Add a secondary unicast address to the device or increase
3139 * the reference count if it already exists.
3140 *
3141 * The caller must hold the rtnl_mutex.
3142 */
3143int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3144{
3145 int err;
3146
3147 ASSERT_RTNL();
3148
David S. Millerb9e40852008-07-15 00:15:08 -07003149 netif_addr_lock_bh(dev);
Patrick McHardy61cbc2f2007-06-30 13:35:52 -07003150 err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3151 if (!err)
Patrick McHardy4417da62007-06-27 01:28:10 -07003152 __dev_set_rx_mode(dev);
David S. Millerb9e40852008-07-15 00:15:08 -07003153 netif_addr_unlock_bh(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07003154 return err;
3155}
3156EXPORT_SYMBOL(dev_unicast_add);
3157
Chris Leeche83a2ea2008-01-31 16:53:23 -08003158int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3159 struct dev_addr_list **from, int *from_count)
3160{
3161 struct dev_addr_list *da, *next;
3162 int err = 0;
3163
3164 da = *from;
3165 while (da != NULL) {
3166 next = da->next;
3167 if (!da->da_synced) {
3168 err = __dev_addr_add(to, to_count,
3169 da->da_addr, da->da_addrlen, 0);
3170 if (err < 0)
3171 break;
3172 da->da_synced = 1;
3173 da->da_users++;
3174 } else if (da->da_users == 1) {
3175 __dev_addr_delete(to, to_count,
3176 da->da_addr, da->da_addrlen, 0);
3177 __dev_addr_delete(from, from_count,
3178 da->da_addr, da->da_addrlen, 0);
3179 }
3180 da = next;
3181 }
3182 return err;
3183}
3184
3185void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3186 struct dev_addr_list **from, int *from_count)
3187{
3188 struct dev_addr_list *da, *next;
3189
3190 da = *from;
3191 while (da != NULL) {
3192 next = da->next;
3193 if (da->da_synced) {
3194 __dev_addr_delete(to, to_count,
3195 da->da_addr, da->da_addrlen, 0);
3196 da->da_synced = 0;
3197 __dev_addr_delete(from, from_count,
3198 da->da_addr, da->da_addrlen, 0);
3199 }
3200 da = next;
3201 }
3202}
3203
3204/**
3205 * dev_unicast_sync - Synchronize device's unicast list to another device
3206 * @to: destination device
3207 * @from: source device
3208 *
3209 * Add newly added addresses to the destination device and release
3210 * addresses that have no users left. The source device must be
3211 * locked by netif_tx_lock_bh.
3212 *
3213 * This function is intended to be called from the dev->set_rx_mode
3214 * function of layered software devices.
3215 */
3216int dev_unicast_sync(struct net_device *to, struct net_device *from)
3217{
3218 int err = 0;
3219
David S. Millerb9e40852008-07-15 00:15:08 -07003220 netif_addr_lock_bh(to);
Chris Leeche83a2ea2008-01-31 16:53:23 -08003221 err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3222 &from->uc_list, &from->uc_count);
3223 if (!err)
3224 __dev_set_rx_mode(to);
David S. Millerb9e40852008-07-15 00:15:08 -07003225 netif_addr_unlock_bh(to);
Chris Leeche83a2ea2008-01-31 16:53:23 -08003226 return err;
3227}
3228EXPORT_SYMBOL(dev_unicast_sync);
3229
3230/**
Randy Dunlapbc2cda12008-02-13 15:03:25 -08003231 * dev_unicast_unsync - Remove synchronized addresses from the destination device
Chris Leeche83a2ea2008-01-31 16:53:23 -08003232 * @to: destination device
3233 * @from: source device
3234 *
3235 * Remove all addresses that were added to the destination device by
3236 * dev_unicast_sync(). This function is intended to be called from the
3237 * dev->stop function of layered software devices.
3238 */
3239void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3240{
David S. Millerb9e40852008-07-15 00:15:08 -07003241 netif_addr_lock_bh(from);
David S. Millere308a5d2008-07-15 00:13:44 -07003242 netif_addr_lock(to);
Chris Leeche83a2ea2008-01-31 16:53:23 -08003243
3244 __dev_addr_unsync(&to->uc_list, &to->uc_count,
3245 &from->uc_list, &from->uc_count);
3246 __dev_set_rx_mode(to);
3247
David S. Millere308a5d2008-07-15 00:13:44 -07003248 netif_addr_unlock(to);
David S. Millerb9e40852008-07-15 00:15:08 -07003249 netif_addr_unlock_bh(from);
Chris Leeche83a2ea2008-01-31 16:53:23 -08003250}
3251EXPORT_SYMBOL(dev_unicast_unsync);
3252
Denis Cheng12972622007-07-18 02:12:56 -07003253static void __dev_addr_discard(struct dev_addr_list **list)
3254{
3255 struct dev_addr_list *tmp;
3256
3257 while (*list != NULL) {
3258 tmp = *list;
3259 *list = tmp->next;
3260 if (tmp->da_users > tmp->da_gusers)
3261 printk("__dev_addr_discard: address leakage! "
3262 "da_users=%d\n", tmp->da_users);
3263 kfree(tmp);
3264 }
3265}
3266
Denis Cheng26cc2522007-07-18 02:12:03 -07003267static void dev_addr_discard(struct net_device *dev)
Patrick McHardy4417da62007-06-27 01:28:10 -07003268{
David S. Millerb9e40852008-07-15 00:15:08 -07003269 netif_addr_lock_bh(dev);
Denis Cheng26cc2522007-07-18 02:12:03 -07003270
Patrick McHardy4417da62007-06-27 01:28:10 -07003271 __dev_addr_discard(&dev->uc_list);
3272 dev->uc_count = 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07003273
Denis Cheng456ad752007-07-18 02:10:54 -07003274 __dev_addr_discard(&dev->mc_list);
3275 dev->mc_count = 0;
Denis Cheng26cc2522007-07-18 02:12:03 -07003276
David S. Millerb9e40852008-07-15 00:15:08 -07003277 netif_addr_unlock_bh(dev);
Denis Cheng456ad752007-07-18 02:10:54 -07003278}
3279
Linus Torvalds1da177e2005-04-16 15:20:36 -07003280unsigned dev_get_flags(const struct net_device *dev)
3281{
3282 unsigned flags;
3283
3284 flags = (dev->flags & ~(IFF_PROMISC |
3285 IFF_ALLMULTI |
Stefan Rompfb00055a2006-03-20 17:09:11 -08003286 IFF_RUNNING |
3287 IFF_LOWER_UP |
3288 IFF_DORMANT)) |
Linus Torvalds1da177e2005-04-16 15:20:36 -07003289 (dev->gflags & (IFF_PROMISC |
3290 IFF_ALLMULTI));
3291
Stefan Rompfb00055a2006-03-20 17:09:11 -08003292 if (netif_running(dev)) {
3293 if (netif_oper_up(dev))
3294 flags |= IFF_RUNNING;
3295 if (netif_carrier_ok(dev))
3296 flags |= IFF_LOWER_UP;
3297 if (netif_dormant(dev))
3298 flags |= IFF_DORMANT;
3299 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003300
3301 return flags;
3302}
3303
3304int dev_change_flags(struct net_device *dev, unsigned flags)
3305{
Thomas Graf7c355f52007-06-05 16:03:03 -07003306 int ret, changes;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003307 int old_flags = dev->flags;
3308
Patrick McHardy24023452007-07-14 18:51:31 -07003309 ASSERT_RTNL();
3310
Linus Torvalds1da177e2005-04-16 15:20:36 -07003311 /*
3312 * Set the flags on our device.
3313 */
3314
3315 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3316 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3317 IFF_AUTOMEDIA)) |
3318 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3319 IFF_ALLMULTI));
3320
3321 /*
3322 * Load in the correct multicast list now the flags have changed.
3323 */
3324
David Woodhouse0e917962008-05-20 14:36:14 -07003325 if (dev->change_rx_flags && (old_flags ^ flags) & IFF_MULTICAST)
Patrick McHardy24023452007-07-14 18:51:31 -07003326 dev->change_rx_flags(dev, IFF_MULTICAST);
3327
Patrick McHardy4417da62007-06-27 01:28:10 -07003328 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003329
3330 /*
3331 * Have we downed the interface. We handle IFF_UP ourselves
3332 * according to user attempts to set it, rather than blindly
3333 * setting it.
3334 */
3335
3336 ret = 0;
3337 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
3338 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3339
3340 if (!ret)
Patrick McHardy4417da62007-06-27 01:28:10 -07003341 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003342 }
3343
3344 if (dev->flags & IFF_UP &&
3345 ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3346 IFF_VOLATILE)))
Pavel Emelyanov056925a2007-09-16 15:42:43 -07003347 call_netdevice_notifiers(NETDEV_CHANGE, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003348
3349 if ((flags ^ dev->gflags) & IFF_PROMISC) {
3350 int inc = (flags & IFF_PROMISC) ? +1 : -1;
3351 dev->gflags ^= IFF_PROMISC;
3352 dev_set_promiscuity(dev, inc);
3353 }
3354
3355 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3356 is important. Some (broken) drivers set IFF_PROMISC, when
3357 IFF_ALLMULTI is requested not asking us and not reporting.
3358 */
3359 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3360 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3361 dev->gflags ^= IFF_ALLMULTI;
3362 dev_set_allmulti(dev, inc);
3363 }
3364
Thomas Graf7c355f52007-06-05 16:03:03 -07003365 /* Exclude state transition flags, already notified */
3366 changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3367 if (changes)
3368 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003369
3370 return ret;
3371}
3372
3373int dev_set_mtu(struct net_device *dev, int new_mtu)
3374{
3375 int err;
3376
3377 if (new_mtu == dev->mtu)
3378 return 0;
3379
3380 /* MTU must be positive. */
3381 if (new_mtu < 0)
3382 return -EINVAL;
3383
3384 if (!netif_device_present(dev))
3385 return -ENODEV;
3386
3387 err = 0;
3388 if (dev->change_mtu)
3389 err = dev->change_mtu(dev, new_mtu);
3390 else
3391 dev->mtu = new_mtu;
3392 if (!err && dev->flags & IFF_UP)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07003393 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003394 return err;
3395}
3396
3397int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3398{
3399 int err;
3400
3401 if (!dev->set_mac_address)
3402 return -EOPNOTSUPP;
3403 if (sa->sa_family != dev->type)
3404 return -EINVAL;
3405 if (!netif_device_present(dev))
3406 return -ENODEV;
3407 err = dev->set_mac_address(dev, sa);
3408 if (!err)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07003409 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003410 return err;
3411}
3412
3413/*
Jeff Garzik14e3e072007-10-08 00:06:32 -07003414 * Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003415 */
Jeff Garzik14e3e072007-10-08 00:06:32 -07003416static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003417{
3418 int err;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003419 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003420
3421 if (!dev)
3422 return -ENODEV;
3423
3424 switch (cmd) {
3425 case SIOCGIFFLAGS: /* Get interface flags */
3426 ifr->ifr_flags = dev_get_flags(dev);
3427 return 0;
3428
Linus Torvalds1da177e2005-04-16 15:20:36 -07003429 case SIOCGIFMETRIC: /* Get the metric on the interface
3430 (currently unused) */
3431 ifr->ifr_metric = 0;
3432 return 0;
3433
Linus Torvalds1da177e2005-04-16 15:20:36 -07003434 case SIOCGIFMTU: /* Get the MTU of a device */
3435 ifr->ifr_mtu = dev->mtu;
3436 return 0;
3437
Linus Torvalds1da177e2005-04-16 15:20:36 -07003438 case SIOCGIFHWADDR:
3439 if (!dev->addr_len)
3440 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3441 else
3442 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3443 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3444 ifr->ifr_hwaddr.sa_family = dev->type;
3445 return 0;
3446
Jeff Garzik14e3e072007-10-08 00:06:32 -07003447 case SIOCGIFSLAVE:
3448 err = -EINVAL;
3449 break;
3450
3451 case SIOCGIFMAP:
3452 ifr->ifr_map.mem_start = dev->mem_start;
3453 ifr->ifr_map.mem_end = dev->mem_end;
3454 ifr->ifr_map.base_addr = dev->base_addr;
3455 ifr->ifr_map.irq = dev->irq;
3456 ifr->ifr_map.dma = dev->dma;
3457 ifr->ifr_map.port = dev->if_port;
3458 return 0;
3459
3460 case SIOCGIFINDEX:
3461 ifr->ifr_ifindex = dev->ifindex;
3462 return 0;
3463
3464 case SIOCGIFTXQLEN:
3465 ifr->ifr_qlen = dev->tx_queue_len;
3466 return 0;
3467
3468 default:
3469 /* dev_ioctl() should ensure this case
3470 * is never reached
3471 */
3472 WARN_ON(1);
3473 err = -EINVAL;
3474 break;
3475
3476 }
3477 return err;
3478}
3479
3480/*
3481 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
3482 */
3483static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3484{
3485 int err;
3486 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3487
3488 if (!dev)
3489 return -ENODEV;
3490
3491 switch (cmd) {
3492 case SIOCSIFFLAGS: /* Set interface flags */
3493 return dev_change_flags(dev, ifr->ifr_flags);
3494
3495 case SIOCSIFMETRIC: /* Set the metric on the interface
3496 (currently unused) */
3497 return -EOPNOTSUPP;
3498
3499 case SIOCSIFMTU: /* Set the MTU of a device */
3500 return dev_set_mtu(dev, ifr->ifr_mtu);
3501
Linus Torvalds1da177e2005-04-16 15:20:36 -07003502 case SIOCSIFHWADDR:
3503 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3504
3505 case SIOCSIFHWBROADCAST:
3506 if (ifr->ifr_hwaddr.sa_family != dev->type)
3507 return -EINVAL;
3508 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3509 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
Pavel Emelyanov056925a2007-09-16 15:42:43 -07003510 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003511 return 0;
3512
Linus Torvalds1da177e2005-04-16 15:20:36 -07003513 case SIOCSIFMAP:
3514 if (dev->set_config) {
3515 if (!netif_device_present(dev))
3516 return -ENODEV;
3517 return dev->set_config(dev, &ifr->ifr_map);
3518 }
3519 return -EOPNOTSUPP;
3520
3521 case SIOCADDMULTI:
Patrick McHardy61ee6bd2008-03-26 02:12:11 -07003522 if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07003523 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3524 return -EINVAL;
3525 if (!netif_device_present(dev))
3526 return -ENODEV;
3527 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3528 dev->addr_len, 1);
3529
3530 case SIOCDELMULTI:
Patrick McHardy61ee6bd2008-03-26 02:12:11 -07003531 if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07003532 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3533 return -EINVAL;
3534 if (!netif_device_present(dev))
3535 return -ENODEV;
3536 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3537 dev->addr_len, 1);
3538
Linus Torvalds1da177e2005-04-16 15:20:36 -07003539 case SIOCSIFTXQLEN:
3540 if (ifr->ifr_qlen < 0)
3541 return -EINVAL;
3542 dev->tx_queue_len = ifr->ifr_qlen;
3543 return 0;
3544
3545 case SIOCSIFNAME:
3546 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3547 return dev_change_name(dev, ifr->ifr_newname);
3548
3549 /*
3550 * Unknown or private ioctl
3551 */
3552
3553 default:
3554 if ((cmd >= SIOCDEVPRIVATE &&
3555 cmd <= SIOCDEVPRIVATE + 15) ||
3556 cmd == SIOCBONDENSLAVE ||
3557 cmd == SIOCBONDRELEASE ||
3558 cmd == SIOCBONDSETHWADDR ||
3559 cmd == SIOCBONDSLAVEINFOQUERY ||
3560 cmd == SIOCBONDINFOQUERY ||
3561 cmd == SIOCBONDCHANGEACTIVE ||
3562 cmd == SIOCGMIIPHY ||
3563 cmd == SIOCGMIIREG ||
3564 cmd == SIOCSMIIREG ||
3565 cmd == SIOCBRADDIF ||
3566 cmd == SIOCBRDELIF ||
3567 cmd == SIOCWANDEV) {
3568 err = -EOPNOTSUPP;
3569 if (dev->do_ioctl) {
3570 if (netif_device_present(dev))
3571 err = dev->do_ioctl(dev, ifr,
3572 cmd);
3573 else
3574 err = -ENODEV;
3575 }
3576 } else
3577 err = -EINVAL;
3578
3579 }
3580 return err;
3581}
3582
3583/*
3584 * This function handles all "interface"-type I/O control requests. The actual
3585 * 'doing' part of this is dev_ifsioc above.
3586 */
3587
3588/**
3589 * dev_ioctl - network device ioctl
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07003590 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07003591 * @cmd: command to issue
3592 * @arg: pointer to a struct ifreq in user space
3593 *
3594 * Issue ioctl functions to devices. This is normally called by the
3595 * user space syscall interfaces but can sometimes be useful for
3596 * other purposes. The return value is the return from the syscall if
3597 * positive or a negative errno code on error.
3598 */
3599
Eric W. Biederman881d9662007-09-17 11:56:21 -07003600int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003601{
3602 struct ifreq ifr;
3603 int ret;
3604 char *colon;
3605
3606 /* One special case: SIOCGIFCONF takes ifconf argument
3607 and requires shared lock, because it sleeps writing
3608 to user space.
3609 */
3610
3611 if (cmd == SIOCGIFCONF) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08003612 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -07003613 ret = dev_ifconf(net, (char __user *) arg);
Stephen Hemminger6756ae42006-03-20 22:23:58 -08003614 rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003615 return ret;
3616 }
3617 if (cmd == SIOCGIFNAME)
Eric W. Biederman881d9662007-09-17 11:56:21 -07003618 return dev_ifname(net, (struct ifreq __user *)arg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003619
3620 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3621 return -EFAULT;
3622
3623 ifr.ifr_name[IFNAMSIZ-1] = 0;
3624
3625 colon = strchr(ifr.ifr_name, ':');
3626 if (colon)
3627 *colon = 0;
3628
3629 /*
3630 * See which interface the caller is talking about.
3631 */
3632
3633 switch (cmd) {
3634 /*
3635 * These ioctl calls:
3636 * - can be done by all.
3637 * - atomic and do not require locking.
3638 * - return a value
3639 */
3640 case SIOCGIFFLAGS:
3641 case SIOCGIFMETRIC:
3642 case SIOCGIFMTU:
3643 case SIOCGIFHWADDR:
3644 case SIOCGIFSLAVE:
3645 case SIOCGIFMAP:
3646 case SIOCGIFINDEX:
3647 case SIOCGIFTXQLEN:
Eric W. Biederman881d9662007-09-17 11:56:21 -07003648 dev_load(net, ifr.ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003649 read_lock(&dev_base_lock);
Jeff Garzik14e3e072007-10-08 00:06:32 -07003650 ret = dev_ifsioc_locked(net, &ifr, cmd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003651 read_unlock(&dev_base_lock);
3652 if (!ret) {
3653 if (colon)
3654 *colon = ':';
3655 if (copy_to_user(arg, &ifr,
3656 sizeof(struct ifreq)))
3657 ret = -EFAULT;
3658 }
3659 return ret;
3660
3661 case SIOCETHTOOL:
Eric W. Biederman881d9662007-09-17 11:56:21 -07003662 dev_load(net, ifr.ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003663 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -07003664 ret = dev_ethtool(net, &ifr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003665 rtnl_unlock();
3666 if (!ret) {
3667 if (colon)
3668 *colon = ':';
3669 if (copy_to_user(arg, &ifr,
3670 sizeof(struct ifreq)))
3671 ret = -EFAULT;
3672 }
3673 return ret;
3674
3675 /*
3676 * These ioctl calls:
3677 * - require superuser power.
3678 * - require strict serialization.
3679 * - return a value
3680 */
3681 case SIOCGMIIPHY:
3682 case SIOCGMIIREG:
3683 case SIOCSIFNAME:
3684 if (!capable(CAP_NET_ADMIN))
3685 return -EPERM;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003686 dev_load(net, ifr.ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003687 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -07003688 ret = dev_ifsioc(net, &ifr, cmd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003689 rtnl_unlock();
3690 if (!ret) {
3691 if (colon)
3692 *colon = ':';
3693 if (copy_to_user(arg, &ifr,
3694 sizeof(struct ifreq)))
3695 ret = -EFAULT;
3696 }
3697 return ret;
3698
3699 /*
3700 * These ioctl calls:
3701 * - require superuser power.
3702 * - require strict serialization.
3703 * - do not return a value
3704 */
3705 case SIOCSIFFLAGS:
3706 case SIOCSIFMETRIC:
3707 case SIOCSIFMTU:
3708 case SIOCSIFMAP:
3709 case SIOCSIFHWADDR:
3710 case SIOCSIFSLAVE:
3711 case SIOCADDMULTI:
3712 case SIOCDELMULTI:
3713 case SIOCSIFHWBROADCAST:
3714 case SIOCSIFTXQLEN:
3715 case SIOCSMIIREG:
3716 case SIOCBONDENSLAVE:
3717 case SIOCBONDRELEASE:
3718 case SIOCBONDSETHWADDR:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003719 case SIOCBONDCHANGEACTIVE:
3720 case SIOCBRADDIF:
3721 case SIOCBRDELIF:
3722 if (!capable(CAP_NET_ADMIN))
3723 return -EPERM;
Thomas Grafcabcac02006-01-24 12:46:33 -08003724 /* fall through */
3725 case SIOCBONDSLAVEINFOQUERY:
3726 case SIOCBONDINFOQUERY:
Eric W. Biederman881d9662007-09-17 11:56:21 -07003727 dev_load(net, ifr.ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003728 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -07003729 ret = dev_ifsioc(net, &ifr, cmd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003730 rtnl_unlock();
3731 return ret;
3732
3733 case SIOCGIFMEM:
3734 /* Get the per device memory space. We can add this but
3735 * currently do not support it */
3736 case SIOCSIFMEM:
3737 /* Set the per device memory buffer space.
3738 * Not applicable in our case */
3739 case SIOCSIFLINK:
3740 return -EINVAL;
3741
3742 /*
3743 * Unknown or private ioctl.
3744 */
3745 default:
3746 if (cmd == SIOCWANDEV ||
3747 (cmd >= SIOCDEVPRIVATE &&
3748 cmd <= SIOCDEVPRIVATE + 15)) {
Eric W. Biederman881d9662007-09-17 11:56:21 -07003749 dev_load(net, ifr.ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003750 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -07003751 ret = dev_ifsioc(net, &ifr, cmd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003752 rtnl_unlock();
3753 if (!ret && copy_to_user(arg, &ifr,
3754 sizeof(struct ifreq)))
3755 ret = -EFAULT;
3756 return ret;
3757 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003758 /* Take care of Wireless Extensions */
Johannes Berg295f4a12007-04-26 20:43:56 -07003759 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
Eric W. Biederman881d9662007-09-17 11:56:21 -07003760 return wext_handle_ioctl(net, &ifr, cmd, arg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003761 return -EINVAL;
3762 }
3763}
3764
3765
3766/**
3767 * dev_new_index - allocate an ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07003768 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07003769 *
3770 * Returns a suitable unique value for a new device interface
3771 * number. The caller must hold the rtnl semaphore or the
3772 * dev_base_lock to be sure it remains unique.
3773 */
Eric W. Biederman881d9662007-09-17 11:56:21 -07003774static int dev_new_index(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003775{
3776 static int ifindex;
3777 for (;;) {
3778 if (++ifindex <= 0)
3779 ifindex = 1;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003780 if (!__dev_get_by_index(net, ifindex))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003781 return ifindex;
3782 }
3783}
3784
Linus Torvalds1da177e2005-04-16 15:20:36 -07003785/* Delayed registration/unregisteration */
3786static DEFINE_SPINLOCK(net_todo_list_lock);
Denis Cheng3b5b34f2007-12-07 00:49:17 -08003787static LIST_HEAD(net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003788
Stephen Hemminger6f05f622007-03-08 20:46:03 -08003789static void net_set_todo(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003790{
3791 spin_lock(&net_todo_list_lock);
3792 list_add_tail(&dev->todo_list, &net_todo_list);
3793 spin_unlock(&net_todo_list_lock);
3794}
3795
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07003796static void rollback_registered(struct net_device *dev)
3797{
3798 BUG_ON(dev_boot_phase);
3799 ASSERT_RTNL();
3800
3801 /* Some devices call without registering for initialization unwind. */
3802 if (dev->reg_state == NETREG_UNINITIALIZED) {
3803 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3804 "was registered\n", dev->name, dev);
3805
3806 WARN_ON(1);
3807 return;
3808 }
3809
3810 BUG_ON(dev->reg_state != NETREG_REGISTERED);
3811
3812 /* If device is running, close it first. */
3813 dev_close(dev);
3814
3815 /* And unlink it from device chain. */
3816 unlist_netdevice(dev);
3817
3818 dev->reg_state = NETREG_UNREGISTERING;
3819
3820 synchronize_net();
3821
3822 /* Shutdown queueing discipline. */
3823 dev_shutdown(dev);
3824
3825
3826 /* Notify protocols, that we are about to destroy
3827 this device. They should clean all the things.
3828 */
3829 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3830
3831 /*
3832 * Flush the unicast and multicast chains
3833 */
3834 dev_addr_discard(dev);
3835
3836 if (dev->uninit)
3837 dev->uninit(dev);
3838
3839 /* Notifier chain MUST detach us from master device. */
3840 BUG_TRAP(!dev->master);
3841
3842 /* Remove entries from kobject tree */
3843 netdev_unregister_kobject(dev);
3844
3845 synchronize_net();
3846
3847 dev_put(dev);
3848}
3849
David S. Millere8a04642008-07-17 00:34:19 -07003850static void __netdev_init_queue_locks_one(struct net_device *dev,
3851 struct netdev_queue *dev_queue,
3852 void *_unused)
David S. Millerc773e842008-07-08 23:13:53 -07003853{
3854 spin_lock_init(&dev_queue->_xmit_lock);
3855 netdev_set_lockdep_class(&dev_queue->_xmit_lock, dev->type);
3856 dev_queue->xmit_lock_owner = -1;
3857}
3858
3859static void netdev_init_queue_locks(struct net_device *dev)
3860{
David S. Millere8a04642008-07-17 00:34:19 -07003861 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
3862 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
David S. Millerc773e842008-07-08 23:13:53 -07003863}
3864
Linus Torvalds1da177e2005-04-16 15:20:36 -07003865/**
3866 * register_netdevice - register a network device
3867 * @dev: device to register
3868 *
3869 * Take a completed network device structure and add it to the kernel
3870 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3871 * chain. 0 is returned on success. A negative errno code is returned
3872 * on a failure to set up the device, or if the name is a duplicate.
3873 *
3874 * Callers must hold the rtnl semaphore. You may want
3875 * register_netdev() instead of this.
3876 *
3877 * BUGS:
3878 * The locking appears insufficient to guarantee two parallel registers
3879 * will not get the same name.
3880 */
3881
3882int register_netdevice(struct net_device *dev)
3883{
3884 struct hlist_head *head;
3885 struct hlist_node *p;
3886 int ret;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003887 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003888
3889 BUG_ON(dev_boot_phase);
3890 ASSERT_RTNL();
3891
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07003892 might_sleep();
3893
Linus Torvalds1da177e2005-04-16 15:20:36 -07003894 /* When net_device's are persistent, this will be fatal. */
3895 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09003896 BUG_ON(!dev_net(dev));
3897 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003898
David S. Millerf1f28aa2008-07-15 00:08:33 -07003899 spin_lock_init(&dev->addr_list_lock);
David S. Millerc773e842008-07-08 23:13:53 -07003900 netdev_init_queue_locks(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003901
Linus Torvalds1da177e2005-04-16 15:20:36 -07003902 dev->iflink = -1;
3903
3904 /* Init, if this function is available */
3905 if (dev->init) {
3906 ret = dev->init(dev);
3907 if (ret) {
3908 if (ret > 0)
3909 ret = -EIO;
Adrian Bunk90833aa2006-11-13 16:02:22 -08003910 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003911 }
3912 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003913
Linus Torvalds1da177e2005-04-16 15:20:36 -07003914 if (!dev_valid_name(dev->name)) {
3915 ret = -EINVAL;
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07003916 goto err_uninit;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003917 }
3918
Eric W. Biederman881d9662007-09-17 11:56:21 -07003919 dev->ifindex = dev_new_index(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003920 if (dev->iflink == -1)
3921 dev->iflink = dev->ifindex;
3922
3923 /* Check for existence of name */
Eric W. Biederman881d9662007-09-17 11:56:21 -07003924 head = dev_name_hash(net, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003925 hlist_for_each(p, head) {
3926 struct net_device *d
3927 = hlist_entry(p, struct net_device, name_hlist);
3928 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
3929 ret = -EEXIST;
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07003930 goto err_uninit;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003931 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003932 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003933
Stephen Hemmingerd212f872007-06-27 00:47:37 -07003934 /* Fix illegal checksum combinations */
3935 if ((dev->features & NETIF_F_HW_CSUM) &&
3936 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3937 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
3938 dev->name);
3939 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
3940 }
3941
3942 if ((dev->features & NETIF_F_NO_CSUM) &&
3943 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3944 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
3945 dev->name);
3946 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
3947 }
3948
3949
Linus Torvalds1da177e2005-04-16 15:20:36 -07003950 /* Fix illegal SG+CSUM combinations. */
3951 if ((dev->features & NETIF_F_SG) &&
Herbert Xu8648b302006-06-17 22:06:05 -07003952 !(dev->features & NETIF_F_ALL_CSUM)) {
Stephen Hemminger5a8da022006-07-07 16:54:05 -07003953 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07003954 dev->name);
3955 dev->features &= ~NETIF_F_SG;
3956 }
3957
3958 /* TSO requires that SG is present as well. */
3959 if ((dev->features & NETIF_F_TSO) &&
3960 !(dev->features & NETIF_F_SG)) {
Stephen Hemminger5a8da022006-07-07 16:54:05 -07003961 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07003962 dev->name);
3963 dev->features &= ~NETIF_F_TSO;
3964 }
Ananda Rajue89e9cf2005-10-18 15:46:41 -07003965 if (dev->features & NETIF_F_UFO) {
3966 if (!(dev->features & NETIF_F_HW_CSUM)) {
3967 printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3968 "NETIF_F_HW_CSUM feature.\n",
3969 dev->name);
3970 dev->features &= ~NETIF_F_UFO;
3971 }
3972 if (!(dev->features & NETIF_F_SG)) {
3973 printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3974 "NETIF_F_SG feature.\n",
3975 dev->name);
3976 dev->features &= ~NETIF_F_UFO;
3977 }
3978 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003979
Daniel Lezcanoaaf8cdc2008-05-02 17:00:58 -07003980 netdev_initialize_kobject(dev);
Eric W. Biederman8b41d182007-09-26 22:02:53 -07003981 ret = netdev_register_kobject(dev);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07003982 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07003983 goto err_uninit;
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07003984 dev->reg_state = NETREG_REGISTERED;
3985
Linus Torvalds1da177e2005-04-16 15:20:36 -07003986 /*
3987 * Default initial state at registry is that the
3988 * device is present.
3989 */
3990
3991 set_bit(__LINK_STATE_PRESENT, &dev->state);
3992
Linus Torvalds1da177e2005-04-16 15:20:36 -07003993 dev_init_scheduler(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003994 dev_hold(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02003995 list_netdevice(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003996
3997 /* Notify protocols, that a new device appeared. */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07003998 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07003999 ret = notifier_to_errno(ret);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004000 if (ret) {
4001 rollback_registered(dev);
4002 dev->reg_state = NETREG_UNREGISTERED;
4003 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004004
4005out:
4006 return ret;
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07004007
4008err_uninit:
4009 if (dev->uninit)
4010 dev->uninit(dev);
4011 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004012}
4013
4014/**
4015 * register_netdev - register a network device
4016 * @dev: device to register
4017 *
4018 * Take a completed network device structure and add it to the kernel
4019 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4020 * chain. 0 is returned on success. A negative errno code is returned
4021 * on a failure to set up the device, or if the name is a duplicate.
4022 *
Borislav Petkov38b4da32007-04-20 22:14:10 -07004023 * This is a wrapper around register_netdevice that takes the rtnl semaphore
Linus Torvalds1da177e2005-04-16 15:20:36 -07004024 * and expands the device name if you passed a format string to
4025 * alloc_netdev.
4026 */
4027int register_netdev(struct net_device *dev)
4028{
4029 int err;
4030
4031 rtnl_lock();
4032
4033 /*
4034 * If the name is a format string the caller wants us to do a
4035 * name allocation.
4036 */
4037 if (strchr(dev->name, '%')) {
4038 err = dev_alloc_name(dev, dev->name);
4039 if (err < 0)
4040 goto out;
4041 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09004042
Linus Torvalds1da177e2005-04-16 15:20:36 -07004043 err = register_netdevice(dev);
4044out:
4045 rtnl_unlock();
4046 return err;
4047}
4048EXPORT_SYMBOL(register_netdev);
4049
4050/*
4051 * netdev_wait_allrefs - wait until all references are gone.
4052 *
4053 * This is called when unregistering network devices.
4054 *
4055 * Any protocol or device that holds a reference should register
4056 * for netdevice notification, and cleanup and put back the
4057 * reference if they receive an UNREGISTER event.
4058 * We can get stuck here if buggy protocols don't correctly
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09004059 * call dev_put.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004060 */
4061static void netdev_wait_allrefs(struct net_device *dev)
4062{
4063 unsigned long rebroadcast_time, warning_time;
4064
4065 rebroadcast_time = warning_time = jiffies;
4066 while (atomic_read(&dev->refcnt) != 0) {
4067 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08004068 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004069
4070 /* Rebroadcast unregister notification */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07004071 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004072
4073 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4074 &dev->state)) {
4075 /* We must not have linkwatch events
4076 * pending on unregister. If this
4077 * happens, we simply run the queue
4078 * unscheduled, resulting in a noop
4079 * for this device.
4080 */
4081 linkwatch_run_queue();
4082 }
4083
Stephen Hemminger6756ae42006-03-20 22:23:58 -08004084 __rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004085
4086 rebroadcast_time = jiffies;
4087 }
4088
4089 msleep(250);
4090
4091 if (time_after(jiffies, warning_time + 10 * HZ)) {
4092 printk(KERN_EMERG "unregister_netdevice: "
4093 "waiting for %s to become free. Usage "
4094 "count = %d\n",
4095 dev->name, atomic_read(&dev->refcnt));
4096 warning_time = jiffies;
4097 }
4098 }
4099}
4100
4101/* The sequence is:
4102 *
4103 * rtnl_lock();
4104 * ...
4105 * register_netdevice(x1);
4106 * register_netdevice(x2);
4107 * ...
4108 * unregister_netdevice(y1);
4109 * unregister_netdevice(y2);
4110 * ...
4111 * rtnl_unlock();
4112 * free_netdev(y1);
4113 * free_netdev(y2);
4114 *
4115 * We are invoked by rtnl_unlock() after it drops the semaphore.
4116 * This allows us to deal with problems:
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07004117 * 1) We can delete sysfs objects which invoke hotplug
Linus Torvalds1da177e2005-04-16 15:20:36 -07004118 * without deadlocking with linkwatch via keventd.
4119 * 2) Since we run with the RTNL semaphore not held, we can sleep
4120 * safely in order to wait for the netdev refcnt to drop to zero.
4121 */
Arjan van de Ven4a3e2f72006-03-20 22:33:17 -08004122static DEFINE_MUTEX(net_todo_run_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004123void netdev_run_todo(void)
4124{
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07004125 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004126
4127 /* Need to guard against multiple cpu's getting out of order. */
Arjan van de Ven4a3e2f72006-03-20 22:33:17 -08004128 mutex_lock(&net_todo_run_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004129
4130 /* Not safe to do outside the semaphore. We must not return
4131 * until all unregister events invoked by the local processor
4132 * have been completed (either by this todo run, or one on
4133 * another cpu).
4134 */
4135 if (list_empty(&net_todo_list))
4136 goto out;
4137
4138 /* Snapshot list, allow later requests */
4139 spin_lock(&net_todo_list_lock);
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07004140 list_replace_init(&net_todo_list, &list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004141 spin_unlock(&net_todo_list_lock);
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07004142
Linus Torvalds1da177e2005-04-16 15:20:36 -07004143 while (!list_empty(&list)) {
4144 struct net_device *dev
4145 = list_entry(list.next, struct net_device, todo_list);
4146 list_del(&dev->todo_list);
4147
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07004148 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004149 printk(KERN_ERR "network todo '%s' but state %d\n",
4150 dev->name, dev->reg_state);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07004151 dump_stack();
4152 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004153 }
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07004154
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07004155 dev->reg_state = NETREG_UNREGISTERED;
4156
4157 netdev_wait_allrefs(dev);
4158
4159 /* paranoia */
4160 BUG_ON(atomic_read(&dev->refcnt));
4161 BUG_TRAP(!dev->ip_ptr);
4162 BUG_TRAP(!dev->ip6_ptr);
4163 BUG_TRAP(!dev->dn_ptr);
4164
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07004165 if (dev->destructor)
4166 dev->destructor(dev);
Stephen Hemminger9093bbb2007-05-19 15:39:25 -07004167
4168 /* Free network device */
4169 kobject_put(&dev->dev.kobj);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004170 }
4171
4172out:
Arjan van de Ven4a3e2f72006-03-20 22:33:17 -08004173 mutex_unlock(&net_todo_run_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004174}
4175
Rusty Russell5a1b5892007-04-28 21:04:03 -07004176static struct net_device_stats *internal_stats(struct net_device *dev)
Rusty Russellc45d2862007-03-28 14:29:08 -07004177{
Rusty Russell5a1b5892007-04-28 21:04:03 -07004178 return &dev->stats;
Rusty Russellc45d2862007-03-28 14:29:08 -07004179}
4180
David S. Millerdc2b4842008-07-08 17:18:23 -07004181static void netdev_init_one_queue(struct net_device *dev,
David S. Millere8a04642008-07-17 00:34:19 -07004182 struct netdev_queue *queue,
4183 void *_unused)
David S. Millerdc2b4842008-07-08 17:18:23 -07004184{
David S. Millerdc2b4842008-07-08 17:18:23 -07004185 queue->dev = dev;
4186}
4187
David S. Millerbb949fb2008-07-08 16:55:56 -07004188static void netdev_init_queues(struct net_device *dev)
4189{
David S. Millere8a04642008-07-17 00:34:19 -07004190 netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4191 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
David S. Millerbb949fb2008-07-08 16:55:56 -07004192}
4193
Linus Torvalds1da177e2005-04-16 15:20:36 -07004194/**
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07004195 * alloc_netdev_mq - allocate network device
Linus Torvalds1da177e2005-04-16 15:20:36 -07004196 * @sizeof_priv: size of private data to allocate space for
4197 * @name: device name format string
4198 * @setup: callback to initialize device
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07004199 * @queue_count: the number of subqueues to allocate
Linus Torvalds1da177e2005-04-16 15:20:36 -07004200 *
4201 * Allocates a struct net_device with private data area for driver use
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07004202 * and performs basic initialization. Also allocates subquue structs
4203 * for each queue on the device at the end of the netdevice.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004204 */
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07004205struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4206 void (*setup)(struct net_device *), unsigned int queue_count)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004207{
David S. Millere8a04642008-07-17 00:34:19 -07004208 struct netdev_queue *tx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004209 struct net_device *dev;
Stephen Hemminger79439862008-07-21 13:28:44 -07004210 size_t alloc_size;
David S. Millere8a04642008-07-17 00:34:19 -07004211 void *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004212
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07004213 BUG_ON(strlen(name) >= sizeof(dev->name));
4214
David S. Millerfd2ea0a2008-07-17 01:56:23 -07004215 alloc_size = sizeof(struct net_device);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07004216 if (sizeof_priv) {
4217 /* ensure 32-byte alignment of private area */
4218 alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4219 alloc_size += sizeof_priv;
4220 }
4221 /* ensure 32-byte alignment of whole construct */
4222 alloc_size += NETDEV_ALIGN_CONST;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004223
Paolo 'Blaisorblade' Giarrusso31380de2006-04-06 22:38:28 -07004224 p = kzalloc(alloc_size, GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004225 if (!p) {
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07004226 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004227 return NULL;
4228 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004229
Stephen Hemminger79439862008-07-21 13:28:44 -07004230 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
David S. Millere8a04642008-07-17 00:34:19 -07004231 if (!tx) {
4232 printk(KERN_ERR "alloc_netdev: Unable to allocate "
4233 "tx qdiscs.\n");
4234 kfree(p);
4235 return NULL;
4236 }
4237
Linus Torvalds1da177e2005-04-16 15:20:36 -07004238 dev = (struct net_device *)
4239 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4240 dev->padded = (char *)dev - (char *)p;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09004241 dev_net_set(dev, &init_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004242
David S. Millere8a04642008-07-17 00:34:19 -07004243 dev->_tx = tx;
4244 dev->num_tx_queues = queue_count;
David S. Millerfd2ea0a2008-07-17 01:56:23 -07004245 dev->real_num_tx_queues = queue_count;
David S. Millere8a04642008-07-17 00:34:19 -07004246
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07004247 if (sizeof_priv) {
4248 dev->priv = ((char *)dev +
David S. Millerfd2ea0a2008-07-17 01:56:23 -07004249 ((sizeof(struct net_device) + NETDEV_ALIGN_CONST)
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07004250 & ~NETDEV_ALIGN_CONST));
4251 }
4252
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07004253 dev->gso_max_size = GSO_MAX_SIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004254
David S. Millerbb949fb2008-07-08 16:55:56 -07004255 netdev_init_queues(dev);
4256
Rusty Russell5a1b5892007-04-28 21:04:03 -07004257 dev->get_stats = internal_stats;
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004258 netpoll_netdev_init(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004259 setup(dev);
4260 strcpy(dev->name, name);
4261 return dev;
4262}
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07004263EXPORT_SYMBOL(alloc_netdev_mq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004264
4265/**
4266 * free_netdev - free network device
4267 * @dev: device
4268 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09004269 * This function does the last stage of destroying an allocated device
4270 * interface. The reference to the device object is released.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004271 * If this is the last reference then it will be freed.
4272 */
4273void free_netdev(struct net_device *dev)
4274{
Denis V. Lunevf3005d72008-04-16 02:02:18 -07004275 release_net(dev_net(dev));
4276
David S. Millere8a04642008-07-17 00:34:19 -07004277 kfree(dev->_tx);
4278
Stephen Hemminger3041a062006-05-26 13:25:24 -07004279 /* Compatibility with error handling in drivers */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004280 if (dev->reg_state == NETREG_UNINITIALIZED) {
4281 kfree((char *)dev - dev->padded);
4282 return;
4283 }
4284
4285 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4286 dev->reg_state = NETREG_RELEASED;
4287
Greg Kroah-Hartman43cb76d2002-04-09 12:14:34 -07004288 /* will free via device release */
4289 put_device(&dev->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004290}
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09004291
Linus Torvalds1da177e2005-04-16 15:20:36 -07004292/* Synchronize with packet receive processing. */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09004293void synchronize_net(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004294{
4295 might_sleep();
Paul E. McKenneyfbd568a3e2005-05-01 08:59:04 -07004296 synchronize_rcu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004297}
4298
4299/**
4300 * unregister_netdevice - remove device from the kernel
4301 * @dev: device
4302 *
4303 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08004304 * from the kernel tables.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004305 *
4306 * Callers must hold the rtnl semaphore. You may want
4307 * unregister_netdev() instead of this.
4308 */
4309
Stephen Hemminger22f8cde2007-02-07 00:09:58 -08004310void unregister_netdevice(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004311{
Herbert Xua6620712007-12-12 19:21:56 -08004312 ASSERT_RTNL();
4313
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004314 rollback_registered(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004315 /* Finish processing unregister after unlock */
4316 net_set_todo(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004317}
4318
4319/**
4320 * unregister_netdev - remove device from the kernel
4321 * @dev: device
4322 *
4323 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08004324 * from the kernel tables.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004325 *
4326 * This is just a wrapper for unregister_netdevice that takes
4327 * the rtnl semaphore. In general you want to use this and not
4328 * unregister_netdevice.
4329 */
4330void unregister_netdev(struct net_device *dev)
4331{
4332 rtnl_lock();
4333 unregister_netdevice(dev);
4334 rtnl_unlock();
4335}
4336
4337EXPORT_SYMBOL(unregister_netdev);
4338
Eric W. Biedermance286d32007-09-12 13:53:49 +02004339/**
4340 * dev_change_net_namespace - move device to different nethost namespace
4341 * @dev: device
4342 * @net: network namespace
4343 * @pat: If not NULL name pattern to try if the current device name
4344 * is already taken in the destination network namespace.
4345 *
4346 * This function shuts down a device interface and moves it
4347 * to a new network namespace. On success 0 is returned, on
4348 * a failure a netagive errno code is returned.
4349 *
4350 * Callers must hold the rtnl semaphore.
4351 */
4352
4353int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4354{
4355 char buf[IFNAMSIZ];
4356 const char *destname;
4357 int err;
4358
4359 ASSERT_RTNL();
4360
4361 /* Don't allow namespace local devices to be moved. */
4362 err = -EINVAL;
4363 if (dev->features & NETIF_F_NETNS_LOCAL)
4364 goto out;
4365
4366 /* Ensure the device has been registrered */
4367 err = -EINVAL;
4368 if (dev->reg_state != NETREG_REGISTERED)
4369 goto out;
4370
4371 /* Get out if there is nothing todo */
4372 err = 0;
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09004373 if (net_eq(dev_net(dev), net))
Eric W. Biedermance286d32007-09-12 13:53:49 +02004374 goto out;
4375
4376 /* Pick the destination device name, and ensure
4377 * we can use it in the destination network namespace.
4378 */
4379 err = -EEXIST;
4380 destname = dev->name;
4381 if (__dev_get_by_name(net, destname)) {
4382 /* We get here if we can't use the current device name */
4383 if (!pat)
4384 goto out;
4385 if (!dev_valid_name(pat))
4386 goto out;
4387 if (strchr(pat, '%')) {
4388 if (__dev_alloc_name(net, pat, buf) < 0)
4389 goto out;
4390 destname = buf;
4391 } else
4392 destname = pat;
4393 if (__dev_get_by_name(net, destname))
4394 goto out;
4395 }
4396
4397 /*
4398 * And now a mini version of register_netdevice unregister_netdevice.
4399 */
4400
4401 /* If device is running close it first. */
Pavel Emelyanov9b772652007-10-10 02:49:09 -07004402 dev_close(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02004403
4404 /* And unlink it from device chain */
4405 err = -ENODEV;
4406 unlist_netdevice(dev);
4407
4408 synchronize_net();
4409
4410 /* Shutdown queueing discipline. */
4411 dev_shutdown(dev);
4412
4413 /* Notify protocols, that we are about to destroy
4414 this device. They should clean all the things.
4415 */
4416 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4417
4418 /*
4419 * Flush the unicast and multicast chains
4420 */
4421 dev_addr_discard(dev);
4422
4423 /* Actually switch the network namespace */
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09004424 dev_net_set(dev, net);
Eric W. Biedermance286d32007-09-12 13:53:49 +02004425
4426 /* Assign the new device name */
4427 if (destname != dev->name)
4428 strcpy(dev->name, destname);
4429
4430 /* If there is an ifindex conflict assign a new one */
4431 if (__dev_get_by_index(net, dev->ifindex)) {
4432 int iflink = (dev->iflink == dev->ifindex);
4433 dev->ifindex = dev_new_index(net);
4434 if (iflink)
4435 dev->iflink = dev->ifindex;
4436 }
4437
Eric W. Biederman8b41d182007-09-26 22:02:53 -07004438 /* Fixup kobjects */
Daniel Lezcanoaaf8cdc2008-05-02 17:00:58 -07004439 netdev_unregister_kobject(dev);
4440 err = netdev_register_kobject(dev);
Eric W. Biederman8b41d182007-09-26 22:02:53 -07004441 WARN_ON(err);
Eric W. Biedermance286d32007-09-12 13:53:49 +02004442
4443 /* Add the device back in the hashes */
4444 list_netdevice(dev);
4445
4446 /* Notify protocols, that a new device appeared. */
4447 call_netdevice_notifiers(NETDEV_REGISTER, dev);
4448
4449 synchronize_net();
4450 err = 0;
4451out:
4452 return err;
4453}
4454
Linus Torvalds1da177e2005-04-16 15:20:36 -07004455static int dev_cpu_callback(struct notifier_block *nfb,
4456 unsigned long action,
4457 void *ocpu)
4458{
4459 struct sk_buff **list_skb;
David S. Miller37437bb2008-07-16 02:15:04 -07004460 struct Qdisc **list_net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004461 struct sk_buff *skb;
4462 unsigned int cpu, oldcpu = (unsigned long)ocpu;
4463 struct softnet_data *sd, *oldsd;
4464
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07004465 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004466 return NOTIFY_OK;
4467
4468 local_irq_disable();
4469 cpu = smp_processor_id();
4470 sd = &per_cpu(softnet_data, cpu);
4471 oldsd = &per_cpu(softnet_data, oldcpu);
4472
4473 /* Find end of our completion_queue. */
4474 list_skb = &sd->completion_queue;
4475 while (*list_skb)
4476 list_skb = &(*list_skb)->next;
4477 /* Append completion queue from offline CPU. */
4478 *list_skb = oldsd->completion_queue;
4479 oldsd->completion_queue = NULL;
4480
4481 /* Find end of our output_queue. */
4482 list_net = &sd->output_queue;
4483 while (*list_net)
4484 list_net = &(*list_net)->next_sched;
4485 /* Append output queue from offline CPU. */
4486 *list_net = oldsd->output_queue;
4487 oldsd->output_queue = NULL;
4488
4489 raise_softirq_irqoff(NET_TX_SOFTIRQ);
4490 local_irq_enable();
4491
4492 /* Process offline CPU's input_pkt_queue */
4493 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4494 netif_rx(skb);
4495
4496 return NOTIFY_OK;
4497}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004498
Chris Leechdb217332006-06-17 21:24:58 -07004499#ifdef CONFIG_NET_DMA
4500/**
Randy Dunlap0ed72ec2007-07-26 00:03:29 -07004501 * net_dma_rebalance - try to maintain one DMA channel per CPU
4502 * @net_dma: DMA client and associated data (lock, channels, channel_mask)
4503 *
4504 * This is called when the number of channels allocated to the net_dma client
4505 * changes. The net_dma client tries to have one DMA channel per CPU.
Chris Leechdb217332006-06-17 21:24:58 -07004506 */
Dan Williamsd379b012007-07-09 11:56:42 -07004507
4508static void net_dma_rebalance(struct net_dma *net_dma)
Chris Leechdb217332006-06-17 21:24:58 -07004509{
Dan Williamsd379b012007-07-09 11:56:42 -07004510 unsigned int cpu, i, n, chan_idx;
Chris Leechdb217332006-06-17 21:24:58 -07004511 struct dma_chan *chan;
4512
Dan Williamsd379b012007-07-09 11:56:42 -07004513 if (cpus_empty(net_dma->channel_mask)) {
Chris Leechdb217332006-06-17 21:24:58 -07004514 for_each_online_cpu(cpu)
Alexey Dobriyan29bbd722006-08-02 15:02:31 -07004515 rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
Chris Leechdb217332006-06-17 21:24:58 -07004516 return;
4517 }
4518
4519 i = 0;
4520 cpu = first_cpu(cpu_online_map);
4521
Dan Williamsd379b012007-07-09 11:56:42 -07004522 for_each_cpu_mask(chan_idx, net_dma->channel_mask) {
4523 chan = net_dma->channels[chan_idx];
4524
4525 n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
4526 + (i < (num_online_cpus() %
4527 cpus_weight(net_dma->channel_mask)) ? 1 : 0));
Chris Leechdb217332006-06-17 21:24:58 -07004528
4529 while(n) {
Alexey Dobriyan29bbd722006-08-02 15:02:31 -07004530 per_cpu(softnet_data, cpu).net_dma = chan;
Chris Leechdb217332006-06-17 21:24:58 -07004531 cpu = next_cpu(cpu, cpu_online_map);
4532 n--;
4533 }
4534 i++;
4535 }
Chris Leechdb217332006-06-17 21:24:58 -07004536}
4537
4538/**
4539 * netdev_dma_event - event callback for the net_dma_client
4540 * @client: should always be net_dma_client
Randy Dunlapf4b8ea72006-06-22 16:00:11 -07004541 * @chan: DMA channel for the event
Randy Dunlap0ed72ec2007-07-26 00:03:29 -07004542 * @state: DMA state to be handled
Chris Leechdb217332006-06-17 21:24:58 -07004543 */
Dan Williamsd379b012007-07-09 11:56:42 -07004544static enum dma_state_client
4545netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
4546 enum dma_state state)
Chris Leechdb217332006-06-17 21:24:58 -07004547{
Dan Williamsd379b012007-07-09 11:56:42 -07004548 int i, found = 0, pos = -1;
4549 struct net_dma *net_dma =
4550 container_of(client, struct net_dma, client);
4551 enum dma_state_client ack = DMA_DUP; /* default: take no action */
4552
4553 spin_lock(&net_dma->lock);
4554 switch (state) {
4555 case DMA_RESOURCE_AVAILABLE:
Mike Travis0c0b0ac2008-05-02 16:43:08 -07004556 for (i = 0; i < nr_cpu_ids; i++)
Dan Williamsd379b012007-07-09 11:56:42 -07004557 if (net_dma->channels[i] == chan) {
4558 found = 1;
4559 break;
4560 } else if (net_dma->channels[i] == NULL && pos < 0)
4561 pos = i;
4562
4563 if (!found && pos >= 0) {
4564 ack = DMA_ACK;
4565 net_dma->channels[pos] = chan;
4566 cpu_set(pos, net_dma->channel_mask);
4567 net_dma_rebalance(net_dma);
4568 }
Chris Leechdb217332006-06-17 21:24:58 -07004569 break;
4570 case DMA_RESOURCE_REMOVED:
Mike Travis0c0b0ac2008-05-02 16:43:08 -07004571 for (i = 0; i < nr_cpu_ids; i++)
Dan Williamsd379b012007-07-09 11:56:42 -07004572 if (net_dma->channels[i] == chan) {
4573 found = 1;
4574 pos = i;
4575 break;
4576 }
4577
4578 if (found) {
4579 ack = DMA_ACK;
4580 cpu_clear(pos, net_dma->channel_mask);
4581 net_dma->channels[i] = NULL;
4582 net_dma_rebalance(net_dma);
4583 }
Chris Leechdb217332006-06-17 21:24:58 -07004584 break;
4585 default:
4586 break;
4587 }
Dan Williamsd379b012007-07-09 11:56:42 -07004588 spin_unlock(&net_dma->lock);
4589
4590 return ack;
Chris Leechdb217332006-06-17 21:24:58 -07004591}
4592
4593/**
4594 * netdev_dma_regiser - register the networking subsystem as a DMA client
4595 */
4596static int __init netdev_dma_register(void)
4597{
Mike Travis0c0b0ac2008-05-02 16:43:08 -07004598 net_dma.channels = kzalloc(nr_cpu_ids * sizeof(struct net_dma),
4599 GFP_KERNEL);
4600 if (unlikely(!net_dma.channels)) {
4601 printk(KERN_NOTICE
4602 "netdev_dma: no memory for net_dma.channels\n");
4603 return -ENOMEM;
4604 }
Dan Williamsd379b012007-07-09 11:56:42 -07004605 spin_lock_init(&net_dma.lock);
4606 dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
4607 dma_async_client_register(&net_dma.client);
4608 dma_async_client_chan_request(&net_dma.client);
Chris Leechdb217332006-06-17 21:24:58 -07004609 return 0;
4610}
4611
4612#else
4613static int __init netdev_dma_register(void) { return -ENODEV; }
4614#endif /* CONFIG_NET_DMA */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004615
Herbert Xu7f353bf2007-08-10 15:47:58 -07004616/**
4617 * netdev_compute_feature - compute conjunction of two feature sets
4618 * @all: first feature set
4619 * @one: second feature set
4620 *
4621 * Computes a new feature set after adding a device with feature set
4622 * @one to the master device with current feature set @all. Returns
4623 * the new feature set.
4624 */
4625int netdev_compute_features(unsigned long all, unsigned long one)
4626{
4627 /* if device needs checksumming, downgrade to hw checksumming */
4628 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
4629 all ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
4630
4631 /* if device can't do all checksum, downgrade to ipv4/ipv6 */
4632 if (all & NETIF_F_HW_CSUM && !(one & NETIF_F_HW_CSUM))
4633 all ^= NETIF_F_HW_CSUM
4634 | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
4635
4636 if (one & NETIF_F_GSO)
4637 one |= NETIF_F_GSO_SOFTWARE;
4638 one |= NETIF_F_GSO;
4639
4640 /* If even one device supports robust GSO, enable it for all. */
4641 if (one & NETIF_F_GSO_ROBUST)
4642 all |= NETIF_F_GSO_ROBUST;
4643
4644 all &= one | NETIF_F_LLTX;
4645
4646 if (!(all & NETIF_F_ALL_CSUM))
4647 all &= ~NETIF_F_SG;
4648 if (!(all & NETIF_F_SG))
4649 all &= ~NETIF_F_GSO_MASK;
4650
4651 return all;
4652}
4653EXPORT_SYMBOL(netdev_compute_features);
4654
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07004655static struct hlist_head *netdev_create_hash(void)
4656{
4657 int i;
4658 struct hlist_head *hash;
4659
4660 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
4661 if (hash != NULL)
4662 for (i = 0; i < NETDEV_HASHENTRIES; i++)
4663 INIT_HLIST_HEAD(&hash[i]);
4664
4665 return hash;
4666}
4667
Eric W. Biederman881d9662007-09-17 11:56:21 -07004668/* Initialize per network namespace state */
Pavel Emelyanov46650792007-10-08 20:38:39 -07004669static int __net_init netdev_init(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07004670{
Eric W. Biederman881d9662007-09-17 11:56:21 -07004671 INIT_LIST_HEAD(&net->dev_base_head);
Eric W. Biederman881d9662007-09-17 11:56:21 -07004672
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07004673 net->dev_name_head = netdev_create_hash();
4674 if (net->dev_name_head == NULL)
4675 goto err_name;
Eric W. Biederman881d9662007-09-17 11:56:21 -07004676
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07004677 net->dev_index_head = netdev_create_hash();
4678 if (net->dev_index_head == NULL)
4679 goto err_idx;
Eric W. Biederman881d9662007-09-17 11:56:21 -07004680
4681 return 0;
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07004682
4683err_idx:
4684 kfree(net->dev_name_head);
4685err_name:
4686 return -ENOMEM;
Eric W. Biederman881d9662007-09-17 11:56:21 -07004687}
4688
Pavel Emelyanov46650792007-10-08 20:38:39 -07004689static void __net_exit netdev_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07004690{
4691 kfree(net->dev_name_head);
4692 kfree(net->dev_index_head);
4693}
4694
Denis V. Lunev022cbae2007-11-13 03:23:50 -08004695static struct pernet_operations __net_initdata netdev_net_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07004696 .init = netdev_init,
4697 .exit = netdev_exit,
4698};
4699
Pavel Emelyanov46650792007-10-08 20:38:39 -07004700static void __net_exit default_device_exit(struct net *net)
Eric W. Biedermance286d32007-09-12 13:53:49 +02004701{
4702 struct net_device *dev, *next;
4703 /*
4704 * Push all migratable of the network devices back to the
4705 * initial network namespace
4706 */
4707 rtnl_lock();
4708 for_each_netdev_safe(net, dev, next) {
4709 int err;
Pavel Emelyanovaca51392008-05-08 01:24:25 -07004710 char fb_name[IFNAMSIZ];
Eric W. Biedermance286d32007-09-12 13:53:49 +02004711
4712 /* Ignore unmoveable devices (i.e. loopback) */
4713 if (dev->features & NETIF_F_NETNS_LOCAL)
4714 continue;
4715
4716 /* Push remaing network devices to init_net */
Pavel Emelyanovaca51392008-05-08 01:24:25 -07004717 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
4718 err = dev_change_net_namespace(dev, &init_net, fb_name);
Eric W. Biedermance286d32007-09-12 13:53:49 +02004719 if (err) {
Pavel Emelyanovaca51392008-05-08 01:24:25 -07004720 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
Eric W. Biedermance286d32007-09-12 13:53:49 +02004721 __func__, dev->name, err);
Pavel Emelyanovaca51392008-05-08 01:24:25 -07004722 BUG();
Eric W. Biedermance286d32007-09-12 13:53:49 +02004723 }
4724 }
4725 rtnl_unlock();
4726}
4727
Denis V. Lunev022cbae2007-11-13 03:23:50 -08004728static struct pernet_operations __net_initdata default_device_ops = {
Eric W. Biedermance286d32007-09-12 13:53:49 +02004729 .exit = default_device_exit,
4730};
4731
Linus Torvalds1da177e2005-04-16 15:20:36 -07004732/*
4733 * Initialize the DEV module. At boot time this walks the device list and
4734 * unhooks any devices that fail to initialise (normally hardware not
4735 * present) and leaves us with a valid list of present and active devices.
4736 *
4737 */
4738
4739/*
4740 * This is called single threaded during boot, so no need
4741 * to take the rtnl semaphore.
4742 */
4743static int __init net_dev_init(void)
4744{
4745 int i, rc = -ENOMEM;
4746
4747 BUG_ON(!dev_boot_phase);
4748
Linus Torvalds1da177e2005-04-16 15:20:36 -07004749 if (dev_proc_init())
4750 goto out;
4751
Eric W. Biederman8b41d182007-09-26 22:02:53 -07004752 if (netdev_kobject_init())
Linus Torvalds1da177e2005-04-16 15:20:36 -07004753 goto out;
4754
4755 INIT_LIST_HEAD(&ptype_all);
Pavel Emelyanov82d8a862007-11-26 20:12:58 +08004756 for (i = 0; i < PTYPE_HASH_SIZE; i++)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004757 INIT_LIST_HEAD(&ptype_base[i]);
4758
Eric W. Biederman881d9662007-09-17 11:56:21 -07004759 if (register_pernet_subsys(&netdev_net_ops))
4760 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004761
Eric W. Biedermance286d32007-09-12 13:53:49 +02004762 if (register_pernet_device(&default_device_ops))
4763 goto out;
4764
Linus Torvalds1da177e2005-04-16 15:20:36 -07004765 /*
4766 * Initialise the packet receive queues.
4767 */
4768
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07004769 for_each_possible_cpu(i) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004770 struct softnet_data *queue;
4771
4772 queue = &per_cpu(softnet_data, i);
4773 skb_queue_head_init(&queue->input_pkt_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004774 queue->completion_queue = NULL;
4775 INIT_LIST_HEAD(&queue->poll_list);
Stephen Hemmingerbea33482007-10-03 16:41:36 -07004776
4777 queue->backlog.poll = process_backlog;
4778 queue->backlog.weight = weight_p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004779 }
4780
Chris Leechdb217332006-06-17 21:24:58 -07004781 netdev_dma_register();
4782
Linus Torvalds1da177e2005-04-16 15:20:36 -07004783 dev_boot_phase = 0;
4784
Carlos R. Mafra962cf362008-05-15 11:15:37 -03004785 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
4786 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004787
4788 hotcpu_notifier(dev_cpu_callback, 0);
4789 dst_init();
4790 dev_mcast_init();
4791 rc = 0;
4792out:
4793 return rc;
4794}
4795
4796subsys_initcall(net_dev_init);
4797
4798EXPORT_SYMBOL(__dev_get_by_index);
4799EXPORT_SYMBOL(__dev_get_by_name);
4800EXPORT_SYMBOL(__dev_remove_pack);
Mitch Williamsc2373ee2005-11-09 10:34:45 -08004801EXPORT_SYMBOL(dev_valid_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004802EXPORT_SYMBOL(dev_add_pack);
4803EXPORT_SYMBOL(dev_alloc_name);
4804EXPORT_SYMBOL(dev_close);
4805EXPORT_SYMBOL(dev_get_by_flags);
4806EXPORT_SYMBOL(dev_get_by_index);
4807EXPORT_SYMBOL(dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004808EXPORT_SYMBOL(dev_open);
4809EXPORT_SYMBOL(dev_queue_xmit);
4810EXPORT_SYMBOL(dev_remove_pack);
4811EXPORT_SYMBOL(dev_set_allmulti);
4812EXPORT_SYMBOL(dev_set_promiscuity);
4813EXPORT_SYMBOL(dev_change_flags);
4814EXPORT_SYMBOL(dev_set_mtu);
4815EXPORT_SYMBOL(dev_set_mac_address);
4816EXPORT_SYMBOL(free_netdev);
4817EXPORT_SYMBOL(netdev_boot_setup_check);
4818EXPORT_SYMBOL(netdev_set_master);
4819EXPORT_SYMBOL(netdev_state_change);
4820EXPORT_SYMBOL(netif_receive_skb);
4821EXPORT_SYMBOL(netif_rx);
4822EXPORT_SYMBOL(register_gifconf);
4823EXPORT_SYMBOL(register_netdevice);
4824EXPORT_SYMBOL(register_netdevice_notifier);
4825EXPORT_SYMBOL(skb_checksum_help);
4826EXPORT_SYMBOL(synchronize_net);
4827EXPORT_SYMBOL(unregister_netdevice);
4828EXPORT_SYMBOL(unregister_netdevice_notifier);
4829EXPORT_SYMBOL(net_enable_timestamp);
4830EXPORT_SYMBOL(net_disable_timestamp);
4831EXPORT_SYMBOL(dev_get_flags);
4832
4833#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
4834EXPORT_SYMBOL(br_handle_frame_hook);
4835EXPORT_SYMBOL(br_fdb_get_hook);
4836EXPORT_SYMBOL(br_fdb_put_hook);
4837#endif
4838
4839#ifdef CONFIG_KMOD
4840EXPORT_SYMBOL(dev_load);
4841#endif
4842
4843EXPORT_PER_CPU_SYMBOL(softnet_data);