blob: 6569fda5cfed892c7e4ed53daa2e9677e60c36d5 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/fs/buffer.c
3 *
4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
5 */
6
7/*
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9 *
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12 *
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
15 *
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17 *
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19 */
20
Linus Torvalds1da177e2005-04-16 15:20:36 -070021#include <linux/kernel.h>
22#include <linux/syscalls.h>
23#include <linux/fs.h>
24#include <linux/mm.h>
25#include <linux/percpu.h>
26#include <linux/slab.h>
Randy Dunlap16f7e0f2006-01-11 12:17:46 -080027#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070028#include <linux/blkdev.h>
29#include <linux/file.h>
30#include <linux/quotaops.h>
31#include <linux/highmem.h>
32#include <linux/module.h>
33#include <linux/writeback.h>
34#include <linux/hash.h>
35#include <linux/suspend.h>
36#include <linux/buffer_head.h>
Andrew Morton55e829a2006-12-10 02:19:27 -080037#include <linux/task_io_accounting_ops.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070038#include <linux/bio.h>
39#include <linux/notifier.h>
40#include <linux/cpu.h>
41#include <linux/bitops.h>
42#include <linux/mpage.h>
Ingo Molnarfb1c8f92005-09-10 00:25:56 -070043#include <linux/bit_spinlock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070044
45static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
Linus Torvalds1da177e2005-04-16 15:20:36 -070046
47#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
48
49inline void
50init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51{
52 bh->b_end_io = handler;
53 bh->b_private = private;
54}
55
56static int sync_buffer(void *word)
57{
58 struct block_device *bd;
59 struct buffer_head *bh
60 = container_of(word, struct buffer_head, b_state);
61
62 smp_mb();
63 bd = bh->b_bdev;
64 if (bd)
65 blk_run_address_space(bd->bd_inode->i_mapping);
66 io_schedule();
67 return 0;
68}
69
Harvey Harrisonfc9b52c2008-02-08 04:19:52 -080070void __lock_buffer(struct buffer_head *bh)
Linus Torvalds1da177e2005-04-16 15:20:36 -070071{
72 wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
73 TASK_UNINTERRUPTIBLE);
74}
75EXPORT_SYMBOL(__lock_buffer);
76
Harvey Harrisonfc9b52c2008-02-08 04:19:52 -080077void unlock_buffer(struct buffer_head *bh)
Linus Torvalds1da177e2005-04-16 15:20:36 -070078{
Nick Piggin51b07fc2008-10-18 20:27:00 -070079 clear_bit_unlock(BH_Lock, &bh->b_state);
Linus Torvalds1da177e2005-04-16 15:20:36 -070080 smp_mb__after_clear_bit();
81 wake_up_bit(&bh->b_state, BH_Lock);
82}
83
84/*
85 * Block until a buffer comes unlocked. This doesn't stop it
86 * from becoming locked again - you have to lock it yourself
87 * if you want to preserve its state.
88 */
89void __wait_on_buffer(struct buffer_head * bh)
90{
91 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
92}
93
94static void
95__clear_page_buffers(struct page *page)
96{
97 ClearPagePrivate(page);
Hugh Dickins4c21e2f2005-10-29 18:16:40 -070098 set_page_private(page, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -070099 page_cache_release(page);
100}
101
102static void buffer_io_error(struct buffer_head *bh)
103{
104 char b[BDEVNAME_SIZE];
105
106 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
107 bdevname(bh->b_bdev, b),
108 (unsigned long long)bh->b_blocknr);
109}
110
111/*
Dmitry Monakhov68671f32007-10-16 01:24:47 -0700112 * End-of-IO handler helper function which does not touch the bh after
113 * unlocking it.
114 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
115 * a race there is benign: unlock_buffer() only use the bh's address for
116 * hashing after unlocking the buffer, so it doesn't actually touch the bh
117 * itself.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118 */
Dmitry Monakhov68671f32007-10-16 01:24:47 -0700119static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700120{
121 if (uptodate) {
122 set_buffer_uptodate(bh);
123 } else {
124 /* This happens, due to failed READA attempts. */
125 clear_buffer_uptodate(bh);
126 }
127 unlock_buffer(bh);
Dmitry Monakhov68671f32007-10-16 01:24:47 -0700128}
129
130/*
131 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
132 * unlock the buffer. This is what ll_rw_block uses too.
133 */
134void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
135{
136 __end_buffer_read_notouch(bh, uptodate);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137 put_bh(bh);
138}
139
140void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
141{
142 char b[BDEVNAME_SIZE];
143
144 if (uptodate) {
145 set_buffer_uptodate(bh);
146 } else {
147 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
148 buffer_io_error(bh);
149 printk(KERN_WARNING "lost page write due to "
150 "I/O error on %s\n",
151 bdevname(bh->b_bdev, b));
152 }
153 set_buffer_write_io_error(bh);
154 clear_buffer_uptodate(bh);
155 }
156 unlock_buffer(bh);
157 put_bh(bh);
158}
159
160/*
161 * Write out and wait upon all the dirty data associated with a block
162 * device via its mapping. Does not take the superblock lock.
163 */
164int sync_blockdev(struct block_device *bdev)
165{
166 int ret = 0;
167
OGAWA Hirofumi28fd1292006-01-08 01:02:14 -0800168 if (bdev)
169 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700170 return ret;
171}
172EXPORT_SYMBOL(sync_blockdev);
173
Linus Torvalds1da177e2005-04-16 15:20:36 -0700174/*
175 * Write out and wait upon all dirty data associated with this
176 * device. Filesystem data as well as the underlying block
177 * device. Takes the superblock lock.
178 */
179int fsync_bdev(struct block_device *bdev)
180{
181 struct super_block *sb = get_super(bdev);
182 if (sb) {
183 int res = fsync_super(sb);
184 drop_super(sb);
185 return res;
186 }
187 return sync_blockdev(bdev);
188}
189
190/**
191 * freeze_bdev -- lock a filesystem and force it into a consistent state
192 * @bdev: blockdevice to lock
193 *
David Chinnerf73ca1b2007-01-10 23:15:41 -0800194 * This takes the block device bd_mount_sem to make sure no new mounts
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195 * happen on bdev until thaw_bdev() is called.
196 * If a superblock is found on this device, we take the s_umount semaphore
197 * on it to make sure nobody unmounts until the snapshot creation is done.
198 */
199struct super_block *freeze_bdev(struct block_device *bdev)
200{
201 struct super_block *sb;
202
David Chinnerf73ca1b2007-01-10 23:15:41 -0800203 down(&bdev->bd_mount_sem);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204 sb = get_super(bdev);
205 if (sb && !(sb->s_flags & MS_RDONLY)) {
206 sb->s_frozen = SB_FREEZE_WRITE;
akpm@osdl.orgd59dd462005-05-01 08:58:47 -0700207 smp_wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700208
OGAWA Hirofumid25b9a12006-03-25 03:07:44 -0800209 __fsync_super(sb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700210
211 sb->s_frozen = SB_FREEZE_TRANS;
akpm@osdl.orgd59dd462005-05-01 08:58:47 -0700212 smp_wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213
214 sync_blockdev(sb->s_bdev);
215
216 if (sb->s_op->write_super_lockfs)
217 sb->s_op->write_super_lockfs(sb);
218 }
219
220 sync_blockdev(bdev);
221 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
222}
223EXPORT_SYMBOL(freeze_bdev);
224
225/**
226 * thaw_bdev -- unlock filesystem
227 * @bdev: blockdevice to unlock
228 * @sb: associated superblock
229 *
230 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
231 */
232void thaw_bdev(struct block_device *bdev, struct super_block *sb)
233{
234 if (sb) {
235 BUG_ON(sb->s_bdev != bdev);
236
237 if (sb->s_op->unlockfs)
238 sb->s_op->unlockfs(sb);
239 sb->s_frozen = SB_UNFROZEN;
akpm@osdl.orgd59dd462005-05-01 08:58:47 -0700240 smp_wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241 wake_up(&sb->s_wait_unfrozen);
242 drop_super(sb);
243 }
244
David Chinnerf73ca1b2007-01-10 23:15:41 -0800245 up(&bdev->bd_mount_sem);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700246}
247EXPORT_SYMBOL(thaw_bdev);
248
249/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250 * Various filesystems appear to want __find_get_block to be non-blocking.
251 * But it's the page lock which protects the buffers. To get around this,
252 * we get exclusion from try_to_free_buffers with the blockdev mapping's
253 * private_lock.
254 *
255 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
256 * may be quite high. This code could TryLock the page, and if that
257 * succeeds, there is no need to take private_lock. (But if
258 * private_lock is contended then so is mapping->tree_lock).
259 */
260static struct buffer_head *
Coywolf Qi Hunt385fd4c2005-11-07 00:59:39 -0800261__find_get_block_slow(struct block_device *bdev, sector_t block)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700262{
263 struct inode *bd_inode = bdev->bd_inode;
264 struct address_space *bd_mapping = bd_inode->i_mapping;
265 struct buffer_head *ret = NULL;
266 pgoff_t index;
267 struct buffer_head *bh;
268 struct buffer_head *head;
269 struct page *page;
270 int all_mapped = 1;
271
272 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
273 page = find_get_page(bd_mapping, index);
274 if (!page)
275 goto out;
276
277 spin_lock(&bd_mapping->private_lock);
278 if (!page_has_buffers(page))
279 goto out_unlock;
280 head = page_buffers(page);
281 bh = head;
282 do {
283 if (bh->b_blocknr == block) {
284 ret = bh;
285 get_bh(bh);
286 goto out_unlock;
287 }
288 if (!buffer_mapped(bh))
289 all_mapped = 0;
290 bh = bh->b_this_page;
291 } while (bh != head);
292
293 /* we might be here because some of the buffers on this page are
294 * not mapped. This is due to various races between
295 * file io on the block device and getblk. It gets dealt with
296 * elsewhere, don't buffer_error if we had some unmapped buffers
297 */
298 if (all_mapped) {
299 printk("__find_get_block_slow() failed. "
300 "block=%llu, b_blocknr=%llu\n",
Badari Pulavarty205f87f2006-03-26 01:38:00 -0800301 (unsigned long long)block,
302 (unsigned long long)bh->b_blocknr);
303 printk("b_state=0x%08lx, b_size=%zu\n",
304 bh->b_state, bh->b_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700305 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
306 }
307out_unlock:
308 spin_unlock(&bd_mapping->private_lock);
309 page_cache_release(page);
310out:
311 return ret;
312}
313
314/* If invalidate_buffers() will trash dirty buffers, it means some kind
315 of fs corruption is going on. Trashing dirty data always imply losing
316 information that was supposed to be just stored on the physical layer
317 by the user.
318
319 Thus invalidate_buffers in general usage is not allwowed to trash
320 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
321 be preserved. These buffers are simply skipped.
322
323 We also skip buffers which are still in use. For example this can
324 happen if a userspace program is reading the block device.
325
326 NOTE: In the case where the user removed a removable-media-disk even if
327 there's still dirty data not synced on disk (due a bug in the device driver
328 or due an error of the user), by not destroying the dirty buffers we could
329 generate corruption also on the next media inserted, thus a parameter is
330 necessary to handle this case in the most safe way possible (trying
331 to not corrupt also the new disk inserted with the data belonging to
332 the old now corrupted disk). Also for the ramdisk the natural thing
333 to do in order to release the ramdisk memory is to destroy dirty buffers.
334
335 These are two special cases. Normal usage imply the device driver
336 to issue a sync on the device (without waiting I/O completion) and
337 then an invalidate_buffers call that doesn't trash dirty buffers.
338
339 For handling cache coherency with the blkdev pagecache the 'update' case
340 is been introduced. It is needed to re-read from disk any pinned
341 buffer. NOTE: re-reading from disk is destructive so we can do it only
342 when we assume nobody is changing the buffercache under our I/O and when
343 we think the disk contains more recent information than the buffercache.
344 The update == 1 pass marks the buffers we need to update, the update == 2
345 pass does the actual I/O. */
Peter Zijlstraf98393a2007-05-06 14:49:54 -0700346void invalidate_bdev(struct block_device *bdev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347{
Andrew Morton0e1dfc62006-07-30 03:03:28 -0700348 struct address_space *mapping = bdev->bd_inode->i_mapping;
349
350 if (mapping->nrpages == 0)
351 return;
352
Linus Torvalds1da177e2005-04-16 15:20:36 -0700353 invalidate_bh_lrus();
Andrew Mortonfc0ecff2007-02-10 01:45:39 -0800354 invalidate_mapping_pages(mapping, 0, -1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355}
356
357/*
358 * Kick pdflush then try to free up some ZONE_NORMAL memory.
359 */
360static void free_more_memory(void)
361{
Mel Gorman19770b32008-04-28 02:12:18 -0700362 struct zone *zone;
Mel Gorman0e884602008-04-28 02:12:14 -0700363 int nid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700364
Pekka J Enberg687a21c2005-06-28 20:44:55 -0700365 wakeup_pdflush(1024);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700366 yield();
367
Mel Gorman0e884602008-04-28 02:12:14 -0700368 for_each_online_node(nid) {
Mel Gorman19770b32008-04-28 02:12:18 -0700369 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
370 gfp_zone(GFP_NOFS), NULL,
371 &zone);
372 if (zone)
Mel Gorman54a6eb52008-04-28 02:12:16 -0700373 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
374 GFP_NOFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700375 }
376}
377
378/*
379 * I/O completion handler for block_read_full_page() - pages
380 * which come unlocked at the end of I/O.
381 */
382static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
383{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700384 unsigned long flags;
Nick Piggina3972202005-07-07 17:56:56 -0700385 struct buffer_head *first;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386 struct buffer_head *tmp;
387 struct page *page;
388 int page_uptodate = 1;
389
390 BUG_ON(!buffer_async_read(bh));
391
392 page = bh->b_page;
393 if (uptodate) {
394 set_buffer_uptodate(bh);
395 } else {
396 clear_buffer_uptodate(bh);
397 if (printk_ratelimit())
398 buffer_io_error(bh);
399 SetPageError(page);
400 }
401
402 /*
403 * Be _very_ careful from here on. Bad things can happen if
404 * two buffer heads end IO at almost the same time and both
405 * decide that the page is now completely done.
406 */
Nick Piggina3972202005-07-07 17:56:56 -0700407 first = page_buffers(page);
408 local_irq_save(flags);
409 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700410 clear_buffer_async_read(bh);
411 unlock_buffer(bh);
412 tmp = bh;
413 do {
414 if (!buffer_uptodate(tmp))
415 page_uptodate = 0;
416 if (buffer_async_read(tmp)) {
417 BUG_ON(!buffer_locked(tmp));
418 goto still_busy;
419 }
420 tmp = tmp->b_this_page;
421 } while (tmp != bh);
Nick Piggina3972202005-07-07 17:56:56 -0700422 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
423 local_irq_restore(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700424
425 /*
426 * If none of the buffers had errors and they are all
427 * uptodate then we can set the page uptodate.
428 */
429 if (page_uptodate && !PageError(page))
430 SetPageUptodate(page);
431 unlock_page(page);
432 return;
433
434still_busy:
Nick Piggina3972202005-07-07 17:56:56 -0700435 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
436 local_irq_restore(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437 return;
438}
439
440/*
441 * Completion handler for block_write_full_page() - pages which are unlocked
442 * during I/O, and which have PageWriteback cleared upon I/O completion.
443 */
Adrian Bunkb6cd0b72006-06-27 02:53:54 -0700444static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445{
446 char b[BDEVNAME_SIZE];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700447 unsigned long flags;
Nick Piggina3972202005-07-07 17:56:56 -0700448 struct buffer_head *first;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700449 struct buffer_head *tmp;
450 struct page *page;
451
452 BUG_ON(!buffer_async_write(bh));
453
454 page = bh->b_page;
455 if (uptodate) {
456 set_buffer_uptodate(bh);
457 } else {
458 if (printk_ratelimit()) {
459 buffer_io_error(bh);
460 printk(KERN_WARNING "lost page write due to "
461 "I/O error on %s\n",
462 bdevname(bh->b_bdev, b));
463 }
464 set_bit(AS_EIO, &page->mapping->flags);
Jan Kara58ff4072006-10-17 00:10:19 -0700465 set_buffer_write_io_error(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 clear_buffer_uptodate(bh);
467 SetPageError(page);
468 }
469
Nick Piggina3972202005-07-07 17:56:56 -0700470 first = page_buffers(page);
471 local_irq_save(flags);
472 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
473
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474 clear_buffer_async_write(bh);
475 unlock_buffer(bh);
476 tmp = bh->b_this_page;
477 while (tmp != bh) {
478 if (buffer_async_write(tmp)) {
479 BUG_ON(!buffer_locked(tmp));
480 goto still_busy;
481 }
482 tmp = tmp->b_this_page;
483 }
Nick Piggina3972202005-07-07 17:56:56 -0700484 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
485 local_irq_restore(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 end_page_writeback(page);
487 return;
488
489still_busy:
Nick Piggina3972202005-07-07 17:56:56 -0700490 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
491 local_irq_restore(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700492 return;
493}
494
495/*
496 * If a page's buffers are under async readin (end_buffer_async_read
497 * completion) then there is a possibility that another thread of
498 * control could lock one of the buffers after it has completed
499 * but while some of the other buffers have not completed. This
500 * locked buffer would confuse end_buffer_async_read() into not unlocking
501 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
502 * that this buffer is not under async I/O.
503 *
504 * The page comes unlocked when it has no locked buffer_async buffers
505 * left.
506 *
507 * PageLocked prevents anyone starting new async I/O reads any of
508 * the buffers.
509 *
510 * PageWriteback is used to prevent simultaneous writeout of the same
511 * page.
512 *
513 * PageLocked prevents anyone from starting writeback of a page which is
514 * under read I/O (PageWriteback is only ever set against a locked page).
515 */
516static void mark_buffer_async_read(struct buffer_head *bh)
517{
518 bh->b_end_io = end_buffer_async_read;
519 set_buffer_async_read(bh);
520}
521
522void mark_buffer_async_write(struct buffer_head *bh)
523{
524 bh->b_end_io = end_buffer_async_write;
525 set_buffer_async_write(bh);
526}
527EXPORT_SYMBOL(mark_buffer_async_write);
528
529
530/*
531 * fs/buffer.c contains helper functions for buffer-backed address space's
532 * fsync functions. A common requirement for buffer-based filesystems is
533 * that certain data from the backing blockdev needs to be written out for
534 * a successful fsync(). For example, ext2 indirect blocks need to be
535 * written back and waited upon before fsync() returns.
536 *
537 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
538 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
539 * management of a list of dependent buffers at ->i_mapping->private_list.
540 *
541 * Locking is a little subtle: try_to_free_buffers() will remove buffers
542 * from their controlling inode's queue when they are being freed. But
543 * try_to_free_buffers() will be operating against the *blockdev* mapping
544 * at the time, not against the S_ISREG file which depends on those buffers.
545 * So the locking for private_list is via the private_lock in the address_space
546 * which backs the buffers. Which is different from the address_space
547 * against which the buffers are listed. So for a particular address_space,
548 * mapping->private_lock does *not* protect mapping->private_list! In fact,
549 * mapping->private_list will always be protected by the backing blockdev's
550 * ->private_lock.
551 *
552 * Which introduces a requirement: all buffers on an address_space's
553 * ->private_list must be from the same address_space: the blockdev's.
554 *
555 * address_spaces which do not place buffers at ->private_list via these
556 * utility functions are free to use private_lock and private_list for
557 * whatever they want. The only requirement is that list_empty(private_list)
558 * be true at clear_inode() time.
559 *
560 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
561 * filesystems should do that. invalidate_inode_buffers() should just go
562 * BUG_ON(!list_empty).
563 *
564 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
565 * take an address_space, not an inode. And it should be called
566 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
567 * queued up.
568 *
569 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
570 * list if it is already on a list. Because if the buffer is on a list,
571 * it *must* already be on the right one. If not, the filesystem is being
572 * silly. This will save a ton of locking. But first we have to ensure
573 * that buffers are taken *off* the old inode's list when they are freed
574 * (presumably in truncate). That requires careful auditing of all
575 * filesystems (do it inside bforget()). It could also be done by bringing
576 * b_inode back.
577 */
578
579/*
580 * The buffer's backing address_space's private_lock must be held
581 */
Thomas Petazzonidbacefc2008-07-29 22:33:47 -0700582static void __remove_assoc_queue(struct buffer_head *bh)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700583{
584 list_del_init(&bh->b_assoc_buffers);
Jan Kara58ff4072006-10-17 00:10:19 -0700585 WARN_ON(!bh->b_assoc_map);
586 if (buffer_write_io_error(bh))
587 set_bit(AS_EIO, &bh->b_assoc_map->flags);
588 bh->b_assoc_map = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700589}
590
591int inode_has_buffers(struct inode *inode)
592{
593 return !list_empty(&inode->i_data.private_list);
594}
595
596/*
597 * osync is designed to support O_SYNC io. It waits synchronously for
598 * all already-submitted IO to complete, but does not queue any new
599 * writes to the disk.
600 *
601 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
602 * you dirty the buffers, and then use osync_inode_buffers to wait for
603 * completion. Any other dirty buffers which are not yet queued for
604 * write will not be flushed to disk by the osync.
605 */
606static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
607{
608 struct buffer_head *bh;
609 struct list_head *p;
610 int err = 0;
611
612 spin_lock(lock);
613repeat:
614 list_for_each_prev(p, list) {
615 bh = BH_ENTRY(p);
616 if (buffer_locked(bh)) {
617 get_bh(bh);
618 spin_unlock(lock);
619 wait_on_buffer(bh);
620 if (!buffer_uptodate(bh))
621 err = -EIO;
622 brelse(bh);
623 spin_lock(lock);
624 goto repeat;
625 }
626 }
627 spin_unlock(lock);
628 return err;
629}
630
631/**
Randy Dunlap78a4a502008-02-29 22:02:31 -0800632 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
Martin Waitz67be2dd2005-05-01 08:59:26 -0700633 * @mapping: the mapping which wants those buffers written
Linus Torvalds1da177e2005-04-16 15:20:36 -0700634 *
635 * Starts I/O against the buffers at mapping->private_list, and waits upon
636 * that I/O.
637 *
Martin Waitz67be2dd2005-05-01 08:59:26 -0700638 * Basically, this is a convenience function for fsync().
639 * @mapping is a file or directory which needs those buffers to be written for
640 * a successful fsync().
Linus Torvalds1da177e2005-04-16 15:20:36 -0700641 */
642int sync_mapping_buffers(struct address_space *mapping)
643{
644 struct address_space *buffer_mapping = mapping->assoc_mapping;
645
646 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
647 return 0;
648
649 return fsync_buffers_list(&buffer_mapping->private_lock,
650 &mapping->private_list);
651}
652EXPORT_SYMBOL(sync_mapping_buffers);
653
654/*
655 * Called when we've recently written block `bblock', and it is known that
656 * `bblock' was for a buffer_boundary() buffer. This means that the block at
657 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
658 * dirty, schedule it for IO. So that indirects merge nicely with their data.
659 */
660void write_boundary_block(struct block_device *bdev,
661 sector_t bblock, unsigned blocksize)
662{
663 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
664 if (bh) {
665 if (buffer_dirty(bh))
666 ll_rw_block(WRITE, 1, &bh);
667 put_bh(bh);
668 }
669}
670
671void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
672{
673 struct address_space *mapping = inode->i_mapping;
674 struct address_space *buffer_mapping = bh->b_page->mapping;
675
676 mark_buffer_dirty(bh);
677 if (!mapping->assoc_mapping) {
678 mapping->assoc_mapping = buffer_mapping;
679 } else {
Eric Sesterhenne827f922006-03-26 18:24:46 +0200680 BUG_ON(mapping->assoc_mapping != buffer_mapping);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700681 }
Jan Kara535ee2f2008-02-08 04:21:59 -0800682 if (!bh->b_assoc_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683 spin_lock(&buffer_mapping->private_lock);
684 list_move_tail(&bh->b_assoc_buffers,
685 &mapping->private_list);
Jan Kara58ff4072006-10-17 00:10:19 -0700686 bh->b_assoc_map = mapping;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700687 spin_unlock(&buffer_mapping->private_lock);
688 }
689}
690EXPORT_SYMBOL(mark_buffer_dirty_inode);
691
692/*
Nick Piggin787d2212007-07-17 04:03:34 -0700693 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
694 * dirty.
695 *
696 * If warn is true, then emit a warning if the page is not uptodate and has
697 * not been truncated.
698 */
699static int __set_page_dirty(struct page *page,
700 struct address_space *mapping, int warn)
701{
702 if (unlikely(!mapping))
703 return !TestSetPageDirty(page);
704
705 if (TestSetPageDirty(page))
706 return 0;
707
Nick Piggin19fd6232008-07-25 19:45:32 -0700708 spin_lock_irq(&mapping->tree_lock);
Nick Piggin787d2212007-07-17 04:03:34 -0700709 if (page->mapping) { /* Race with truncate? */
710 WARN_ON_ONCE(warn && !PageUptodate(page));
711
712 if (mapping_cap_account_dirty(mapping)) {
713 __inc_zone_page_state(page, NR_FILE_DIRTY);
Peter Zijlstrac9e51e42007-10-16 23:25:47 -0700714 __inc_bdi_stat(mapping->backing_dev_info,
715 BDI_RECLAIMABLE);
Nick Piggin787d2212007-07-17 04:03:34 -0700716 task_io_account_write(PAGE_CACHE_SIZE);
717 }
718 radix_tree_tag_set(&mapping->page_tree,
719 page_index(page), PAGECACHE_TAG_DIRTY);
720 }
Nick Piggin19fd6232008-07-25 19:45:32 -0700721 spin_unlock_irq(&mapping->tree_lock);
Nick Piggin787d2212007-07-17 04:03:34 -0700722 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
723
724 return 1;
725}
726
727/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700728 * Add a page to the dirty page list.
729 *
730 * It is a sad fact of life that this function is called from several places
731 * deeply under spinlocking. It may not sleep.
732 *
733 * If the page has buffers, the uptodate buffers are set dirty, to preserve
734 * dirty-state coherency between the page and the buffers. It the page does
735 * not have buffers then when they are later attached they will all be set
736 * dirty.
737 *
738 * The buffers are dirtied before the page is dirtied. There's a small race
739 * window in which a writepage caller may see the page cleanness but not the
740 * buffer dirtiness. That's fine. If this code were to set the page dirty
741 * before the buffers, a concurrent writepage caller could clear the page dirty
742 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
743 * page on the dirty page list.
744 *
745 * We use private_lock to lock against try_to_free_buffers while using the
746 * page's buffer list. Also use this to protect against clean buffers being
747 * added to the page after it was set dirty.
748 *
749 * FIXME: may need to call ->reservepage here as well. That's rather up to the
750 * address_space though.
751 */
752int __set_page_dirty_buffers(struct page *page)
753{
Nick Piggin787d2212007-07-17 04:03:34 -0700754 struct address_space *mapping = page_mapping(page);
Nick Pigginebf7a222006-10-10 04:36:54 +0200755
756 if (unlikely(!mapping))
757 return !TestSetPageDirty(page);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700758
759 spin_lock(&mapping->private_lock);
760 if (page_has_buffers(page)) {
761 struct buffer_head *head = page_buffers(page);
762 struct buffer_head *bh = head;
763
764 do {
765 set_buffer_dirty(bh);
766 bh = bh->b_this_page;
767 } while (bh != head);
768 }
769 spin_unlock(&mapping->private_lock);
770
Nick Piggin787d2212007-07-17 04:03:34 -0700771 return __set_page_dirty(page, mapping, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700772}
773EXPORT_SYMBOL(__set_page_dirty_buffers);
774
775/*
776 * Write out and wait upon a list of buffers.
777 *
778 * We have conflicting pressures: we want to make sure that all
779 * initially dirty buffers get waited on, but that any subsequently
780 * dirtied buffers don't. After all, we don't want fsync to last
781 * forever if somebody is actively writing to the file.
782 *
783 * Do this in two main stages: first we copy dirty buffers to a
784 * temporary inode list, queueing the writes as we go. Then we clean
785 * up, waiting for those writes to complete.
786 *
787 * During this second stage, any subsequent updates to the file may end
788 * up refiling the buffer on the original inode's dirty list again, so
789 * there is a chance we will end up with a buffer queued for write but
790 * not yet completed on that list. So, as a final cleanup we go through
791 * the osync code to catch these locked, dirty buffers without requeuing
792 * any newly dirty buffers for write.
793 */
794static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
795{
796 struct buffer_head *bh;
797 struct list_head tmp;
Jan Kara535ee2f2008-02-08 04:21:59 -0800798 struct address_space *mapping;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799 int err = 0, err2;
800
801 INIT_LIST_HEAD(&tmp);
802
803 spin_lock(lock);
804 while (!list_empty(list)) {
805 bh = BH_ENTRY(list->next);
Jan Kara535ee2f2008-02-08 04:21:59 -0800806 mapping = bh->b_assoc_map;
Jan Kara58ff4072006-10-17 00:10:19 -0700807 __remove_assoc_queue(bh);
Jan Kara535ee2f2008-02-08 04:21:59 -0800808 /* Avoid race with mark_buffer_dirty_inode() which does
809 * a lockless check and we rely on seeing the dirty bit */
810 smp_mb();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700811 if (buffer_dirty(bh) || buffer_locked(bh)) {
812 list_add(&bh->b_assoc_buffers, &tmp);
Jan Kara535ee2f2008-02-08 04:21:59 -0800813 bh->b_assoc_map = mapping;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700814 if (buffer_dirty(bh)) {
815 get_bh(bh);
816 spin_unlock(lock);
817 /*
818 * Ensure any pending I/O completes so that
819 * ll_rw_block() actually writes the current
820 * contents - it is a noop if I/O is still in
821 * flight on potentially older contents.
822 */
Jens Axboe18ce3752008-07-01 09:07:34 +0200823 ll_rw_block(SWRITE_SYNC, 1, &bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700824 brelse(bh);
825 spin_lock(lock);
826 }
827 }
828 }
829
830 while (!list_empty(&tmp)) {
831 bh = BH_ENTRY(tmp.prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700832 get_bh(bh);
Jan Kara535ee2f2008-02-08 04:21:59 -0800833 mapping = bh->b_assoc_map;
834 __remove_assoc_queue(bh);
835 /* Avoid race with mark_buffer_dirty_inode() which does
836 * a lockless check and we rely on seeing the dirty bit */
837 smp_mb();
838 if (buffer_dirty(bh)) {
839 list_add(&bh->b_assoc_buffers,
Jan Karae3892292008-03-04 14:28:33 -0800840 &mapping->private_list);
Jan Kara535ee2f2008-02-08 04:21:59 -0800841 bh->b_assoc_map = mapping;
842 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700843 spin_unlock(lock);
844 wait_on_buffer(bh);
845 if (!buffer_uptodate(bh))
846 err = -EIO;
847 brelse(bh);
848 spin_lock(lock);
849 }
850
851 spin_unlock(lock);
852 err2 = osync_buffers_list(lock, list);
853 if (err)
854 return err;
855 else
856 return err2;
857}
858
859/*
860 * Invalidate any and all dirty buffers on a given inode. We are
861 * probably unmounting the fs, but that doesn't mean we have already
862 * done a sync(). Just drop the buffers from the inode list.
863 *
864 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
865 * assumes that all the buffers are against the blockdev. Not true
866 * for reiserfs.
867 */
868void invalidate_inode_buffers(struct inode *inode)
869{
870 if (inode_has_buffers(inode)) {
871 struct address_space *mapping = &inode->i_data;
872 struct list_head *list = &mapping->private_list;
873 struct address_space *buffer_mapping = mapping->assoc_mapping;
874
875 spin_lock(&buffer_mapping->private_lock);
876 while (!list_empty(list))
877 __remove_assoc_queue(BH_ENTRY(list->next));
878 spin_unlock(&buffer_mapping->private_lock);
879 }
880}
881
882/*
883 * Remove any clean buffers from the inode's buffer list. This is called
884 * when we're trying to free the inode itself. Those buffers can pin it.
885 *
886 * Returns true if all buffers were removed.
887 */
888int remove_inode_buffers(struct inode *inode)
889{
890 int ret = 1;
891
892 if (inode_has_buffers(inode)) {
893 struct address_space *mapping = &inode->i_data;
894 struct list_head *list = &mapping->private_list;
895 struct address_space *buffer_mapping = mapping->assoc_mapping;
896
897 spin_lock(&buffer_mapping->private_lock);
898 while (!list_empty(list)) {
899 struct buffer_head *bh = BH_ENTRY(list->next);
900 if (buffer_dirty(bh)) {
901 ret = 0;
902 break;
903 }
904 __remove_assoc_queue(bh);
905 }
906 spin_unlock(&buffer_mapping->private_lock);
907 }
908 return ret;
909}
910
911/*
912 * Create the appropriate buffers when given a page for data area and
913 * the size of each buffer.. Use the bh->b_this_page linked list to
914 * follow the buffers created. Return NULL if unable to create more
915 * buffers.
916 *
917 * The retry flag is used to differentiate async IO (paging, swapping)
918 * which may not fail from ordinary buffer allocations.
919 */
920struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
921 int retry)
922{
923 struct buffer_head *bh, *head;
924 long offset;
925
926try_again:
927 head = NULL;
928 offset = PAGE_SIZE;
929 while ((offset -= size) >= 0) {
930 bh = alloc_buffer_head(GFP_NOFS);
931 if (!bh)
932 goto no_grow;
933
934 bh->b_bdev = NULL;
935 bh->b_this_page = head;
936 bh->b_blocknr = -1;
937 head = bh;
938
939 bh->b_state = 0;
940 atomic_set(&bh->b_count, 0);
Chris Masonfc5cd582006-02-01 03:06:48 -0800941 bh->b_private = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942 bh->b_size = size;
943
944 /* Link the buffer to its page */
945 set_bh_page(bh, page, offset);
946
Nathan Scott01ffe332006-01-17 09:02:07 +1100947 init_buffer(bh, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700948 }
949 return head;
950/*
951 * In case anything failed, we just free everything we got.
952 */
953no_grow:
954 if (head) {
955 do {
956 bh = head;
957 head = head->b_this_page;
958 free_buffer_head(bh);
959 } while (head);
960 }
961
962 /*
963 * Return failure for non-async IO requests. Async IO requests
964 * are not allowed to fail, so we have to wait until buffer heads
965 * become available. But we don't want tasks sleeping with
966 * partially complete buffers, so all were released above.
967 */
968 if (!retry)
969 return NULL;
970
971 /* We're _really_ low on memory. Now we just
972 * wait for old buffer heads to become free due to
973 * finishing IO. Since this is an async request and
974 * the reserve list is empty, we're sure there are
975 * async buffer heads in use.
976 */
977 free_more_memory();
978 goto try_again;
979}
980EXPORT_SYMBOL_GPL(alloc_page_buffers);
981
982static inline void
983link_dev_buffers(struct page *page, struct buffer_head *head)
984{
985 struct buffer_head *bh, *tail;
986
987 bh = head;
988 do {
989 tail = bh;
990 bh = bh->b_this_page;
991 } while (bh);
992 tail->b_this_page = head;
993 attach_page_buffers(page, head);
994}
995
996/*
997 * Initialise the state of a blockdev page's buffers.
998 */
999static void
1000init_page_buffers(struct page *page, struct block_device *bdev,
1001 sector_t block, int size)
1002{
1003 struct buffer_head *head = page_buffers(page);
1004 struct buffer_head *bh = head;
1005 int uptodate = PageUptodate(page);
1006
1007 do {
1008 if (!buffer_mapped(bh)) {
1009 init_buffer(bh, NULL, NULL);
1010 bh->b_bdev = bdev;
1011 bh->b_blocknr = block;
1012 if (uptodate)
1013 set_buffer_uptodate(bh);
1014 set_buffer_mapped(bh);
1015 }
1016 block++;
1017 bh = bh->b_this_page;
1018 } while (bh != head);
1019}
1020
1021/*
1022 * Create the page-cache page that contains the requested block.
1023 *
1024 * This is user purely for blockdev mappings.
1025 */
1026static struct page *
1027grow_dev_page(struct block_device *bdev, sector_t block,
1028 pgoff_t index, int size)
1029{
1030 struct inode *inode = bdev->bd_inode;
1031 struct page *page;
1032 struct buffer_head *bh;
1033
Christoph Lameterea125892007-05-16 22:11:21 -07001034 page = find_or_create_page(inode->i_mapping, index,
Mel Gorman769848c2007-07-17 04:03:05 -07001035 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001036 if (!page)
1037 return NULL;
1038
Eric Sesterhenne827f922006-03-26 18:24:46 +02001039 BUG_ON(!PageLocked(page));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001040
1041 if (page_has_buffers(page)) {
1042 bh = page_buffers(page);
1043 if (bh->b_size == size) {
1044 init_page_buffers(page, bdev, block, size);
1045 return page;
1046 }
1047 if (!try_to_free_buffers(page))
1048 goto failed;
1049 }
1050
1051 /*
1052 * Allocate some buffers for this page
1053 */
1054 bh = alloc_page_buffers(page, size, 0);
1055 if (!bh)
1056 goto failed;
1057
1058 /*
1059 * Link the page to the buffers and initialise them. Take the
1060 * lock to be atomic wrt __find_get_block(), which does not
1061 * run under the page lock.
1062 */
1063 spin_lock(&inode->i_mapping->private_lock);
1064 link_dev_buffers(page, bh);
1065 init_page_buffers(page, bdev, block, size);
1066 spin_unlock(&inode->i_mapping->private_lock);
1067 return page;
1068
1069failed:
1070 BUG();
1071 unlock_page(page);
1072 page_cache_release(page);
1073 return NULL;
1074}
1075
1076/*
1077 * Create buffers for the specified block device block's page. If
1078 * that page was dirty, the buffers are set dirty also.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001079 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08001080static int
Linus Torvalds1da177e2005-04-16 15:20:36 -07001081grow_buffers(struct block_device *bdev, sector_t block, int size)
1082{
1083 struct page *page;
1084 pgoff_t index;
1085 int sizebits;
1086
1087 sizebits = -1;
1088 do {
1089 sizebits++;
1090 } while ((size << sizebits) < PAGE_SIZE);
1091
1092 index = block >> sizebits;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001093
Andrew Mortone5657932006-10-11 01:21:46 -07001094 /*
1095 * Check for a block which wants to lie outside our maximum possible
1096 * pagecache index. (this comparison is done using sector_t types).
1097 */
1098 if (unlikely(index != block >> sizebits)) {
1099 char b[BDEVNAME_SIZE];
1100
1101 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1102 "device %s\n",
Harvey Harrison8e24eea2008-04-30 00:55:09 -07001103 __func__, (unsigned long long)block,
Andrew Mortone5657932006-10-11 01:21:46 -07001104 bdevname(bdev, b));
1105 return -EIO;
1106 }
1107 block = index << sizebits;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001108 /* Create a page with the proper size buffers.. */
1109 page = grow_dev_page(bdev, block, index, size);
1110 if (!page)
1111 return 0;
1112 unlock_page(page);
1113 page_cache_release(page);
1114 return 1;
1115}
1116
Adrian Bunk75c96f82005-05-05 16:16:09 -07001117static struct buffer_head *
Linus Torvalds1da177e2005-04-16 15:20:36 -07001118__getblk_slow(struct block_device *bdev, sector_t block, int size)
1119{
1120 /* Size must be multiple of hard sectorsize */
1121 if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1122 (size < 512 || size > PAGE_SIZE))) {
1123 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1124 size);
1125 printk(KERN_ERR "hardsect size: %d\n",
1126 bdev_hardsect_size(bdev));
1127
1128 dump_stack();
1129 return NULL;
1130 }
1131
1132 for (;;) {
1133 struct buffer_head * bh;
Andrew Mortone5657932006-10-11 01:21:46 -07001134 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001135
1136 bh = __find_get_block(bdev, block, size);
1137 if (bh)
1138 return bh;
1139
Andrew Mortone5657932006-10-11 01:21:46 -07001140 ret = grow_buffers(bdev, block, size);
1141 if (ret < 0)
1142 return NULL;
1143 if (ret == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001144 free_more_memory();
1145 }
1146}
1147
1148/*
1149 * The relationship between dirty buffers and dirty pages:
1150 *
1151 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1152 * the page is tagged dirty in its radix tree.
1153 *
1154 * At all times, the dirtiness of the buffers represents the dirtiness of
1155 * subsections of the page. If the page has buffers, the page dirty bit is
1156 * merely a hint about the true dirty state.
1157 *
1158 * When a page is set dirty in its entirety, all its buffers are marked dirty
1159 * (if the page has buffers).
1160 *
1161 * When a buffer is marked dirty, its page is dirtied, but the page's other
1162 * buffers are not.
1163 *
1164 * Also. When blockdev buffers are explicitly read with bread(), they
1165 * individually become uptodate. But their backing page remains not
1166 * uptodate - even if all of its buffers are uptodate. A subsequent
1167 * block_read_full_page() against that page will discover all the uptodate
1168 * buffers, will set the page uptodate and will perform no I/O.
1169 */
1170
1171/**
1172 * mark_buffer_dirty - mark a buffer_head as needing writeout
Martin Waitz67be2dd2005-05-01 08:59:26 -07001173 * @bh: the buffer_head to mark dirty
Linus Torvalds1da177e2005-04-16 15:20:36 -07001174 *
1175 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1176 * backing page dirty, then tag the page as dirty in its address_space's radix
1177 * tree and then attach the address_space's inode to its superblock's dirty
1178 * inode list.
1179 *
1180 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1181 * mapping->tree_lock and the global inode_lock.
1182 */
Harvey Harrisonfc9b52c2008-02-08 04:19:52 -08001183void mark_buffer_dirty(struct buffer_head *bh)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001184{
Nick Piggin787d2212007-07-17 04:03:34 -07001185 WARN_ON_ONCE(!buffer_uptodate(bh));
Linus Torvalds1be62dc2008-04-04 14:38:17 -07001186
1187 /*
1188 * Very *carefully* optimize the it-is-already-dirty case.
1189 *
1190 * Don't let the final "is it dirty" escape to before we
1191 * perhaps modified the buffer.
1192 */
1193 if (buffer_dirty(bh)) {
1194 smp_mb();
1195 if (buffer_dirty(bh))
1196 return;
1197 }
1198
1199 if (!test_set_buffer_dirty(bh))
Nick Piggin787d2212007-07-17 04:03:34 -07001200 __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001201}
1202
1203/*
1204 * Decrement a buffer_head's reference count. If all buffers against a page
1205 * have zero reference count, are clean and unlocked, and if the page is clean
1206 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1207 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1208 * a page but it ends up not being freed, and buffers may later be reattached).
1209 */
1210void __brelse(struct buffer_head * buf)
1211{
1212 if (atomic_read(&buf->b_count)) {
1213 put_bh(buf);
1214 return;
1215 }
Arjan van de Ven5c752ad2008-07-25 19:45:40 -07001216 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001217}
1218
1219/*
1220 * bforget() is like brelse(), except it discards any
1221 * potentially dirty data.
1222 */
1223void __bforget(struct buffer_head *bh)
1224{
1225 clear_buffer_dirty(bh);
Jan Kara535ee2f2008-02-08 04:21:59 -08001226 if (bh->b_assoc_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001227 struct address_space *buffer_mapping = bh->b_page->mapping;
1228
1229 spin_lock(&buffer_mapping->private_lock);
1230 list_del_init(&bh->b_assoc_buffers);
Jan Kara58ff4072006-10-17 00:10:19 -07001231 bh->b_assoc_map = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001232 spin_unlock(&buffer_mapping->private_lock);
1233 }
1234 __brelse(bh);
1235}
1236
1237static struct buffer_head *__bread_slow(struct buffer_head *bh)
1238{
1239 lock_buffer(bh);
1240 if (buffer_uptodate(bh)) {
1241 unlock_buffer(bh);
1242 return bh;
1243 } else {
1244 get_bh(bh);
1245 bh->b_end_io = end_buffer_read_sync;
1246 submit_bh(READ, bh);
1247 wait_on_buffer(bh);
1248 if (buffer_uptodate(bh))
1249 return bh;
1250 }
1251 brelse(bh);
1252 return NULL;
1253}
1254
1255/*
1256 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1257 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1258 * refcount elevated by one when they're in an LRU. A buffer can only appear
1259 * once in a particular CPU's LRU. A single buffer can be present in multiple
1260 * CPU's LRUs at the same time.
1261 *
1262 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1263 * sb_find_get_block().
1264 *
1265 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1266 * a local interrupt disable for that.
1267 */
1268
1269#define BH_LRU_SIZE 8
1270
1271struct bh_lru {
1272 struct buffer_head *bhs[BH_LRU_SIZE];
1273};
1274
1275static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1276
1277#ifdef CONFIG_SMP
1278#define bh_lru_lock() local_irq_disable()
1279#define bh_lru_unlock() local_irq_enable()
1280#else
1281#define bh_lru_lock() preempt_disable()
1282#define bh_lru_unlock() preempt_enable()
1283#endif
1284
1285static inline void check_irqs_on(void)
1286{
1287#ifdef irqs_disabled
1288 BUG_ON(irqs_disabled());
1289#endif
1290}
1291
1292/*
1293 * The LRU management algorithm is dopey-but-simple. Sorry.
1294 */
1295static void bh_lru_install(struct buffer_head *bh)
1296{
1297 struct buffer_head *evictee = NULL;
1298 struct bh_lru *lru;
1299
1300 check_irqs_on();
1301 bh_lru_lock();
1302 lru = &__get_cpu_var(bh_lrus);
1303 if (lru->bhs[0] != bh) {
1304 struct buffer_head *bhs[BH_LRU_SIZE];
1305 int in;
1306 int out = 0;
1307
1308 get_bh(bh);
1309 bhs[out++] = bh;
1310 for (in = 0; in < BH_LRU_SIZE; in++) {
1311 struct buffer_head *bh2 = lru->bhs[in];
1312
1313 if (bh2 == bh) {
1314 __brelse(bh2);
1315 } else {
1316 if (out >= BH_LRU_SIZE) {
1317 BUG_ON(evictee != NULL);
1318 evictee = bh2;
1319 } else {
1320 bhs[out++] = bh2;
1321 }
1322 }
1323 }
1324 while (out < BH_LRU_SIZE)
1325 bhs[out++] = NULL;
1326 memcpy(lru->bhs, bhs, sizeof(bhs));
1327 }
1328 bh_lru_unlock();
1329
1330 if (evictee)
1331 __brelse(evictee);
1332}
1333
1334/*
1335 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1336 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08001337static struct buffer_head *
Tomasz Kvarsin3991d3b2007-02-12 00:52:14 -08001338lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001339{
1340 struct buffer_head *ret = NULL;
1341 struct bh_lru *lru;
Tomasz Kvarsin3991d3b2007-02-12 00:52:14 -08001342 unsigned int i;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001343
1344 check_irqs_on();
1345 bh_lru_lock();
1346 lru = &__get_cpu_var(bh_lrus);
1347 for (i = 0; i < BH_LRU_SIZE; i++) {
1348 struct buffer_head *bh = lru->bhs[i];
1349
1350 if (bh && bh->b_bdev == bdev &&
1351 bh->b_blocknr == block && bh->b_size == size) {
1352 if (i) {
1353 while (i) {
1354 lru->bhs[i] = lru->bhs[i - 1];
1355 i--;
1356 }
1357 lru->bhs[0] = bh;
1358 }
1359 get_bh(bh);
1360 ret = bh;
1361 break;
1362 }
1363 }
1364 bh_lru_unlock();
1365 return ret;
1366}
1367
1368/*
1369 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1370 * it in the LRU and mark it as accessed. If it is not present then return
1371 * NULL
1372 */
1373struct buffer_head *
Tomasz Kvarsin3991d3b2007-02-12 00:52:14 -08001374__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001375{
1376 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1377
1378 if (bh == NULL) {
Coywolf Qi Hunt385fd4c2005-11-07 00:59:39 -08001379 bh = __find_get_block_slow(bdev, block);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001380 if (bh)
1381 bh_lru_install(bh);
1382 }
1383 if (bh)
1384 touch_buffer(bh);
1385 return bh;
1386}
1387EXPORT_SYMBOL(__find_get_block);
1388
1389/*
1390 * __getblk will locate (and, if necessary, create) the buffer_head
1391 * which corresponds to the passed block_device, block and size. The
1392 * returned buffer has its reference count incremented.
1393 *
1394 * __getblk() cannot fail - it just keeps trying. If you pass it an
1395 * illegal block number, __getblk() will happily return a buffer_head
1396 * which represents the non-existent block. Very weird.
1397 *
1398 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1399 * attempt is failing. FIXME, perhaps?
1400 */
1401struct buffer_head *
Tomasz Kvarsin3991d3b2007-02-12 00:52:14 -08001402__getblk(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001403{
1404 struct buffer_head *bh = __find_get_block(bdev, block, size);
1405
1406 might_sleep();
1407 if (bh == NULL)
1408 bh = __getblk_slow(bdev, block, size);
1409 return bh;
1410}
1411EXPORT_SYMBOL(__getblk);
1412
1413/*
1414 * Do async read-ahead on a buffer..
1415 */
Tomasz Kvarsin3991d3b2007-02-12 00:52:14 -08001416void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001417{
1418 struct buffer_head *bh = __getblk(bdev, block, size);
Andrew Mortona3e713b2005-10-30 15:03:15 -08001419 if (likely(bh)) {
1420 ll_rw_block(READA, 1, &bh);
1421 brelse(bh);
1422 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001423}
1424EXPORT_SYMBOL(__breadahead);
1425
1426/**
1427 * __bread() - reads a specified block and returns the bh
Martin Waitz67be2dd2005-05-01 08:59:26 -07001428 * @bdev: the block_device to read from
Linus Torvalds1da177e2005-04-16 15:20:36 -07001429 * @block: number of block
1430 * @size: size (in bytes) to read
1431 *
1432 * Reads a specified block, and returns buffer head that contains it.
1433 * It returns NULL if the block was unreadable.
1434 */
1435struct buffer_head *
Tomasz Kvarsin3991d3b2007-02-12 00:52:14 -08001436__bread(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001437{
1438 struct buffer_head *bh = __getblk(bdev, block, size);
1439
Andrew Mortona3e713b2005-10-30 15:03:15 -08001440 if (likely(bh) && !buffer_uptodate(bh))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001441 bh = __bread_slow(bh);
1442 return bh;
1443}
1444EXPORT_SYMBOL(__bread);
1445
1446/*
1447 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1448 * This doesn't race because it runs in each cpu either in irq
1449 * or with preempt disabled.
1450 */
1451static void invalidate_bh_lru(void *arg)
1452{
1453 struct bh_lru *b = &get_cpu_var(bh_lrus);
1454 int i;
1455
1456 for (i = 0; i < BH_LRU_SIZE; i++) {
1457 brelse(b->bhs[i]);
1458 b->bhs[i] = NULL;
1459 }
1460 put_cpu_var(bh_lrus);
1461}
1462
Peter Zijlstraf9a14392007-05-06 14:49:55 -07001463void invalidate_bh_lrus(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001464{
Jens Axboe15c8b6c2008-05-09 09:39:44 +02001465 on_each_cpu(invalidate_bh_lru, NULL, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001466}
Nick Piggin9db55792008-02-08 04:19:49 -08001467EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001468
1469void set_bh_page(struct buffer_head *bh,
1470 struct page *page, unsigned long offset)
1471{
1472 bh->b_page = page;
Eric Sesterhenne827f922006-03-26 18:24:46 +02001473 BUG_ON(offset >= PAGE_SIZE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001474 if (PageHighMem(page))
1475 /*
1476 * This catches illegal uses and preserves the offset:
1477 */
1478 bh->b_data = (char *)(0 + offset);
1479 else
1480 bh->b_data = page_address(page) + offset;
1481}
1482EXPORT_SYMBOL(set_bh_page);
1483
1484/*
1485 * Called when truncating a buffer on a page completely.
1486 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08001487static void discard_buffer(struct buffer_head * bh)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001488{
1489 lock_buffer(bh);
1490 clear_buffer_dirty(bh);
1491 bh->b_bdev = NULL;
1492 clear_buffer_mapped(bh);
1493 clear_buffer_req(bh);
1494 clear_buffer_new(bh);
1495 clear_buffer_delay(bh);
David Chinner33a266d2007-02-12 00:51:41 -08001496 clear_buffer_unwritten(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001497 unlock_buffer(bh);
1498}
1499
1500/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001501 * block_invalidatepage - invalidate part of all of a buffer-backed page
1502 *
1503 * @page: the page which is affected
1504 * @offset: the index of the truncation point
1505 *
1506 * block_invalidatepage() is called when all or part of the page has become
1507 * invalidatedby a truncate operation.
1508 *
1509 * block_invalidatepage() does not have to release all buffers, but it must
1510 * ensure that no dirty buffer is left outside @offset and that no I/O
1511 * is underway against any of the blocks which are outside the truncation
1512 * point. Because the caller is about to free (and possibly reuse) those
1513 * blocks on-disk.
1514 */
NeilBrown2ff28e22006-03-26 01:37:18 -08001515void block_invalidatepage(struct page *page, unsigned long offset)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001516{
1517 struct buffer_head *head, *bh, *next;
1518 unsigned int curr_off = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001519
1520 BUG_ON(!PageLocked(page));
1521 if (!page_has_buffers(page))
1522 goto out;
1523
1524 head = page_buffers(page);
1525 bh = head;
1526 do {
1527 unsigned int next_off = curr_off + bh->b_size;
1528 next = bh->b_this_page;
1529
1530 /*
1531 * is this block fully invalidated?
1532 */
1533 if (offset <= curr_off)
1534 discard_buffer(bh);
1535 curr_off = next_off;
1536 bh = next;
1537 } while (bh != head);
1538
1539 /*
1540 * We release buffers only if the entire page is being invalidated.
1541 * The get_block cached value has been unconditionally invalidated,
1542 * so real IO is not possible anymore.
1543 */
1544 if (offset == 0)
NeilBrown2ff28e22006-03-26 01:37:18 -08001545 try_to_release_page(page, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001546out:
NeilBrown2ff28e22006-03-26 01:37:18 -08001547 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001548}
1549EXPORT_SYMBOL(block_invalidatepage);
1550
1551/*
1552 * We attach and possibly dirty the buffers atomically wrt
1553 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1554 * is already excluded via the page lock.
1555 */
1556void create_empty_buffers(struct page *page,
1557 unsigned long blocksize, unsigned long b_state)
1558{
1559 struct buffer_head *bh, *head, *tail;
1560
1561 head = alloc_page_buffers(page, blocksize, 1);
1562 bh = head;
1563 do {
1564 bh->b_state |= b_state;
1565 tail = bh;
1566 bh = bh->b_this_page;
1567 } while (bh);
1568 tail->b_this_page = head;
1569
1570 spin_lock(&page->mapping->private_lock);
1571 if (PageUptodate(page) || PageDirty(page)) {
1572 bh = head;
1573 do {
1574 if (PageDirty(page))
1575 set_buffer_dirty(bh);
1576 if (PageUptodate(page))
1577 set_buffer_uptodate(bh);
1578 bh = bh->b_this_page;
1579 } while (bh != head);
1580 }
1581 attach_page_buffers(page, head);
1582 spin_unlock(&page->mapping->private_lock);
1583}
1584EXPORT_SYMBOL(create_empty_buffers);
1585
1586/*
1587 * We are taking a block for data and we don't want any output from any
1588 * buffer-cache aliases starting from return from that function and
1589 * until the moment when something will explicitly mark the buffer
1590 * dirty (hopefully that will not happen until we will free that block ;-)
1591 * We don't even need to mark it not-uptodate - nobody can expect
1592 * anything from a newly allocated buffer anyway. We used to used
1593 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1594 * don't want to mark the alias unmapped, for example - it would confuse
1595 * anyone who might pick it with bread() afterwards...
1596 *
1597 * Also.. Note that bforget() doesn't lock the buffer. So there can
1598 * be writeout I/O going on against recently-freed buffers. We don't
1599 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1600 * only if we really need to. That happens here.
1601 */
1602void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1603{
1604 struct buffer_head *old_bh;
1605
1606 might_sleep();
1607
Coywolf Qi Hunt385fd4c2005-11-07 00:59:39 -08001608 old_bh = __find_get_block_slow(bdev, block);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001609 if (old_bh) {
1610 clear_buffer_dirty(old_bh);
1611 wait_on_buffer(old_bh);
1612 clear_buffer_req(old_bh);
1613 __brelse(old_bh);
1614 }
1615}
1616EXPORT_SYMBOL(unmap_underlying_metadata);
1617
1618/*
1619 * NOTE! All mapped/uptodate combinations are valid:
1620 *
1621 * Mapped Uptodate Meaning
1622 *
1623 * No No "unknown" - must do get_block()
1624 * No Yes "hole" - zero-filled
1625 * Yes No "allocated" - allocated on disk, not read in
1626 * Yes Yes "valid" - allocated and up-to-date in memory.
1627 *
1628 * "Dirty" is valid only with the last case (mapped+uptodate).
1629 */
1630
1631/*
1632 * While block_write_full_page is writing back the dirty buffers under
1633 * the page lock, whoever dirtied the buffers may decide to clean them
1634 * again at any time. We handle that by only looking at the buffer
1635 * state inside lock_buffer().
1636 *
1637 * If block_write_full_page() is called for regular writeback
1638 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1639 * locked buffer. This only can happen if someone has written the buffer
1640 * directly, with submit_bh(). At the address_space level PageWriteback
1641 * prevents this contention from occurring.
1642 */
1643static int __block_write_full_page(struct inode *inode, struct page *page,
1644 get_block_t *get_block, struct writeback_control *wbc)
1645{
1646 int err;
1647 sector_t block;
1648 sector_t last_block;
Andrew Mortonf0fbd5f2005-05-05 16:15:48 -07001649 struct buffer_head *bh, *head;
Badari Pulavartyb0cf2322006-03-26 01:38:00 -08001650 const unsigned blocksize = 1 << inode->i_blkbits;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001651 int nr_underway = 0;
1652
1653 BUG_ON(!PageLocked(page));
1654
1655 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1656
1657 if (!page_has_buffers(page)) {
Badari Pulavartyb0cf2322006-03-26 01:38:00 -08001658 create_empty_buffers(page, blocksize,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001659 (1 << BH_Dirty)|(1 << BH_Uptodate));
1660 }
1661
1662 /*
1663 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1664 * here, and the (potentially unmapped) buffers may become dirty at
1665 * any time. If a buffer becomes dirty here after we've inspected it
1666 * then we just miss that fact, and the page stays dirty.
1667 *
1668 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1669 * handle that here by just cleaning them.
1670 */
1671
Andrew Morton54b21a72006-01-08 01:03:05 -08001672 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001673 head = page_buffers(page);
1674 bh = head;
1675
1676 /*
1677 * Get all the dirty buffers mapped to disk addresses and
1678 * handle any aliases from the underlying blockdev's mapping.
1679 */
1680 do {
1681 if (block > last_block) {
1682 /*
1683 * mapped buffers outside i_size will occur, because
1684 * this page can be outside i_size when there is a
1685 * truncate in progress.
1686 */
1687 /*
1688 * The buffer was zeroed by block_write_full_page()
1689 */
1690 clear_buffer_dirty(bh);
1691 set_buffer_uptodate(bh);
Alex Tomas29a814d2008-07-11 19:27:31 -04001692 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1693 buffer_dirty(bh)) {
Badari Pulavartyb0cf2322006-03-26 01:38:00 -08001694 WARN_ON(bh->b_size != blocksize);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001695 err = get_block(inode, block, bh, 1);
1696 if (err)
1697 goto recover;
Alex Tomas29a814d2008-07-11 19:27:31 -04001698 clear_buffer_delay(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001699 if (buffer_new(bh)) {
1700 /* blockdev mappings never come here */
1701 clear_buffer_new(bh);
1702 unmap_underlying_metadata(bh->b_bdev,
1703 bh->b_blocknr);
1704 }
1705 }
1706 bh = bh->b_this_page;
1707 block++;
1708 } while (bh != head);
1709
1710 do {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001711 if (!buffer_mapped(bh))
1712 continue;
1713 /*
1714 * If it's a fully non-blocking write attempt and we cannot
1715 * lock the buffer then redirty the page. Note that this can
1716 * potentially cause a busy-wait loop from pdflush and kswapd
1717 * activity, but those code paths have their own higher-level
1718 * throttling.
1719 */
1720 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1721 lock_buffer(bh);
Nick Pigginca5de402008-08-02 12:02:13 +02001722 } else if (!trylock_buffer(bh)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001723 redirty_page_for_writepage(wbc, page);
1724 continue;
1725 }
1726 if (test_clear_buffer_dirty(bh)) {
1727 mark_buffer_async_write(bh);
1728 } else {
1729 unlock_buffer(bh);
1730 }
1731 } while ((bh = bh->b_this_page) != head);
1732
1733 /*
1734 * The page and its buffers are protected by PageWriteback(), so we can
1735 * drop the bh refcounts early.
1736 */
1737 BUG_ON(PageWriteback(page));
1738 set_page_writeback(page);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001739
1740 do {
1741 struct buffer_head *next = bh->b_this_page;
1742 if (buffer_async_write(bh)) {
1743 submit_bh(WRITE, bh);
1744 nr_underway++;
1745 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001746 bh = next;
1747 } while (bh != head);
Andrew Morton05937ba2005-05-05 16:15:47 -07001748 unlock_page(page);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001749
1750 err = 0;
1751done:
1752 if (nr_underway == 0) {
1753 /*
1754 * The page was marked dirty, but the buffers were
1755 * clean. Someone wrote them back by hand with
1756 * ll_rw_block/submit_bh. A rare case.
1757 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001758 end_page_writeback(page);
Nick Piggin3d67f2d2007-05-06 14:49:05 -07001759
Linus Torvalds1da177e2005-04-16 15:20:36 -07001760 /*
1761 * The page and buffer_heads can be released at any time from
1762 * here on.
1763 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001764 }
1765 return err;
1766
1767recover:
1768 /*
1769 * ENOSPC, or some other error. We may already have added some
1770 * blocks to the file, so we need to write these out to avoid
1771 * exposing stale data.
1772 * The page is currently locked and not marked for writeback
1773 */
1774 bh = head;
1775 /* Recovery: lock and submit the mapped buffers */
1776 do {
Alex Tomas29a814d2008-07-11 19:27:31 -04001777 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1778 !buffer_delay(bh)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001779 lock_buffer(bh);
1780 mark_buffer_async_write(bh);
1781 } else {
1782 /*
1783 * The buffer may have been set dirty during
1784 * attachment to a dirty page.
1785 */
1786 clear_buffer_dirty(bh);
1787 }
1788 } while ((bh = bh->b_this_page) != head);
1789 SetPageError(page);
1790 BUG_ON(PageWriteback(page));
Andrew Morton7e4c3692007-05-08 00:23:27 -07001791 mapping_set_error(page->mapping, err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001792 set_page_writeback(page);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001793 do {
1794 struct buffer_head *next = bh->b_this_page;
1795 if (buffer_async_write(bh)) {
1796 clear_buffer_dirty(bh);
1797 submit_bh(WRITE, bh);
1798 nr_underway++;
1799 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001800 bh = next;
1801 } while (bh != head);
Nick Pigginffda9d32007-02-20 13:57:54 -08001802 unlock_page(page);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001803 goto done;
1804}
1805
Nick Pigginafddba42007-10-16 01:25:01 -07001806/*
1807 * If a page has any new buffers, zero them out here, and mark them uptodate
1808 * and dirty so they'll be written out (in order to prevent uninitialised
1809 * block data from leaking). And clear the new bit.
1810 */
1811void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1812{
1813 unsigned int block_start, block_end;
1814 struct buffer_head *head, *bh;
1815
1816 BUG_ON(!PageLocked(page));
1817 if (!page_has_buffers(page))
1818 return;
1819
1820 bh = head = page_buffers(page);
1821 block_start = 0;
1822 do {
1823 block_end = block_start + bh->b_size;
1824
1825 if (buffer_new(bh)) {
1826 if (block_end > from && block_start < to) {
1827 if (!PageUptodate(page)) {
1828 unsigned start, size;
1829
1830 start = max(from, block_start);
1831 size = min(to, block_end) - start;
1832
Christoph Lametereebd2aa2008-02-04 22:28:29 -08001833 zero_user(page, start, size);
Nick Pigginafddba42007-10-16 01:25:01 -07001834 set_buffer_uptodate(bh);
1835 }
1836
1837 clear_buffer_new(bh);
1838 mark_buffer_dirty(bh);
1839 }
1840 }
1841
1842 block_start = block_end;
1843 bh = bh->b_this_page;
1844 } while (bh != head);
1845}
1846EXPORT_SYMBOL(page_zero_new_buffers);
1847
Linus Torvalds1da177e2005-04-16 15:20:36 -07001848static int __block_prepare_write(struct inode *inode, struct page *page,
1849 unsigned from, unsigned to, get_block_t *get_block)
1850{
1851 unsigned block_start, block_end;
1852 sector_t block;
1853 int err = 0;
1854 unsigned blocksize, bbits;
1855 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1856
1857 BUG_ON(!PageLocked(page));
1858 BUG_ON(from > PAGE_CACHE_SIZE);
1859 BUG_ON(to > PAGE_CACHE_SIZE);
1860 BUG_ON(from > to);
1861
1862 blocksize = 1 << inode->i_blkbits;
1863 if (!page_has_buffers(page))
1864 create_empty_buffers(page, blocksize, 0);
1865 head = page_buffers(page);
1866
1867 bbits = inode->i_blkbits;
1868 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1869
1870 for(bh = head, block_start = 0; bh != head || !block_start;
1871 block++, block_start=block_end, bh = bh->b_this_page) {
1872 block_end = block_start + blocksize;
1873 if (block_end <= from || block_start >= to) {
1874 if (PageUptodate(page)) {
1875 if (!buffer_uptodate(bh))
1876 set_buffer_uptodate(bh);
1877 }
1878 continue;
1879 }
1880 if (buffer_new(bh))
1881 clear_buffer_new(bh);
1882 if (!buffer_mapped(bh)) {
Badari Pulavartyb0cf2322006-03-26 01:38:00 -08001883 WARN_ON(bh->b_size != blocksize);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001884 err = get_block(inode, block, bh, 1);
1885 if (err)
Nick Pigginf3ddbdc2005-05-05 16:15:45 -07001886 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001887 if (buffer_new(bh)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001888 unmap_underlying_metadata(bh->b_bdev,
1889 bh->b_blocknr);
1890 if (PageUptodate(page)) {
Nick Piggin637aff42007-10-16 01:25:00 -07001891 clear_buffer_new(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001892 set_buffer_uptodate(bh);
Nick Piggin637aff42007-10-16 01:25:00 -07001893 mark_buffer_dirty(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001894 continue;
1895 }
Christoph Lametereebd2aa2008-02-04 22:28:29 -08001896 if (block_end > to || block_start < from)
1897 zero_user_segments(page,
1898 to, block_end,
1899 block_start, from);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001900 continue;
1901 }
1902 }
1903 if (PageUptodate(page)) {
1904 if (!buffer_uptodate(bh))
1905 set_buffer_uptodate(bh);
1906 continue;
1907 }
1908 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
David Chinner33a266d2007-02-12 00:51:41 -08001909 !buffer_unwritten(bh) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001910 (block_start < from || block_end > to)) {
1911 ll_rw_block(READ, 1, &bh);
1912 *wait_bh++=bh;
1913 }
1914 }
1915 /*
1916 * If we issued read requests - let them complete.
1917 */
1918 while(wait_bh > wait) {
1919 wait_on_buffer(*--wait_bh);
1920 if (!buffer_uptodate(*wait_bh))
Nick Pigginf3ddbdc2005-05-05 16:15:45 -07001921 err = -EIO;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001922 }
Nick Pigginafddba42007-10-16 01:25:01 -07001923 if (unlikely(err))
1924 page_zero_new_buffers(page, from, to);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001925 return err;
1926}
1927
1928static int __block_commit_write(struct inode *inode, struct page *page,
1929 unsigned from, unsigned to)
1930{
1931 unsigned block_start, block_end;
1932 int partial = 0;
1933 unsigned blocksize;
1934 struct buffer_head *bh, *head;
1935
1936 blocksize = 1 << inode->i_blkbits;
1937
1938 for(bh = head = page_buffers(page), block_start = 0;
1939 bh != head || !block_start;
1940 block_start=block_end, bh = bh->b_this_page) {
1941 block_end = block_start + blocksize;
1942 if (block_end <= from || block_start >= to) {
1943 if (!buffer_uptodate(bh))
1944 partial = 1;
1945 } else {
1946 set_buffer_uptodate(bh);
1947 mark_buffer_dirty(bh);
1948 }
Nick Pigginafddba42007-10-16 01:25:01 -07001949 clear_buffer_new(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001950 }
1951
1952 /*
1953 * If this is a partial write which happened to make all buffers
1954 * uptodate then we can optimize away a bogus readpage() for
1955 * the next read(). Here we 'discover' whether the page went
1956 * uptodate as a result of this (potentially partial) write.
1957 */
1958 if (!partial)
1959 SetPageUptodate(page);
1960 return 0;
1961}
1962
1963/*
Nick Pigginafddba42007-10-16 01:25:01 -07001964 * block_write_begin takes care of the basic task of block allocation and
1965 * bringing partial write blocks uptodate first.
1966 *
1967 * If *pagep is not NULL, then block_write_begin uses the locked page
1968 * at *pagep rather than allocating its own. In this case, the page will
1969 * not be unlocked or deallocated on failure.
1970 */
1971int block_write_begin(struct file *file, struct address_space *mapping,
1972 loff_t pos, unsigned len, unsigned flags,
1973 struct page **pagep, void **fsdata,
1974 get_block_t *get_block)
1975{
1976 struct inode *inode = mapping->host;
1977 int status = 0;
1978 struct page *page;
1979 pgoff_t index;
1980 unsigned start, end;
1981 int ownpage = 0;
1982
1983 index = pos >> PAGE_CACHE_SHIFT;
1984 start = pos & (PAGE_CACHE_SIZE - 1);
1985 end = start + len;
1986
1987 page = *pagep;
1988 if (page == NULL) {
1989 ownpage = 1;
1990 page = __grab_cache_page(mapping, index);
1991 if (!page) {
1992 status = -ENOMEM;
1993 goto out;
1994 }
1995 *pagep = page;
1996 } else
1997 BUG_ON(!PageLocked(page));
1998
1999 status = __block_prepare_write(inode, page, start, end, get_block);
2000 if (unlikely(status)) {
2001 ClearPageUptodate(page);
2002
2003 if (ownpage) {
2004 unlock_page(page);
2005 page_cache_release(page);
2006 *pagep = NULL;
2007
2008 /*
2009 * prepare_write() may have instantiated a few blocks
2010 * outside i_size. Trim these off again. Don't need
2011 * i_size_read because we hold i_mutex.
2012 */
2013 if (pos + len > inode->i_size)
2014 vmtruncate(inode, inode->i_size);
2015 }
2016 goto out;
2017 }
2018
2019out:
2020 return status;
2021}
2022EXPORT_SYMBOL(block_write_begin);
2023
2024int block_write_end(struct file *file, struct address_space *mapping,
2025 loff_t pos, unsigned len, unsigned copied,
2026 struct page *page, void *fsdata)
2027{
2028 struct inode *inode = mapping->host;
2029 unsigned start;
2030
2031 start = pos & (PAGE_CACHE_SIZE - 1);
2032
2033 if (unlikely(copied < len)) {
2034 /*
2035 * The buffers that were written will now be uptodate, so we
2036 * don't have to worry about a readpage reading them and
2037 * overwriting a partial write. However if we have encountered
2038 * a short write and only partially written into a buffer, it
2039 * will not be marked uptodate, so a readpage might come in and
2040 * destroy our partial write.
2041 *
2042 * Do the simplest thing, and just treat any short write to a
2043 * non uptodate page as a zero-length write, and force the
2044 * caller to redo the whole thing.
2045 */
2046 if (!PageUptodate(page))
2047 copied = 0;
2048
2049 page_zero_new_buffers(page, start+copied, start+len);
2050 }
2051 flush_dcache_page(page);
2052
2053 /* This could be a short (even 0-length) commit */
2054 __block_commit_write(inode, page, start, start+copied);
2055
2056 return copied;
2057}
2058EXPORT_SYMBOL(block_write_end);
2059
2060int generic_write_end(struct file *file, struct address_space *mapping,
2061 loff_t pos, unsigned len, unsigned copied,
2062 struct page *page, void *fsdata)
2063{
2064 struct inode *inode = mapping->host;
Jan Karac7d206b2008-07-11 19:27:31 -04002065 int i_size_changed = 0;
Nick Pigginafddba42007-10-16 01:25:01 -07002066
2067 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2068
2069 /*
2070 * No need to use i_size_read() here, the i_size
2071 * cannot change under us because we hold i_mutex.
2072 *
2073 * But it's important to update i_size while still holding page lock:
2074 * page writeout could otherwise come in and zero beyond i_size.
2075 */
2076 if (pos+copied > inode->i_size) {
2077 i_size_write(inode, pos+copied);
Jan Karac7d206b2008-07-11 19:27:31 -04002078 i_size_changed = 1;
Nick Pigginafddba42007-10-16 01:25:01 -07002079 }
2080
2081 unlock_page(page);
2082 page_cache_release(page);
2083
Jan Karac7d206b2008-07-11 19:27:31 -04002084 /*
2085 * Don't mark the inode dirty under page lock. First, it unnecessarily
2086 * makes the holding time of page lock longer. Second, it forces lock
2087 * ordering of page lock and transaction start for journaling
2088 * filesystems.
2089 */
2090 if (i_size_changed)
2091 mark_inode_dirty(inode);
2092
Nick Pigginafddba42007-10-16 01:25:01 -07002093 return copied;
2094}
2095EXPORT_SYMBOL(generic_write_end);
2096
2097/*
Hisashi Hifumi8ab22b92008-07-28 15:46:36 -07002098 * block_is_partially_uptodate checks whether buffers within a page are
2099 * uptodate or not.
2100 *
2101 * Returns true if all buffers which correspond to a file portion
2102 * we want to read are uptodate.
2103 */
2104int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2105 unsigned long from)
2106{
2107 struct inode *inode = page->mapping->host;
2108 unsigned block_start, block_end, blocksize;
2109 unsigned to;
2110 struct buffer_head *bh, *head;
2111 int ret = 1;
2112
2113 if (!page_has_buffers(page))
2114 return 0;
2115
2116 blocksize = 1 << inode->i_blkbits;
2117 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2118 to = from + to;
2119 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2120 return 0;
2121
2122 head = page_buffers(page);
2123 bh = head;
2124 block_start = 0;
2125 do {
2126 block_end = block_start + blocksize;
2127 if (block_end > from && block_start < to) {
2128 if (!buffer_uptodate(bh)) {
2129 ret = 0;
2130 break;
2131 }
2132 if (block_end >= to)
2133 break;
2134 }
2135 block_start = block_end;
2136 bh = bh->b_this_page;
2137 } while (bh != head);
2138
2139 return ret;
2140}
2141EXPORT_SYMBOL(block_is_partially_uptodate);
2142
2143/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002144 * Generic "read page" function for block devices that have the normal
2145 * get_block functionality. This is most of the block device filesystems.
2146 * Reads the page asynchronously --- the unlock_buffer() and
2147 * set/clear_buffer_uptodate() functions propagate buffer state into the
2148 * page struct once IO has completed.
2149 */
2150int block_read_full_page(struct page *page, get_block_t *get_block)
2151{
2152 struct inode *inode = page->mapping->host;
2153 sector_t iblock, lblock;
2154 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2155 unsigned int blocksize;
2156 int nr, i;
2157 int fully_mapped = 1;
2158
Matt Mackallcd7619d2005-05-01 08:59:01 -07002159 BUG_ON(!PageLocked(page));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002160 blocksize = 1 << inode->i_blkbits;
2161 if (!page_has_buffers(page))
2162 create_empty_buffers(page, blocksize, 0);
2163 head = page_buffers(page);
2164
2165 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2166 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2167 bh = head;
2168 nr = 0;
2169 i = 0;
2170
2171 do {
2172 if (buffer_uptodate(bh))
2173 continue;
2174
2175 if (!buffer_mapped(bh)) {
Andrew Mortonc64610b2005-05-16 21:53:49 -07002176 int err = 0;
2177
Linus Torvalds1da177e2005-04-16 15:20:36 -07002178 fully_mapped = 0;
2179 if (iblock < lblock) {
Badari Pulavartyb0cf2322006-03-26 01:38:00 -08002180 WARN_ON(bh->b_size != blocksize);
Andrew Mortonc64610b2005-05-16 21:53:49 -07002181 err = get_block(inode, iblock, bh, 0);
2182 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002183 SetPageError(page);
2184 }
2185 if (!buffer_mapped(bh)) {
Christoph Lametereebd2aa2008-02-04 22:28:29 -08002186 zero_user(page, i * blocksize, blocksize);
Andrew Mortonc64610b2005-05-16 21:53:49 -07002187 if (!err)
2188 set_buffer_uptodate(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002189 continue;
2190 }
2191 /*
2192 * get_block() might have updated the buffer
2193 * synchronously
2194 */
2195 if (buffer_uptodate(bh))
2196 continue;
2197 }
2198 arr[nr++] = bh;
2199 } while (i++, iblock++, (bh = bh->b_this_page) != head);
2200
2201 if (fully_mapped)
2202 SetPageMappedToDisk(page);
2203
2204 if (!nr) {
2205 /*
2206 * All buffers are uptodate - we can set the page uptodate
2207 * as well. But not if get_block() returned an error.
2208 */
2209 if (!PageError(page))
2210 SetPageUptodate(page);
2211 unlock_page(page);
2212 return 0;
2213 }
2214
2215 /* Stage two: lock the buffers */
2216 for (i = 0; i < nr; i++) {
2217 bh = arr[i];
2218 lock_buffer(bh);
2219 mark_buffer_async_read(bh);
2220 }
2221
2222 /*
2223 * Stage 3: start the IO. Check for uptodateness
2224 * inside the buffer lock in case another process reading
2225 * the underlying blockdev brought it uptodate (the sct fix).
2226 */
2227 for (i = 0; i < nr; i++) {
2228 bh = arr[i];
2229 if (buffer_uptodate(bh))
2230 end_buffer_async_read(bh, 1);
2231 else
2232 submit_bh(READ, bh);
2233 }
2234 return 0;
2235}
2236
2237/* utility function for filesystems that need to do work on expanding
Nick Piggin89e10782007-10-16 01:25:07 -07002238 * truncates. Uses filesystem pagecache writes to allow the filesystem to
Linus Torvalds1da177e2005-04-16 15:20:36 -07002239 * deal with the hole.
2240 */
Nick Piggin89e10782007-10-16 01:25:07 -07002241int generic_cont_expand_simple(struct inode *inode, loff_t size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002242{
2243 struct address_space *mapping = inode->i_mapping;
2244 struct page *page;
Nick Piggin89e10782007-10-16 01:25:07 -07002245 void *fsdata;
OGAWA Hirofumi05eb0b52006-01-08 01:02:13 -08002246 unsigned long limit;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002247 int err;
2248
2249 err = -EFBIG;
2250 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2251 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2252 send_sig(SIGXFSZ, current, 0);
2253 goto out;
2254 }
2255 if (size > inode->i_sb->s_maxbytes)
2256 goto out;
2257
Nick Piggin89e10782007-10-16 01:25:07 -07002258 err = pagecache_write_begin(NULL, mapping, size, 0,
2259 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2260 &page, &fsdata);
2261 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002262 goto out;
OGAWA Hirofumi05eb0b52006-01-08 01:02:13 -08002263
Nick Piggin89e10782007-10-16 01:25:07 -07002264 err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2265 BUG_ON(err > 0);
OGAWA Hirofumi05eb0b52006-01-08 01:02:13 -08002266
Linus Torvalds1da177e2005-04-16 15:20:36 -07002267out:
2268 return err;
2269}
2270
Adrian Bunkf1e3af72008-04-29 00:59:01 -07002271static int cont_expand_zero(struct file *file, struct address_space *mapping,
2272 loff_t pos, loff_t *bytes)
OGAWA Hirofumi05eb0b52006-01-08 01:02:13 -08002273{
Nick Piggin89e10782007-10-16 01:25:07 -07002274 struct inode *inode = mapping->host;
2275 unsigned blocksize = 1 << inode->i_blkbits;
2276 struct page *page;
2277 void *fsdata;
2278 pgoff_t index, curidx;
2279 loff_t curpos;
2280 unsigned zerofrom, offset, len;
2281 int err = 0;
OGAWA Hirofumi05eb0b52006-01-08 01:02:13 -08002282
Nick Piggin89e10782007-10-16 01:25:07 -07002283 index = pos >> PAGE_CACHE_SHIFT;
2284 offset = pos & ~PAGE_CACHE_MASK;
2285
2286 while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2287 zerofrom = curpos & ~PAGE_CACHE_MASK;
2288 if (zerofrom & (blocksize-1)) {
2289 *bytes |= (blocksize-1);
2290 (*bytes)++;
2291 }
2292 len = PAGE_CACHE_SIZE - zerofrom;
2293
2294 err = pagecache_write_begin(file, mapping, curpos, len,
2295 AOP_FLAG_UNINTERRUPTIBLE,
2296 &page, &fsdata);
2297 if (err)
2298 goto out;
Christoph Lametereebd2aa2008-02-04 22:28:29 -08002299 zero_user(page, zerofrom, len);
Nick Piggin89e10782007-10-16 01:25:07 -07002300 err = pagecache_write_end(file, mapping, curpos, len, len,
2301 page, fsdata);
2302 if (err < 0)
2303 goto out;
2304 BUG_ON(err != len);
2305 err = 0;
OGAWA Hirofumi061e9742008-04-28 02:16:28 -07002306
2307 balance_dirty_pages_ratelimited(mapping);
Nick Piggin89e10782007-10-16 01:25:07 -07002308 }
2309
2310 /* page covers the boundary, find the boundary offset */
2311 if (index == curidx) {
2312 zerofrom = curpos & ~PAGE_CACHE_MASK;
2313 /* if we will expand the thing last block will be filled */
2314 if (offset <= zerofrom) {
2315 goto out;
2316 }
2317 if (zerofrom & (blocksize-1)) {
2318 *bytes |= (blocksize-1);
2319 (*bytes)++;
2320 }
2321 len = offset - zerofrom;
2322
2323 err = pagecache_write_begin(file, mapping, curpos, len,
2324 AOP_FLAG_UNINTERRUPTIBLE,
2325 &page, &fsdata);
2326 if (err)
2327 goto out;
Christoph Lametereebd2aa2008-02-04 22:28:29 -08002328 zero_user(page, zerofrom, len);
Nick Piggin89e10782007-10-16 01:25:07 -07002329 err = pagecache_write_end(file, mapping, curpos, len, len,
2330 page, fsdata);
2331 if (err < 0)
2332 goto out;
2333 BUG_ON(err != len);
2334 err = 0;
2335 }
2336out:
2337 return err;
OGAWA Hirofumi05eb0b52006-01-08 01:02:13 -08002338}
2339
Linus Torvalds1da177e2005-04-16 15:20:36 -07002340/*
2341 * For moronic filesystems that do not allow holes in file.
2342 * We may have to extend the file.
2343 */
Nick Piggin89e10782007-10-16 01:25:07 -07002344int cont_write_begin(struct file *file, struct address_space *mapping,
2345 loff_t pos, unsigned len, unsigned flags,
2346 struct page **pagep, void **fsdata,
2347 get_block_t *get_block, loff_t *bytes)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002348{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002349 struct inode *inode = mapping->host;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002350 unsigned blocksize = 1 << inode->i_blkbits;
Nick Piggin89e10782007-10-16 01:25:07 -07002351 unsigned zerofrom;
2352 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002353
Nick Piggin89e10782007-10-16 01:25:07 -07002354 err = cont_expand_zero(file, mapping, pos, bytes);
2355 if (err)
2356 goto out;
2357
2358 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2359 if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2360 *bytes |= (blocksize-1);
2361 (*bytes)++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002362 }
2363
Nick Piggin89e10782007-10-16 01:25:07 -07002364 *pagep = NULL;
2365 err = block_write_begin(file, mapping, pos, len,
2366 flags, pagep, fsdata, get_block);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002367out:
Nick Piggin89e10782007-10-16 01:25:07 -07002368 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002369}
2370
2371int block_prepare_write(struct page *page, unsigned from, unsigned to,
2372 get_block_t *get_block)
2373{
2374 struct inode *inode = page->mapping->host;
2375 int err = __block_prepare_write(inode, page, from, to, get_block);
2376 if (err)
2377 ClearPageUptodate(page);
2378 return err;
2379}
2380
2381int block_commit_write(struct page *page, unsigned from, unsigned to)
2382{
2383 struct inode *inode = page->mapping->host;
2384 __block_commit_write(inode,page,from,to);
2385 return 0;
2386}
2387
David Chinner54171692007-07-19 17:39:55 +10002388/*
2389 * block_page_mkwrite() is not allowed to change the file size as it gets
2390 * called from a page fault handler when a page is first dirtied. Hence we must
2391 * be careful to check for EOF conditions here. We set the page up correctly
2392 * for a written page which means we get ENOSPC checking when writing into
2393 * holes and correct delalloc and unwritten extent mapping on filesystems that
2394 * support these features.
2395 *
2396 * We are not allowed to take the i_mutex here so we have to play games to
2397 * protect against truncate races as the page could now be beyond EOF. Because
2398 * vmtruncate() writes the inode size before removing pages, once we have the
2399 * page lock we can determine safely if the page is beyond EOF. If it is not
2400 * beyond EOF, then the page is guaranteed safe against truncation until we
2401 * unlock the page.
2402 */
2403int
2404block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2405 get_block_t get_block)
2406{
2407 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2408 unsigned long end;
2409 loff_t size;
2410 int ret = -EINVAL;
2411
2412 lock_page(page);
2413 size = i_size_read(inode);
2414 if ((page->mapping != inode->i_mapping) ||
Nick Piggin18336332007-07-20 00:31:45 -07002415 (page_offset(page) > size)) {
David Chinner54171692007-07-19 17:39:55 +10002416 /* page got truncated out from underneath us */
2417 goto out_unlock;
2418 }
2419
2420 /* page is wholly or partially inside EOF */
2421 if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2422 end = size & ~PAGE_CACHE_MASK;
2423 else
2424 end = PAGE_CACHE_SIZE;
2425
2426 ret = block_prepare_write(page, 0, end, get_block);
2427 if (!ret)
2428 ret = block_commit_write(page, 0, end);
2429
2430out_unlock:
2431 unlock_page(page);
2432 return ret;
2433}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002434
2435/*
Nick Piggin03158cd2007-10-16 01:25:25 -07002436 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
Linus Torvalds1da177e2005-04-16 15:20:36 -07002437 * immediately, while under the page lock. So it needs a special end_io
2438 * handler which does not touch the bh after unlocking it.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002439 */
2440static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2441{
Dmitry Monakhov68671f32007-10-16 01:24:47 -07002442 __end_buffer_read_notouch(bh, uptodate);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002443}
2444
2445/*
Nick Piggin03158cd2007-10-16 01:25:25 -07002446 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2447 * the page (converting it to circular linked list and taking care of page
2448 * dirty races).
2449 */
2450static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2451{
2452 struct buffer_head *bh;
2453
2454 BUG_ON(!PageLocked(page));
2455
2456 spin_lock(&page->mapping->private_lock);
2457 bh = head;
2458 do {
2459 if (PageDirty(page))
2460 set_buffer_dirty(bh);
2461 if (!bh->b_this_page)
2462 bh->b_this_page = head;
2463 bh = bh->b_this_page;
2464 } while (bh != head);
2465 attach_page_buffers(page, head);
2466 spin_unlock(&page->mapping->private_lock);
2467}
2468
2469/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002470 * On entry, the page is fully not uptodate.
2471 * On exit the page is fully uptodate in the areas outside (from,to)
2472 */
Nick Piggin03158cd2007-10-16 01:25:25 -07002473int nobh_write_begin(struct file *file, struct address_space *mapping,
2474 loff_t pos, unsigned len, unsigned flags,
2475 struct page **pagep, void **fsdata,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002476 get_block_t *get_block)
2477{
Nick Piggin03158cd2007-10-16 01:25:25 -07002478 struct inode *inode = mapping->host;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002479 const unsigned blkbits = inode->i_blkbits;
2480 const unsigned blocksize = 1 << blkbits;
Nick Piggina4b06722007-10-16 01:24:48 -07002481 struct buffer_head *head, *bh;
Nick Piggin03158cd2007-10-16 01:25:25 -07002482 struct page *page;
2483 pgoff_t index;
2484 unsigned from, to;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002485 unsigned block_in_page;
Nick Piggina4b06722007-10-16 01:24:48 -07002486 unsigned block_start, block_end;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002487 sector_t block_in_file;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002488 int nr_reads = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002489 int ret = 0;
2490 int is_mapped_to_disk = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002491
Nick Piggin03158cd2007-10-16 01:25:25 -07002492 index = pos >> PAGE_CACHE_SHIFT;
2493 from = pos & (PAGE_CACHE_SIZE - 1);
2494 to = from + len;
2495
2496 page = __grab_cache_page(mapping, index);
2497 if (!page)
2498 return -ENOMEM;
2499 *pagep = page;
2500 *fsdata = NULL;
2501
2502 if (page_has_buffers(page)) {
2503 unlock_page(page);
2504 page_cache_release(page);
2505 *pagep = NULL;
2506 return block_write_begin(file, mapping, pos, len, flags, pagep,
2507 fsdata, get_block);
2508 }
Nick Piggina4b06722007-10-16 01:24:48 -07002509
Linus Torvalds1da177e2005-04-16 15:20:36 -07002510 if (PageMappedToDisk(page))
2511 return 0;
2512
Nick Piggina4b06722007-10-16 01:24:48 -07002513 /*
2514 * Allocate buffers so that we can keep track of state, and potentially
2515 * attach them to the page if an error occurs. In the common case of
2516 * no error, they will just be freed again without ever being attached
2517 * to the page (which is all OK, because we're under the page lock).
2518 *
2519 * Be careful: the buffer linked list is a NULL terminated one, rather
2520 * than the circular one we're used to.
2521 */
2522 head = alloc_page_buffers(page, blocksize, 0);
Nick Piggin03158cd2007-10-16 01:25:25 -07002523 if (!head) {
2524 ret = -ENOMEM;
2525 goto out_release;
2526 }
Nick Piggina4b06722007-10-16 01:24:48 -07002527
Linus Torvalds1da177e2005-04-16 15:20:36 -07002528 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002529
2530 /*
2531 * We loop across all blocks in the page, whether or not they are
2532 * part of the affected region. This is so we can discover if the
2533 * page is fully mapped-to-disk.
2534 */
Nick Piggina4b06722007-10-16 01:24:48 -07002535 for (block_start = 0, block_in_page = 0, bh = head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002536 block_start < PAGE_CACHE_SIZE;
Nick Piggina4b06722007-10-16 01:24:48 -07002537 block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002538 int create;
2539
Nick Piggina4b06722007-10-16 01:24:48 -07002540 block_end = block_start + blocksize;
2541 bh->b_state = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002542 create = 1;
2543 if (block_start >= to)
2544 create = 0;
2545 ret = get_block(inode, block_in_file + block_in_page,
Nick Piggina4b06722007-10-16 01:24:48 -07002546 bh, create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002547 if (ret)
2548 goto failed;
Nick Piggina4b06722007-10-16 01:24:48 -07002549 if (!buffer_mapped(bh))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002550 is_mapped_to_disk = 0;
Nick Piggina4b06722007-10-16 01:24:48 -07002551 if (buffer_new(bh))
2552 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2553 if (PageUptodate(page)) {
2554 set_buffer_uptodate(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002555 continue;
Nick Piggina4b06722007-10-16 01:24:48 -07002556 }
2557 if (buffer_new(bh) || !buffer_mapped(bh)) {
Christoph Lametereebd2aa2008-02-04 22:28:29 -08002558 zero_user_segments(page, block_start, from,
2559 to, block_end);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002560 continue;
2561 }
Nick Piggina4b06722007-10-16 01:24:48 -07002562 if (buffer_uptodate(bh))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002563 continue; /* reiserfs does this */
2564 if (block_start < from || block_end > to) {
Nick Piggina4b06722007-10-16 01:24:48 -07002565 lock_buffer(bh);
2566 bh->b_end_io = end_buffer_read_nobh;
2567 submit_bh(READ, bh);
2568 nr_reads++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002569 }
2570 }
2571
2572 if (nr_reads) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002573 /*
2574 * The page is locked, so these buffers are protected from
2575 * any VM or truncate activity. Hence we don't need to care
2576 * for the buffer_head refcounts.
2577 */
Nick Piggina4b06722007-10-16 01:24:48 -07002578 for (bh = head; bh; bh = bh->b_this_page) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002579 wait_on_buffer(bh);
2580 if (!buffer_uptodate(bh))
2581 ret = -EIO;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002582 }
2583 if (ret)
2584 goto failed;
2585 }
2586
2587 if (is_mapped_to_disk)
2588 SetPageMappedToDisk(page);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002589
Nick Piggin03158cd2007-10-16 01:25:25 -07002590 *fsdata = head; /* to be released by nobh_write_end */
Nick Piggina4b06722007-10-16 01:24:48 -07002591
Linus Torvalds1da177e2005-04-16 15:20:36 -07002592 return 0;
2593
2594failed:
Nick Piggin03158cd2007-10-16 01:25:25 -07002595 BUG_ON(!ret);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002596 /*
Nick Piggina4b06722007-10-16 01:24:48 -07002597 * Error recovery is a bit difficult. We need to zero out blocks that
2598 * were newly allocated, and dirty them to ensure they get written out.
2599 * Buffers need to be attached to the page at this point, otherwise
2600 * the handling of potential IO errors during writeout would be hard
2601 * (could try doing synchronous writeout, but what if that fails too?)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002602 */
Nick Piggin03158cd2007-10-16 01:25:25 -07002603 attach_nobh_buffers(page, head);
2604 page_zero_new_buffers(page, from, to);
Nick Piggina4b06722007-10-16 01:24:48 -07002605
Nick Piggin03158cd2007-10-16 01:25:25 -07002606out_release:
2607 unlock_page(page);
2608 page_cache_release(page);
2609 *pagep = NULL;
Nick Piggina4b06722007-10-16 01:24:48 -07002610
Nick Piggin03158cd2007-10-16 01:25:25 -07002611 if (pos + len > inode->i_size)
2612 vmtruncate(inode, inode->i_size);
Nick Piggina4b06722007-10-16 01:24:48 -07002613
Linus Torvalds1da177e2005-04-16 15:20:36 -07002614 return ret;
2615}
Nick Piggin03158cd2007-10-16 01:25:25 -07002616EXPORT_SYMBOL(nobh_write_begin);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002617
Nick Piggin03158cd2007-10-16 01:25:25 -07002618int nobh_write_end(struct file *file, struct address_space *mapping,
2619 loff_t pos, unsigned len, unsigned copied,
2620 struct page *page, void *fsdata)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002621{
2622 struct inode *inode = page->mapping->host;
Nick Pigginefdc3132007-10-21 06:57:41 +02002623 struct buffer_head *head = fsdata;
Nick Piggin03158cd2007-10-16 01:25:25 -07002624 struct buffer_head *bh;
Dmitri Monakhov5b41e742008-03-28 14:15:52 -07002625 BUG_ON(fsdata != NULL && page_has_buffers(page));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002626
Dmitri Monakhov5b41e742008-03-28 14:15:52 -07002627 if (unlikely(copied < len) && !page_has_buffers(page))
2628 attach_nobh_buffers(page, head);
2629 if (page_has_buffers(page))
2630 return generic_write_end(file, mapping, pos, len,
2631 copied, page, fsdata);
Nick Piggina4b06722007-10-16 01:24:48 -07002632
Nick Piggin22c8ca72007-02-20 13:58:09 -08002633 SetPageUptodate(page);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002634 set_page_dirty(page);
Nick Piggin03158cd2007-10-16 01:25:25 -07002635 if (pos+copied > inode->i_size) {
2636 i_size_write(inode, pos+copied);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002637 mark_inode_dirty(inode);
2638 }
Nick Piggin03158cd2007-10-16 01:25:25 -07002639
2640 unlock_page(page);
2641 page_cache_release(page);
2642
Nick Piggin03158cd2007-10-16 01:25:25 -07002643 while (head) {
2644 bh = head;
2645 head = head->b_this_page;
2646 free_buffer_head(bh);
2647 }
2648
2649 return copied;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002650}
Nick Piggin03158cd2007-10-16 01:25:25 -07002651EXPORT_SYMBOL(nobh_write_end);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002652
2653/*
2654 * nobh_writepage() - based on block_full_write_page() except
2655 * that it tries to operate without attaching bufferheads to
2656 * the page.
2657 */
2658int nobh_writepage(struct page *page, get_block_t *get_block,
2659 struct writeback_control *wbc)
2660{
2661 struct inode * const inode = page->mapping->host;
2662 loff_t i_size = i_size_read(inode);
2663 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2664 unsigned offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002665 int ret;
2666
2667 /* Is the page fully inside i_size? */
2668 if (page->index < end_index)
2669 goto out;
2670
2671 /* Is the page fully outside i_size? (truncate in progress) */
2672 offset = i_size & (PAGE_CACHE_SIZE-1);
2673 if (page->index >= end_index+1 || !offset) {
2674 /*
2675 * The page may have dirty, unmapped buffers. For example,
2676 * they may have been added in ext3_writepage(). Make them
2677 * freeable here, so the page does not leak.
2678 */
2679#if 0
2680 /* Not really sure about this - do we need this ? */
2681 if (page->mapping->a_ops->invalidatepage)
2682 page->mapping->a_ops->invalidatepage(page, offset);
2683#endif
2684 unlock_page(page);
2685 return 0; /* don't care */
2686 }
2687
2688 /*
2689 * The page straddles i_size. It must be zeroed out on each and every
2690 * writepage invocation because it may be mmapped. "A file is mapped
2691 * in multiples of the page size. For a file that is not a multiple of
2692 * the page size, the remaining memory is zeroed when mapped, and
2693 * writes to that region are not written out to the file."
2694 */
Christoph Lametereebd2aa2008-02-04 22:28:29 -08002695 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002696out:
2697 ret = mpage_writepage(page, get_block, wbc);
2698 if (ret == -EAGAIN)
2699 ret = __block_write_full_page(inode, page, get_block, wbc);
2700 return ret;
2701}
2702EXPORT_SYMBOL(nobh_writepage);
2703
Nick Piggin03158cd2007-10-16 01:25:25 -07002704int nobh_truncate_page(struct address_space *mapping,
2705 loff_t from, get_block_t *get_block)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002706{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002707 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2708 unsigned offset = from & (PAGE_CACHE_SIZE-1);
Nick Piggin03158cd2007-10-16 01:25:25 -07002709 unsigned blocksize;
2710 sector_t iblock;
2711 unsigned length, pos;
2712 struct inode *inode = mapping->host;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002713 struct page *page;
Nick Piggin03158cd2007-10-16 01:25:25 -07002714 struct buffer_head map_bh;
2715 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002716
Nick Piggin03158cd2007-10-16 01:25:25 -07002717 blocksize = 1 << inode->i_blkbits;
2718 length = offset & (blocksize - 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002719
Nick Piggin03158cd2007-10-16 01:25:25 -07002720 /* Block boundary? Nothing to do */
2721 if (!length)
2722 return 0;
2723
2724 length = blocksize - length;
2725 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2726
Linus Torvalds1da177e2005-04-16 15:20:36 -07002727 page = grab_cache_page(mapping, index);
Nick Piggin03158cd2007-10-16 01:25:25 -07002728 err = -ENOMEM;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002729 if (!page)
2730 goto out;
2731
Nick Piggin03158cd2007-10-16 01:25:25 -07002732 if (page_has_buffers(page)) {
2733has_buffers:
2734 unlock_page(page);
2735 page_cache_release(page);
2736 return block_truncate_page(mapping, from, get_block);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002737 }
Nick Piggin03158cd2007-10-16 01:25:25 -07002738
2739 /* Find the buffer that contains "offset" */
2740 pos = blocksize;
2741 while (offset >= pos) {
2742 iblock++;
2743 pos += blocksize;
2744 }
2745
2746 err = get_block(inode, iblock, &map_bh, 0);
2747 if (err)
2748 goto unlock;
2749 /* unmapped? It's a hole - nothing to do */
2750 if (!buffer_mapped(&map_bh))
2751 goto unlock;
2752
2753 /* Ok, it's mapped. Make sure it's up-to-date */
2754 if (!PageUptodate(page)) {
2755 err = mapping->a_ops->readpage(NULL, page);
2756 if (err) {
2757 page_cache_release(page);
2758 goto out;
2759 }
2760 lock_page(page);
2761 if (!PageUptodate(page)) {
2762 err = -EIO;
2763 goto unlock;
2764 }
2765 if (page_has_buffers(page))
2766 goto has_buffers;
2767 }
Christoph Lametereebd2aa2008-02-04 22:28:29 -08002768 zero_user(page, offset, length);
Nick Piggin03158cd2007-10-16 01:25:25 -07002769 set_page_dirty(page);
2770 err = 0;
2771
2772unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002773 unlock_page(page);
2774 page_cache_release(page);
2775out:
Nick Piggin03158cd2007-10-16 01:25:25 -07002776 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002777}
2778EXPORT_SYMBOL(nobh_truncate_page);
2779
2780int block_truncate_page(struct address_space *mapping,
2781 loff_t from, get_block_t *get_block)
2782{
2783 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2784 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2785 unsigned blocksize;
Andrew Morton54b21a72006-01-08 01:03:05 -08002786 sector_t iblock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002787 unsigned length, pos;
2788 struct inode *inode = mapping->host;
2789 struct page *page;
2790 struct buffer_head *bh;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002791 int err;
2792
2793 blocksize = 1 << inode->i_blkbits;
2794 length = offset & (blocksize - 1);
2795
2796 /* Block boundary? Nothing to do */
2797 if (!length)
2798 return 0;
2799
2800 length = blocksize - length;
Andrew Morton54b21a72006-01-08 01:03:05 -08002801 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002802
2803 page = grab_cache_page(mapping, index);
2804 err = -ENOMEM;
2805 if (!page)
2806 goto out;
2807
2808 if (!page_has_buffers(page))
2809 create_empty_buffers(page, blocksize, 0);
2810
2811 /* Find the buffer that contains "offset" */
2812 bh = page_buffers(page);
2813 pos = blocksize;
2814 while (offset >= pos) {
2815 bh = bh->b_this_page;
2816 iblock++;
2817 pos += blocksize;
2818 }
2819
2820 err = 0;
2821 if (!buffer_mapped(bh)) {
Badari Pulavartyb0cf2322006-03-26 01:38:00 -08002822 WARN_ON(bh->b_size != blocksize);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002823 err = get_block(inode, iblock, bh, 0);
2824 if (err)
2825 goto unlock;
2826 /* unmapped? It's a hole - nothing to do */
2827 if (!buffer_mapped(bh))
2828 goto unlock;
2829 }
2830
2831 /* Ok, it's mapped. Make sure it's up-to-date */
2832 if (PageUptodate(page))
2833 set_buffer_uptodate(bh);
2834
David Chinner33a266d2007-02-12 00:51:41 -08002835 if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002836 err = -EIO;
2837 ll_rw_block(READ, 1, &bh);
2838 wait_on_buffer(bh);
2839 /* Uhhuh. Read error. Complain and punt. */
2840 if (!buffer_uptodate(bh))
2841 goto unlock;
2842 }
2843
Christoph Lametereebd2aa2008-02-04 22:28:29 -08002844 zero_user(page, offset, length);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002845 mark_buffer_dirty(bh);
2846 err = 0;
2847
2848unlock:
2849 unlock_page(page);
2850 page_cache_release(page);
2851out:
2852 return err;
2853}
2854
2855/*
2856 * The generic ->writepage function for buffer-backed address_spaces
2857 */
2858int block_write_full_page(struct page *page, get_block_t *get_block,
2859 struct writeback_control *wbc)
2860{
2861 struct inode * const inode = page->mapping->host;
2862 loff_t i_size = i_size_read(inode);
2863 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2864 unsigned offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002865
2866 /* Is the page fully inside i_size? */
2867 if (page->index < end_index)
2868 return __block_write_full_page(inode, page, get_block, wbc);
2869
2870 /* Is the page fully outside i_size? (truncate in progress) */
2871 offset = i_size & (PAGE_CACHE_SIZE-1);
2872 if (page->index >= end_index+1 || !offset) {
2873 /*
2874 * The page may have dirty, unmapped buffers. For example,
2875 * they may have been added in ext3_writepage(). Make them
2876 * freeable here, so the page does not leak.
2877 */
Jan Karaaaa40592005-10-30 15:00:16 -08002878 do_invalidatepage(page, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002879 unlock_page(page);
2880 return 0; /* don't care */
2881 }
2882
2883 /*
2884 * The page straddles i_size. It must be zeroed out on each and every
2885 * writepage invokation because it may be mmapped. "A file is mapped
2886 * in multiples of the page size. For a file that is not a multiple of
2887 * the page size, the remaining memory is zeroed when mapped, and
2888 * writes to that region are not written out to the file."
2889 */
Christoph Lametereebd2aa2008-02-04 22:28:29 -08002890 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002891 return __block_write_full_page(inode, page, get_block, wbc);
2892}
2893
2894sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2895 get_block_t *get_block)
2896{
2897 struct buffer_head tmp;
2898 struct inode *inode = mapping->host;
2899 tmp.b_state = 0;
2900 tmp.b_blocknr = 0;
Badari Pulavartyb0cf2322006-03-26 01:38:00 -08002901 tmp.b_size = 1 << inode->i_blkbits;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002902 get_block(inode, block, &tmp, 0);
2903 return tmp.b_blocknr;
2904}
2905
NeilBrown6712ecf2007-09-27 12:47:43 +02002906static void end_bio_bh_io_sync(struct bio *bio, int err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002907{
2908 struct buffer_head *bh = bio->bi_private;
2909
Linus Torvalds1da177e2005-04-16 15:20:36 -07002910 if (err == -EOPNOTSUPP) {
2911 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2912 set_bit(BH_Eopnotsupp, &bh->b_state);
2913 }
2914
2915 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2916 bio_put(bio);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002917}
2918
2919int submit_bh(int rw, struct buffer_head * bh)
2920{
2921 struct bio *bio;
2922 int ret = 0;
2923
2924 BUG_ON(!buffer_locked(bh));
2925 BUG_ON(!buffer_mapped(bh));
2926 BUG_ON(!bh->b_end_io);
2927
Jens Axboe48fd4f92008-08-22 10:00:36 +02002928 /*
2929 * Mask in barrier bit for a write (could be either a WRITE or a
2930 * WRITE_SYNC
2931 */
2932 if (buffer_ordered(bh) && (rw & WRITE))
2933 rw |= WRITE_BARRIER;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002934
2935 /*
Jens Axboe48fd4f92008-08-22 10:00:36 +02002936 * Only clear out a write error when rewriting
Linus Torvalds1da177e2005-04-16 15:20:36 -07002937 */
Jens Axboe48fd4f92008-08-22 10:00:36 +02002938 if (test_set_buffer_req(bh) && (rw & WRITE))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002939 clear_buffer_write_io_error(bh);
2940
2941 /*
2942 * from here on down, it's all bio -- do the initial mapping,
2943 * submit_bio -> generic_make_request may further map this bio around
2944 */
2945 bio = bio_alloc(GFP_NOIO, 1);
2946
2947 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2948 bio->bi_bdev = bh->b_bdev;
2949 bio->bi_io_vec[0].bv_page = bh->b_page;
2950 bio->bi_io_vec[0].bv_len = bh->b_size;
2951 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2952
2953 bio->bi_vcnt = 1;
2954 bio->bi_idx = 0;
2955 bio->bi_size = bh->b_size;
2956
2957 bio->bi_end_io = end_bio_bh_io_sync;
2958 bio->bi_private = bh;
2959
2960 bio_get(bio);
2961 submit_bio(rw, bio);
2962
2963 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2964 ret = -EOPNOTSUPP;
2965
2966 bio_put(bio);
2967 return ret;
2968}
2969
2970/**
2971 * ll_rw_block: low-level access to block devices (DEPRECATED)
Jan Karaa7662232005-09-06 15:19:10 -07002972 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002973 * @nr: number of &struct buffer_heads in the array
2974 * @bhs: array of pointers to &struct buffer_head
2975 *
Jan Karaa7662232005-09-06 15:19:10 -07002976 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2977 * requests an I/O operation on them, either a %READ or a %WRITE. The third
2978 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2979 * are sent to disk. The fourth %READA option is described in the documentation
2980 * for generic_make_request() which ll_rw_block() calls.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002981 *
2982 * This function drops any buffer that it cannot get a lock on (with the
Jan Karaa7662232005-09-06 15:19:10 -07002983 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2984 * clean when doing a write request, and any buffer that appears to be
2985 * up-to-date when doing read request. Further it marks as clean buffers that
2986 * are processed for writing (the buffer cache won't assume that they are
2987 * actually clean until the buffer gets unlocked).
Linus Torvalds1da177e2005-04-16 15:20:36 -07002988 *
2989 * ll_rw_block sets b_end_io to simple completion handler that marks
2990 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2991 * any waiters.
2992 *
2993 * All of the buffers must be for the same device, and must also be a
2994 * multiple of the current approved size for the device.
2995 */
2996void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2997{
2998 int i;
2999
3000 for (i = 0; i < nr; i++) {
3001 struct buffer_head *bh = bhs[i];
3002
Jens Axboe18ce3752008-07-01 09:07:34 +02003003 if (rw == SWRITE || rw == SWRITE_SYNC)
Jan Karaa7662232005-09-06 15:19:10 -07003004 lock_buffer(bh);
Nick Pigginca5de402008-08-02 12:02:13 +02003005 else if (!trylock_buffer(bh))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003006 continue;
3007
Jens Axboe18ce3752008-07-01 09:07:34 +02003008 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003009 if (test_clear_buffer_dirty(bh)) {
akpm@osdl.org76c30732005-04-16 15:24:07 -07003010 bh->b_end_io = end_buffer_write_sync;
OGAWA Hirofumie60e5c52006-02-03 03:04:43 -08003011 get_bh(bh);
Jens Axboe18ce3752008-07-01 09:07:34 +02003012 if (rw == SWRITE_SYNC)
3013 submit_bh(WRITE_SYNC, bh);
3014 else
3015 submit_bh(WRITE, bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003016 continue;
3017 }
3018 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003019 if (!buffer_uptodate(bh)) {
akpm@osdl.org76c30732005-04-16 15:24:07 -07003020 bh->b_end_io = end_buffer_read_sync;
OGAWA Hirofumie60e5c52006-02-03 03:04:43 -08003021 get_bh(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003022 submit_bh(rw, bh);
3023 continue;
3024 }
3025 }
3026 unlock_buffer(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003027 }
3028}
3029
3030/*
3031 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3032 * and then start new I/O and then wait upon it. The caller must have a ref on
3033 * the buffer_head.
3034 */
3035int sync_dirty_buffer(struct buffer_head *bh)
3036{
3037 int ret = 0;
3038
3039 WARN_ON(atomic_read(&bh->b_count) < 1);
3040 lock_buffer(bh);
3041 if (test_clear_buffer_dirty(bh)) {
3042 get_bh(bh);
3043 bh->b_end_io = end_buffer_write_sync;
Jens Axboe18ce3752008-07-01 09:07:34 +02003044 ret = submit_bh(WRITE_SYNC, bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003045 wait_on_buffer(bh);
3046 if (buffer_eopnotsupp(bh)) {
3047 clear_buffer_eopnotsupp(bh);
3048 ret = -EOPNOTSUPP;
3049 }
3050 if (!ret && !buffer_uptodate(bh))
3051 ret = -EIO;
3052 } else {
3053 unlock_buffer(bh);
3054 }
3055 return ret;
3056}
3057
3058/*
3059 * try_to_free_buffers() checks if all the buffers on this particular page
3060 * are unused, and releases them if so.
3061 *
3062 * Exclusion against try_to_free_buffers may be obtained by either
3063 * locking the page or by holding its mapping's private_lock.
3064 *
3065 * If the page is dirty but all the buffers are clean then we need to
3066 * be sure to mark the page clean as well. This is because the page
3067 * may be against a block device, and a later reattachment of buffers
3068 * to a dirty page will set *all* buffers dirty. Which would corrupt
3069 * filesystem data on the same device.
3070 *
3071 * The same applies to regular filesystem pages: if all the buffers are
3072 * clean then we set the page clean and proceed. To do that, we require
3073 * total exclusion from __set_page_dirty_buffers(). That is obtained with
3074 * private_lock.
3075 *
3076 * try_to_free_buffers() is non-blocking.
3077 */
3078static inline int buffer_busy(struct buffer_head *bh)
3079{
3080 return atomic_read(&bh->b_count) |
3081 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3082}
3083
3084static int
3085drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3086{
3087 struct buffer_head *head = page_buffers(page);
3088 struct buffer_head *bh;
3089
3090 bh = head;
3091 do {
akpm@osdl.orgde7d5a32005-05-01 08:58:39 -07003092 if (buffer_write_io_error(bh) && page->mapping)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003093 set_bit(AS_EIO, &page->mapping->flags);
3094 if (buffer_busy(bh))
3095 goto failed;
3096 bh = bh->b_this_page;
3097 } while (bh != head);
3098
3099 do {
3100 struct buffer_head *next = bh->b_this_page;
3101
Jan Kara535ee2f2008-02-08 04:21:59 -08003102 if (bh->b_assoc_map)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003103 __remove_assoc_queue(bh);
3104 bh = next;
3105 } while (bh != head);
3106 *buffers_to_free = head;
3107 __clear_page_buffers(page);
3108 return 1;
3109failed:
3110 return 0;
3111}
3112
3113int try_to_free_buffers(struct page *page)
3114{
3115 struct address_space * const mapping = page->mapping;
3116 struct buffer_head *buffers_to_free = NULL;
3117 int ret = 0;
3118
3119 BUG_ON(!PageLocked(page));
Linus Torvaldsecdfc972007-01-26 12:47:06 -08003120 if (PageWriteback(page))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003121 return 0;
3122
3123 if (mapping == NULL) { /* can this still happen? */
3124 ret = drop_buffers(page, &buffers_to_free);
3125 goto out;
3126 }
3127
3128 spin_lock(&mapping->private_lock);
3129 ret = drop_buffers(page, &buffers_to_free);
Linus Torvaldsecdfc972007-01-26 12:47:06 -08003130
3131 /*
3132 * If the filesystem writes its buffers by hand (eg ext3)
3133 * then we can have clean buffers against a dirty page. We
3134 * clean the page here; otherwise the VM will never notice
3135 * that the filesystem did any IO at all.
3136 *
3137 * Also, during truncate, discard_buffer will have marked all
3138 * the page's buffers clean. We discover that here and clean
3139 * the page also.
Nick Piggin87df7242007-01-30 14:36:27 +11003140 *
3141 * private_lock must be held over this entire operation in order
3142 * to synchronise against __set_page_dirty_buffers and prevent the
3143 * dirty bit from being lost.
Linus Torvaldsecdfc972007-01-26 12:47:06 -08003144 */
3145 if (ret)
3146 cancel_dirty_page(page, PAGE_CACHE_SIZE);
Nick Piggin87df7242007-01-30 14:36:27 +11003147 spin_unlock(&mapping->private_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003148out:
3149 if (buffers_to_free) {
3150 struct buffer_head *bh = buffers_to_free;
3151
3152 do {
3153 struct buffer_head *next = bh->b_this_page;
3154 free_buffer_head(bh);
3155 bh = next;
3156 } while (bh != buffers_to_free);
3157 }
3158 return ret;
3159}
3160EXPORT_SYMBOL(try_to_free_buffers);
3161
NeilBrown3978d7172006-03-26 01:37:17 -08003162void block_sync_page(struct page *page)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003163{
3164 struct address_space *mapping;
3165
3166 smp_mb();
3167 mapping = page_mapping(page);
3168 if (mapping)
3169 blk_run_backing_dev(mapping->backing_dev_info, page);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003170}
3171
3172/*
3173 * There are no bdflush tunables left. But distributions are
3174 * still running obsolete flush daemons, so we terminate them here.
3175 *
3176 * Use of bdflush() is deprecated and will be removed in a future kernel.
3177 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
3178 */
3179asmlinkage long sys_bdflush(int func, long data)
3180{
3181 static int msg_count;
3182
3183 if (!capable(CAP_SYS_ADMIN))
3184 return -EPERM;
3185
3186 if (msg_count < 5) {
3187 msg_count++;
3188 printk(KERN_INFO
3189 "warning: process `%s' used the obsolete bdflush"
3190 " system call\n", current->comm);
3191 printk(KERN_INFO "Fix your initscripts?\n");
3192 }
3193
3194 if (func == 1)
3195 do_exit(0);
3196 return 0;
3197}
3198
3199/*
3200 * Buffer-head allocation
3201 */
Christoph Lametere18b8902006-12-06 20:33:20 -08003202static struct kmem_cache *bh_cachep;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003203
3204/*
3205 * Once the number of bh's in the machine exceeds this level, we start
3206 * stripping them in writeback.
3207 */
3208static int max_buffer_heads;
3209
3210int buffer_heads_over_limit;
3211
3212struct bh_accounting {
3213 int nr; /* Number of live bh's */
3214 int ratelimit; /* Limit cacheline bouncing */
3215};
3216
3217static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3218
3219static void recalc_bh_state(void)
3220{
3221 int i;
3222 int tot = 0;
3223
3224 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3225 return;
3226 __get_cpu_var(bh_accounting).ratelimit = 0;
Eric Dumazet8a143422006-03-24 03:18:10 -08003227 for_each_online_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003228 tot += per_cpu(bh_accounting, i).nr;
3229 buffer_heads_over_limit = (tot > max_buffer_heads);
3230}
3231
Al Virodd0fc662005-10-07 07:46:04 +01003232struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003233{
Christoph Lameter488514d2008-04-28 02:12:05 -07003234 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003235 if (ret) {
Christoph Lametera35afb82007-05-16 22:10:57 -07003236 INIT_LIST_HEAD(&ret->b_assoc_buffers);
Coywolf Qi Hunt736c7b82005-09-06 15:18:17 -07003237 get_cpu_var(bh_accounting).nr++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003238 recalc_bh_state();
Coywolf Qi Hunt736c7b82005-09-06 15:18:17 -07003239 put_cpu_var(bh_accounting);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003240 }
3241 return ret;
3242}
3243EXPORT_SYMBOL(alloc_buffer_head);
3244
3245void free_buffer_head(struct buffer_head *bh)
3246{
3247 BUG_ON(!list_empty(&bh->b_assoc_buffers));
3248 kmem_cache_free(bh_cachep, bh);
Coywolf Qi Hunt736c7b82005-09-06 15:18:17 -07003249 get_cpu_var(bh_accounting).nr--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003250 recalc_bh_state();
Coywolf Qi Hunt736c7b82005-09-06 15:18:17 -07003251 put_cpu_var(bh_accounting);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003252}
3253EXPORT_SYMBOL(free_buffer_head);
3254
Linus Torvalds1da177e2005-04-16 15:20:36 -07003255static void buffer_exit_cpu(int cpu)
3256{
3257 int i;
3258 struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3259
3260 for (i = 0; i < BH_LRU_SIZE; i++) {
3261 brelse(b->bhs[i]);
3262 b->bhs[i] = NULL;
3263 }
Eric Dumazet8a143422006-03-24 03:18:10 -08003264 get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3265 per_cpu(bh_accounting, cpu).nr = 0;
3266 put_cpu_var(bh_accounting);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003267}
3268
3269static int buffer_cpu_notify(struct notifier_block *self,
3270 unsigned long action, void *hcpu)
3271{
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07003272 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003273 buffer_exit_cpu((unsigned long)hcpu);
3274 return NOTIFY_OK;
3275}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003276
Aneesh Kumar K.V389d1b02008-01-28 23:58:26 -05003277/**
Randy Dunlapa6b91912008-03-19 17:01:00 -07003278 * bh_uptodate_or_lock - Test whether the buffer is uptodate
Aneesh Kumar K.V389d1b02008-01-28 23:58:26 -05003279 * @bh: struct buffer_head
3280 *
3281 * Return true if the buffer is up-to-date and false,
3282 * with the buffer locked, if not.
3283 */
3284int bh_uptodate_or_lock(struct buffer_head *bh)
3285{
3286 if (!buffer_uptodate(bh)) {
3287 lock_buffer(bh);
3288 if (!buffer_uptodate(bh))
3289 return 0;
3290 unlock_buffer(bh);
3291 }
3292 return 1;
3293}
3294EXPORT_SYMBOL(bh_uptodate_or_lock);
3295
3296/**
Randy Dunlapa6b91912008-03-19 17:01:00 -07003297 * bh_submit_read - Submit a locked buffer for reading
Aneesh Kumar K.V389d1b02008-01-28 23:58:26 -05003298 * @bh: struct buffer_head
3299 *
3300 * Returns zero on success and -EIO on error.
3301 */
3302int bh_submit_read(struct buffer_head *bh)
3303{
3304 BUG_ON(!buffer_locked(bh));
3305
3306 if (buffer_uptodate(bh)) {
3307 unlock_buffer(bh);
3308 return 0;
3309 }
3310
3311 get_bh(bh);
3312 bh->b_end_io = end_buffer_read_sync;
3313 submit_bh(READ, bh);
3314 wait_on_buffer(bh);
3315 if (buffer_uptodate(bh))
3316 return 0;
3317 return -EIO;
3318}
3319EXPORT_SYMBOL(bh_submit_read);
3320
Christoph Lameterb98938c2008-02-04 22:28:36 -08003321static void
Alexey Dobriyan51cc5062008-07-25 19:45:34 -07003322init_buffer_head(void *data)
Christoph Lameterb98938c2008-02-04 22:28:36 -08003323{
3324 struct buffer_head *bh = data;
3325
3326 memset(bh, 0, sizeof(*bh));
3327 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3328}
3329
Linus Torvalds1da177e2005-04-16 15:20:36 -07003330void __init buffer_init(void)
3331{
3332 int nrpages;
3333
Christoph Lameterb98938c2008-02-04 22:28:36 -08003334 bh_cachep = kmem_cache_create("buffer_head",
3335 sizeof(struct buffer_head), 0,
3336 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3337 SLAB_MEM_SPREAD),
3338 init_buffer_head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003339
3340 /*
3341 * Limit the bh occupancy to 10% of ZONE_NORMAL
3342 */
3343 nrpages = (nr_free_buffer_pages() * 10) / 100;
3344 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3345 hotcpu_notifier(buffer_cpu_notify, 0);
3346}
3347
3348EXPORT_SYMBOL(__bforget);
3349EXPORT_SYMBOL(__brelse);
3350EXPORT_SYMBOL(__wait_on_buffer);
3351EXPORT_SYMBOL(block_commit_write);
3352EXPORT_SYMBOL(block_prepare_write);
David Chinner54171692007-07-19 17:39:55 +10003353EXPORT_SYMBOL(block_page_mkwrite);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003354EXPORT_SYMBOL(block_read_full_page);
3355EXPORT_SYMBOL(block_sync_page);
3356EXPORT_SYMBOL(block_truncate_page);
3357EXPORT_SYMBOL(block_write_full_page);
Nick Piggin89e10782007-10-16 01:25:07 -07003358EXPORT_SYMBOL(cont_write_begin);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003359EXPORT_SYMBOL(end_buffer_read_sync);
3360EXPORT_SYMBOL(end_buffer_write_sync);
3361EXPORT_SYMBOL(file_fsync);
3362EXPORT_SYMBOL(fsync_bdev);
3363EXPORT_SYMBOL(generic_block_bmap);
OGAWA Hirofumi05eb0b52006-01-08 01:02:13 -08003364EXPORT_SYMBOL(generic_cont_expand_simple);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003365EXPORT_SYMBOL(init_buffer);
3366EXPORT_SYMBOL(invalidate_bdev);
3367EXPORT_SYMBOL(ll_rw_block);
3368EXPORT_SYMBOL(mark_buffer_dirty);
3369EXPORT_SYMBOL(submit_bh);
3370EXPORT_SYMBOL(sync_dirty_buffer);
3371EXPORT_SYMBOL(unlock_buffer);