blob: 6d77ce9f54e52031ef23d50643bcce772ed20c65 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/fs/buffer.c
3 *
4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
5 */
6
7/*
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9 *
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12 *
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
15 *
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17 *
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19 */
20
21#include <linux/config.h>
22#include <linux/kernel.h>
23#include <linux/syscalls.h>
24#include <linux/fs.h>
25#include <linux/mm.h>
26#include <linux/percpu.h>
27#include <linux/slab.h>
28#include <linux/smp_lock.h>
Randy Dunlap16f7e0f2006-01-11 12:17:46 -080029#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070030#include <linux/blkdev.h>
31#include <linux/file.h>
32#include <linux/quotaops.h>
33#include <linux/highmem.h>
34#include <linux/module.h>
35#include <linux/writeback.h>
36#include <linux/hash.h>
37#include <linux/suspend.h>
38#include <linux/buffer_head.h>
39#include <linux/bio.h>
40#include <linux/notifier.h>
41#include <linux/cpu.h>
42#include <linux/bitops.h>
43#include <linux/mpage.h>
Ingo Molnarfb1c8f92005-09-10 00:25:56 -070044#include <linux/bit_spinlock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070045
46static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
47static void invalidate_bh_lrus(void);
48
49#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
50
51inline void
52init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
53{
54 bh->b_end_io = handler;
55 bh->b_private = private;
56}
57
58static int sync_buffer(void *word)
59{
60 struct block_device *bd;
61 struct buffer_head *bh
62 = container_of(word, struct buffer_head, b_state);
63
64 smp_mb();
65 bd = bh->b_bdev;
66 if (bd)
67 blk_run_address_space(bd->bd_inode->i_mapping);
68 io_schedule();
69 return 0;
70}
71
72void fastcall __lock_buffer(struct buffer_head *bh)
73{
74 wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
75 TASK_UNINTERRUPTIBLE);
76}
77EXPORT_SYMBOL(__lock_buffer);
78
79void fastcall unlock_buffer(struct buffer_head *bh)
80{
81 clear_buffer_locked(bh);
82 smp_mb__after_clear_bit();
83 wake_up_bit(&bh->b_state, BH_Lock);
84}
85
86/*
87 * Block until a buffer comes unlocked. This doesn't stop it
88 * from becoming locked again - you have to lock it yourself
89 * if you want to preserve its state.
90 */
91void __wait_on_buffer(struct buffer_head * bh)
92{
93 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
94}
95
96static void
97__clear_page_buffers(struct page *page)
98{
99 ClearPagePrivate(page);
Hugh Dickins4c21e2f2005-10-29 18:16:40 -0700100 set_page_private(page, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101 page_cache_release(page);
102}
103
104static void buffer_io_error(struct buffer_head *bh)
105{
106 char b[BDEVNAME_SIZE];
107
108 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
109 bdevname(bh->b_bdev, b),
110 (unsigned long long)bh->b_blocknr);
111}
112
113/*
114 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
115 * unlock the buffer. This is what ll_rw_block uses too.
116 */
117void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
118{
119 if (uptodate) {
120 set_buffer_uptodate(bh);
121 } else {
122 /* This happens, due to failed READA attempts. */
123 clear_buffer_uptodate(bh);
124 }
125 unlock_buffer(bh);
126 put_bh(bh);
127}
128
129void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
130{
131 char b[BDEVNAME_SIZE];
132
133 if (uptodate) {
134 set_buffer_uptodate(bh);
135 } else {
136 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
137 buffer_io_error(bh);
138 printk(KERN_WARNING "lost page write due to "
139 "I/O error on %s\n",
140 bdevname(bh->b_bdev, b));
141 }
142 set_buffer_write_io_error(bh);
143 clear_buffer_uptodate(bh);
144 }
145 unlock_buffer(bh);
146 put_bh(bh);
147}
148
149/*
150 * Write out and wait upon all the dirty data associated with a block
151 * device via its mapping. Does not take the superblock lock.
152 */
153int sync_blockdev(struct block_device *bdev)
154{
155 int ret = 0;
156
OGAWA Hirofumi28fd1292006-01-08 01:02:14 -0800157 if (bdev)
158 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700159 return ret;
160}
161EXPORT_SYMBOL(sync_blockdev);
162
163/*
164 * Write out and wait upon all dirty data associated with this
165 * superblock. Filesystem data as well as the underlying block
166 * device. Takes the superblock lock.
167 */
168int fsync_super(struct super_block *sb)
169{
170 sync_inodes_sb(sb, 0);
171 DQUOT_SYNC(sb);
172 lock_super(sb);
173 if (sb->s_dirt && sb->s_op->write_super)
174 sb->s_op->write_super(sb);
175 unlock_super(sb);
176 if (sb->s_op->sync_fs)
177 sb->s_op->sync_fs(sb, 1);
178 sync_blockdev(sb->s_bdev);
179 sync_inodes_sb(sb, 1);
180
181 return sync_blockdev(sb->s_bdev);
182}
183
184/*
185 * Write out and wait upon all dirty data associated with this
186 * device. Filesystem data as well as the underlying block
187 * device. Takes the superblock lock.
188 */
189int fsync_bdev(struct block_device *bdev)
190{
191 struct super_block *sb = get_super(bdev);
192 if (sb) {
193 int res = fsync_super(sb);
194 drop_super(sb);
195 return res;
196 }
197 return sync_blockdev(bdev);
198}
199
200/**
201 * freeze_bdev -- lock a filesystem and force it into a consistent state
202 * @bdev: blockdevice to lock
203 *
Arjan van de Venc039e312006-03-23 03:00:28 -0800204 * This takes the block device bd_mount_mutex to make sure no new mounts
Linus Torvalds1da177e2005-04-16 15:20:36 -0700205 * happen on bdev until thaw_bdev() is called.
206 * If a superblock is found on this device, we take the s_umount semaphore
207 * on it to make sure nobody unmounts until the snapshot creation is done.
208 */
209struct super_block *freeze_bdev(struct block_device *bdev)
210{
211 struct super_block *sb;
212
Arjan van de Venc039e312006-03-23 03:00:28 -0800213 mutex_lock(&bdev->bd_mount_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700214 sb = get_super(bdev);
215 if (sb && !(sb->s_flags & MS_RDONLY)) {
216 sb->s_frozen = SB_FREEZE_WRITE;
akpm@osdl.orgd59dd462005-05-01 08:58:47 -0700217 smp_wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700218
219 sync_inodes_sb(sb, 0);
220 DQUOT_SYNC(sb);
221
222 lock_super(sb);
223 if (sb->s_dirt && sb->s_op->write_super)
224 sb->s_op->write_super(sb);
225 unlock_super(sb);
226
227 if (sb->s_op->sync_fs)
228 sb->s_op->sync_fs(sb, 1);
229
230 sync_blockdev(sb->s_bdev);
231 sync_inodes_sb(sb, 1);
232
233 sb->s_frozen = SB_FREEZE_TRANS;
akpm@osdl.orgd59dd462005-05-01 08:58:47 -0700234 smp_wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700235
236 sync_blockdev(sb->s_bdev);
237
238 if (sb->s_op->write_super_lockfs)
239 sb->s_op->write_super_lockfs(sb);
240 }
241
242 sync_blockdev(bdev);
243 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
244}
245EXPORT_SYMBOL(freeze_bdev);
246
247/**
248 * thaw_bdev -- unlock filesystem
249 * @bdev: blockdevice to unlock
250 * @sb: associated superblock
251 *
252 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
253 */
254void thaw_bdev(struct block_device *bdev, struct super_block *sb)
255{
256 if (sb) {
257 BUG_ON(sb->s_bdev != bdev);
258
259 if (sb->s_op->unlockfs)
260 sb->s_op->unlockfs(sb);
261 sb->s_frozen = SB_UNFROZEN;
akpm@osdl.orgd59dd462005-05-01 08:58:47 -0700262 smp_wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263 wake_up(&sb->s_wait_unfrozen);
264 drop_super(sb);
265 }
266
Arjan van de Venc039e312006-03-23 03:00:28 -0800267 mutex_unlock(&bdev->bd_mount_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268}
269EXPORT_SYMBOL(thaw_bdev);
270
271/*
272 * sync everything. Start out by waking pdflush, because that writes back
273 * all queues in parallel.
274 */
275static void do_sync(unsigned long wait)
276{
Pekka J Enberg687a21c2005-06-28 20:44:55 -0700277 wakeup_pdflush(0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278 sync_inodes(0); /* All mappings, inodes and their blockdevs */
279 DQUOT_SYNC(NULL);
280 sync_supers(); /* Write the superblocks */
281 sync_filesystems(0); /* Start syncing the filesystems */
282 sync_filesystems(wait); /* Waitingly sync the filesystems */
283 sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
284 if (!wait)
285 printk("Emergency Sync complete\n");
286 if (unlikely(laptop_mode))
287 laptop_sync_completion();
288}
289
290asmlinkage long sys_sync(void)
291{
292 do_sync(1);
293 return 0;
294}
295
296void emergency_sync(void)
297{
298 pdflush_operation(do_sync, 0);
299}
300
301/*
302 * Generic function to fsync a file.
303 *
304 * filp may be NULL if called via the msync of a vma.
305 */
306
307int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
308{
309 struct inode * inode = dentry->d_inode;
310 struct super_block * sb;
311 int ret, err;
312
313 /* sync the inode to buffers */
314 ret = write_inode_now(inode, 0);
315
316 /* sync the superblock to buffers */
317 sb = inode->i_sb;
318 lock_super(sb);
319 if (sb->s_op->write_super)
320 sb->s_op->write_super(sb);
321 unlock_super(sb);
322
323 /* .. finally sync the buffers to disk */
324 err = sync_blockdev(sb->s_bdev);
325 if (!ret)
326 ret = err;
327 return ret;
328}
329
Andrew Morton18e79b42006-03-24 03:18:14 -0800330long do_fsync(struct file *file, int datasync)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700331{
Andrew Morton18e79b42006-03-24 03:18:14 -0800332 int ret;
333 int err;
334 struct address_space *mapping = file->f_mapping;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336 if (!file->f_op || !file->f_op->fsync) {
337 /* Why? We can still call filemap_fdatawrite */
Andrew Morton18e79b42006-03-24 03:18:14 -0800338 ret = -EINVAL;
339 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700340 }
341
342 current->flags |= PF_SYNCWRITE;
343 ret = filemap_fdatawrite(mapping);
344
345 /*
Andrew Morton18e79b42006-03-24 03:18:14 -0800346 * We need to protect against concurrent writers, which could cause
347 * livelocks in fsync_buffers_list().
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348 */
Jes Sorensen1b1dcc12006-01-09 15:59:24 -0800349 mutex_lock(&mapping->host->i_mutex);
Oleg Nesterovdfb388b2005-06-23 00:10:02 -0700350 err = file->f_op->fsync(file, file->f_dentry, datasync);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351 if (!ret)
352 ret = err;
Jes Sorensen1b1dcc12006-01-09 15:59:24 -0800353 mutex_unlock(&mapping->host->i_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700354 err = filemap_fdatawait(mapping);
355 if (!ret)
356 ret = err;
357 current->flags &= ~PF_SYNCWRITE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358out:
359 return ret;
360}
361
Andrew Morton18e79b42006-03-24 03:18:14 -0800362static long __do_fsync(unsigned int fd, int datasync)
363{
364 struct file *file;
365 int ret = -EBADF;
366
367 file = fget(fd);
368 if (file) {
369 ret = do_fsync(file, datasync);
370 fput(file);
371 }
372 return ret;
373}
374
Oleg Nesterovdfb388b2005-06-23 00:10:02 -0700375asmlinkage long sys_fsync(unsigned int fd)
376{
Andrew Morton18e79b42006-03-24 03:18:14 -0800377 return __do_fsync(fd, 0);
Oleg Nesterovdfb388b2005-06-23 00:10:02 -0700378}
379
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380asmlinkage long sys_fdatasync(unsigned int fd)
381{
Andrew Morton18e79b42006-03-24 03:18:14 -0800382 return __do_fsync(fd, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383}
384
385/*
386 * Various filesystems appear to want __find_get_block to be non-blocking.
387 * But it's the page lock which protects the buffers. To get around this,
388 * we get exclusion from try_to_free_buffers with the blockdev mapping's
389 * private_lock.
390 *
391 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
392 * may be quite high. This code could TryLock the page, and if that
393 * succeeds, there is no need to take private_lock. (But if
394 * private_lock is contended then so is mapping->tree_lock).
395 */
396static struct buffer_head *
Coywolf Qi Hunt385fd4c2005-11-07 00:59:39 -0800397__find_get_block_slow(struct block_device *bdev, sector_t block)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700398{
399 struct inode *bd_inode = bdev->bd_inode;
400 struct address_space *bd_mapping = bd_inode->i_mapping;
401 struct buffer_head *ret = NULL;
402 pgoff_t index;
403 struct buffer_head *bh;
404 struct buffer_head *head;
405 struct page *page;
406 int all_mapped = 1;
407
408 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
409 page = find_get_page(bd_mapping, index);
410 if (!page)
411 goto out;
412
413 spin_lock(&bd_mapping->private_lock);
414 if (!page_has_buffers(page))
415 goto out_unlock;
416 head = page_buffers(page);
417 bh = head;
418 do {
419 if (bh->b_blocknr == block) {
420 ret = bh;
421 get_bh(bh);
422 goto out_unlock;
423 }
424 if (!buffer_mapped(bh))
425 all_mapped = 0;
426 bh = bh->b_this_page;
427 } while (bh != head);
428
429 /* we might be here because some of the buffers on this page are
430 * not mapped. This is due to various races between
431 * file io on the block device and getblk. It gets dealt with
432 * elsewhere, don't buffer_error if we had some unmapped buffers
433 */
434 if (all_mapped) {
435 printk("__find_get_block_slow() failed. "
436 "block=%llu, b_blocknr=%llu\n",
437 (unsigned long long)block, (unsigned long long)bh->b_blocknr);
438 printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size);
439 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
440 }
441out_unlock:
442 spin_unlock(&bd_mapping->private_lock);
443 page_cache_release(page);
444out:
445 return ret;
446}
447
448/* If invalidate_buffers() will trash dirty buffers, it means some kind
449 of fs corruption is going on. Trashing dirty data always imply losing
450 information that was supposed to be just stored on the physical layer
451 by the user.
452
453 Thus invalidate_buffers in general usage is not allwowed to trash
454 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
455 be preserved. These buffers are simply skipped.
456
457 We also skip buffers which are still in use. For example this can
458 happen if a userspace program is reading the block device.
459
460 NOTE: In the case where the user removed a removable-media-disk even if
461 there's still dirty data not synced on disk (due a bug in the device driver
462 or due an error of the user), by not destroying the dirty buffers we could
463 generate corruption also on the next media inserted, thus a parameter is
464 necessary to handle this case in the most safe way possible (trying
465 to not corrupt also the new disk inserted with the data belonging to
466 the old now corrupted disk). Also for the ramdisk the natural thing
467 to do in order to release the ramdisk memory is to destroy dirty buffers.
468
469 These are two special cases. Normal usage imply the device driver
470 to issue a sync on the device (without waiting I/O completion) and
471 then an invalidate_buffers call that doesn't trash dirty buffers.
472
473 For handling cache coherency with the blkdev pagecache the 'update' case
474 is been introduced. It is needed to re-read from disk any pinned
475 buffer. NOTE: re-reading from disk is destructive so we can do it only
476 when we assume nobody is changing the buffercache under our I/O and when
477 we think the disk contains more recent information than the buffercache.
478 The update == 1 pass marks the buffers we need to update, the update == 2
479 pass does the actual I/O. */
480void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
481{
482 invalidate_bh_lrus();
483 /*
484 * FIXME: what about destroy_dirty_buffers?
485 * We really want to use invalidate_inode_pages2() for
486 * that, but not until that's cleaned up.
487 */
488 invalidate_inode_pages(bdev->bd_inode->i_mapping);
489}
490
491/*
492 * Kick pdflush then try to free up some ZONE_NORMAL memory.
493 */
494static void free_more_memory(void)
495{
496 struct zone **zones;
497 pg_data_t *pgdat;
498
Pekka J Enberg687a21c2005-06-28 20:44:55 -0700499 wakeup_pdflush(1024);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500 yield();
501
502 for_each_pgdat(pgdat) {
Al Viroaf4ca452005-10-21 02:55:38 -0400503 zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700504 if (*zones)
Darren Hart1ad539b2005-06-21 17:14:53 -0700505 try_to_free_pages(zones, GFP_NOFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700506 }
507}
508
509/*
510 * I/O completion handler for block_read_full_page() - pages
511 * which come unlocked at the end of I/O.
512 */
513static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
514{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700515 unsigned long flags;
Nick Piggina3972202005-07-07 17:56:56 -0700516 struct buffer_head *first;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700517 struct buffer_head *tmp;
518 struct page *page;
519 int page_uptodate = 1;
520
521 BUG_ON(!buffer_async_read(bh));
522
523 page = bh->b_page;
524 if (uptodate) {
525 set_buffer_uptodate(bh);
526 } else {
527 clear_buffer_uptodate(bh);
528 if (printk_ratelimit())
529 buffer_io_error(bh);
530 SetPageError(page);
531 }
532
533 /*
534 * Be _very_ careful from here on. Bad things can happen if
535 * two buffer heads end IO at almost the same time and both
536 * decide that the page is now completely done.
537 */
Nick Piggina3972202005-07-07 17:56:56 -0700538 first = page_buffers(page);
539 local_irq_save(flags);
540 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700541 clear_buffer_async_read(bh);
542 unlock_buffer(bh);
543 tmp = bh;
544 do {
545 if (!buffer_uptodate(tmp))
546 page_uptodate = 0;
547 if (buffer_async_read(tmp)) {
548 BUG_ON(!buffer_locked(tmp));
549 goto still_busy;
550 }
551 tmp = tmp->b_this_page;
552 } while (tmp != bh);
Nick Piggina3972202005-07-07 17:56:56 -0700553 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
554 local_irq_restore(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700555
556 /*
557 * If none of the buffers had errors and they are all
558 * uptodate then we can set the page uptodate.
559 */
560 if (page_uptodate && !PageError(page))
561 SetPageUptodate(page);
562 unlock_page(page);
563 return;
564
565still_busy:
Nick Piggina3972202005-07-07 17:56:56 -0700566 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
567 local_irq_restore(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700568 return;
569}
570
571/*
572 * Completion handler for block_write_full_page() - pages which are unlocked
573 * during I/O, and which have PageWriteback cleared upon I/O completion.
574 */
575void end_buffer_async_write(struct buffer_head *bh, int uptodate)
576{
577 char b[BDEVNAME_SIZE];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700578 unsigned long flags;
Nick Piggina3972202005-07-07 17:56:56 -0700579 struct buffer_head *first;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700580 struct buffer_head *tmp;
581 struct page *page;
582
583 BUG_ON(!buffer_async_write(bh));
584
585 page = bh->b_page;
586 if (uptodate) {
587 set_buffer_uptodate(bh);
588 } else {
589 if (printk_ratelimit()) {
590 buffer_io_error(bh);
591 printk(KERN_WARNING "lost page write due to "
592 "I/O error on %s\n",
593 bdevname(bh->b_bdev, b));
594 }
595 set_bit(AS_EIO, &page->mapping->flags);
596 clear_buffer_uptodate(bh);
597 SetPageError(page);
598 }
599
Nick Piggina3972202005-07-07 17:56:56 -0700600 first = page_buffers(page);
601 local_irq_save(flags);
602 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
603
Linus Torvalds1da177e2005-04-16 15:20:36 -0700604 clear_buffer_async_write(bh);
605 unlock_buffer(bh);
606 tmp = bh->b_this_page;
607 while (tmp != bh) {
608 if (buffer_async_write(tmp)) {
609 BUG_ON(!buffer_locked(tmp));
610 goto still_busy;
611 }
612 tmp = tmp->b_this_page;
613 }
Nick Piggina3972202005-07-07 17:56:56 -0700614 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
615 local_irq_restore(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700616 end_page_writeback(page);
617 return;
618
619still_busy:
Nick Piggina3972202005-07-07 17:56:56 -0700620 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
621 local_irq_restore(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700622 return;
623}
624
625/*
626 * If a page's buffers are under async readin (end_buffer_async_read
627 * completion) then there is a possibility that another thread of
628 * control could lock one of the buffers after it has completed
629 * but while some of the other buffers have not completed. This
630 * locked buffer would confuse end_buffer_async_read() into not unlocking
631 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
632 * that this buffer is not under async I/O.
633 *
634 * The page comes unlocked when it has no locked buffer_async buffers
635 * left.
636 *
637 * PageLocked prevents anyone starting new async I/O reads any of
638 * the buffers.
639 *
640 * PageWriteback is used to prevent simultaneous writeout of the same
641 * page.
642 *
643 * PageLocked prevents anyone from starting writeback of a page which is
644 * under read I/O (PageWriteback is only ever set against a locked page).
645 */
646static void mark_buffer_async_read(struct buffer_head *bh)
647{
648 bh->b_end_io = end_buffer_async_read;
649 set_buffer_async_read(bh);
650}
651
652void mark_buffer_async_write(struct buffer_head *bh)
653{
654 bh->b_end_io = end_buffer_async_write;
655 set_buffer_async_write(bh);
656}
657EXPORT_SYMBOL(mark_buffer_async_write);
658
659
660/*
661 * fs/buffer.c contains helper functions for buffer-backed address space's
662 * fsync functions. A common requirement for buffer-based filesystems is
663 * that certain data from the backing blockdev needs to be written out for
664 * a successful fsync(). For example, ext2 indirect blocks need to be
665 * written back and waited upon before fsync() returns.
666 *
667 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
668 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
669 * management of a list of dependent buffers at ->i_mapping->private_list.
670 *
671 * Locking is a little subtle: try_to_free_buffers() will remove buffers
672 * from their controlling inode's queue when they are being freed. But
673 * try_to_free_buffers() will be operating against the *blockdev* mapping
674 * at the time, not against the S_ISREG file which depends on those buffers.
675 * So the locking for private_list is via the private_lock in the address_space
676 * which backs the buffers. Which is different from the address_space
677 * against which the buffers are listed. So for a particular address_space,
678 * mapping->private_lock does *not* protect mapping->private_list! In fact,
679 * mapping->private_list will always be protected by the backing blockdev's
680 * ->private_lock.
681 *
682 * Which introduces a requirement: all buffers on an address_space's
683 * ->private_list must be from the same address_space: the blockdev's.
684 *
685 * address_spaces which do not place buffers at ->private_list via these
686 * utility functions are free to use private_lock and private_list for
687 * whatever they want. The only requirement is that list_empty(private_list)
688 * be true at clear_inode() time.
689 *
690 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
691 * filesystems should do that. invalidate_inode_buffers() should just go
692 * BUG_ON(!list_empty).
693 *
694 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
695 * take an address_space, not an inode. And it should be called
696 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
697 * queued up.
698 *
699 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
700 * list if it is already on a list. Because if the buffer is on a list,
701 * it *must* already be on the right one. If not, the filesystem is being
702 * silly. This will save a ton of locking. But first we have to ensure
703 * that buffers are taken *off* the old inode's list when they are freed
704 * (presumably in truncate). That requires careful auditing of all
705 * filesystems (do it inside bforget()). It could also be done by bringing
706 * b_inode back.
707 */
708
709/*
710 * The buffer's backing address_space's private_lock must be held
711 */
712static inline void __remove_assoc_queue(struct buffer_head *bh)
713{
714 list_del_init(&bh->b_assoc_buffers);
715}
716
717int inode_has_buffers(struct inode *inode)
718{
719 return !list_empty(&inode->i_data.private_list);
720}
721
722/*
723 * osync is designed to support O_SYNC io. It waits synchronously for
724 * all already-submitted IO to complete, but does not queue any new
725 * writes to the disk.
726 *
727 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
728 * you dirty the buffers, and then use osync_inode_buffers to wait for
729 * completion. Any other dirty buffers which are not yet queued for
730 * write will not be flushed to disk by the osync.
731 */
732static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
733{
734 struct buffer_head *bh;
735 struct list_head *p;
736 int err = 0;
737
738 spin_lock(lock);
739repeat:
740 list_for_each_prev(p, list) {
741 bh = BH_ENTRY(p);
742 if (buffer_locked(bh)) {
743 get_bh(bh);
744 spin_unlock(lock);
745 wait_on_buffer(bh);
746 if (!buffer_uptodate(bh))
747 err = -EIO;
748 brelse(bh);
749 spin_lock(lock);
750 goto repeat;
751 }
752 }
753 spin_unlock(lock);
754 return err;
755}
756
757/**
758 * sync_mapping_buffers - write out and wait upon a mapping's "associated"
759 * buffers
Martin Waitz67be2dd2005-05-01 08:59:26 -0700760 * @mapping: the mapping which wants those buffers written
Linus Torvalds1da177e2005-04-16 15:20:36 -0700761 *
762 * Starts I/O against the buffers at mapping->private_list, and waits upon
763 * that I/O.
764 *
Martin Waitz67be2dd2005-05-01 08:59:26 -0700765 * Basically, this is a convenience function for fsync().
766 * @mapping is a file or directory which needs those buffers to be written for
767 * a successful fsync().
Linus Torvalds1da177e2005-04-16 15:20:36 -0700768 */
769int sync_mapping_buffers(struct address_space *mapping)
770{
771 struct address_space *buffer_mapping = mapping->assoc_mapping;
772
773 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
774 return 0;
775
776 return fsync_buffers_list(&buffer_mapping->private_lock,
777 &mapping->private_list);
778}
779EXPORT_SYMBOL(sync_mapping_buffers);
780
781/*
782 * Called when we've recently written block `bblock', and it is known that
783 * `bblock' was for a buffer_boundary() buffer. This means that the block at
784 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
785 * dirty, schedule it for IO. So that indirects merge nicely with their data.
786 */
787void write_boundary_block(struct block_device *bdev,
788 sector_t bblock, unsigned blocksize)
789{
790 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
791 if (bh) {
792 if (buffer_dirty(bh))
793 ll_rw_block(WRITE, 1, &bh);
794 put_bh(bh);
795 }
796}
797
798void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
799{
800 struct address_space *mapping = inode->i_mapping;
801 struct address_space *buffer_mapping = bh->b_page->mapping;
802
803 mark_buffer_dirty(bh);
804 if (!mapping->assoc_mapping) {
805 mapping->assoc_mapping = buffer_mapping;
806 } else {
807 if (mapping->assoc_mapping != buffer_mapping)
808 BUG();
809 }
810 if (list_empty(&bh->b_assoc_buffers)) {
811 spin_lock(&buffer_mapping->private_lock);
812 list_move_tail(&bh->b_assoc_buffers,
813 &mapping->private_list);
814 spin_unlock(&buffer_mapping->private_lock);
815 }
816}
817EXPORT_SYMBOL(mark_buffer_dirty_inode);
818
819/*
820 * Add a page to the dirty page list.
821 *
822 * It is a sad fact of life that this function is called from several places
823 * deeply under spinlocking. It may not sleep.
824 *
825 * If the page has buffers, the uptodate buffers are set dirty, to preserve
826 * dirty-state coherency between the page and the buffers. It the page does
827 * not have buffers then when they are later attached they will all be set
828 * dirty.
829 *
830 * The buffers are dirtied before the page is dirtied. There's a small race
831 * window in which a writepage caller may see the page cleanness but not the
832 * buffer dirtiness. That's fine. If this code were to set the page dirty
833 * before the buffers, a concurrent writepage caller could clear the page dirty
834 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
835 * page on the dirty page list.
836 *
837 * We use private_lock to lock against try_to_free_buffers while using the
838 * page's buffer list. Also use this to protect against clean buffers being
839 * added to the page after it was set dirty.
840 *
841 * FIXME: may need to call ->reservepage here as well. That's rather up to the
842 * address_space though.
843 */
844int __set_page_dirty_buffers(struct page *page)
845{
846 struct address_space * const mapping = page->mapping;
847
848 spin_lock(&mapping->private_lock);
849 if (page_has_buffers(page)) {
850 struct buffer_head *head = page_buffers(page);
851 struct buffer_head *bh = head;
852
853 do {
854 set_buffer_dirty(bh);
855 bh = bh->b_this_page;
856 } while (bh != head);
857 }
858 spin_unlock(&mapping->private_lock);
859
860 if (!TestSetPageDirty(page)) {
861 write_lock_irq(&mapping->tree_lock);
862 if (page->mapping) { /* Race with truncate? */
863 if (mapping_cap_account_dirty(mapping))
864 inc_page_state(nr_dirty);
865 radix_tree_tag_set(&mapping->page_tree,
866 page_index(page),
867 PAGECACHE_TAG_DIRTY);
868 }
869 write_unlock_irq(&mapping->tree_lock);
870 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
Andrew Morton4741c9f2006-03-24 03:18:11 -0800871 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700872 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700873 return 0;
874}
875EXPORT_SYMBOL(__set_page_dirty_buffers);
876
877/*
878 * Write out and wait upon a list of buffers.
879 *
880 * We have conflicting pressures: we want to make sure that all
881 * initially dirty buffers get waited on, but that any subsequently
882 * dirtied buffers don't. After all, we don't want fsync to last
883 * forever if somebody is actively writing to the file.
884 *
885 * Do this in two main stages: first we copy dirty buffers to a
886 * temporary inode list, queueing the writes as we go. Then we clean
887 * up, waiting for those writes to complete.
888 *
889 * During this second stage, any subsequent updates to the file may end
890 * up refiling the buffer on the original inode's dirty list again, so
891 * there is a chance we will end up with a buffer queued for write but
892 * not yet completed on that list. So, as a final cleanup we go through
893 * the osync code to catch these locked, dirty buffers without requeuing
894 * any newly dirty buffers for write.
895 */
896static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
897{
898 struct buffer_head *bh;
899 struct list_head tmp;
900 int err = 0, err2;
901
902 INIT_LIST_HEAD(&tmp);
903
904 spin_lock(lock);
905 while (!list_empty(list)) {
906 bh = BH_ENTRY(list->next);
907 list_del_init(&bh->b_assoc_buffers);
908 if (buffer_dirty(bh) || buffer_locked(bh)) {
909 list_add(&bh->b_assoc_buffers, &tmp);
910 if (buffer_dirty(bh)) {
911 get_bh(bh);
912 spin_unlock(lock);
913 /*
914 * Ensure any pending I/O completes so that
915 * ll_rw_block() actually writes the current
916 * contents - it is a noop if I/O is still in
917 * flight on potentially older contents.
918 */
Jan Karaa7662232005-09-06 15:19:10 -0700919 ll_rw_block(SWRITE, 1, &bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700920 brelse(bh);
921 spin_lock(lock);
922 }
923 }
924 }
925
926 while (!list_empty(&tmp)) {
927 bh = BH_ENTRY(tmp.prev);
928 __remove_assoc_queue(bh);
929 get_bh(bh);
930 spin_unlock(lock);
931 wait_on_buffer(bh);
932 if (!buffer_uptodate(bh))
933 err = -EIO;
934 brelse(bh);
935 spin_lock(lock);
936 }
937
938 spin_unlock(lock);
939 err2 = osync_buffers_list(lock, list);
940 if (err)
941 return err;
942 else
943 return err2;
944}
945
946/*
947 * Invalidate any and all dirty buffers on a given inode. We are
948 * probably unmounting the fs, but that doesn't mean we have already
949 * done a sync(). Just drop the buffers from the inode list.
950 *
951 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
952 * assumes that all the buffers are against the blockdev. Not true
953 * for reiserfs.
954 */
955void invalidate_inode_buffers(struct inode *inode)
956{
957 if (inode_has_buffers(inode)) {
958 struct address_space *mapping = &inode->i_data;
959 struct list_head *list = &mapping->private_list;
960 struct address_space *buffer_mapping = mapping->assoc_mapping;
961
962 spin_lock(&buffer_mapping->private_lock);
963 while (!list_empty(list))
964 __remove_assoc_queue(BH_ENTRY(list->next));
965 spin_unlock(&buffer_mapping->private_lock);
966 }
967}
968
969/*
970 * Remove any clean buffers from the inode's buffer list. This is called
971 * when we're trying to free the inode itself. Those buffers can pin it.
972 *
973 * Returns true if all buffers were removed.
974 */
975int remove_inode_buffers(struct inode *inode)
976{
977 int ret = 1;
978
979 if (inode_has_buffers(inode)) {
980 struct address_space *mapping = &inode->i_data;
981 struct list_head *list = &mapping->private_list;
982 struct address_space *buffer_mapping = mapping->assoc_mapping;
983
984 spin_lock(&buffer_mapping->private_lock);
985 while (!list_empty(list)) {
986 struct buffer_head *bh = BH_ENTRY(list->next);
987 if (buffer_dirty(bh)) {
988 ret = 0;
989 break;
990 }
991 __remove_assoc_queue(bh);
992 }
993 spin_unlock(&buffer_mapping->private_lock);
994 }
995 return ret;
996}
997
998/*
999 * Create the appropriate buffers when given a page for data area and
1000 * the size of each buffer.. Use the bh->b_this_page linked list to
1001 * follow the buffers created. Return NULL if unable to create more
1002 * buffers.
1003 *
1004 * The retry flag is used to differentiate async IO (paging, swapping)
1005 * which may not fail from ordinary buffer allocations.
1006 */
1007struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
1008 int retry)
1009{
1010 struct buffer_head *bh, *head;
1011 long offset;
1012
1013try_again:
1014 head = NULL;
1015 offset = PAGE_SIZE;
1016 while ((offset -= size) >= 0) {
1017 bh = alloc_buffer_head(GFP_NOFS);
1018 if (!bh)
1019 goto no_grow;
1020
1021 bh->b_bdev = NULL;
1022 bh->b_this_page = head;
1023 bh->b_blocknr = -1;
1024 head = bh;
1025
1026 bh->b_state = 0;
1027 atomic_set(&bh->b_count, 0);
Chris Masonfc5cd582006-02-01 03:06:48 -08001028 bh->b_private = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001029 bh->b_size = size;
1030
1031 /* Link the buffer to its page */
1032 set_bh_page(bh, page, offset);
1033
Nathan Scott01ffe332006-01-17 09:02:07 +11001034 init_buffer(bh, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001035 }
1036 return head;
1037/*
1038 * In case anything failed, we just free everything we got.
1039 */
1040no_grow:
1041 if (head) {
1042 do {
1043 bh = head;
1044 head = head->b_this_page;
1045 free_buffer_head(bh);
1046 } while (head);
1047 }
1048
1049 /*
1050 * Return failure for non-async IO requests. Async IO requests
1051 * are not allowed to fail, so we have to wait until buffer heads
1052 * become available. But we don't want tasks sleeping with
1053 * partially complete buffers, so all were released above.
1054 */
1055 if (!retry)
1056 return NULL;
1057
1058 /* We're _really_ low on memory. Now we just
1059 * wait for old buffer heads to become free due to
1060 * finishing IO. Since this is an async request and
1061 * the reserve list is empty, we're sure there are
1062 * async buffer heads in use.
1063 */
1064 free_more_memory();
1065 goto try_again;
1066}
1067EXPORT_SYMBOL_GPL(alloc_page_buffers);
1068
1069static inline void
1070link_dev_buffers(struct page *page, struct buffer_head *head)
1071{
1072 struct buffer_head *bh, *tail;
1073
1074 bh = head;
1075 do {
1076 tail = bh;
1077 bh = bh->b_this_page;
1078 } while (bh);
1079 tail->b_this_page = head;
1080 attach_page_buffers(page, head);
1081}
1082
1083/*
1084 * Initialise the state of a blockdev page's buffers.
1085 */
1086static void
1087init_page_buffers(struct page *page, struct block_device *bdev,
1088 sector_t block, int size)
1089{
1090 struct buffer_head *head = page_buffers(page);
1091 struct buffer_head *bh = head;
1092 int uptodate = PageUptodate(page);
1093
1094 do {
1095 if (!buffer_mapped(bh)) {
1096 init_buffer(bh, NULL, NULL);
1097 bh->b_bdev = bdev;
1098 bh->b_blocknr = block;
1099 if (uptodate)
1100 set_buffer_uptodate(bh);
1101 set_buffer_mapped(bh);
1102 }
1103 block++;
1104 bh = bh->b_this_page;
1105 } while (bh != head);
1106}
1107
1108/*
1109 * Create the page-cache page that contains the requested block.
1110 *
1111 * This is user purely for blockdev mappings.
1112 */
1113static struct page *
1114grow_dev_page(struct block_device *bdev, sector_t block,
1115 pgoff_t index, int size)
1116{
1117 struct inode *inode = bdev->bd_inode;
1118 struct page *page;
1119 struct buffer_head *bh;
1120
1121 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1122 if (!page)
1123 return NULL;
1124
1125 if (!PageLocked(page))
1126 BUG();
1127
1128 if (page_has_buffers(page)) {
1129 bh = page_buffers(page);
1130 if (bh->b_size == size) {
1131 init_page_buffers(page, bdev, block, size);
1132 return page;
1133 }
1134 if (!try_to_free_buffers(page))
1135 goto failed;
1136 }
1137
1138 /*
1139 * Allocate some buffers for this page
1140 */
1141 bh = alloc_page_buffers(page, size, 0);
1142 if (!bh)
1143 goto failed;
1144
1145 /*
1146 * Link the page to the buffers and initialise them. Take the
1147 * lock to be atomic wrt __find_get_block(), which does not
1148 * run under the page lock.
1149 */
1150 spin_lock(&inode->i_mapping->private_lock);
1151 link_dev_buffers(page, bh);
1152 init_page_buffers(page, bdev, block, size);
1153 spin_unlock(&inode->i_mapping->private_lock);
1154 return page;
1155
1156failed:
1157 BUG();
1158 unlock_page(page);
1159 page_cache_release(page);
1160 return NULL;
1161}
1162
1163/*
1164 * Create buffers for the specified block device block's page. If
1165 * that page was dirty, the buffers are set dirty also.
1166 *
1167 * Except that's a bug. Attaching dirty buffers to a dirty
1168 * blockdev's page can result in filesystem corruption, because
1169 * some of those buffers may be aliases of filesystem data.
1170 * grow_dev_page() will go BUG() if this happens.
1171 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08001172static int
Linus Torvalds1da177e2005-04-16 15:20:36 -07001173grow_buffers(struct block_device *bdev, sector_t block, int size)
1174{
1175 struct page *page;
1176 pgoff_t index;
1177 int sizebits;
1178
1179 sizebits = -1;
1180 do {
1181 sizebits++;
1182 } while ((size << sizebits) < PAGE_SIZE);
1183
1184 index = block >> sizebits;
1185 block = index << sizebits;
1186
1187 /* Create a page with the proper size buffers.. */
1188 page = grow_dev_page(bdev, block, index, size);
1189 if (!page)
1190 return 0;
1191 unlock_page(page);
1192 page_cache_release(page);
1193 return 1;
1194}
1195
Adrian Bunk75c96f82005-05-05 16:16:09 -07001196static struct buffer_head *
Linus Torvalds1da177e2005-04-16 15:20:36 -07001197__getblk_slow(struct block_device *bdev, sector_t block, int size)
1198{
1199 /* Size must be multiple of hard sectorsize */
1200 if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1201 (size < 512 || size > PAGE_SIZE))) {
1202 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1203 size);
1204 printk(KERN_ERR "hardsect size: %d\n",
1205 bdev_hardsect_size(bdev));
1206
1207 dump_stack();
1208 return NULL;
1209 }
1210
1211 for (;;) {
1212 struct buffer_head * bh;
1213
1214 bh = __find_get_block(bdev, block, size);
1215 if (bh)
1216 return bh;
1217
1218 if (!grow_buffers(bdev, block, size))
1219 free_more_memory();
1220 }
1221}
1222
1223/*
1224 * The relationship between dirty buffers and dirty pages:
1225 *
1226 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1227 * the page is tagged dirty in its radix tree.
1228 *
1229 * At all times, the dirtiness of the buffers represents the dirtiness of
1230 * subsections of the page. If the page has buffers, the page dirty bit is
1231 * merely a hint about the true dirty state.
1232 *
1233 * When a page is set dirty in its entirety, all its buffers are marked dirty
1234 * (if the page has buffers).
1235 *
1236 * When a buffer is marked dirty, its page is dirtied, but the page's other
1237 * buffers are not.
1238 *
1239 * Also. When blockdev buffers are explicitly read with bread(), they
1240 * individually become uptodate. But their backing page remains not
1241 * uptodate - even if all of its buffers are uptodate. A subsequent
1242 * block_read_full_page() against that page will discover all the uptodate
1243 * buffers, will set the page uptodate and will perform no I/O.
1244 */
1245
1246/**
1247 * mark_buffer_dirty - mark a buffer_head as needing writeout
Martin Waitz67be2dd2005-05-01 08:59:26 -07001248 * @bh: the buffer_head to mark dirty
Linus Torvalds1da177e2005-04-16 15:20:36 -07001249 *
1250 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1251 * backing page dirty, then tag the page as dirty in its address_space's radix
1252 * tree and then attach the address_space's inode to its superblock's dirty
1253 * inode list.
1254 *
1255 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1256 * mapping->tree_lock and the global inode_lock.
1257 */
1258void fastcall mark_buffer_dirty(struct buffer_head *bh)
1259{
1260 if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1261 __set_page_dirty_nobuffers(bh->b_page);
1262}
1263
1264/*
1265 * Decrement a buffer_head's reference count. If all buffers against a page
1266 * have zero reference count, are clean and unlocked, and if the page is clean
1267 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1268 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1269 * a page but it ends up not being freed, and buffers may later be reattached).
1270 */
1271void __brelse(struct buffer_head * buf)
1272{
1273 if (atomic_read(&buf->b_count)) {
1274 put_bh(buf);
1275 return;
1276 }
1277 printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1278 WARN_ON(1);
1279}
1280
1281/*
1282 * bforget() is like brelse(), except it discards any
1283 * potentially dirty data.
1284 */
1285void __bforget(struct buffer_head *bh)
1286{
1287 clear_buffer_dirty(bh);
1288 if (!list_empty(&bh->b_assoc_buffers)) {
1289 struct address_space *buffer_mapping = bh->b_page->mapping;
1290
1291 spin_lock(&buffer_mapping->private_lock);
1292 list_del_init(&bh->b_assoc_buffers);
1293 spin_unlock(&buffer_mapping->private_lock);
1294 }
1295 __brelse(bh);
1296}
1297
1298static struct buffer_head *__bread_slow(struct buffer_head *bh)
1299{
1300 lock_buffer(bh);
1301 if (buffer_uptodate(bh)) {
1302 unlock_buffer(bh);
1303 return bh;
1304 } else {
1305 get_bh(bh);
1306 bh->b_end_io = end_buffer_read_sync;
1307 submit_bh(READ, bh);
1308 wait_on_buffer(bh);
1309 if (buffer_uptodate(bh))
1310 return bh;
1311 }
1312 brelse(bh);
1313 return NULL;
1314}
1315
1316/*
1317 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1318 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1319 * refcount elevated by one when they're in an LRU. A buffer can only appear
1320 * once in a particular CPU's LRU. A single buffer can be present in multiple
1321 * CPU's LRUs at the same time.
1322 *
1323 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1324 * sb_find_get_block().
1325 *
1326 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1327 * a local interrupt disable for that.
1328 */
1329
1330#define BH_LRU_SIZE 8
1331
1332struct bh_lru {
1333 struct buffer_head *bhs[BH_LRU_SIZE];
1334};
1335
1336static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1337
1338#ifdef CONFIG_SMP
1339#define bh_lru_lock() local_irq_disable()
1340#define bh_lru_unlock() local_irq_enable()
1341#else
1342#define bh_lru_lock() preempt_disable()
1343#define bh_lru_unlock() preempt_enable()
1344#endif
1345
1346static inline void check_irqs_on(void)
1347{
1348#ifdef irqs_disabled
1349 BUG_ON(irqs_disabled());
1350#endif
1351}
1352
1353/*
1354 * The LRU management algorithm is dopey-but-simple. Sorry.
1355 */
1356static void bh_lru_install(struct buffer_head *bh)
1357{
1358 struct buffer_head *evictee = NULL;
1359 struct bh_lru *lru;
1360
1361 check_irqs_on();
1362 bh_lru_lock();
1363 lru = &__get_cpu_var(bh_lrus);
1364 if (lru->bhs[0] != bh) {
1365 struct buffer_head *bhs[BH_LRU_SIZE];
1366 int in;
1367 int out = 0;
1368
1369 get_bh(bh);
1370 bhs[out++] = bh;
1371 for (in = 0; in < BH_LRU_SIZE; in++) {
1372 struct buffer_head *bh2 = lru->bhs[in];
1373
1374 if (bh2 == bh) {
1375 __brelse(bh2);
1376 } else {
1377 if (out >= BH_LRU_SIZE) {
1378 BUG_ON(evictee != NULL);
1379 evictee = bh2;
1380 } else {
1381 bhs[out++] = bh2;
1382 }
1383 }
1384 }
1385 while (out < BH_LRU_SIZE)
1386 bhs[out++] = NULL;
1387 memcpy(lru->bhs, bhs, sizeof(bhs));
1388 }
1389 bh_lru_unlock();
1390
1391 if (evictee)
1392 __brelse(evictee);
1393}
1394
1395/*
1396 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1397 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08001398static struct buffer_head *
Linus Torvalds1da177e2005-04-16 15:20:36 -07001399lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1400{
1401 struct buffer_head *ret = NULL;
1402 struct bh_lru *lru;
1403 int i;
1404
1405 check_irqs_on();
1406 bh_lru_lock();
1407 lru = &__get_cpu_var(bh_lrus);
1408 for (i = 0; i < BH_LRU_SIZE; i++) {
1409 struct buffer_head *bh = lru->bhs[i];
1410
1411 if (bh && bh->b_bdev == bdev &&
1412 bh->b_blocknr == block && bh->b_size == size) {
1413 if (i) {
1414 while (i) {
1415 lru->bhs[i] = lru->bhs[i - 1];
1416 i--;
1417 }
1418 lru->bhs[0] = bh;
1419 }
1420 get_bh(bh);
1421 ret = bh;
1422 break;
1423 }
1424 }
1425 bh_lru_unlock();
1426 return ret;
1427}
1428
1429/*
1430 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1431 * it in the LRU and mark it as accessed. If it is not present then return
1432 * NULL
1433 */
1434struct buffer_head *
1435__find_get_block(struct block_device *bdev, sector_t block, int size)
1436{
1437 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1438
1439 if (bh == NULL) {
Coywolf Qi Hunt385fd4c2005-11-07 00:59:39 -08001440 bh = __find_get_block_slow(bdev, block);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001441 if (bh)
1442 bh_lru_install(bh);
1443 }
1444 if (bh)
1445 touch_buffer(bh);
1446 return bh;
1447}
1448EXPORT_SYMBOL(__find_get_block);
1449
1450/*
1451 * __getblk will locate (and, if necessary, create) the buffer_head
1452 * which corresponds to the passed block_device, block and size. The
1453 * returned buffer has its reference count incremented.
1454 *
1455 * __getblk() cannot fail - it just keeps trying. If you pass it an
1456 * illegal block number, __getblk() will happily return a buffer_head
1457 * which represents the non-existent block. Very weird.
1458 *
1459 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1460 * attempt is failing. FIXME, perhaps?
1461 */
1462struct buffer_head *
1463__getblk(struct block_device *bdev, sector_t block, int size)
1464{
1465 struct buffer_head *bh = __find_get_block(bdev, block, size);
1466
1467 might_sleep();
1468 if (bh == NULL)
1469 bh = __getblk_slow(bdev, block, size);
1470 return bh;
1471}
1472EXPORT_SYMBOL(__getblk);
1473
1474/*
1475 * Do async read-ahead on a buffer..
1476 */
1477void __breadahead(struct block_device *bdev, sector_t block, int size)
1478{
1479 struct buffer_head *bh = __getblk(bdev, block, size);
Andrew Mortona3e713b2005-10-30 15:03:15 -08001480 if (likely(bh)) {
1481 ll_rw_block(READA, 1, &bh);
1482 brelse(bh);
1483 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001484}
1485EXPORT_SYMBOL(__breadahead);
1486
1487/**
1488 * __bread() - reads a specified block and returns the bh
Martin Waitz67be2dd2005-05-01 08:59:26 -07001489 * @bdev: the block_device to read from
Linus Torvalds1da177e2005-04-16 15:20:36 -07001490 * @block: number of block
1491 * @size: size (in bytes) to read
1492 *
1493 * Reads a specified block, and returns buffer head that contains it.
1494 * It returns NULL if the block was unreadable.
1495 */
1496struct buffer_head *
1497__bread(struct block_device *bdev, sector_t block, int size)
1498{
1499 struct buffer_head *bh = __getblk(bdev, block, size);
1500
Andrew Mortona3e713b2005-10-30 15:03:15 -08001501 if (likely(bh) && !buffer_uptodate(bh))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001502 bh = __bread_slow(bh);
1503 return bh;
1504}
1505EXPORT_SYMBOL(__bread);
1506
1507/*
1508 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1509 * This doesn't race because it runs in each cpu either in irq
1510 * or with preempt disabled.
1511 */
1512static void invalidate_bh_lru(void *arg)
1513{
1514 struct bh_lru *b = &get_cpu_var(bh_lrus);
1515 int i;
1516
1517 for (i = 0; i < BH_LRU_SIZE; i++) {
1518 brelse(b->bhs[i]);
1519 b->bhs[i] = NULL;
1520 }
1521 put_cpu_var(bh_lrus);
1522}
1523
1524static void invalidate_bh_lrus(void)
1525{
1526 on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1527}
1528
1529void set_bh_page(struct buffer_head *bh,
1530 struct page *page, unsigned long offset)
1531{
1532 bh->b_page = page;
1533 if (offset >= PAGE_SIZE)
1534 BUG();
1535 if (PageHighMem(page))
1536 /*
1537 * This catches illegal uses and preserves the offset:
1538 */
1539 bh->b_data = (char *)(0 + offset);
1540 else
1541 bh->b_data = page_address(page) + offset;
1542}
1543EXPORT_SYMBOL(set_bh_page);
1544
1545/*
1546 * Called when truncating a buffer on a page completely.
1547 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08001548static void discard_buffer(struct buffer_head * bh)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549{
1550 lock_buffer(bh);
1551 clear_buffer_dirty(bh);
1552 bh->b_bdev = NULL;
1553 clear_buffer_mapped(bh);
1554 clear_buffer_req(bh);
1555 clear_buffer_new(bh);
1556 clear_buffer_delay(bh);
1557 unlock_buffer(bh);
1558}
1559
1560/**
1561 * try_to_release_page() - release old fs-specific metadata on a page
1562 *
1563 * @page: the page which the kernel is trying to free
1564 * @gfp_mask: memory allocation flags (and I/O mode)
1565 *
1566 * The address_space is to try to release any data against the page
1567 * (presumably at page->private). If the release was successful, return `1'.
1568 * Otherwise return zero.
1569 *
1570 * The @gfp_mask argument specifies whether I/O may be performed to release
1571 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
1572 *
1573 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1574 */
Al Viro27496a82005-10-21 03:20:48 -04001575int try_to_release_page(struct page *page, gfp_t gfp_mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001576{
1577 struct address_space * const mapping = page->mapping;
1578
1579 BUG_ON(!PageLocked(page));
1580 if (PageWriteback(page))
1581 return 0;
1582
1583 if (mapping && mapping->a_ops->releasepage)
1584 return mapping->a_ops->releasepage(page, gfp_mask);
1585 return try_to_free_buffers(page);
1586}
1587EXPORT_SYMBOL(try_to_release_page);
1588
1589/**
1590 * block_invalidatepage - invalidate part of all of a buffer-backed page
1591 *
1592 * @page: the page which is affected
1593 * @offset: the index of the truncation point
1594 *
1595 * block_invalidatepage() is called when all or part of the page has become
1596 * invalidatedby a truncate operation.
1597 *
1598 * block_invalidatepage() does not have to release all buffers, but it must
1599 * ensure that no dirty buffer is left outside @offset and that no I/O
1600 * is underway against any of the blocks which are outside the truncation
1601 * point. Because the caller is about to free (and possibly reuse) those
1602 * blocks on-disk.
1603 */
1604int block_invalidatepage(struct page *page, unsigned long offset)
1605{
1606 struct buffer_head *head, *bh, *next;
1607 unsigned int curr_off = 0;
1608 int ret = 1;
1609
1610 BUG_ON(!PageLocked(page));
1611 if (!page_has_buffers(page))
1612 goto out;
1613
1614 head = page_buffers(page);
1615 bh = head;
1616 do {
1617 unsigned int next_off = curr_off + bh->b_size;
1618 next = bh->b_this_page;
1619
1620 /*
1621 * is this block fully invalidated?
1622 */
1623 if (offset <= curr_off)
1624 discard_buffer(bh);
1625 curr_off = next_off;
1626 bh = next;
1627 } while (bh != head);
1628
1629 /*
1630 * We release buffers only if the entire page is being invalidated.
1631 * The get_block cached value has been unconditionally invalidated,
1632 * so real IO is not possible anymore.
1633 */
1634 if (offset == 0)
1635 ret = try_to_release_page(page, 0);
1636out:
1637 return ret;
1638}
1639EXPORT_SYMBOL(block_invalidatepage);
1640
Jan Karaaaa40592005-10-30 15:00:16 -08001641int do_invalidatepage(struct page *page, unsigned long offset)
1642{
1643 int (*invalidatepage)(struct page *, unsigned long);
1644 invalidatepage = page->mapping->a_ops->invalidatepage;
1645 if (invalidatepage == NULL)
1646 invalidatepage = block_invalidatepage;
1647 return (*invalidatepage)(page, offset);
1648}
1649
Linus Torvalds1da177e2005-04-16 15:20:36 -07001650/*
1651 * We attach and possibly dirty the buffers atomically wrt
1652 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1653 * is already excluded via the page lock.
1654 */
1655void create_empty_buffers(struct page *page,
1656 unsigned long blocksize, unsigned long b_state)
1657{
1658 struct buffer_head *bh, *head, *tail;
1659
1660 head = alloc_page_buffers(page, blocksize, 1);
1661 bh = head;
1662 do {
1663 bh->b_state |= b_state;
1664 tail = bh;
1665 bh = bh->b_this_page;
1666 } while (bh);
1667 tail->b_this_page = head;
1668
1669 spin_lock(&page->mapping->private_lock);
1670 if (PageUptodate(page) || PageDirty(page)) {
1671 bh = head;
1672 do {
1673 if (PageDirty(page))
1674 set_buffer_dirty(bh);
1675 if (PageUptodate(page))
1676 set_buffer_uptodate(bh);
1677 bh = bh->b_this_page;
1678 } while (bh != head);
1679 }
1680 attach_page_buffers(page, head);
1681 spin_unlock(&page->mapping->private_lock);
1682}
1683EXPORT_SYMBOL(create_empty_buffers);
1684
1685/*
1686 * We are taking a block for data and we don't want any output from any
1687 * buffer-cache aliases starting from return from that function and
1688 * until the moment when something will explicitly mark the buffer
1689 * dirty (hopefully that will not happen until we will free that block ;-)
1690 * We don't even need to mark it not-uptodate - nobody can expect
1691 * anything from a newly allocated buffer anyway. We used to used
1692 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1693 * don't want to mark the alias unmapped, for example - it would confuse
1694 * anyone who might pick it with bread() afterwards...
1695 *
1696 * Also.. Note that bforget() doesn't lock the buffer. So there can
1697 * be writeout I/O going on against recently-freed buffers. We don't
1698 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1699 * only if we really need to. That happens here.
1700 */
1701void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1702{
1703 struct buffer_head *old_bh;
1704
1705 might_sleep();
1706
Coywolf Qi Hunt385fd4c2005-11-07 00:59:39 -08001707 old_bh = __find_get_block_slow(bdev, block);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001708 if (old_bh) {
1709 clear_buffer_dirty(old_bh);
1710 wait_on_buffer(old_bh);
1711 clear_buffer_req(old_bh);
1712 __brelse(old_bh);
1713 }
1714}
1715EXPORT_SYMBOL(unmap_underlying_metadata);
1716
1717/*
1718 * NOTE! All mapped/uptodate combinations are valid:
1719 *
1720 * Mapped Uptodate Meaning
1721 *
1722 * No No "unknown" - must do get_block()
1723 * No Yes "hole" - zero-filled
1724 * Yes No "allocated" - allocated on disk, not read in
1725 * Yes Yes "valid" - allocated and up-to-date in memory.
1726 *
1727 * "Dirty" is valid only with the last case (mapped+uptodate).
1728 */
1729
1730/*
1731 * While block_write_full_page is writing back the dirty buffers under
1732 * the page lock, whoever dirtied the buffers may decide to clean them
1733 * again at any time. We handle that by only looking at the buffer
1734 * state inside lock_buffer().
1735 *
1736 * If block_write_full_page() is called for regular writeback
1737 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1738 * locked buffer. This only can happen if someone has written the buffer
1739 * directly, with submit_bh(). At the address_space level PageWriteback
1740 * prevents this contention from occurring.
1741 */
1742static int __block_write_full_page(struct inode *inode, struct page *page,
1743 get_block_t *get_block, struct writeback_control *wbc)
1744{
1745 int err;
1746 sector_t block;
1747 sector_t last_block;
Andrew Mortonf0fbd5f2005-05-05 16:15:48 -07001748 struct buffer_head *bh, *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001749 int nr_underway = 0;
1750
1751 BUG_ON(!PageLocked(page));
1752
1753 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1754
1755 if (!page_has_buffers(page)) {
1756 create_empty_buffers(page, 1 << inode->i_blkbits,
1757 (1 << BH_Dirty)|(1 << BH_Uptodate));
1758 }
1759
1760 /*
1761 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1762 * here, and the (potentially unmapped) buffers may become dirty at
1763 * any time. If a buffer becomes dirty here after we've inspected it
1764 * then we just miss that fact, and the page stays dirty.
1765 *
1766 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1767 * handle that here by just cleaning them.
1768 */
1769
Andrew Morton54b21a72006-01-08 01:03:05 -08001770 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001771 head = page_buffers(page);
1772 bh = head;
1773
1774 /*
1775 * Get all the dirty buffers mapped to disk addresses and
1776 * handle any aliases from the underlying blockdev's mapping.
1777 */
1778 do {
1779 if (block > last_block) {
1780 /*
1781 * mapped buffers outside i_size will occur, because
1782 * this page can be outside i_size when there is a
1783 * truncate in progress.
1784 */
1785 /*
1786 * The buffer was zeroed by block_write_full_page()
1787 */
1788 clear_buffer_dirty(bh);
1789 set_buffer_uptodate(bh);
1790 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1791 err = get_block(inode, block, bh, 1);
1792 if (err)
1793 goto recover;
1794 if (buffer_new(bh)) {
1795 /* blockdev mappings never come here */
1796 clear_buffer_new(bh);
1797 unmap_underlying_metadata(bh->b_bdev,
1798 bh->b_blocknr);
1799 }
1800 }
1801 bh = bh->b_this_page;
1802 block++;
1803 } while (bh != head);
1804
1805 do {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001806 if (!buffer_mapped(bh))
1807 continue;
1808 /*
1809 * If it's a fully non-blocking write attempt and we cannot
1810 * lock the buffer then redirty the page. Note that this can
1811 * potentially cause a busy-wait loop from pdflush and kswapd
1812 * activity, but those code paths have their own higher-level
1813 * throttling.
1814 */
1815 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1816 lock_buffer(bh);
1817 } else if (test_set_buffer_locked(bh)) {
1818 redirty_page_for_writepage(wbc, page);
1819 continue;
1820 }
1821 if (test_clear_buffer_dirty(bh)) {
1822 mark_buffer_async_write(bh);
1823 } else {
1824 unlock_buffer(bh);
1825 }
1826 } while ((bh = bh->b_this_page) != head);
1827
1828 /*
1829 * The page and its buffers are protected by PageWriteback(), so we can
1830 * drop the bh refcounts early.
1831 */
1832 BUG_ON(PageWriteback(page));
1833 set_page_writeback(page);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001834
1835 do {
1836 struct buffer_head *next = bh->b_this_page;
1837 if (buffer_async_write(bh)) {
1838 submit_bh(WRITE, bh);
1839 nr_underway++;
1840 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001841 bh = next;
1842 } while (bh != head);
Andrew Morton05937ba2005-05-05 16:15:47 -07001843 unlock_page(page);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001844
1845 err = 0;
1846done:
1847 if (nr_underway == 0) {
1848 /*
1849 * The page was marked dirty, but the buffers were
1850 * clean. Someone wrote them back by hand with
1851 * ll_rw_block/submit_bh. A rare case.
1852 */
1853 int uptodate = 1;
1854 do {
1855 if (!buffer_uptodate(bh)) {
1856 uptodate = 0;
1857 break;
1858 }
1859 bh = bh->b_this_page;
1860 } while (bh != head);
1861 if (uptodate)
1862 SetPageUptodate(page);
1863 end_page_writeback(page);
1864 /*
1865 * The page and buffer_heads can be released at any time from
1866 * here on.
1867 */
1868 wbc->pages_skipped++; /* We didn't write this page */
1869 }
1870 return err;
1871
1872recover:
1873 /*
1874 * ENOSPC, or some other error. We may already have added some
1875 * blocks to the file, so we need to write these out to avoid
1876 * exposing stale data.
1877 * The page is currently locked and not marked for writeback
1878 */
1879 bh = head;
1880 /* Recovery: lock and submit the mapped buffers */
1881 do {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001882 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1883 lock_buffer(bh);
1884 mark_buffer_async_write(bh);
1885 } else {
1886 /*
1887 * The buffer may have been set dirty during
1888 * attachment to a dirty page.
1889 */
1890 clear_buffer_dirty(bh);
1891 }
1892 } while ((bh = bh->b_this_page) != head);
1893 SetPageError(page);
1894 BUG_ON(PageWriteback(page));
1895 set_page_writeback(page);
1896 unlock_page(page);
1897 do {
1898 struct buffer_head *next = bh->b_this_page;
1899 if (buffer_async_write(bh)) {
1900 clear_buffer_dirty(bh);
1901 submit_bh(WRITE, bh);
1902 nr_underway++;
1903 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001904 bh = next;
1905 } while (bh != head);
1906 goto done;
1907}
1908
1909static int __block_prepare_write(struct inode *inode, struct page *page,
1910 unsigned from, unsigned to, get_block_t *get_block)
1911{
1912 unsigned block_start, block_end;
1913 sector_t block;
1914 int err = 0;
1915 unsigned blocksize, bbits;
1916 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1917
1918 BUG_ON(!PageLocked(page));
1919 BUG_ON(from > PAGE_CACHE_SIZE);
1920 BUG_ON(to > PAGE_CACHE_SIZE);
1921 BUG_ON(from > to);
1922
1923 blocksize = 1 << inode->i_blkbits;
1924 if (!page_has_buffers(page))
1925 create_empty_buffers(page, blocksize, 0);
1926 head = page_buffers(page);
1927
1928 bbits = inode->i_blkbits;
1929 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1930
1931 for(bh = head, block_start = 0; bh != head || !block_start;
1932 block++, block_start=block_end, bh = bh->b_this_page) {
1933 block_end = block_start + blocksize;
1934 if (block_end <= from || block_start >= to) {
1935 if (PageUptodate(page)) {
1936 if (!buffer_uptodate(bh))
1937 set_buffer_uptodate(bh);
1938 }
1939 continue;
1940 }
1941 if (buffer_new(bh))
1942 clear_buffer_new(bh);
1943 if (!buffer_mapped(bh)) {
1944 err = get_block(inode, block, bh, 1);
1945 if (err)
Nick Pigginf3ddbdc2005-05-05 16:15:45 -07001946 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001947 if (buffer_new(bh)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001948 unmap_underlying_metadata(bh->b_bdev,
1949 bh->b_blocknr);
1950 if (PageUptodate(page)) {
1951 set_buffer_uptodate(bh);
1952 continue;
1953 }
1954 if (block_end > to || block_start < from) {
1955 void *kaddr;
1956
1957 kaddr = kmap_atomic(page, KM_USER0);
1958 if (block_end > to)
1959 memset(kaddr+to, 0,
1960 block_end-to);
1961 if (block_start < from)
1962 memset(kaddr+block_start,
1963 0, from-block_start);
1964 flush_dcache_page(page);
1965 kunmap_atomic(kaddr, KM_USER0);
1966 }
1967 continue;
1968 }
1969 }
1970 if (PageUptodate(page)) {
1971 if (!buffer_uptodate(bh))
1972 set_buffer_uptodate(bh);
1973 continue;
1974 }
1975 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1976 (block_start < from || block_end > to)) {
1977 ll_rw_block(READ, 1, &bh);
1978 *wait_bh++=bh;
1979 }
1980 }
1981 /*
1982 * If we issued read requests - let them complete.
1983 */
1984 while(wait_bh > wait) {
1985 wait_on_buffer(*--wait_bh);
1986 if (!buffer_uptodate(*wait_bh))
Nick Pigginf3ddbdc2005-05-05 16:15:45 -07001987 err = -EIO;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001988 }
Anton Altaparmakov152becd2005-06-23 00:10:21 -07001989 if (!err) {
1990 bh = head;
1991 do {
1992 if (buffer_new(bh))
1993 clear_buffer_new(bh);
1994 } while ((bh = bh->b_this_page) != head);
1995 return 0;
1996 }
Nick Pigginf3ddbdc2005-05-05 16:15:45 -07001997 /* Error case: */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001998 /*
1999 * Zero out any newly allocated blocks to avoid exposing stale
2000 * data. If BH_New is set, we know that the block was newly
2001 * allocated in the above loop.
2002 */
2003 bh = head;
2004 block_start = 0;
2005 do {
2006 block_end = block_start+blocksize;
2007 if (block_end <= from)
2008 goto next_bh;
2009 if (block_start >= to)
2010 break;
2011 if (buffer_new(bh)) {
2012 void *kaddr;
2013
2014 clear_buffer_new(bh);
2015 kaddr = kmap_atomic(page, KM_USER0);
2016 memset(kaddr+block_start, 0, bh->b_size);
2017 kunmap_atomic(kaddr, KM_USER0);
2018 set_buffer_uptodate(bh);
2019 mark_buffer_dirty(bh);
2020 }
2021next_bh:
2022 block_start = block_end;
2023 bh = bh->b_this_page;
2024 } while (bh != head);
2025 return err;
2026}
2027
2028static int __block_commit_write(struct inode *inode, struct page *page,
2029 unsigned from, unsigned to)
2030{
2031 unsigned block_start, block_end;
2032 int partial = 0;
2033 unsigned blocksize;
2034 struct buffer_head *bh, *head;
2035
2036 blocksize = 1 << inode->i_blkbits;
2037
2038 for(bh = head = page_buffers(page), block_start = 0;
2039 bh != head || !block_start;
2040 block_start=block_end, bh = bh->b_this_page) {
2041 block_end = block_start + blocksize;
2042 if (block_end <= from || block_start >= to) {
2043 if (!buffer_uptodate(bh))
2044 partial = 1;
2045 } else {
2046 set_buffer_uptodate(bh);
2047 mark_buffer_dirty(bh);
2048 }
2049 }
2050
2051 /*
2052 * If this is a partial write which happened to make all buffers
2053 * uptodate then we can optimize away a bogus readpage() for
2054 * the next read(). Here we 'discover' whether the page went
2055 * uptodate as a result of this (potentially partial) write.
2056 */
2057 if (!partial)
2058 SetPageUptodate(page);
2059 return 0;
2060}
2061
2062/*
2063 * Generic "read page" function for block devices that have the normal
2064 * get_block functionality. This is most of the block device filesystems.
2065 * Reads the page asynchronously --- the unlock_buffer() and
2066 * set/clear_buffer_uptodate() functions propagate buffer state into the
2067 * page struct once IO has completed.
2068 */
2069int block_read_full_page(struct page *page, get_block_t *get_block)
2070{
2071 struct inode *inode = page->mapping->host;
2072 sector_t iblock, lblock;
2073 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2074 unsigned int blocksize;
2075 int nr, i;
2076 int fully_mapped = 1;
2077
Matt Mackallcd7619d2005-05-01 08:59:01 -07002078 BUG_ON(!PageLocked(page));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002079 blocksize = 1 << inode->i_blkbits;
2080 if (!page_has_buffers(page))
2081 create_empty_buffers(page, blocksize, 0);
2082 head = page_buffers(page);
2083
2084 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2085 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2086 bh = head;
2087 nr = 0;
2088 i = 0;
2089
2090 do {
2091 if (buffer_uptodate(bh))
2092 continue;
2093
2094 if (!buffer_mapped(bh)) {
Andrew Mortonc64610b2005-05-16 21:53:49 -07002095 int err = 0;
2096
Linus Torvalds1da177e2005-04-16 15:20:36 -07002097 fully_mapped = 0;
2098 if (iblock < lblock) {
Andrew Mortonc64610b2005-05-16 21:53:49 -07002099 err = get_block(inode, iblock, bh, 0);
2100 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002101 SetPageError(page);
2102 }
2103 if (!buffer_mapped(bh)) {
2104 void *kaddr = kmap_atomic(page, KM_USER0);
2105 memset(kaddr + i * blocksize, 0, blocksize);
2106 flush_dcache_page(page);
2107 kunmap_atomic(kaddr, KM_USER0);
Andrew Mortonc64610b2005-05-16 21:53:49 -07002108 if (!err)
2109 set_buffer_uptodate(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002110 continue;
2111 }
2112 /*
2113 * get_block() might have updated the buffer
2114 * synchronously
2115 */
2116 if (buffer_uptodate(bh))
2117 continue;
2118 }
2119 arr[nr++] = bh;
2120 } while (i++, iblock++, (bh = bh->b_this_page) != head);
2121
2122 if (fully_mapped)
2123 SetPageMappedToDisk(page);
2124
2125 if (!nr) {
2126 /*
2127 * All buffers are uptodate - we can set the page uptodate
2128 * as well. But not if get_block() returned an error.
2129 */
2130 if (!PageError(page))
2131 SetPageUptodate(page);
2132 unlock_page(page);
2133 return 0;
2134 }
2135
2136 /* Stage two: lock the buffers */
2137 for (i = 0; i < nr; i++) {
2138 bh = arr[i];
2139 lock_buffer(bh);
2140 mark_buffer_async_read(bh);
2141 }
2142
2143 /*
2144 * Stage 3: start the IO. Check for uptodateness
2145 * inside the buffer lock in case another process reading
2146 * the underlying blockdev brought it uptodate (the sct fix).
2147 */
2148 for (i = 0; i < nr; i++) {
2149 bh = arr[i];
2150 if (buffer_uptodate(bh))
2151 end_buffer_async_read(bh, 1);
2152 else
2153 submit_bh(READ, bh);
2154 }
2155 return 0;
2156}
2157
2158/* utility function for filesystems that need to do work on expanding
2159 * truncates. Uses prepare/commit_write to allow the filesystem to
2160 * deal with the hole.
2161 */
OGAWA Hirofumi05eb0b52006-01-08 01:02:13 -08002162static int __generic_cont_expand(struct inode *inode, loff_t size,
2163 pgoff_t index, unsigned int offset)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002164{
2165 struct address_space *mapping = inode->i_mapping;
2166 struct page *page;
OGAWA Hirofumi05eb0b52006-01-08 01:02:13 -08002167 unsigned long limit;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002168 int err;
2169
2170 err = -EFBIG;
2171 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2172 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2173 send_sig(SIGXFSZ, current, 0);
2174 goto out;
2175 }
2176 if (size > inode->i_sb->s_maxbytes)
2177 goto out;
2178
Linus Torvalds1da177e2005-04-16 15:20:36 -07002179 err = -ENOMEM;
2180 page = grab_cache_page(mapping, index);
2181 if (!page)
2182 goto out;
2183 err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
OGAWA Hirofumi05eb0b52006-01-08 01:02:13 -08002184 if (err) {
2185 /*
2186 * ->prepare_write() may have instantiated a few blocks
2187 * outside i_size. Trim these off again.
2188 */
2189 unlock_page(page);
2190 page_cache_release(page);
2191 vmtruncate(inode, inode->i_size);
2192 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002193 }
OGAWA Hirofumi05eb0b52006-01-08 01:02:13 -08002194
2195 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2196
Linus Torvalds1da177e2005-04-16 15:20:36 -07002197 unlock_page(page);
2198 page_cache_release(page);
2199 if (err > 0)
2200 err = 0;
2201out:
2202 return err;
2203}
2204
OGAWA Hirofumi05eb0b52006-01-08 01:02:13 -08002205int generic_cont_expand(struct inode *inode, loff_t size)
2206{
2207 pgoff_t index;
2208 unsigned int offset;
2209
2210 offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
2211
2212 /* ugh. in prepare/commit_write, if from==to==start of block, we
2213 ** skip the prepare. make sure we never send an offset for the start
2214 ** of a block
2215 */
2216 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2217 /* caller must handle this extra byte. */
2218 offset++;
2219 }
2220 index = size >> PAGE_CACHE_SHIFT;
2221
2222 return __generic_cont_expand(inode, size, index, offset);
2223}
2224
2225int generic_cont_expand_simple(struct inode *inode, loff_t size)
2226{
2227 loff_t pos = size - 1;
2228 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2229 unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
2230
2231 /* prepare/commit_write can handle even if from==to==start of block. */
2232 return __generic_cont_expand(inode, size, index, offset);
2233}
2234
Linus Torvalds1da177e2005-04-16 15:20:36 -07002235/*
2236 * For moronic filesystems that do not allow holes in file.
2237 * We may have to extend the file.
2238 */
2239
2240int cont_prepare_write(struct page *page, unsigned offset,
2241 unsigned to, get_block_t *get_block, loff_t *bytes)
2242{
2243 struct address_space *mapping = page->mapping;
2244 struct inode *inode = mapping->host;
2245 struct page *new_page;
2246 pgoff_t pgpos;
2247 long status;
2248 unsigned zerofrom;
2249 unsigned blocksize = 1 << inode->i_blkbits;
2250 void *kaddr;
2251
2252 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2253 status = -ENOMEM;
2254 new_page = grab_cache_page(mapping, pgpos);
2255 if (!new_page)
2256 goto out;
2257 /* we might sleep */
2258 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2259 unlock_page(new_page);
2260 page_cache_release(new_page);
2261 continue;
2262 }
2263 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2264 if (zerofrom & (blocksize-1)) {
2265 *bytes |= (blocksize-1);
2266 (*bytes)++;
2267 }
2268 status = __block_prepare_write(inode, new_page, zerofrom,
2269 PAGE_CACHE_SIZE, get_block);
2270 if (status)
2271 goto out_unmap;
2272 kaddr = kmap_atomic(new_page, KM_USER0);
2273 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2274 flush_dcache_page(new_page);
2275 kunmap_atomic(kaddr, KM_USER0);
2276 generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
2277 unlock_page(new_page);
2278 page_cache_release(new_page);
2279 }
2280
2281 if (page->index < pgpos) {
2282 /* completely inside the area */
2283 zerofrom = offset;
2284 } else {
2285 /* page covers the boundary, find the boundary offset */
2286 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2287
2288 /* if we will expand the thing last block will be filled */
2289 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2290 *bytes |= (blocksize-1);
2291 (*bytes)++;
2292 }
2293
2294 /* starting below the boundary? Nothing to zero out */
2295 if (offset <= zerofrom)
2296 zerofrom = offset;
2297 }
2298 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2299 if (status)
2300 goto out1;
2301 if (zerofrom < offset) {
2302 kaddr = kmap_atomic(page, KM_USER0);
2303 memset(kaddr+zerofrom, 0, offset-zerofrom);
2304 flush_dcache_page(page);
2305 kunmap_atomic(kaddr, KM_USER0);
2306 __block_commit_write(inode, page, zerofrom, offset);
2307 }
2308 return 0;
2309out1:
2310 ClearPageUptodate(page);
2311 return status;
2312
2313out_unmap:
2314 ClearPageUptodate(new_page);
2315 unlock_page(new_page);
2316 page_cache_release(new_page);
2317out:
2318 return status;
2319}
2320
2321int block_prepare_write(struct page *page, unsigned from, unsigned to,
2322 get_block_t *get_block)
2323{
2324 struct inode *inode = page->mapping->host;
2325 int err = __block_prepare_write(inode, page, from, to, get_block);
2326 if (err)
2327 ClearPageUptodate(page);
2328 return err;
2329}
2330
2331int block_commit_write(struct page *page, unsigned from, unsigned to)
2332{
2333 struct inode *inode = page->mapping->host;
2334 __block_commit_write(inode,page,from,to);
2335 return 0;
2336}
2337
2338int generic_commit_write(struct file *file, struct page *page,
2339 unsigned from, unsigned to)
2340{
2341 struct inode *inode = page->mapping->host;
2342 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2343 __block_commit_write(inode,page,from,to);
2344 /*
2345 * No need to use i_size_read() here, the i_size
Jes Sorensen1b1dcc12006-01-09 15:59:24 -08002346 * cannot change under us because we hold i_mutex.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002347 */
2348 if (pos > inode->i_size) {
2349 i_size_write(inode, pos);
2350 mark_inode_dirty(inode);
2351 }
2352 return 0;
2353}
2354
2355
2356/*
2357 * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2358 * immediately, while under the page lock. So it needs a special end_io
2359 * handler which does not touch the bh after unlocking it.
2360 *
2361 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2362 * a race there is benign: unlock_buffer() only use the bh's address for
2363 * hashing after unlocking the buffer, so it doesn't actually touch the bh
2364 * itself.
2365 */
2366static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2367{
2368 if (uptodate) {
2369 set_buffer_uptodate(bh);
2370 } else {
2371 /* This happens, due to failed READA attempts. */
2372 clear_buffer_uptodate(bh);
2373 }
2374 unlock_buffer(bh);
2375}
2376
2377/*
2378 * On entry, the page is fully not uptodate.
2379 * On exit the page is fully uptodate in the areas outside (from,to)
2380 */
2381int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2382 get_block_t *get_block)
2383{
2384 struct inode *inode = page->mapping->host;
2385 const unsigned blkbits = inode->i_blkbits;
2386 const unsigned blocksize = 1 << blkbits;
2387 struct buffer_head map_bh;
2388 struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2389 unsigned block_in_page;
2390 unsigned block_start;
2391 sector_t block_in_file;
2392 char *kaddr;
2393 int nr_reads = 0;
2394 int i;
2395 int ret = 0;
2396 int is_mapped_to_disk = 1;
2397 int dirtied_it = 0;
2398
2399 if (PageMappedToDisk(page))
2400 return 0;
2401
2402 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2403 map_bh.b_page = page;
2404
2405 /*
2406 * We loop across all blocks in the page, whether or not they are
2407 * part of the affected region. This is so we can discover if the
2408 * page is fully mapped-to-disk.
2409 */
2410 for (block_start = 0, block_in_page = 0;
2411 block_start < PAGE_CACHE_SIZE;
2412 block_in_page++, block_start += blocksize) {
2413 unsigned block_end = block_start + blocksize;
2414 int create;
2415
2416 map_bh.b_state = 0;
2417 create = 1;
2418 if (block_start >= to)
2419 create = 0;
2420 ret = get_block(inode, block_in_file + block_in_page,
2421 &map_bh, create);
2422 if (ret)
2423 goto failed;
2424 if (!buffer_mapped(&map_bh))
2425 is_mapped_to_disk = 0;
2426 if (buffer_new(&map_bh))
2427 unmap_underlying_metadata(map_bh.b_bdev,
2428 map_bh.b_blocknr);
2429 if (PageUptodate(page))
2430 continue;
2431 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2432 kaddr = kmap_atomic(page, KM_USER0);
2433 if (block_start < from) {
2434 memset(kaddr+block_start, 0, from-block_start);
2435 dirtied_it = 1;
2436 }
2437 if (block_end > to) {
2438 memset(kaddr + to, 0, block_end - to);
2439 dirtied_it = 1;
2440 }
2441 flush_dcache_page(page);
2442 kunmap_atomic(kaddr, KM_USER0);
2443 continue;
2444 }
2445 if (buffer_uptodate(&map_bh))
2446 continue; /* reiserfs does this */
2447 if (block_start < from || block_end > to) {
2448 struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2449
2450 if (!bh) {
2451 ret = -ENOMEM;
2452 goto failed;
2453 }
2454 bh->b_state = map_bh.b_state;
2455 atomic_set(&bh->b_count, 0);
2456 bh->b_this_page = NULL;
2457 bh->b_page = page;
2458 bh->b_blocknr = map_bh.b_blocknr;
2459 bh->b_size = blocksize;
2460 bh->b_data = (char *)(long)block_start;
2461 bh->b_bdev = map_bh.b_bdev;
2462 bh->b_private = NULL;
2463 read_bh[nr_reads++] = bh;
2464 }
2465 }
2466
2467 if (nr_reads) {
2468 struct buffer_head *bh;
2469
2470 /*
2471 * The page is locked, so these buffers are protected from
2472 * any VM or truncate activity. Hence we don't need to care
2473 * for the buffer_head refcounts.
2474 */
2475 for (i = 0; i < nr_reads; i++) {
2476 bh = read_bh[i];
2477 lock_buffer(bh);
2478 bh->b_end_io = end_buffer_read_nobh;
2479 submit_bh(READ, bh);
2480 }
2481 for (i = 0; i < nr_reads; i++) {
2482 bh = read_bh[i];
2483 wait_on_buffer(bh);
2484 if (!buffer_uptodate(bh))
2485 ret = -EIO;
2486 free_buffer_head(bh);
2487 read_bh[i] = NULL;
2488 }
2489 if (ret)
2490 goto failed;
2491 }
2492
2493 if (is_mapped_to_disk)
2494 SetPageMappedToDisk(page);
2495 SetPageUptodate(page);
2496
2497 /*
2498 * Setting the page dirty here isn't necessary for the prepare_write
2499 * function - commit_write will do that. But if/when this function is
2500 * used within the pagefault handler to ensure that all mmapped pages
2501 * have backing space in the filesystem, we will need to dirty the page
2502 * if its contents were altered.
2503 */
2504 if (dirtied_it)
2505 set_page_dirty(page);
2506
2507 return 0;
2508
2509failed:
2510 for (i = 0; i < nr_reads; i++) {
2511 if (read_bh[i])
2512 free_buffer_head(read_bh[i]);
2513 }
2514
2515 /*
2516 * Error recovery is pretty slack. Clear the page and mark it dirty
2517 * so we'll later zero out any blocks which _were_ allocated.
2518 */
2519 kaddr = kmap_atomic(page, KM_USER0);
2520 memset(kaddr, 0, PAGE_CACHE_SIZE);
2521 kunmap_atomic(kaddr, KM_USER0);
2522 SetPageUptodate(page);
2523 set_page_dirty(page);
2524 return ret;
2525}
2526EXPORT_SYMBOL(nobh_prepare_write);
2527
2528int nobh_commit_write(struct file *file, struct page *page,
2529 unsigned from, unsigned to)
2530{
2531 struct inode *inode = page->mapping->host;
2532 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2533
2534 set_page_dirty(page);
2535 if (pos > inode->i_size) {
2536 i_size_write(inode, pos);
2537 mark_inode_dirty(inode);
2538 }
2539 return 0;
2540}
2541EXPORT_SYMBOL(nobh_commit_write);
2542
2543/*
2544 * nobh_writepage() - based on block_full_write_page() except
2545 * that it tries to operate without attaching bufferheads to
2546 * the page.
2547 */
2548int nobh_writepage(struct page *page, get_block_t *get_block,
2549 struct writeback_control *wbc)
2550{
2551 struct inode * const inode = page->mapping->host;
2552 loff_t i_size = i_size_read(inode);
2553 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2554 unsigned offset;
2555 void *kaddr;
2556 int ret;
2557
2558 /* Is the page fully inside i_size? */
2559 if (page->index < end_index)
2560 goto out;
2561
2562 /* Is the page fully outside i_size? (truncate in progress) */
2563 offset = i_size & (PAGE_CACHE_SIZE-1);
2564 if (page->index >= end_index+1 || !offset) {
2565 /*
2566 * The page may have dirty, unmapped buffers. For example,
2567 * they may have been added in ext3_writepage(). Make them
2568 * freeable here, so the page does not leak.
2569 */
2570#if 0
2571 /* Not really sure about this - do we need this ? */
2572 if (page->mapping->a_ops->invalidatepage)
2573 page->mapping->a_ops->invalidatepage(page, offset);
2574#endif
2575 unlock_page(page);
2576 return 0; /* don't care */
2577 }
2578
2579 /*
2580 * The page straddles i_size. It must be zeroed out on each and every
2581 * writepage invocation because it may be mmapped. "A file is mapped
2582 * in multiples of the page size. For a file that is not a multiple of
2583 * the page size, the remaining memory is zeroed when mapped, and
2584 * writes to that region are not written out to the file."
2585 */
2586 kaddr = kmap_atomic(page, KM_USER0);
2587 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2588 flush_dcache_page(page);
2589 kunmap_atomic(kaddr, KM_USER0);
2590out:
2591 ret = mpage_writepage(page, get_block, wbc);
2592 if (ret == -EAGAIN)
2593 ret = __block_write_full_page(inode, page, get_block, wbc);
2594 return ret;
2595}
2596EXPORT_SYMBOL(nobh_writepage);
2597
2598/*
2599 * This function assumes that ->prepare_write() uses nobh_prepare_write().
2600 */
2601int nobh_truncate_page(struct address_space *mapping, loff_t from)
2602{
2603 struct inode *inode = mapping->host;
2604 unsigned blocksize = 1 << inode->i_blkbits;
2605 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2606 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2607 unsigned to;
2608 struct page *page;
2609 struct address_space_operations *a_ops = mapping->a_ops;
2610 char *kaddr;
2611 int ret = 0;
2612
2613 if ((offset & (blocksize - 1)) == 0)
2614 goto out;
2615
2616 ret = -ENOMEM;
2617 page = grab_cache_page(mapping, index);
2618 if (!page)
2619 goto out;
2620
2621 to = (offset + blocksize) & ~(blocksize - 1);
2622 ret = a_ops->prepare_write(NULL, page, offset, to);
2623 if (ret == 0) {
2624 kaddr = kmap_atomic(page, KM_USER0);
2625 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2626 flush_dcache_page(page);
2627 kunmap_atomic(kaddr, KM_USER0);
2628 set_page_dirty(page);
2629 }
2630 unlock_page(page);
2631 page_cache_release(page);
2632out:
2633 return ret;
2634}
2635EXPORT_SYMBOL(nobh_truncate_page);
2636
2637int block_truncate_page(struct address_space *mapping,
2638 loff_t from, get_block_t *get_block)
2639{
2640 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2641 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2642 unsigned blocksize;
Andrew Morton54b21a72006-01-08 01:03:05 -08002643 sector_t iblock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002644 unsigned length, pos;
2645 struct inode *inode = mapping->host;
2646 struct page *page;
2647 struct buffer_head *bh;
2648 void *kaddr;
2649 int err;
2650
2651 blocksize = 1 << inode->i_blkbits;
2652 length = offset & (blocksize - 1);
2653
2654 /* Block boundary? Nothing to do */
2655 if (!length)
2656 return 0;
2657
2658 length = blocksize - length;
Andrew Morton54b21a72006-01-08 01:03:05 -08002659 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002660
2661 page = grab_cache_page(mapping, index);
2662 err = -ENOMEM;
2663 if (!page)
2664 goto out;
2665
2666 if (!page_has_buffers(page))
2667 create_empty_buffers(page, blocksize, 0);
2668
2669 /* Find the buffer that contains "offset" */
2670 bh = page_buffers(page);
2671 pos = blocksize;
2672 while (offset >= pos) {
2673 bh = bh->b_this_page;
2674 iblock++;
2675 pos += blocksize;
2676 }
2677
2678 err = 0;
2679 if (!buffer_mapped(bh)) {
2680 err = get_block(inode, iblock, bh, 0);
2681 if (err)
2682 goto unlock;
2683 /* unmapped? It's a hole - nothing to do */
2684 if (!buffer_mapped(bh))
2685 goto unlock;
2686 }
2687
2688 /* Ok, it's mapped. Make sure it's up-to-date */
2689 if (PageUptodate(page))
2690 set_buffer_uptodate(bh);
2691
2692 if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2693 err = -EIO;
2694 ll_rw_block(READ, 1, &bh);
2695 wait_on_buffer(bh);
2696 /* Uhhuh. Read error. Complain and punt. */
2697 if (!buffer_uptodate(bh))
2698 goto unlock;
2699 }
2700
2701 kaddr = kmap_atomic(page, KM_USER0);
2702 memset(kaddr + offset, 0, length);
2703 flush_dcache_page(page);
2704 kunmap_atomic(kaddr, KM_USER0);
2705
2706 mark_buffer_dirty(bh);
2707 err = 0;
2708
2709unlock:
2710 unlock_page(page);
2711 page_cache_release(page);
2712out:
2713 return err;
2714}
2715
2716/*
2717 * The generic ->writepage function for buffer-backed address_spaces
2718 */
2719int block_write_full_page(struct page *page, get_block_t *get_block,
2720 struct writeback_control *wbc)
2721{
2722 struct inode * const inode = page->mapping->host;
2723 loff_t i_size = i_size_read(inode);
2724 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2725 unsigned offset;
2726 void *kaddr;
2727
2728 /* Is the page fully inside i_size? */
2729 if (page->index < end_index)
2730 return __block_write_full_page(inode, page, get_block, wbc);
2731
2732 /* Is the page fully outside i_size? (truncate in progress) */
2733 offset = i_size & (PAGE_CACHE_SIZE-1);
2734 if (page->index >= end_index+1 || !offset) {
2735 /*
2736 * The page may have dirty, unmapped buffers. For example,
2737 * they may have been added in ext3_writepage(). Make them
2738 * freeable here, so the page does not leak.
2739 */
Jan Karaaaa40592005-10-30 15:00:16 -08002740 do_invalidatepage(page, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002741 unlock_page(page);
2742 return 0; /* don't care */
2743 }
2744
2745 /*
2746 * The page straddles i_size. It must be zeroed out on each and every
2747 * writepage invokation because it may be mmapped. "A file is mapped
2748 * in multiples of the page size. For a file that is not a multiple of
2749 * the page size, the remaining memory is zeroed when mapped, and
2750 * writes to that region are not written out to the file."
2751 */
2752 kaddr = kmap_atomic(page, KM_USER0);
2753 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2754 flush_dcache_page(page);
2755 kunmap_atomic(kaddr, KM_USER0);
2756 return __block_write_full_page(inode, page, get_block, wbc);
2757}
2758
2759sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2760 get_block_t *get_block)
2761{
2762 struct buffer_head tmp;
2763 struct inode *inode = mapping->host;
2764 tmp.b_state = 0;
2765 tmp.b_blocknr = 0;
2766 get_block(inode, block, &tmp, 0);
2767 return tmp.b_blocknr;
2768}
2769
2770static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2771{
2772 struct buffer_head *bh = bio->bi_private;
2773
2774 if (bio->bi_size)
2775 return 1;
2776
2777 if (err == -EOPNOTSUPP) {
2778 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2779 set_bit(BH_Eopnotsupp, &bh->b_state);
2780 }
2781
2782 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2783 bio_put(bio);
2784 return 0;
2785}
2786
2787int submit_bh(int rw, struct buffer_head * bh)
2788{
2789 struct bio *bio;
2790 int ret = 0;
2791
2792 BUG_ON(!buffer_locked(bh));
2793 BUG_ON(!buffer_mapped(bh));
2794 BUG_ON(!bh->b_end_io);
2795
2796 if (buffer_ordered(bh) && (rw == WRITE))
2797 rw = WRITE_BARRIER;
2798
2799 /*
2800 * Only clear out a write error when rewriting, should this
2801 * include WRITE_SYNC as well?
2802 */
2803 if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2804 clear_buffer_write_io_error(bh);
2805
2806 /*
2807 * from here on down, it's all bio -- do the initial mapping,
2808 * submit_bio -> generic_make_request may further map this bio around
2809 */
2810 bio = bio_alloc(GFP_NOIO, 1);
2811
2812 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2813 bio->bi_bdev = bh->b_bdev;
2814 bio->bi_io_vec[0].bv_page = bh->b_page;
2815 bio->bi_io_vec[0].bv_len = bh->b_size;
2816 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2817
2818 bio->bi_vcnt = 1;
2819 bio->bi_idx = 0;
2820 bio->bi_size = bh->b_size;
2821
2822 bio->bi_end_io = end_bio_bh_io_sync;
2823 bio->bi_private = bh;
2824
2825 bio_get(bio);
2826 submit_bio(rw, bio);
2827
2828 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2829 ret = -EOPNOTSUPP;
2830
2831 bio_put(bio);
2832 return ret;
2833}
2834
2835/**
2836 * ll_rw_block: low-level access to block devices (DEPRECATED)
Jan Karaa7662232005-09-06 15:19:10 -07002837 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002838 * @nr: number of &struct buffer_heads in the array
2839 * @bhs: array of pointers to &struct buffer_head
2840 *
Jan Karaa7662232005-09-06 15:19:10 -07002841 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2842 * requests an I/O operation on them, either a %READ or a %WRITE. The third
2843 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2844 * are sent to disk. The fourth %READA option is described in the documentation
2845 * for generic_make_request() which ll_rw_block() calls.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002846 *
2847 * This function drops any buffer that it cannot get a lock on (with the
Jan Karaa7662232005-09-06 15:19:10 -07002848 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2849 * clean when doing a write request, and any buffer that appears to be
2850 * up-to-date when doing read request. Further it marks as clean buffers that
2851 * are processed for writing (the buffer cache won't assume that they are
2852 * actually clean until the buffer gets unlocked).
Linus Torvalds1da177e2005-04-16 15:20:36 -07002853 *
2854 * ll_rw_block sets b_end_io to simple completion handler that marks
2855 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2856 * any waiters.
2857 *
2858 * All of the buffers must be for the same device, and must also be a
2859 * multiple of the current approved size for the device.
2860 */
2861void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2862{
2863 int i;
2864
2865 for (i = 0; i < nr; i++) {
2866 struct buffer_head *bh = bhs[i];
2867
Jan Karaa7662232005-09-06 15:19:10 -07002868 if (rw == SWRITE)
2869 lock_buffer(bh);
2870 else if (test_set_buffer_locked(bh))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002871 continue;
2872
Jan Karaa7662232005-09-06 15:19:10 -07002873 if (rw == WRITE || rw == SWRITE) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002874 if (test_clear_buffer_dirty(bh)) {
akpm@osdl.org76c30732005-04-16 15:24:07 -07002875 bh->b_end_io = end_buffer_write_sync;
OGAWA Hirofumie60e5c52006-02-03 03:04:43 -08002876 get_bh(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002877 submit_bh(WRITE, bh);
2878 continue;
2879 }
2880 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002881 if (!buffer_uptodate(bh)) {
akpm@osdl.org76c30732005-04-16 15:24:07 -07002882 bh->b_end_io = end_buffer_read_sync;
OGAWA Hirofumie60e5c52006-02-03 03:04:43 -08002883 get_bh(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002884 submit_bh(rw, bh);
2885 continue;
2886 }
2887 }
2888 unlock_buffer(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002889 }
2890}
2891
2892/*
2893 * For a data-integrity writeout, we need to wait upon any in-progress I/O
2894 * and then start new I/O and then wait upon it. The caller must have a ref on
2895 * the buffer_head.
2896 */
2897int sync_dirty_buffer(struct buffer_head *bh)
2898{
2899 int ret = 0;
2900
2901 WARN_ON(atomic_read(&bh->b_count) < 1);
2902 lock_buffer(bh);
2903 if (test_clear_buffer_dirty(bh)) {
2904 get_bh(bh);
2905 bh->b_end_io = end_buffer_write_sync;
2906 ret = submit_bh(WRITE, bh);
2907 wait_on_buffer(bh);
2908 if (buffer_eopnotsupp(bh)) {
2909 clear_buffer_eopnotsupp(bh);
2910 ret = -EOPNOTSUPP;
2911 }
2912 if (!ret && !buffer_uptodate(bh))
2913 ret = -EIO;
2914 } else {
2915 unlock_buffer(bh);
2916 }
2917 return ret;
2918}
2919
2920/*
2921 * try_to_free_buffers() checks if all the buffers on this particular page
2922 * are unused, and releases them if so.
2923 *
2924 * Exclusion against try_to_free_buffers may be obtained by either
2925 * locking the page or by holding its mapping's private_lock.
2926 *
2927 * If the page is dirty but all the buffers are clean then we need to
2928 * be sure to mark the page clean as well. This is because the page
2929 * may be against a block device, and a later reattachment of buffers
2930 * to a dirty page will set *all* buffers dirty. Which would corrupt
2931 * filesystem data on the same device.
2932 *
2933 * The same applies to regular filesystem pages: if all the buffers are
2934 * clean then we set the page clean and proceed. To do that, we require
2935 * total exclusion from __set_page_dirty_buffers(). That is obtained with
2936 * private_lock.
2937 *
2938 * try_to_free_buffers() is non-blocking.
2939 */
2940static inline int buffer_busy(struct buffer_head *bh)
2941{
2942 return atomic_read(&bh->b_count) |
2943 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2944}
2945
2946static int
2947drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2948{
2949 struct buffer_head *head = page_buffers(page);
2950 struct buffer_head *bh;
2951
2952 bh = head;
2953 do {
akpm@osdl.orgde7d5a32005-05-01 08:58:39 -07002954 if (buffer_write_io_error(bh) && page->mapping)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002955 set_bit(AS_EIO, &page->mapping->flags);
2956 if (buffer_busy(bh))
2957 goto failed;
2958 bh = bh->b_this_page;
2959 } while (bh != head);
2960
2961 do {
2962 struct buffer_head *next = bh->b_this_page;
2963
2964 if (!list_empty(&bh->b_assoc_buffers))
2965 __remove_assoc_queue(bh);
2966 bh = next;
2967 } while (bh != head);
2968 *buffers_to_free = head;
2969 __clear_page_buffers(page);
2970 return 1;
2971failed:
2972 return 0;
2973}
2974
2975int try_to_free_buffers(struct page *page)
2976{
2977 struct address_space * const mapping = page->mapping;
2978 struct buffer_head *buffers_to_free = NULL;
2979 int ret = 0;
2980
2981 BUG_ON(!PageLocked(page));
2982 if (PageWriteback(page))
2983 return 0;
2984
2985 if (mapping == NULL) { /* can this still happen? */
2986 ret = drop_buffers(page, &buffers_to_free);
2987 goto out;
2988 }
2989
2990 spin_lock(&mapping->private_lock);
2991 ret = drop_buffers(page, &buffers_to_free);
2992 if (ret) {
2993 /*
2994 * If the filesystem writes its buffers by hand (eg ext3)
2995 * then we can have clean buffers against a dirty page. We
2996 * clean the page here; otherwise later reattachment of buffers
2997 * could encounter a non-uptodate page, which is unresolvable.
2998 * This only applies in the rare case where try_to_free_buffers
2999 * succeeds but the page is not freed.
3000 */
3001 clear_page_dirty(page);
3002 }
3003 spin_unlock(&mapping->private_lock);
3004out:
3005 if (buffers_to_free) {
3006 struct buffer_head *bh = buffers_to_free;
3007
3008 do {
3009 struct buffer_head *next = bh->b_this_page;
3010 free_buffer_head(bh);
3011 bh = next;
3012 } while (bh != buffers_to_free);
3013 }
3014 return ret;
3015}
3016EXPORT_SYMBOL(try_to_free_buffers);
3017
3018int block_sync_page(struct page *page)
3019{
3020 struct address_space *mapping;
3021
3022 smp_mb();
3023 mapping = page_mapping(page);
3024 if (mapping)
3025 blk_run_backing_dev(mapping->backing_dev_info, page);
3026 return 0;
3027}
3028
3029/*
3030 * There are no bdflush tunables left. But distributions are
3031 * still running obsolete flush daemons, so we terminate them here.
3032 *
3033 * Use of bdflush() is deprecated and will be removed in a future kernel.
3034 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
3035 */
3036asmlinkage long sys_bdflush(int func, long data)
3037{
3038 static int msg_count;
3039
3040 if (!capable(CAP_SYS_ADMIN))
3041 return -EPERM;
3042
3043 if (msg_count < 5) {
3044 msg_count++;
3045 printk(KERN_INFO
3046 "warning: process `%s' used the obsolete bdflush"
3047 " system call\n", current->comm);
3048 printk(KERN_INFO "Fix your initscripts?\n");
3049 }
3050
3051 if (func == 1)
3052 do_exit(0);
3053 return 0;
3054}
3055
3056/*
3057 * Buffer-head allocation
3058 */
3059static kmem_cache_t *bh_cachep;
3060
3061/*
3062 * Once the number of bh's in the machine exceeds this level, we start
3063 * stripping them in writeback.
3064 */
3065static int max_buffer_heads;
3066
3067int buffer_heads_over_limit;
3068
3069struct bh_accounting {
3070 int nr; /* Number of live bh's */
3071 int ratelimit; /* Limit cacheline bouncing */
3072};
3073
3074static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3075
3076static void recalc_bh_state(void)
3077{
3078 int i;
3079 int tot = 0;
3080
3081 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3082 return;
3083 __get_cpu_var(bh_accounting).ratelimit = 0;
Eric Dumazet8a143422006-03-24 03:18:10 -08003084 for_each_online_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003085 tot += per_cpu(bh_accounting, i).nr;
3086 buffer_heads_over_limit = (tot > max_buffer_heads);
3087}
3088
Al Virodd0fc662005-10-07 07:46:04 +01003089struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003090{
3091 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
3092 if (ret) {
Coywolf Qi Hunt736c7b82005-09-06 15:18:17 -07003093 get_cpu_var(bh_accounting).nr++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003094 recalc_bh_state();
Coywolf Qi Hunt736c7b82005-09-06 15:18:17 -07003095 put_cpu_var(bh_accounting);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003096 }
3097 return ret;
3098}
3099EXPORT_SYMBOL(alloc_buffer_head);
3100
3101void free_buffer_head(struct buffer_head *bh)
3102{
3103 BUG_ON(!list_empty(&bh->b_assoc_buffers));
3104 kmem_cache_free(bh_cachep, bh);
Coywolf Qi Hunt736c7b82005-09-06 15:18:17 -07003105 get_cpu_var(bh_accounting).nr--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003106 recalc_bh_state();
Coywolf Qi Hunt736c7b82005-09-06 15:18:17 -07003107 put_cpu_var(bh_accounting);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003108}
3109EXPORT_SYMBOL(free_buffer_head);
3110
3111static void
3112init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
3113{
3114 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
3115 SLAB_CTOR_CONSTRUCTOR) {
3116 struct buffer_head * bh = (struct buffer_head *)data;
3117
3118 memset(bh, 0, sizeof(*bh));
3119 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3120 }
3121}
3122
3123#ifdef CONFIG_HOTPLUG_CPU
3124static void buffer_exit_cpu(int cpu)
3125{
3126 int i;
3127 struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3128
3129 for (i = 0; i < BH_LRU_SIZE; i++) {
3130 brelse(b->bhs[i]);
3131 b->bhs[i] = NULL;
3132 }
Eric Dumazet8a143422006-03-24 03:18:10 -08003133 get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3134 per_cpu(bh_accounting, cpu).nr = 0;
3135 put_cpu_var(bh_accounting);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003136}
3137
3138static int buffer_cpu_notify(struct notifier_block *self,
3139 unsigned long action, void *hcpu)
3140{
3141 if (action == CPU_DEAD)
3142 buffer_exit_cpu((unsigned long)hcpu);
3143 return NOTIFY_OK;
3144}
3145#endif /* CONFIG_HOTPLUG_CPU */
3146
3147void __init buffer_init(void)
3148{
3149 int nrpages;
3150
3151 bh_cachep = kmem_cache_create("buffer_head",
Paul Jacksonb0196002006-03-24 03:16:09 -08003152 sizeof(struct buffer_head), 0,
3153 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3154 SLAB_MEM_SPREAD),
3155 init_buffer_head,
3156 NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003157
3158 /*
3159 * Limit the bh occupancy to 10% of ZONE_NORMAL
3160 */
3161 nrpages = (nr_free_buffer_pages() * 10) / 100;
3162 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3163 hotcpu_notifier(buffer_cpu_notify, 0);
3164}
3165
3166EXPORT_SYMBOL(__bforget);
3167EXPORT_SYMBOL(__brelse);
3168EXPORT_SYMBOL(__wait_on_buffer);
3169EXPORT_SYMBOL(block_commit_write);
3170EXPORT_SYMBOL(block_prepare_write);
3171EXPORT_SYMBOL(block_read_full_page);
3172EXPORT_SYMBOL(block_sync_page);
3173EXPORT_SYMBOL(block_truncate_page);
3174EXPORT_SYMBOL(block_write_full_page);
3175EXPORT_SYMBOL(cont_prepare_write);
3176EXPORT_SYMBOL(end_buffer_async_write);
3177EXPORT_SYMBOL(end_buffer_read_sync);
3178EXPORT_SYMBOL(end_buffer_write_sync);
3179EXPORT_SYMBOL(file_fsync);
3180EXPORT_SYMBOL(fsync_bdev);
3181EXPORT_SYMBOL(generic_block_bmap);
3182EXPORT_SYMBOL(generic_commit_write);
3183EXPORT_SYMBOL(generic_cont_expand);
OGAWA Hirofumi05eb0b52006-01-08 01:02:13 -08003184EXPORT_SYMBOL(generic_cont_expand_simple);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003185EXPORT_SYMBOL(init_buffer);
3186EXPORT_SYMBOL(invalidate_bdev);
3187EXPORT_SYMBOL(ll_rw_block);
3188EXPORT_SYMBOL(mark_buffer_dirty);
3189EXPORT_SYMBOL(submit_bh);
3190EXPORT_SYMBOL(sync_dirty_buffer);
3191EXPORT_SYMBOL(unlock_buffer);