blob: 1a2bdbfa3ea9d1f14a39594d3ca886d1f7098d57 [file] [log] [blame]
Matthew Wilcoxd475c632015-02-16 15:58:56 -08001/*
2 * fs/dax.c - Direct Access filesystem code
3 * Copyright (c) 2013-2014 Intel Corporation
4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 */
16
17#include <linux/atomic.h>
18#include <linux/blkdev.h>
19#include <linux/buffer_head.h>
20#include <linux/fs.h>
21#include <linux/genhd.h>
22#include <linux/mutex.h>
23#include <linux/uio.h>
24
25static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
26{
27 unsigned long pfn;
28 sector_t sector = bh->b_blocknr << (blkbits - 9);
29 return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
30}
31
32static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
33 loff_t end)
34{
35 loff_t final = end - pos + first; /* The final byte of the buffer */
36
37 if (first > 0)
38 memset(addr, 0, first);
39 if (final < size)
40 memset(addr + final, 0, size - final);
41}
42
43static bool buffer_written(struct buffer_head *bh)
44{
45 return buffer_mapped(bh) && !buffer_unwritten(bh);
46}
47
48/*
49 * When ext4 encounters a hole, it returns without modifying the buffer_head
50 * which means that we can't trust b_size. To cope with this, we set b_state
51 * to 0 before calling get_block and, if any bit is set, we know we can trust
52 * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is
53 * and would save us time calling get_block repeatedly.
54 */
55static bool buffer_size_valid(struct buffer_head *bh)
56{
57 return bh->b_state != 0;
58}
59
60static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
61 loff_t start, loff_t end, get_block_t get_block,
62 struct buffer_head *bh)
63{
64 ssize_t retval = 0;
65 loff_t pos = start;
66 loff_t max = start;
67 loff_t bh_max = start;
68 void *addr;
69 bool hole = false;
70
71 if (rw != WRITE)
72 end = min(end, i_size_read(inode));
73
74 while (pos < end) {
75 unsigned len;
76 if (pos == max) {
77 unsigned blkbits = inode->i_blkbits;
78 sector_t block = pos >> blkbits;
79 unsigned first = pos - (block << blkbits);
80 long size;
81
82 if (pos == bh_max) {
83 bh->b_size = PAGE_ALIGN(end - pos);
84 bh->b_state = 0;
85 retval = get_block(inode, block, bh,
86 rw == WRITE);
87 if (retval)
88 break;
89 if (!buffer_size_valid(bh))
90 bh->b_size = 1 << blkbits;
91 bh_max = pos - first + bh->b_size;
92 } else {
93 unsigned done = bh->b_size -
94 (bh_max - (pos - first));
95 bh->b_blocknr += done >> blkbits;
96 bh->b_size -= done;
97 }
98
99 hole = (rw != WRITE) && !buffer_written(bh);
100 if (hole) {
101 addr = NULL;
102 size = bh->b_size - first;
103 } else {
104 retval = dax_get_addr(bh, &addr, blkbits);
105 if (retval < 0)
106 break;
107 if (buffer_unwritten(bh) || buffer_new(bh))
108 dax_new_buf(addr, retval, first, pos,
109 end);
110 addr += first;
111 size = retval - first;
112 }
113 max = min(pos + size, end);
114 }
115
116 if (rw == WRITE)
117 len = copy_from_iter(addr, max - pos, iter);
118 else if (!hole)
119 len = copy_to_iter(addr, max - pos, iter);
120 else
121 len = iov_iter_zero(max - pos, iter);
122
123 if (!len)
124 break;
125
126 pos += len;
127 addr += len;
128 }
129
130 return (pos == start) ? retval : pos - start;
131}
132
133/**
134 * dax_do_io - Perform I/O to a DAX file
135 * @rw: READ to read or WRITE to write
136 * @iocb: The control block for this I/O
137 * @inode: The file which the I/O is directed at
138 * @iter: The addresses to do I/O from or to
139 * @pos: The file offset where the I/O starts
140 * @get_block: The filesystem method used to translate file offsets to blocks
141 * @end_io: A filesystem callback for I/O completion
142 * @flags: See below
143 *
144 * This function uses the same locking scheme as do_blockdev_direct_IO:
145 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
146 * caller for writes. For reads, we take and release the i_mutex ourselves.
147 * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
148 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
149 * is in progress.
150 */
151ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
152 struct iov_iter *iter, loff_t pos,
153 get_block_t get_block, dio_iodone_t end_io, int flags)
154{
155 struct buffer_head bh;
156 ssize_t retval = -EINVAL;
157 loff_t end = pos + iov_iter_count(iter);
158
159 memset(&bh, 0, sizeof(bh));
160
161 if ((flags & DIO_LOCKING) && (rw == READ)) {
162 struct address_space *mapping = inode->i_mapping;
163 mutex_lock(&inode->i_mutex);
164 retval = filemap_write_and_wait_range(mapping, pos, end - 1);
165 if (retval) {
166 mutex_unlock(&inode->i_mutex);
167 goto out;
168 }
169 }
170
171 /* Protects against truncate */
172 atomic_inc(&inode->i_dio_count);
173
174 retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
175
176 if ((flags & DIO_LOCKING) && (rw == READ))
177 mutex_unlock(&inode->i_mutex);
178
179 if ((retval > 0) && end_io)
180 end_io(iocb, pos, retval, bh.b_private);
181
182 inode_dio_done(inode);
183 out:
184 return retval;
185}
186EXPORT_SYMBOL_GPL(dax_do_io);