blob: d953df3a201c93e35e5e196a42b381ded75532eb [file] [log] [blame]
Darrick J. Wong3993bae2016-10-03 09:11:32 -07001/*
2 * Copyright (C) 2016 Oracle. All Rights Reserved.
3 *
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it would be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
19 */
20#include "xfs.h"
21#include "xfs_fs.h"
22#include "xfs_shared.h"
23#include "xfs_format.h"
24#include "xfs_log_format.h"
25#include "xfs_trans_resv.h"
26#include "xfs_mount.h"
27#include "xfs_defer.h"
28#include "xfs_da_format.h"
29#include "xfs_da_btree.h"
30#include "xfs_inode.h"
31#include "xfs_trans.h"
32#include "xfs_inode_item.h"
33#include "xfs_bmap.h"
34#include "xfs_bmap_util.h"
35#include "xfs_error.h"
36#include "xfs_dir2.h"
37#include "xfs_dir2_priv.h"
38#include "xfs_ioctl.h"
39#include "xfs_trace.h"
40#include "xfs_log.h"
41#include "xfs_icache.h"
42#include "xfs_pnfs.h"
43#include "xfs_refcount_btree.h"
44#include "xfs_refcount.h"
45#include "xfs_bmap_btree.h"
46#include "xfs_trans_space.h"
47#include "xfs_bit.h"
48#include "xfs_alloc.h"
49#include "xfs_quota_defs.h"
50#include "xfs_quota.h"
51#include "xfs_btree.h"
52#include "xfs_bmap_btree.h"
53#include "xfs_reflink.h"
Darrick J. Wong2a067052016-10-03 09:11:33 -070054#include "xfs_iomap.h"
Darrick J. Wong3993bae2016-10-03 09:11:32 -070055
56/*
57 * Copy on Write of Shared Blocks
58 *
59 * XFS must preserve "the usual" file semantics even when two files share
60 * the same physical blocks. This means that a write to one file must not
61 * alter the blocks in a different file; the way that we'll do that is
62 * through the use of a copy-on-write mechanism. At a high level, that
63 * means that when we want to write to a shared block, we allocate a new
64 * block, write the data to the new block, and if that succeeds we map the
65 * new block into the file.
66 *
67 * XFS provides a "delayed allocation" mechanism that defers the allocation
68 * of disk blocks to dirty-but-not-yet-mapped file blocks as long as
69 * possible. This reduces fragmentation by enabling the filesystem to ask
70 * for bigger chunks less often, which is exactly what we want for CoW.
71 *
72 * The delalloc mechanism begins when the kernel wants to make a block
73 * writable (write_begin or page_mkwrite). If the offset is not mapped, we
74 * create a delalloc mapping, which is a regular in-core extent, but without
75 * a real startblock. (For delalloc mappings, the startblock encodes both
76 * a flag that this is a delalloc mapping, and a worst-case estimate of how
77 * many blocks might be required to put the mapping into the BMBT.) delalloc
78 * mappings are a reservation against the free space in the filesystem;
79 * adjacent mappings can also be combined into fewer larger mappings.
80 *
81 * When dirty pages are being written out (typically in writepage), the
82 * delalloc reservations are converted into real mappings by allocating
83 * blocks and replacing the delalloc mapping with real ones. A delalloc
84 * mapping can be replaced by several real ones if the free space is
85 * fragmented.
86 *
87 * We want to adapt the delalloc mechanism for copy-on-write, since the
88 * write paths are similar. The first two steps (creating the reservation
89 * and allocating the blocks) are exactly the same as delalloc except that
90 * the mappings must be stored in a separate CoW fork because we do not want
91 * to disturb the mapping in the data fork until we're sure that the write
92 * succeeded. IO completion in this case is the process of removing the old
93 * mapping from the data fork and moving the new mapping from the CoW fork to
94 * the data fork. This will be discussed shortly.
95 *
96 * For now, unaligned directio writes will be bounced back to the page cache.
97 * Block-aligned directio writes will use the same mechanism as buffered
98 * writes.
99 *
100 * CoW remapping must be done after the data block write completes,
101 * because we don't want to destroy the old data fork map until we're sure
102 * the new block has been written. Since the new mappings are kept in a
103 * separate fork, we can simply iterate these mappings to find the ones
104 * that cover the file blocks that we just CoW'd. For each extent, simply
105 * unmap the corresponding range in the data fork, map the new range into
106 * the data fork, and remove the extent from the CoW fork.
107 *
108 * Since the remapping operation can be applied to an arbitrary file
109 * range, we record the need for the remap step as a flag in the ioend
110 * instead of declaring a new IO type. This is required for direct io
111 * because we only have ioend for the whole dio, and we have to be able to
112 * remember the presence of unwritten blocks and CoW blocks with a single
113 * ioend structure. Better yet, the more ground we can cover with one
114 * ioend, the better.
115 */
Darrick J. Wong2a067052016-10-03 09:11:33 -0700116
117/*
118 * Given an AG extent, find the lowest-numbered run of shared blocks
119 * within that range and return the range in fbno/flen. If
120 * find_end_of_shared is true, return the longest contiguous extent of
121 * shared blocks. If there are no shared extents, fbno and flen will
122 * be set to NULLAGBLOCK and 0, respectively.
123 */
124int
125xfs_reflink_find_shared(
126 struct xfs_mount *mp,
127 xfs_agnumber_t agno,
128 xfs_agblock_t agbno,
129 xfs_extlen_t aglen,
130 xfs_agblock_t *fbno,
131 xfs_extlen_t *flen,
132 bool find_end_of_shared)
133{
134 struct xfs_buf *agbp;
135 struct xfs_btree_cur *cur;
136 int error;
137
138 error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
139 if (error)
140 return error;
141
142 cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
143
144 error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
145 find_end_of_shared);
146
147 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
148
149 xfs_buf_relse(agbp);
150 return error;
151}
152
153/*
154 * Trim the mapping to the next block where there's a change in the
155 * shared/unshared status. More specifically, this means that we
156 * find the lowest-numbered extent of shared blocks that coincides with
157 * the given block mapping. If the shared extent overlaps the start of
158 * the mapping, trim the mapping to the end of the shared extent. If
159 * the shared region intersects the mapping, trim the mapping to the
160 * start of the shared extent. If there are no shared regions that
161 * overlap, just return the original extent.
162 */
163int
164xfs_reflink_trim_around_shared(
165 struct xfs_inode *ip,
166 struct xfs_bmbt_irec *irec,
167 bool *shared,
168 bool *trimmed)
169{
170 xfs_agnumber_t agno;
171 xfs_agblock_t agbno;
172 xfs_extlen_t aglen;
173 xfs_agblock_t fbno;
174 xfs_extlen_t flen;
175 int error = 0;
176
177 /* Holes, unwritten, and delalloc extents cannot be shared */
178 if (!xfs_is_reflink_inode(ip) ||
179 ISUNWRITTEN(irec) ||
180 irec->br_startblock == HOLESTARTBLOCK ||
181 irec->br_startblock == DELAYSTARTBLOCK) {
182 *shared = false;
183 return 0;
184 }
185
186 trace_xfs_reflink_trim_around_shared(ip, irec);
187
188 agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
189 agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
190 aglen = irec->br_blockcount;
191
192 error = xfs_reflink_find_shared(ip->i_mount, agno, agbno,
193 aglen, &fbno, &flen, true);
194 if (error)
195 return error;
196
197 *shared = *trimmed = false;
198 if (fbno == NULLAGBLOCK) {
199 /* No shared blocks at all. */
200 return 0;
201 } else if (fbno == agbno) {
202 /*
203 * The start of this extent is shared. Truncate the
204 * mapping at the end of the shared region so that a
205 * subsequent iteration starts at the start of the
206 * unshared region.
207 */
208 irec->br_blockcount = flen;
209 *shared = true;
210 if (flen != aglen)
211 *trimmed = true;
212 return 0;
213 } else {
214 /*
215 * There's a shared extent midway through this extent.
216 * Truncate the mapping at the start of the shared
217 * extent so that a subsequent iteration starts at the
218 * start of the shared region.
219 */
220 irec->br_blockcount = fbno - agbno;
221 *trimmed = true;
222 return 0;
223 }
224}
225
226/* Create a CoW reservation for a range of blocks within a file. */
227static int
228__xfs_reflink_reserve_cow(
229 struct xfs_inode *ip,
230 xfs_fileoff_t *offset_fsb,
231 xfs_fileoff_t end_fsb)
232{
233 struct xfs_bmbt_irec got, prev, imap;
234 xfs_fileoff_t orig_end_fsb;
235 int nimaps, eof = 0, error = 0;
236 bool shared = false, trimmed = false;
237 xfs_extnum_t idx;
238
239 /* Already reserved? Skip the refcount btree access. */
240 xfs_bmap_search_extents(ip, *offset_fsb, XFS_COW_FORK, &eof, &idx,
241 &got, &prev);
242 if (!eof && got.br_startoff <= *offset_fsb) {
243 end_fsb = orig_end_fsb = got.br_startoff + got.br_blockcount;
244 trace_xfs_reflink_cow_found(ip, &got);
245 goto done;
246 }
247
248 /* Read extent from the source file. */
249 nimaps = 1;
250 error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb,
251 &imap, &nimaps, 0);
252 if (error)
253 goto out_unlock;
254 ASSERT(nimaps == 1);
255
256 /* Trim the mapping to the nearest shared extent boundary. */
257 error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed);
258 if (error)
259 goto out_unlock;
260
261 end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount;
262
263 /* Not shared? Just report the (potentially capped) extent. */
264 if (!shared)
265 goto done;
266
267 /*
268 * Fork all the shared blocks from our write offset until the end of
269 * the extent.
270 */
271 error = xfs_qm_dqattach_locked(ip, 0);
272 if (error)
273 goto out_unlock;
274
275retry:
276 error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, *offset_fsb,
277 end_fsb - *offset_fsb, &got,
278 &prev, &idx, eof);
279 switch (error) {
280 case 0:
281 break;
282 case -ENOSPC:
283 case -EDQUOT:
284 /* retry without any preallocation */
285 trace_xfs_reflink_cow_enospc(ip, &imap);
286 if (end_fsb != orig_end_fsb) {
287 end_fsb = orig_end_fsb;
288 goto retry;
289 }
290 /*FALLTHRU*/
291 default:
292 goto out_unlock;
293 }
294
295 trace_xfs_reflink_cow_alloc(ip, &got);
296done:
297 *offset_fsb = end_fsb;
298out_unlock:
299 return error;
300}
301
302/* Create a CoW reservation for part of a file. */
303int
304xfs_reflink_reserve_cow_range(
305 struct xfs_inode *ip,
306 xfs_off_t offset,
307 xfs_off_t count)
308{
309 struct xfs_mount *mp = ip->i_mount;
310 xfs_fileoff_t offset_fsb, end_fsb;
311 int error;
312
313 trace_xfs_reflink_reserve_cow_range(ip, offset, count);
314
315 offset_fsb = XFS_B_TO_FSBT(mp, offset);
316 end_fsb = XFS_B_TO_FSB(mp, offset + count);
317
318 xfs_ilock(ip, XFS_ILOCK_EXCL);
319 while (offset_fsb < end_fsb) {
320 error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb);
321 if (error) {
322 trace_xfs_reflink_reserve_cow_range_error(ip, error,
323 _RET_IP_);
324 break;
325 }
326 }
327 xfs_iunlock(ip, XFS_ILOCK_EXCL);
328
329 return error;
330}
Darrick J. Wongef473662016-10-03 09:11:34 -0700331
332/*
333 * Find the CoW reservation (and whether or not it needs block allocation)
334 * for a given byte offset of a file.
335 */
336bool
337xfs_reflink_find_cow_mapping(
338 struct xfs_inode *ip,
339 xfs_off_t offset,
340 struct xfs_bmbt_irec *imap,
341 bool *need_alloc)
342{
343 struct xfs_bmbt_irec irec;
344 struct xfs_ifork *ifp;
345 struct xfs_bmbt_rec_host *gotp;
346 xfs_fileoff_t bno;
347 xfs_extnum_t idx;
348
349 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
350 ASSERT(xfs_is_reflink_inode(ip));
351
352 /* Find the extent in the CoW fork. */
353 ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
354 bno = XFS_B_TO_FSBT(ip->i_mount, offset);
355 gotp = xfs_iext_bno_to_ext(ifp, bno, &idx);
356 if (!gotp)
357 return false;
358
359 xfs_bmbt_get_all(gotp, &irec);
360 if (bno >= irec.br_startoff + irec.br_blockcount ||
361 bno < irec.br_startoff)
362 return false;
363
364 trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE,
365 &irec);
366
367 /* If it's still delalloc, we must allocate later. */
368 *imap = irec;
369 *need_alloc = !!(isnullstartblock(irec.br_startblock));
370
371 return true;
372}
373
374/*
375 * Trim an extent to end at the next CoW reservation past offset_fsb.
376 */
377int
378xfs_reflink_trim_irec_to_next_cow(
379 struct xfs_inode *ip,
380 xfs_fileoff_t offset_fsb,
381 struct xfs_bmbt_irec *imap)
382{
383 struct xfs_bmbt_irec irec;
384 struct xfs_ifork *ifp;
385 struct xfs_bmbt_rec_host *gotp;
386 xfs_extnum_t idx;
387
388 if (!xfs_is_reflink_inode(ip))
389 return 0;
390
391 /* Find the extent in the CoW fork. */
392 ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
393 gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx);
394 if (!gotp)
395 return 0;
396 xfs_bmbt_get_all(gotp, &irec);
397
398 /* This is the extent before; try sliding up one. */
399 if (irec.br_startoff < offset_fsb) {
400 idx++;
401 if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
402 return 0;
403 gotp = xfs_iext_get_ext(ifp, idx);
404 xfs_bmbt_get_all(gotp, &irec);
405 }
406
407 if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount)
408 return 0;
409
410 imap->br_blockcount = irec.br_startoff - imap->br_startoff;
411 trace_xfs_reflink_trim_irec(ip, imap);
412
413 return 0;
414}