Blame - fs/xfs/xfs_reflink.c - kernel/msm-4.9

blob: 5d796b7f23d7bdd027365dcca3644b72af62fba1 [file] [log] [blame]

Darrick J. Wong	3993bae	2016-10-03 09:11:32 -0700	[diff] [blame]	1	/*
				2	* Copyright (C) 2016 Oracle. All Rights Reserved.
				3	*
				4	* Author: Darrick J. Wong <darrick.wong@oracle.com>
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public License
				8	* as published by the Free Software Foundation; either version 2
				9	* of the License, or (at your option) any later version.
				10	*
				11	* This program is distributed in the hope that it would be useful,
				12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				14	* GNU General Public License for more details.
				15	*
				16	* You should have received a copy of the GNU General Public License
				17	* along with this program; if not, write the Free Software Foundation,
				18	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
				19	*/
				20	#include "xfs.h"
				21	#include "xfs_fs.h"
				22	#include "xfs_shared.h"
				23	#include "xfs_format.h"
				24	#include "xfs_log_format.h"
				25	#include "xfs_trans_resv.h"
				26	#include "xfs_mount.h"
				27	#include "xfs_defer.h"
				28	#include "xfs_da_format.h"
				29	#include "xfs_da_btree.h"
				30	#include "xfs_inode.h"
				31	#include "xfs_trans.h"
				32	#include "xfs_inode_item.h"
				33	#include "xfs_bmap.h"
				34	#include "xfs_bmap_util.h"
				35	#include "xfs_error.h"
				36	#include "xfs_dir2.h"
				37	#include "xfs_dir2_priv.h"
				38	#include "xfs_ioctl.h"
				39	#include "xfs_trace.h"
				40	#include "xfs_log.h"
				41	#include "xfs_icache.h"
				42	#include "xfs_pnfs.h"
				43	#include "xfs_refcount_btree.h"
				44	#include "xfs_refcount.h"
				45	#include "xfs_bmap_btree.h"
				46	#include "xfs_trans_space.h"
				47	#include "xfs_bit.h"
				48	#include "xfs_alloc.h"
				49	#include "xfs_quota_defs.h"
				50	#include "xfs_quota.h"
				51	#include "xfs_btree.h"
				52	#include "xfs_bmap_btree.h"
				53	#include "xfs_reflink.h"
Darrick J. Wong	2a06705	2016-10-03 09:11:33 -0700	[diff] [blame^]	54	#include "xfs_iomap.h"
Darrick J. Wong	3993bae	2016-10-03 09:11:32 -0700	[diff] [blame]	55
				56	/*
				57	* Copy on Write of Shared Blocks
				58	*
				59	* XFS must preserve "the usual" file semantics even when two files share
				60	* the same physical blocks. This means that a write to one file must not
				61	* alter the blocks in a different file; the way that we'll do that is
				62	* through the use of a copy-on-write mechanism. At a high level, that
				63	* means that when we want to write to a shared block, we allocate a new
				64	* block, write the data to the new block, and if that succeeds we map the
				65	* new block into the file.
				66	*
				67	* XFS provides a "delayed allocation" mechanism that defers the allocation
				68	* of disk blocks to dirty-but-not-yet-mapped file blocks as long as
				69	* possible. This reduces fragmentation by enabling the filesystem to ask
				70	* for bigger chunks less often, which is exactly what we want for CoW.
				71	*
				72	* The delalloc mechanism begins when the kernel wants to make a block
				73	* writable (write_begin or page_mkwrite). If the offset is not mapped, we
				74	* create a delalloc mapping, which is a regular in-core extent, but without
				75	* a real startblock. (For delalloc mappings, the startblock encodes both
				76	* a flag that this is a delalloc mapping, and a worst-case estimate of how
				77	* many blocks might be required to put the mapping into the BMBT.) delalloc
				78	* mappings are a reservation against the free space in the filesystem;
				79	* adjacent mappings can also be combined into fewer larger mappings.
				80	*
				81	* When dirty pages are being written out (typically in writepage), the
				82	* delalloc reservations are converted into real mappings by allocating
				83	* blocks and replacing the delalloc mapping with real ones. A delalloc
				84	* mapping can be replaced by several real ones if the free space is
				85	* fragmented.
				86	*
				87	* We want to adapt the delalloc mechanism for copy-on-write, since the
				88	* write paths are similar. The first two steps (creating the reservation
				89	* and allocating the blocks) are exactly the same as delalloc except that
				90	* the mappings must be stored in a separate CoW fork because we do not want
				91	* to disturb the mapping in the data fork until we're sure that the write
				92	* succeeded. IO completion in this case is the process of removing the old
				93	* mapping from the data fork and moving the new mapping from the CoW fork to
				94	* the data fork. This will be discussed shortly.
				95	*
				96	* For now, unaligned directio writes will be bounced back to the page cache.
				97	* Block-aligned directio writes will use the same mechanism as buffered
				98	* writes.
				99	*
				100	* CoW remapping must be done after the data block write completes,
				101	* because we don't want to destroy the old data fork map until we're sure
				102	* the new block has been written. Since the new mappings are kept in a
				103	* separate fork, we can simply iterate these mappings to find the ones
				104	* that cover the file blocks that we just CoW'd. For each extent, simply
				105	* unmap the corresponding range in the data fork, map the new range into
				106	* the data fork, and remove the extent from the CoW fork.
				107	*
				108	* Since the remapping operation can be applied to an arbitrary file
				109	* range, we record the need for the remap step as a flag in the ioend
				110	* instead of declaring a new IO type. This is required for direct io
				111	* because we only have ioend for the whole dio, and we have to be able to
				112	* remember the presence of unwritten blocks and CoW blocks with a single
				113	* ioend structure. Better yet, the more ground we can cover with one
				114	* ioend, the better.
				115	*/
Darrick J. Wong	2a06705	2016-10-03 09:11:33 -0700	[diff] [blame^]	116
				117	/*
				118	* Given an AG extent, find the lowest-numbered run of shared blocks
				119	* within that range and return the range in fbno/flen. If
				120	* find_end_of_shared is true, return the longest contiguous extent of
				121	* shared blocks. If there are no shared extents, fbno and flen will
				122	* be set to NULLAGBLOCK and 0, respectively.
				123	*/
				124	int
				125	xfs_reflink_find_shared(
				126	struct xfs_mount *mp,
				127	xfs_agnumber_t agno,
				128	xfs_agblock_t agbno,
				129	xfs_extlen_t aglen,
				130	xfs_agblock_t *fbno,
				131	xfs_extlen_t *flen,
				132	bool find_end_of_shared)
				133	{
				134	struct xfs_buf *agbp;
				135	struct xfs_btree_cur *cur;
				136	int error;
				137
				138	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
				139	if (error)
				140	return error;
				141
				142	cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
				143
				144	error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
				145	find_end_of_shared);
				146
				147	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
				148
				149	xfs_buf_relse(agbp);
				150	return error;
				151	}
				152
				153	/*
				154	* Trim the mapping to the next block where there's a change in the
				155	* shared/unshared status. More specifically, this means that we
				156	* find the lowest-numbered extent of shared blocks that coincides with
				157	* the given block mapping. If the shared extent overlaps the start of
				158	* the mapping, trim the mapping to the end of the shared extent. If
				159	* the shared region intersects the mapping, trim the mapping to the
				160	* start of the shared extent. If there are no shared regions that
				161	* overlap, just return the original extent.
				162	*/
				163	int
				164	xfs_reflink_trim_around_shared(
				165	struct xfs_inode *ip,
				166	struct xfs_bmbt_irec *irec,
				167	bool *shared,
				168	bool *trimmed)
				169	{
				170	xfs_agnumber_t agno;
				171	xfs_agblock_t agbno;
				172	xfs_extlen_t aglen;
				173	xfs_agblock_t fbno;
				174	xfs_extlen_t flen;
				175	int error = 0;
				176
				177	/* Holes, unwritten, and delalloc extents cannot be shared */
				178	if (!xfs_is_reflink_inode(ip) \|\|
				179	ISUNWRITTEN(irec) \|\|
				180	irec->br_startblock == HOLESTARTBLOCK \|\|
				181	irec->br_startblock == DELAYSTARTBLOCK) {
				182	*shared = false;
				183	return 0;
				184	}
				185
				186	trace_xfs_reflink_trim_around_shared(ip, irec);
				187
				188	agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
				189	agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
				190	aglen = irec->br_blockcount;
				191
				192	error = xfs_reflink_find_shared(ip->i_mount, agno, agbno,
				193	aglen, &fbno, &flen, true);
				194	if (error)
				195	return error;
				196
				197	shared = trimmed = false;
				198	if (fbno == NULLAGBLOCK) {
				199	/* No shared blocks at all. */
				200	return 0;
				201	} else if (fbno == agbno) {
				202	/*
				203	* The start of this extent is shared. Truncate the
				204	* mapping at the end of the shared region so that a
				205	* subsequent iteration starts at the start of the
				206	* unshared region.
				207	*/
				208	irec->br_blockcount = flen;
				209	*shared = true;
				210	if (flen != aglen)
				211	*trimmed = true;
				212	return 0;
				213	} else {
				214	/*
				215	* There's a shared extent midway through this extent.
				216	* Truncate the mapping at the start of the shared
				217	* extent so that a subsequent iteration starts at the
				218	* start of the shared region.
				219	*/
				220	irec->br_blockcount = fbno - agbno;
				221	*trimmed = true;
				222	return 0;
				223	}
				224	}
				225
				226	/* Create a CoW reservation for a range of blocks within a file. */
				227	static int
				228	__xfs_reflink_reserve_cow(
				229	struct xfs_inode *ip,
				230	xfs_fileoff_t *offset_fsb,
				231	xfs_fileoff_t end_fsb)
				232	{
				233	struct xfs_bmbt_irec got, prev, imap;
				234	xfs_fileoff_t orig_end_fsb;
				235	int nimaps, eof = 0, error = 0;
				236	bool shared = false, trimmed = false;
				237	xfs_extnum_t idx;
				238
				239	/* Already reserved? Skip the refcount btree access. */
				240	xfs_bmap_search_extents(ip, *offset_fsb, XFS_COW_FORK, &eof, &idx,
				241	&got, &prev);
				242	if (!eof && got.br_startoff <= *offset_fsb) {
				243	end_fsb = orig_end_fsb = got.br_startoff + got.br_blockcount;
				244	trace_xfs_reflink_cow_found(ip, &got);
				245	goto done;
				246	}
				247
				248	/* Read extent from the source file. */
				249	nimaps = 1;
				250	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
				251	&imap, &nimaps, 0);
				252	if (error)
				253	goto out_unlock;
				254	ASSERT(nimaps == 1);
				255
				256	/* Trim the mapping to the nearest shared extent boundary. */
				257	error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed);
				258	if (error)
				259	goto out_unlock;
				260
				261	end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount;
				262
				263	/* Not shared? Just report the (potentially capped) extent. */
				264	if (!shared)
				265	goto done;
				266
				267	/*
				268	* Fork all the shared blocks from our write offset until the end of
				269	* the extent.
				270	*/
				271	error = xfs_qm_dqattach_locked(ip, 0);
				272	if (error)
				273	goto out_unlock;
				274
				275	retry:
				276	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, *offset_fsb,
				277	end_fsb - *offset_fsb, &got,
				278	&prev, &idx, eof);
				279	switch (error) {
				280	case 0:
				281	break;
				282	case -ENOSPC:
				283	case -EDQUOT:
				284	/* retry without any preallocation */
				285	trace_xfs_reflink_cow_enospc(ip, &imap);
				286	if (end_fsb != orig_end_fsb) {
				287	end_fsb = orig_end_fsb;
				288	goto retry;
				289	}
				290	/FALLTHRU/
				291	default:
				292	goto out_unlock;
				293	}
				294
				295	trace_xfs_reflink_cow_alloc(ip, &got);
				296	done:
				297	*offset_fsb = end_fsb;
				298	out_unlock:
				299	return error;
				300	}
				301
				302	/* Create a CoW reservation for part of a file. */
				303	int
				304	xfs_reflink_reserve_cow_range(
				305	struct xfs_inode *ip,
				306	xfs_off_t offset,
				307	xfs_off_t count)
				308	{
				309	struct xfs_mount *mp = ip->i_mount;
				310	xfs_fileoff_t offset_fsb, end_fsb;
				311	int error;
				312
				313	trace_xfs_reflink_reserve_cow_range(ip, offset, count);
				314
				315	offset_fsb = XFS_B_TO_FSBT(mp, offset);
				316	end_fsb = XFS_B_TO_FSB(mp, offset + count);
				317
				318	xfs_ilock(ip, XFS_ILOCK_EXCL);
				319	while (offset_fsb < end_fsb) {
				320	error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb);
				321	if (error) {
				322	trace_xfs_reflink_reserve_cow_range_error(ip, error,
				323	_RET_IP_);
				324	break;
				325	}
				326	}
				327	xfs_iunlock(ip, XFS_ILOCK_EXCL);
				328
				329	return error;
				330	}