NTFS: Implement fs/ntfs/inode.[hc]::ntfs_truncate().  It only supports
      uncompressed and unencrypted files.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 7ec0451..a168234 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -30,6 +30,7 @@
 #include "debug.h"
 #include "inode.h"
 #include "attrib.h"
+#include "lcnalloc.h"
 #include "malloc.h"
 #include "mft.h"
 #include "time.h"
@@ -2291,11 +2292,16 @@
 
 #ifdef NTFS_RW
 
+static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
+		"chkdsk.";
+
 /**
  * ntfs_truncate - called when the i_size of an ntfs inode is changed
  * @vi:		inode for which the i_size was changed
  *
- * We do not support i_size changes yet.
+ * We only support i_size changes for normal files at present, i.e. not
+ * compressed and not encrypted.  This is enforced in ntfs_setattr(), see
+ * below.
  *
  * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and
  * that the change is allowed.
@@ -2306,80 +2312,499 @@
  * Returns 0 on success or -errno on error.
  *
  * Called with ->i_sem held.  In all but one case ->i_alloc_sem is held for
- * writing.  The only case where ->i_alloc_sem is not held is
+ * writing.  The only case in the kernel where ->i_alloc_sem is not held is
  * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called
- * with the current i_size as the offset which means that it is a noop as far
- * as ntfs_truncate() is concerned.
+ * with the current i_size as the offset.  The analogous place in NTFS is in
+ * fs/ntfs/file.c::ntfs_file_buffered_write() where we call vmtruncate() again
+ * without holding ->i_alloc_sem.
  */
 int ntfs_truncate(struct inode *vi)
 {
-	ntfs_inode *ni = NTFS_I(vi);
+	s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size;
+	VCN highest_vcn;
+	unsigned long flags;
+	ntfs_inode *base_ni, *ni = NTFS_I(vi);
 	ntfs_volume *vol = ni->vol;
 	ntfs_attr_search_ctx *ctx;
 	MFT_RECORD *m;
 	ATTR_RECORD *a;
 	const char *te = "  Leaving file length out of sync with i_size.";
-	int err;
+	int err, mp_size, size_change, alloc_change;
+	u32 attr_len;
 
 	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
 	BUG_ON(NInoAttr(ni));
+	BUG_ON(S_ISDIR(vi->i_mode));
+	BUG_ON(NInoMstProtected(ni));
 	BUG_ON(ni->nr_extents < 0);
-	m = map_mft_record(ni);
+retry_truncate:
+	/*
+	 * Lock the runlist for writing and map the mft record to ensure it is
+	 * safe to mess with the attribute runlist and sizes.
+	 */
+	down_write(&ni->runlist.lock);
+	if (!NInoAttr(ni))
+		base_ni = ni;
+	else
+		base_ni = ni->ext.base_ntfs_ino;
+	m = map_mft_record(base_ni);
 	if (IS_ERR(m)) {
 		err = PTR_ERR(m);
 		ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx "
 				"(error code %d).%s", vi->i_ino, err, te);
 		ctx = NULL;
 		m = NULL;
-		goto err_out;
+		goto old_bad_out;
 	}
-	ctx = ntfs_attr_get_search_ctx(ni, m);
+	ctx = ntfs_attr_get_search_ctx(base_ni, m);
 	if (unlikely(!ctx)) {
 		ntfs_error(vi->i_sb, "Failed to allocate a search context for "
 				"inode 0x%lx (not enough memory).%s",
 				vi->i_ino, te);
 		err = -ENOMEM;
-		goto err_out;
+		goto old_bad_out;
 	}
 	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
 			CASE_SENSITIVE, 0, NULL, 0, ctx);
 	if (unlikely(err)) {
-		if (err == -ENOENT)
+		if (err == -ENOENT) {
 			ntfs_error(vi->i_sb, "Open attribute is missing from "
 					"mft record.  Inode 0x%lx is corrupt.  "
-					"Run chkdsk.", vi->i_ino);
-		else
+					"Run chkdsk.%s", vi->i_ino, te);
+			err = -EIO;
+		} else
 			ntfs_error(vi->i_sb, "Failed to lookup attribute in "
-					"inode 0x%lx (error code %d).",
-					vi->i_ino, err);
+					"inode 0x%lx (error code %d).%s",
+					vi->i_ino, err, te);
+		goto old_bad_out;
+	}
+	m = ctx->mrec;
+	a = ctx->attr;
+	/*
+	 * The i_size of the vfs inode is the new size for the attribute value.
+	 */
+	new_size = i_size_read(vi);
+	/* The current size of the attribute value is the old size. */
+	old_size = ntfs_attr_size(a);
+	/* Calculate the new allocated size. */
+	if (NInoNonResident(ni))
+		new_alloc_size = (new_size + vol->cluster_size - 1) &
+				~(s64)vol->cluster_size_mask;
+	else
+		new_alloc_size = (new_size + 7) & ~7;
+	/* The current allocated size is the old allocated size. */
+	read_lock_irqsave(&ni->size_lock, flags);
+	old_alloc_size = ni->allocated_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
+	/*
+	 * The change in the file size.  This will be 0 if no change, >0 if the
+	 * size is growing, and <0 if the size is shrinking.
+	 */
+	size_change = -1;
+	if (new_size - old_size >= 0) {
+		size_change = 1;
+		if (new_size == old_size)
+			size_change = 0;
+	}
+	/* As above for the allocated size. */
+	alloc_change = -1;
+	if (new_alloc_size - old_alloc_size >= 0) {
+		alloc_change = 1;
+		if (new_alloc_size == old_alloc_size)
+			alloc_change = 0;
+	}
+	/*
+	 * If neither the size nor the allocation are being changed there is
+	 * nothing to do.
+	 */
+	if (!size_change && !alloc_change)
+		goto unm_done;
+	/* If the size is changing, check if new size is allowed in $AttrDef. */
+	if (size_change) {
+		err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
+		if (unlikely(err)) {
+			if (err == -ERANGE) {
+				ntfs_error(vol->sb, "Truncate would cause the "
+						"inode 0x%lx to %simum size "
+						"for its attribute type "
+						"(0x%x).  Aborting truncate.",
+						vi->i_ino,
+						new_size > old_size ? "exceed "
+						"the max" : "go under the min",
+						le32_to_cpu(ni->type));
+				err = -EFBIG;
+			} else {
+				ntfs_error(vol->sb, "Inode 0x%lx has unknown "
+						"attribute type 0x%x.  "
+						"Aborting truncate.",
+						vi->i_ino,
+						le32_to_cpu(ni->type));
+				err = -EIO;
+			}
+			/* Reset the vfs inode size to the old size. */
+			i_size_write(vi, old_size);
+			goto err_out;
+		}
+	}
+	if (NInoCompressed(ni) || NInoEncrypted(ni)) {
+		ntfs_warning(vi->i_sb, "Changes in inode size are not "
+				"supported yet for %s files, ignoring.",
+				NInoCompressed(ni) ? "compressed" :
+				"encrypted");
+		err = -EOPNOTSUPP;
+		goto bad_out;
+	}
+	if (a->non_resident)
+		goto do_non_resident_truncate;
+	BUG_ON(NInoNonResident(ni));
+	/* Resize the attribute record to best fit the new attribute size. */
+	if (new_size < vol->mft_record_size &&
+			!ntfs_resident_attr_value_resize(m, a, new_size)) {
+		unsigned long flags;
+
+		/* The resize succeeded! */
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		write_lock_irqsave(&ni->size_lock, flags);
+		/* Update the sizes in the ntfs inode and all is done. */
+		ni->allocated_size = le32_to_cpu(a->length) -
+				le16_to_cpu(a->data.resident.value_offset);
+		/*
+		 * Note ntfs_resident_attr_value_resize() has already done any
+		 * necessary data clearing in the attribute record.  When the
+		 * file is being shrunk vmtruncate() will already have cleared
+		 * the top part of the last partial page, i.e. since this is
+		 * the resident case this is the page with index 0.  However,
+		 * when the file is being expanded, the page cache page data
+		 * between the old data_size, i.e. old_size, and the new_size
+		 * has not been zeroed.  Fortunately, we do not need to zero it
+		 * either since on one hand it will either already be zero due
+		 * to both readpage and writepage clearing partial page data
+		 * beyond i_size in which case there is nothing to do or in the
+		 * case of the file being mmap()ped at the same time, POSIX
+		 * specifies that the behaviour is unspecified thus we do not
+		 * have to do anything.  This means that in our implementation
+		 * in the rare case that the file is mmap()ped and a write
+		 * occured into the mmap()ped region just beyond the file size
+		 * and writepage has not yet been called to write out the page
+		 * (which would clear the area beyond the file size) and we now
+		 * extend the file size to incorporate this dirty region
+		 * outside the file size, a write of the page would result in
+		 * this data being written to disk instead of being cleared.
+		 * Given both POSIX and the Linux mmap(2) man page specify that
+		 * this corner case is undefined, we choose to leave it like
+		 * that as this is much simpler for us as we cannot lock the
+		 * relevant page now since we are holding too many ntfs locks
+		 * which would result in a lock reversal deadlock.
+		 */
+		ni->initialized_size = new_size;
+		write_unlock_irqrestore(&ni->size_lock, flags);
+		goto unm_done;
+	}
+	/* If the above resize failed, this must be an attribute extension. */
+	BUG_ON(size_change < 0);
+	/*
+	 * We have to drop all the locks so we can call
+	 * ntfs_attr_make_non_resident().  This could be optimised by try-
+	 * locking the first page cache page and only if that fails dropping
+	 * the locks, locking the page, and redoing all the locking and
+	 * lookups.  While this would be a huge optimisation, it is not worth
+	 * it as this is definitely a slow code path as it only ever can happen
+	 * once for any given file.
+	 */
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+	/*
+	 * Not enough space in the mft record, try to make the attribute
+	 * non-resident and if successful restart the truncation process.
+	 */
+	err = ntfs_attr_make_non_resident(ni, old_size);
+	if (likely(!err))
+		goto retry_truncate;
+	/*
+	 * Could not make non-resident.  If this is due to this not being
+	 * permitted for this attribute type or there not being enough space,
+	 * try to make other attributes non-resident.  Otherwise fail.
+	 */
+	if (unlikely(err != -EPERM && err != -ENOSPC)) {
+		ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute "
+				"type 0x%x, because the conversion from "
+				"resident to non-resident attribute failed "
+				"with error code %i.", vi->i_ino,
+				(unsigned)le32_to_cpu(ni->type), err);
+		if (err != -ENOMEM)
+			err = -EIO;
+		goto conv_err_out;
+	}
+	/* TODO: Not implemented from here, abort. */
+	if (err == -ENOSPC)
+		ntfs_error(vol->sb, "Not enough space in the mft record/on "
+				"disk for the non-resident attribute value.  "
+				"This case is not implemented yet.");
+	else /* if (err == -EPERM) */
+		ntfs_error(vol->sb, "This attribute type may not be "
+				"non-resident.  This case is not implemented "
+				"yet.");
+	err = -EOPNOTSUPP;
+	goto conv_err_out;
+#if 0
+	// TODO: Attempt to make other attributes non-resident.
+	if (!err)
+		goto do_resident_extend;
+	/*
+	 * Both the attribute list attribute and the standard information
+	 * attribute must remain in the base inode.  Thus, if this is one of
+	 * these attributes, we have to try to move other attributes out into
+	 * extent mft records instead.
+	 */
+	if (ni->type == AT_ATTRIBUTE_LIST ||
+			ni->type == AT_STANDARD_INFORMATION) {
+		// TODO: Attempt to move other attributes into extent mft
+		// records.
+		err = -EOPNOTSUPP;
+		if (!err)
+			goto do_resident_extend;
 		goto err_out;
 	}
-	a = ctx->attr;
-	/* If the size has not changed there is nothing to do. */
-	if (ntfs_attr_size(a) == i_size_read(vi))
-		goto done;
-	// TODO: Implement the truncate...
-	ntfs_error(vi->i_sb, "Inode size has changed but this is not "
-			"implemented yet.  Resetting inode size to old value. "
-			" This is most likely a bug in the ntfs driver!");
-	i_size_write(vi, ntfs_attr_size(a)); 
-done:
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(ni);
-	NInoClearTruncateFailed(ni);
-	ntfs_debug("Done.");
-	return 0;
-err_out:
-	if (err != -ENOMEM) {
-		NVolSetErrors(vol);
-		make_bad_inode(vi);
+	// TODO: Attempt to move this attribute to an extent mft record, but
+	// only if it is not already the only attribute in an mft record in
+	// which case there would be nothing to gain.
+	err = -EOPNOTSUPP;
+	if (!err)
+		goto do_resident_extend;
+	/* There is nothing we can do to make enough space. )-: */
+	goto err_out;
+#endif
+do_non_resident_truncate:
+	BUG_ON(!NInoNonResident(ni));
+	if (alloc_change < 0) {
+		highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+		if (highest_vcn > 0 &&
+				old_alloc_size >> vol->cluster_size_bits >
+				highest_vcn + 1) {
+			/*
+			 * This attribute has multiple extents.  Not yet
+			 * supported.
+			 */
+			ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, "
+					"attribute type 0x%x, because the "
+					"attribute is highly fragmented (it "
+					"consists of multiple extents) and "
+					"this case is not implemented yet.",
+					vi->i_ino,
+					(unsigned)le32_to_cpu(ni->type));
+			err = -EOPNOTSUPP;
+			goto bad_out;
+		}
 	}
+	/*
+	 * If the size is shrinking, need to reduce the initialized_size and
+	 * the data_size before reducing the allocation.
+	 */
+	if (size_change < 0) {
+		/*
+		 * Make the valid size smaller (i_size is already up-to-date).
+		 */
+		write_lock_irqsave(&ni->size_lock, flags);
+		if (new_size < ni->initialized_size) {
+			ni->initialized_size = new_size;
+			a->data.non_resident.initialized_size =
+					cpu_to_sle64(new_size);
+		}
+		a->data.non_resident.data_size = cpu_to_sle64(new_size);
+		write_unlock_irqrestore(&ni->size_lock, flags);
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		/* If the allocated size is not changing, we are done. */
+		if (!alloc_change)
+			goto unm_done;
+		/*
+		 * If the size is shrinking it makes no sense for the
+		 * allocation to be growing.
+		 */
+		BUG_ON(alloc_change > 0);
+	} else /* if (size_change >= 0) */ {
+		/*
+		 * The file size is growing or staying the same but the
+		 * allocation can be shrinking, growing or staying the same.
+		 */
+		if (alloc_change > 0) {
+			/*
+			 * We need to extend the allocation and possibly update
+			 * the data size.  If we are updating the data size,
+			 * since we are not touching the initialized_size we do
+			 * not need to worry about the actual data on disk.
+			 * And as far as the page cache is concerned, there
+			 * will be no pages beyond the old data size and any
+			 * partial region in the last page between the old and
+			 * new data size (or the end of the page if the new
+			 * data size is outside the page) does not need to be
+			 * modified as explained above for the resident
+			 * attribute truncate case.  To do this, we simply drop
+			 * the locks we hold and leave all the work to our
+			 * friendly helper ntfs_attr_extend_allocation().
+			 */
+			ntfs_attr_put_search_ctx(ctx);
+			unmap_mft_record(base_ni);
+			up_write(&ni->runlist.lock);
+			err = ntfs_attr_extend_allocation(ni, new_size,
+					size_change > 0 ? new_size : -1, -1);
+			/*
+			 * ntfs_attr_extend_allocation() will have done error
+			 * output already.
+			 */
+			goto done;
+		}
+		if (!alloc_change)
+			goto alloc_done;
+	}
+	/* alloc_change < 0 */
+	/* Free the clusters. */
+	nr_freed = ntfs_cluster_free(ni, new_alloc_size >>
+			vol->cluster_size_bits, -1, ctx);
+	m = ctx->mrec;
+	a = ctx->attr;
+	if (unlikely(nr_freed < 0)) {
+		ntfs_error(vol->sb, "Failed to release cluster(s) (error code "
+				"%lli).  Unmount and run chkdsk to recover "
+				"the lost cluster(s).", (long long)nr_freed);
+		NVolSetErrors(vol);
+		nr_freed = 0;
+	}
+	/* Truncate the runlist. */
+	err = ntfs_rl_truncate_nolock(vol, &ni->runlist,
+			new_alloc_size >> vol->cluster_size_bits);
+	/*
+	 * If the runlist truncation failed and/or the search context is no
+	 * longer valid, we cannot resize the attribute record or build the
+	 * mapping pairs array thus we mark the inode bad so that no access to
+	 * the freed clusters can happen.
+	 */
+	if (unlikely(err || IS_ERR(m))) {
+		ntfs_error(vol->sb, "Failed to %s (error code %li).%s",
+				IS_ERR(m) ?
+				"restore attribute search context" :
+				"truncate attribute runlist",
+				IS_ERR(m) ? PTR_ERR(m) : err, es);
+		err = -EIO;
+		goto bad_out;
+	}
+	/* Get the size for the shrunk mapping pairs array for the runlist. */
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1);
+	if (unlikely(mp_size <= 0)) {
+		ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
+				"attribute type 0x%x, because determining the "
+				"size for the mapping pairs failed with error "
+				"code %i.%s", vi->i_ino,
+				(unsigned)le32_to_cpu(ni->type), mp_size, es);
+		err = -EIO;
+		goto bad_out;
+	}
+	/*
+	 * Shrink the attribute record for the new mapping pairs array.  Note,
+	 * this cannot fail since we are making the attribute smaller thus by
+	 * definition there is enough space to do so.
+	 */
+	attr_len = le32_to_cpu(a->length);
+	err = ntfs_attr_record_resize(m, a, mp_size +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+	BUG_ON(err);
+	/*
+	 * Generate the mapping pairs array directly into the attribute record.
+	 */
+	err = ntfs_mapping_pairs_build(vol, (u8*)a +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+			mp_size, ni->runlist.rl, 0, -1, NULL);
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
+				"attribute type 0x%x, because building the "
+				"mapping pairs failed with error code %i.%s",
+				vi->i_ino, (unsigned)le32_to_cpu(ni->type),
+				err, es);
+		err = -EIO;
+		goto bad_out;
+	}
+	/* Update the allocated/compressed size as well as the highest vcn. */
+	a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
+			vol->cluster_size_bits) - 1);
+	write_lock_irqsave(&ni->size_lock, flags);
+	ni->allocated_size = new_alloc_size;
+	a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
+	if (NInoSparse(ni) || NInoCompressed(ni)) {
+		if (nr_freed) {
+			ni->itype.compressed.size -= nr_freed <<
+					vol->cluster_size_bits;
+			BUG_ON(ni->itype.compressed.size < 0);
+			a->data.non_resident.compressed_size = cpu_to_sle64(
+					ni->itype.compressed.size);
+			vi->i_blocks = ni->itype.compressed.size >> 9;
+		}
+	} else
+		vi->i_blocks = new_alloc_size >> 9;
+	write_unlock_irqrestore(&ni->size_lock, flags);
+	/*
+	 * We have shrunk the allocation.  If this is a shrinking truncate we
+	 * have already dealt with the initialized_size and the data_size above
+	 * and we are done.  If the truncate is only changing the allocation
+	 * and not the data_size, we are also done.  If this is an extending
+	 * truncate, need to extend the data_size now which is ensured by the
+	 * fact that @size_change is positive.
+	 */
+alloc_done:
+	/*
+	 * If the size is growing, need to update it now.  If it is shrinking,
+	 * we have already updated it above (before the allocation change).
+	 */
+	if (size_change > 0)
+		a->data.non_resident.data_size = cpu_to_sle64(new_size);
+	/* Ensure the modified mft record is written out. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+unm_done:
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+done:
+	/* Update the mtime and ctime on the base inode. */
+	inode_update_time(VFS_I(base_ni), 1);
+	if (likely(!err)) {
+		NInoClearTruncateFailed(ni);
+		ntfs_debug("Done.");
+	}
+	return err;
+old_bad_out:
+	old_size = -1;
+bad_out:
+	if (err != -ENOMEM && err != -EOPNOTSUPP) {
+		make_bad_inode(vi);
+		make_bad_inode(VFS_I(base_ni));
+		NVolSetErrors(vol);
+	}
+	if (err != -EOPNOTSUPP)
+		NInoSetTruncateFailed(ni);
+	else if (old_size >= 0)
+		i_size_write(vi, old_size);
+err_out:
 	if (ctx)
 		ntfs_attr_put_search_ctx(ctx);
 	if (m)
-		unmap_mft_record(ni);
-	NInoSetTruncateFailed(ni);
+		unmap_mft_record(base_ni);
+	up_write(&ni->runlist.lock);
+out:
+	ntfs_debug("Failed.  Returning error code %i.", err);
 	return err;
+conv_err_out:
+	if (err != -ENOMEM && err != -EOPNOTSUPP) {
+		make_bad_inode(vi);
+		make_bad_inode(VFS_I(base_ni));
+		NVolSetErrors(vol);
+	}
+	if (err != -EOPNOTSUPP)
+		NInoSetTruncateFailed(ni);
+	else
+		i_size_write(vi, old_size);
+	goto out;
 }
 
 /**