Skip to content

Commit

Permalink
BRT: Fix FICLONE/FICLONERANGE shortened copy
Browse files Browse the repository at this point in the history
On Linux the ioctl_ficlonerange() and ioctl_ficlone() system calls
are expected to either fully clone the specified range or return an
error.  The range may be for an entire file.  While internally ZFS
supports cloning partial ranges there's no way to return the length
cloned to the caller so we need to make this all or nothing.

As part of this change support for the REMAP_FILE_CAN_SHORTEN flag
has been added.  When REMAP_FILE_CAN_SHORTEN is set zfs_clone_range()
will return a shortened range when encountering pending dirty records.
When it's clear zfs_clone_range() will block and wait for the records
to be written out allowing the blocks to be cloned.

Furthermore, the file rangelock is held over the region being cloned
to prevent it from being modified while cloning.  This doesn't quite
provide an atomic semantics since if an error is encountered only a
portion of the range may be cloned.  This will be converted to an
error if REMAP_FILE_CAN_SHORTEN was not provided and returned to the
caller.  However, the destination file range is left in an undefined
state.

A test case has been added which exercises this functionality by
verifying that `cp --reflink=never|auto|always` works correctly.

Signed-off-by: Brian D Behlendorf <[email protected]>
Issue openzfs#15728
  • Loading branch information
behlendorf committed Feb 1, 2024
1 parent 2e6b3c4 commit 45d016a
Show file tree
Hide file tree
Showing 8 changed files with 142 additions and 29 deletions.
2 changes: 1 addition & 1 deletion include/sys/zfs_vnops.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *);
extern int zfs_holey(znode_t *, ulong_t, loff_t *);
extern int zfs_access(znode_t *, int, int, cred_t *);
extern int zfs_clone_range(znode_t *, uint64_t *, znode_t *, uint64_t *,
uint64_t *, cred_t *);
uint64_t *, cred_t *, boolean_t wait_sync);
extern int zfs_clone_range_replay(znode_t *, uint64_t, uint64_t, uint64_t,
const blkptr_t *, size_t);

Expand Down
2 changes: 1 addition & 1 deletion module/os/freebsd/zfs/zfs_vnops_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -6304,7 +6304,7 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
goto out_locked;

error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
ap->a_outoffp, &len, ap->a_outcred);
ap->a_outoffp, &len, ap->a_outcred, B_FALSE);
if (error == EXDEV || error == EAGAIN || error == EINVAL ||
error == EOPNOTSUPP)
goto bad_locked_fallback;
Expand Down
49 changes: 27 additions & 22 deletions module/os/linux/zfs/zpl_file_range.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,15 @@ int zfs_bclone_enabled = 1;
*
* Note that we are not required to update file offsets; the kernel will take
* care of that depending on how it was called.
*
* When the caller cannot handle a shortened range request wait_sync should
* be set. This will request that zfs_clone_range() wait on transaction groups
* as needed to allow the clone to succeed. This handles the case where a file
* is modified and immediately cloned with FICLONE or FICLONERANGE.
*/
static ssize_t
__zpl_clone_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off, size_t len)
zpl_clone_file_range_impl(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off, size_t len, boolean_t wait_sync)
{
struct inode *src_i = file_inode(src_file);
struct inode *dst_i = file_inode(dst_file);
Expand All @@ -67,7 +72,9 @@ __zpl_clone_file_range(struct file *src_file, loff_t src_off,
cookie = spl_fstrans_mark();

err = -zfs_clone_range(ITOZ(src_i), &src_off_o, ITOZ(dst_i),
&dst_off_o, &len_o, cr);
&dst_off_o, &len_o, cr, wait_sync);
if (wait_sync && err == 0 && len != len_o)
err = -EINVAL;

spl_fstrans_unmark(cookie);
crfree(cr);
Expand Down Expand Up @@ -96,12 +103,13 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
{
ssize_t ret;

/* Flags is reserved for future extensions and must be zero. */
if (flags != 0)
return (-EINVAL);

/* Try to do it via zfs_clone_range() */
ret = __zpl_clone_file_range(src_file, src_off,
dst_file, dst_off, len);
/* Try to do it via zfs_clone_range() and allow shortening. */
ret = zpl_clone_file_range_impl(src_file, src_off,
dst_file, dst_off, len, B_FALSE);

#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
/*
Expand Down Expand Up @@ -137,6 +145,11 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
* FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the
* range in both files and if they're the same, arrange for them to be backed
* by the same storage.
*
* REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range
* if we want. Its designed for filesystems that may need to shorten the length
* for alignment, EOF, or any other requirement. ZFS may shorten the request
* when there is outstanding dirty data which hasn't been written to disk.
*/
loff_t
zpl_remap_file_range(struct file *src_file, loff_t src_off,
Expand All @@ -145,24 +158,16 @@ zpl_remap_file_range(struct file *src_file, loff_t src_off,
if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN))
return (-EINVAL);

/*
* REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given
* range if we want. Its designed for filesystems that make data past
* EOF available, and don't want it to be visible in both files. ZFS
* doesn't do that, so we just turn the flag off.
*/
flags &= ~REMAP_FILE_CAN_SHORTEN;

/* No support for dedup yet */
if (flags & REMAP_FILE_DEDUP)
/* No support for dedup yet */
return (-EOPNOTSUPP);

/* Zero length means to clone everything to the end of the file */
if (len == 0)
len = i_size_read(file_inode(src_file)) - src_off;

return (__zpl_clone_file_range(src_file, src_off,
dst_file, dst_off, len));
return (zpl_clone_file_range_impl(src_file, src_off,
dst_file, dst_off, len, !(REMAP_FILE_CAN_SHORTEN & flags)));
}
#endif /* HAVE_VFS_REMAP_FILE_RANGE */

Expand All @@ -179,8 +184,8 @@ zpl_clone_file_range(struct file *src_file, loff_t src_off,
if (len == 0)
len = i_size_read(file_inode(src_file)) - src_off;

return (__zpl_clone_file_range(src_file, src_off,
dst_file, dst_off, len));
return (zpl_clone_file_range_impl(src_file, src_off,
dst_file, dst_off, len, B_TRUE));
}
#endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */

Expand Down Expand Up @@ -215,7 +220,7 @@ zpl_ioctl_ficlone(struct file *dst_file, void *arg)
size_t len = i_size_read(file_inode(src_file));

ssize_t ret =
__zpl_clone_file_range(src_file, 0, dst_file, 0, len);
zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len, B_TRUE);

fput(src_file);

Expand Down Expand Up @@ -253,8 +258,8 @@ zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg)
if (len == 0)
len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset;

ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset,
dst_file, fcr.fcr_dest_offset, len);
ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset,
dst_file, fcr.fcr_dest_offset, len, B_TRUE);

fput(src_file);

Expand Down
16 changes: 12 additions & 4 deletions module/zfs/zfs_vnops.c
Original file line number Diff line number Diff line change
Expand Up @@ -1030,7 +1030,7 @@ zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
*/
int
zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
uint64_t *outoffp, uint64_t *lenp, cred_t *cr)
uint64_t *outoffp, uint64_t *lenp, cred_t *cr, boolean_t wait_sync)
{
zfsvfs_t *inzfsvfs, *outzfsvfs;
objset_t *inos, *outos;
Expand Down Expand Up @@ -1292,10 +1292,18 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
if (error != 0) {
/*
* If we are trying to clone a block that was created
* in the current transaction group, error will be
* EAGAIN here, which we can just return to the caller
* so it can fallback if it likes.
* in the current transaction group, the error will be
* EAGAIN here. Based on sync_wait either return a
* shortened range to the caller so it can fallback,
* or wait for the next sync and check again.
*/
if (error == EAGAIN && wait_sync) {
int64_t txg = spa_last_synced_txg(
dmu_objset_spa(inos)) + 1;
txg_wait_synced(dmu_objset_pool(inos), txg);
continue;
}

break;
}

Expand Down
2 changes: 1 addition & 1 deletion tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,7 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos',
tags = ['functional', 'compression']

[tests/functional/cp_files]
tests = ['cp_files_001_pos', 'cp_stress']
tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress']
tags = ['functional', 'cp_files']

[tests/functional/crtime]
Expand Down
1 change: 1 addition & 0 deletions tests/test-runner/bin/zts-report.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ if sys.platform.startswith('freebsd'):
'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason],
'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason],
'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason],
'cp_files/cp_files_002_pos': ['SKIP', na_reason],
'link_count/link_count_001': ['SKIP', na_reason],
'casenorm/mixed_create_failure': ['FAIL', 13215],
'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -1394,6 +1394,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/compression/setup.ksh \
functional/cp_files/cleanup.ksh \
functional/cp_files/cp_files_001_pos.ksh \
functional/cp_files/cp_files_002_pos.ksh \
functional/cp_files/cp_stress.ksh \
functional/cp_files/setup.ksh \
functional/crtime/cleanup.ksh \
Expand Down
98 changes: 98 additions & 0 deletions tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#! /bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#

#
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
#

. $STF_SUITE/include/libtest.shlib

#
# DESCRIPTION:
# Verify all cp --reflink modes work with modified file.
#
# STRATEGY:
# 1. Verify "cp --reflink=never|auto|always" all correctly copy a
# freshly modified file. Depending on the setting it may be
# cloned or copied but it must always be fully intact and never
# truncated.
#

verify_runnable "global"

if ! is_linux; then
log_unsupported "cp --reflink is a GNU coreutils option"
fi

function cleanup
{
rm -rf $TESTDIR/cp-reflink
}

function verify_copy
{
src_cksum=$(sha256digest $1)
dst_cksum=$(sha256digest $2)

if [[ "$src_cksum" != "$dst_cksum" ]]; then
log_must ls -l $TESTDIR/cp
log_fail "checksum mismatch ($src_cksum != $dst_cksum)"
fi
}

log_assert "Verify all cp --reflink modes work with modified file"

log_onexit cleanup

RECORDSIZE=$(zfs get -Hp -o value recordsize $TESTPOOL/$TESTFS)
SRC_FILE=src.data
DST_FILE=dst.data
SRC_SIZE=$(($RANDOM % 2048))

log_must mkdir $TESTDIR/cp-reflink
log_must cd $TESTDIR/cp-reflink

for mode in "never" "auto" "always"; do
log_note "Checking 'cp --reflink=$mode'"

# Create a new file and immediately copy it.
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
verify_copy $SRC_FILE $DST_FILE
rm -f $DST_FILE

# Append to an existing file and immediately copy it.
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \
count=1 conv=notrunc
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
verify_copy $SRC_FILE $DST_FILE
rm -f $DST_FILE

# Overwrite a random range of an existing file and immediately copy it.
log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
verify_copy $SRC_FILE $DST_FILE
log_must rm -f $SRC_FILE $DST_FILE
done

log_pass

0 comments on commit 45d016a

Please sign in to comment.