Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BRT: Fix FICLONE/FICLONERANGE shortened copy #15842

Merged
merged 2 commits into from
Feb 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion include/os/freebsd/zfs/sys/zfs_vfsops_os.h
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,6 @@ typedef struct zfid_long {
#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t))

extern int zfs_super_owner;
extern int zfs_bclone_enabled;

extern void zfs_init(void);
extern void zfs_fini(void);
Expand Down
2 changes: 0 additions & 2 deletions include/os/linux/zfs/sys/zfs_vfsops_os.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@ extern "C" {
typedef struct zfsvfs zfsvfs_t;
struct znode;

extern int zfs_bclone_enabled;

/*
* This structure emulates the vfs_t from other platforms. It's purpose
* is to facilitate the handling of mount options and minimize structural
Expand Down
3 changes: 3 additions & 0 deletions include/sys/zfs_vnops.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,11 @@

#ifndef _SYS_FS_ZFS_VNOPS_H
#define _SYS_FS_ZFS_VNOPS_H

#include <sys/zfs_vnops_os.h>

extern int zfs_bclone_enabled;

extern int zfs_fsync(znode_t *, int, cred_t *);
extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *);
extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *);
Expand Down
9 changes: 9 additions & 0 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -1159,6 +1159,15 @@ Enable the experimental block cloning feature.
If this setting is 0, then even if feature@block_cloning is enabled,
attempts to clone blocks will act as though the feature is disabled.
.
.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int
When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be
written to disk.
This allows the clone operation to reliably succeed when a file is
modified and then immediately cloned.
For small files this may be slower than making a copy of the file.
Therefore, this setting defaults to 0 which causes a clone operation to
immediately fail when encountering a dirty block.
.
.It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string
Select a BLAKE3 implementation.
.Pp
Expand Down
4 changes: 0 additions & 4 deletions module/os/freebsd/zfs/zfs_vfsops.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,6 @@ int zfs_debug_level;
SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
"Debug level");

int zfs_bclone_enabled = 1;
SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN,
&zfs_bclone_enabled, 0, "Enable block cloning");

struct zfs_jailparam {
int mount_snapshot;
};
Expand Down
5 changes: 0 additions & 5 deletions module/os/linux/zfs/zfs_vnops_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -4255,9 +4255,4 @@ EXPORT_SYMBOL(zfs_map);
/* CSTYLED */
module_param(zfs_delete_blocks, ulong, 0644);
MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");

/* CSTYLED */
module_param(zfs_bclone_enabled, uint, 0644);
MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning");

#endif
48 changes: 27 additions & 21 deletions module/os/linux/zfs/zpl_file_range.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,14 @@
#include <sys/zfs_vnops.h>
#include <sys/zfeature.h>

int zfs_bclone_enabled = 1;

/*
* Clone part of a file via block cloning.
*
* Note that we are not required to update file offsets; the kernel will take
* care of that depending on how it was called.
*/
static ssize_t
__zpl_clone_file_range(struct file *src_file, loff_t src_off,
zpl_clone_file_range_impl(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off, size_t len)
{
struct inode *src_i = file_inode(src_file);
Expand Down Expand Up @@ -96,11 +94,12 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
{
ssize_t ret;

/* Flags is reserved for future extensions and must be zero. */
if (flags != 0)
return (-EINVAL);

/* Try to do it via zfs_clone_range() */
ret = __zpl_clone_file_range(src_file, src_off,
/* Try to do it via zfs_clone_range() and allow shortening. */
ret = zpl_clone_file_range_impl(src_file, src_off,
dst_file, dst_off, len);

#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
Expand Down Expand Up @@ -137,6 +136,11 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
* FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the
* range in both files and if they're the same, arrange for them to be backed
* by the same storage.
*
* REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range
* if we want. It's designed for filesystems that may need to shorten the
* length for alignment, EOF, or any other requirement. ZFS may shorten the
* request when there is outstanding dirty data which hasn't been written.
*/
loff_t
zpl_remap_file_range(struct file *src_file, loff_t src_off,
Expand All @@ -145,24 +149,21 @@ zpl_remap_file_range(struct file *src_file, loff_t src_off,
if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN))
return (-EINVAL);

/*
* REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given
* range if we want. Its designed for filesystems that make data past
* EOF available, and don't want it to be visible in both files. ZFS
* doesn't do that, so we just turn the flag off.
*/
flags &= ~REMAP_FILE_CAN_SHORTEN;

/* No support for dedup yet */
if (flags & REMAP_FILE_DEDUP)
/* No support for dedup yet */
return (-EOPNOTSUPP);

/* Zero length means to clone everything to the end of the file */
if (len == 0)
len = i_size_read(file_inode(src_file)) - src_off;

return (__zpl_clone_file_range(src_file, src_off,
dst_file, dst_off, len));
ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
dst_file, dst_off, len);

if (!(flags & REMAP_FILE_CAN_SHORTEN) && ret >= 0 && ret != len)
ret = -EINVAL;

return (ret);
}
#endif /* HAVE_VFS_REMAP_FILE_RANGE */

Expand All @@ -179,8 +180,14 @@ zpl_clone_file_range(struct file *src_file, loff_t src_off,
if (len == 0)
len = i_size_read(file_inode(src_file)) - src_off;

return (__zpl_clone_file_range(src_file, src_off,
dst_file, dst_off, len));
/* The entire length must be cloned or this is an error. */
ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
dst_file, dst_off, len);

if (ret >= 0 && ret != len)
ret = -EINVAL;

return (ret);
}
#endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */

Expand Down Expand Up @@ -214,8 +221,7 @@ zpl_ioctl_ficlone(struct file *dst_file, void *arg)

size_t len = i_size_read(file_inode(src_file));

ssize_t ret =
__zpl_clone_file_range(src_file, 0, dst_file, 0, len);
ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len);

fput(src_file);

Expand Down Expand Up @@ -253,7 +259,7 @@ zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg)
if (len == 0)
len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset;

ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset,
ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset,
dst_file, fcr.fcr_dest_offset, len);

fput(src_file);
Expand Down
43 changes: 38 additions & 5 deletions module/zfs/zfs_vnops.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,26 @@
#include <sys/zfs_vfsops.h>
#include <sys/zfs_znode.h>

/*
* Enable the experimental block cloning feature. If this setting is 0, then
* even if feature@block_cloning is enabled, attempts to clone blocks will act
* as though the feature is disabled.
*/
int zfs_bclone_enabled = 1;

/*
* When set zfs_clone_range() waits for dirty data to be written to disk.
* This allows the clone operation to reliably succeed when a file is modified
* and then immediately cloned. For small files this may be slower than making
* a copy of the file and is therefore not the default. However, in certain
* scenarios this behavior may be desirable so a tunable is provided.
*/
static int zfs_bclone_wait_dirty = 0;

/*
* Maximum bytes to read per chunk in zfs_read().
*/
static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
behlendorf marked this conversation as resolved.
Show resolved Hide resolved

int
zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
Expand Down Expand Up @@ -182,8 +202,6 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
return (error);
}

static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */

/*
* Read bytes from specified file into supplied buffer.
*
Expand Down Expand Up @@ -1049,6 +1067,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
size_t maxblocks, nbps;
uint_t inblksz;
uint64_t clear_setid_bits_txg = 0;
uint64_t last_synced_txg = 0;

inoff = *inoffp;
outoff = *outoffp;
Expand Down Expand Up @@ -1287,15 +1306,23 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
}

nbps = maxblocks;
last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos));
error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
&nbps);
if (error != 0) {
/*
* If we are trying to clone a block that was created
* in the current transaction group, error will be
* EAGAIN here, which we can just return to the caller
* so it can fallback if it likes.
* in the current transaction group, the error will be
* EAGAIN here. Based on zfs_bclone_wait_dirty either
* return a shortened range to the caller so it can
* fallback, or wait for the next TXG and check again.
*/
if (error == EAGAIN && zfs_bclone_wait_dirty) {
txg_wait_synced(dmu_objset_pool(inos),
last_synced_txg + 1);
continue;
}

break;
}

Expand Down Expand Up @@ -1517,3 +1544,9 @@ EXPORT_SYMBOL(zfs_clone_range_replay);

ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
"Bytes to read per chunk");

ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW,
"Enable block cloning");

ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
"Wait for dirty blocks when cloning");
2 changes: 1 addition & 1 deletion tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,7 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos',
tags = ['functional', 'compression']

[tests/functional/cp_files]
tests = ['cp_files_001_pos', 'cp_stress']
tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress']
tags = ['functional', 'cp_files']

[tests/functional/crtime]
Expand Down
2 changes: 2 additions & 0 deletions tests/test-runner/bin/zts-report.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ if sys.platform.startswith('freebsd'):
'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason],
'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason],
'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason],
'cp_files/cp_files_002_pos': ['SKIP', na_reason],
'link_count/link_count_001': ['SKIP', na_reason],
'casenorm/mixed_create_failure': ['FAIL', 13215],
'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
Expand Down Expand Up @@ -312,6 +313,7 @@ elif sys.platform.startswith('linux'):
['SKIP', cfr_reason],
'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason],
'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason],
'cp_files/cp_files_002_pos': ['SKIP', cfr_reason],
'fault/auto_online_002_pos': ['FAIL', 11889],
'fault/auto_replace_001_pos': ['FAIL', 14851],
'fault/auto_spare_002_pos': ['FAIL', 11889],
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/include/tunables.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ VOL_MODE vol.mode zvol_volmode
VOL_RECURSIVE vol.recursive UNSUPPORTED
VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
BCLONE_ENABLED zfs_bclone_enabled zfs_bclone_enabled
BCLONE_WAIT_DIRTY zfs_bclone_wait_dirty zfs_bclone_wait_dirty
XATTR_COMPAT xattr_compat zfs_xattr_compat
ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max
ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -1394,6 +1394,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/compression/setup.ksh \
functional/cp_files/cleanup.ksh \
functional/cp_files/cp_files_001_pos.ksh \
functional/cp_files/cp_files_002_pos.ksh \
functional/cp_files/cp_stress.ksh \
functional/cp_files/setup.ksh \
functional/crtime/cleanup.ksh \
Expand Down
Loading
Loading