Skip to content

Commit

Permalink
Speed up WB_SYNC_NONE when a WB_SYNC_ALL occurs simultaneously
Browse files Browse the repository at this point in the history
Page writebacks with WB_SYNC_NONE can take several seconds to complete
since they wait for the transaction group to close before being
committed. This is usually not a problem since the caller does not
need to wait. However, if we're simultaneously doing a writeback
with WB_SYNC_ALL (e.g via msync), the latter can block for several
seconds (up to zfs_txg_timeout) due to the active WB_SYNC_NONE
writeback since it needs to wait for the transaction to complete
and the PG_writeback bit to be cleared.

This commit deals with 2 cases:

- No page writeback is active. A WB_SYNC_ALL page writeback starts
  and even completes. But when it's about to check if the PG_writeback
  bit has been cleared, another writeback with WB_SYNC_NONE starts.
  The sync page writeback ends up waiting for the non-sync page
  writeback to complete.

- A page writeback with WB_SYNC_NONE is already active when a
  WB_SYNC_ALL writeback starts. The WB_SYNC_ALL writeback ends up
  waiting for the WB_SYNC_NONE writeback.

The fix works by carefully keeping track of active sync/non-sync
writebacks and committing when beneficial.

Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Shaan Nobee <[email protected]>
Closes openzfs#12662
Closes openzfs#12790
  • Loading branch information
shaan1337 authored and andrewc12 committed Sep 23, 2022
1 parent 90cb41d commit 13a53a6
Show file tree
Hide file tree
Showing 15 changed files with 347 additions and 18 deletions.
14 changes: 10 additions & 4 deletions include/os/linux/zfs/sys/trace_acl.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
__field(uint64_t, z_size)
__field(uint64_t, z_pflags)
__field(uint32_t, z_sync_cnt)
__field(uint32_t, z_sync_writes_cnt)
__field(uint32_t, z_async_writes_cnt)
__field(mode_t, z_mode)
__field(boolean_t, z_is_sa)
__field(boolean_t, z_is_mapped)
Expand Down Expand Up @@ -91,6 +93,8 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
__entry->z_size = zn->z_size;
__entry->z_pflags = zn->z_pflags;
__entry->z_sync_cnt = zn->z_sync_cnt;
__entry->z_sync_writes_cnt = zn->z_sync_writes_cnt;
__entry->z_async_writes_cnt = zn->z_async_writes_cnt;
__entry->z_mode = zn->z_mode;
__entry->z_is_sa = zn->z_is_sa;
__entry->z_is_mapped = zn->z_is_mapped;
Expand All @@ -116,16 +120,18 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
TP_printk("zn { id %llu unlinked %u atime_dirty %u "
"zn_prefetch %u blksz %u seq %u "
"mapcnt %llu size %llu pflags %llu "
"sync_cnt %u mode 0x%x is_sa %d "
"is_mapped %d is_ctldir %d is_stale %d inode { "
"sync_cnt %u sync_writes_cnt %u async_writes_cnt %u "
"mode 0x%x is_sa %d is_mapped %d "
"is_ctldir %d is_stale %d inode { "
"uid %u gid %u ino %lu nlink %u size %lli "
"blkbits %u bytes %u mode 0x%x generation %x } } "
"ace { type %u flags %u access_mask %u } mask_matched %u",
__entry->z_id, __entry->z_unlinked, __entry->z_atime_dirty,
__entry->z_zn_prefetch, __entry->z_blksz,
__entry->z_seq, __entry->z_mapcnt, __entry->z_size,
__entry->z_pflags, __entry->z_sync_cnt, __entry->z_mode,
__entry->z_is_sa, __entry->z_is_mapped,
__entry->z_pflags, __entry->z_sync_cnt,
__entry->z_sync_writes_cnt, __entry->z_async_writes_cnt,
__entry->z_mode, __entry->z_is_sa, __entry->z_is_mapped,
__entry->z_is_ctldir, __entry->z_is_stale, __entry->i_uid,
__entry->i_gid, __entry->i_ino, __entry->i_nlink,
__entry->i_size, __entry->i_blkbits,
Expand Down
2 changes: 1 addition & 1 deletion include/os/linux/zfs/sys/zfs_vnops_os.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ extern int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
extern int zfs_fid(struct inode *ip, fid_t *fidp);
extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages);
extern int zfs_putpage(struct inode *ip, struct page *pp,
struct writeback_control *wbc);
struct writeback_control *wbc, boolean_t for_sync);
extern int zfs_dirty_inode(struct inode *ip, int flags);
extern int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp,
size_t len, unsigned long vm_flags);
Expand Down
2 changes: 2 additions & 0 deletions include/sys/zfs_znode.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ typedef struct znode {
uint64_t z_size; /* file size (cached) */
uint64_t z_pflags; /* pflags (cached) */
uint32_t z_sync_cnt; /* synchronous open count */
uint32_t z_sync_writes_cnt; /* synchronous write count */
uint32_t z_async_writes_cnt; /* asynchronous write count */
mode_t z_mode; /* mode (cached) */
kmutex_t z_acl_lock; /* acl data lock */
zfs_acl_t *z_acl_cached; /* cached acl */
Expand Down
8 changes: 8 additions & 0 deletions module/os/freebsd/zfs/zfs_znode.c
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
zp->z_xattr_cached = NULL;
zp->z_xattr_parent = 0;
zp->z_vnode = NULL;
zp->z_sync_writes_cnt = 0;
zp->z_async_writes_cnt = 0;

return (0);
}

Expand All @@ -172,6 +175,9 @@ zfs_znode_cache_destructor(void *buf, void *arg)

ASSERT3P(zp->z_acl_cached, ==, NULL);
ASSERT3P(zp->z_xattr_cached, ==, NULL);

ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
}


Expand Down Expand Up @@ -453,6 +459,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
zp->z_blksz = blksz;
zp->z_seq = 0x7A4653;
zp->z_sync_cnt = 0;
zp->z_sync_writes_cnt = 0;
zp->z_async_writes_cnt = 0;
#if __FreeBSD_version >= 1300139
atomic_store_ptr(&zp->z_cached_symlink, NULL);
#endif
Expand Down
2 changes: 2 additions & 0 deletions module/os/linux/zfs/zfs_ctldir.c
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,8 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
zp->z_pflags = 0;
zp->z_mode = 0;
zp->z_sync_cnt = 0;
zp->z_sync_writes_cnt = 0;
zp->z_async_writes_cnt = 0;
ip->i_generation = 0;
ip->i_ino = id;
ip->i_mode = (S_IFDIR | S_IRWXUGO);
Expand Down
52 changes: 46 additions & 6 deletions module/os/linux/zfs/zfs_vnops_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -3396,21 +3396,34 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
}

static void
zfs_putpage_commit_cb(void *arg)
zfs_putpage_sync_commit_cb(void *arg)
{
struct page *pp = arg;

ClearPageError(pp);
end_page_writeback(pp);
}

static void
zfs_putpage_async_commit_cb(void *arg)
{
struct page *pp = arg;
znode_t *zp = ITOZ(pp->mapping->host);

ClearPageError(pp);
end_page_writeback(pp);
atomic_dec_32(&zp->z_async_writes_cnt);
}

/*
* Push a page out to disk, once the page is on stable storage the
* registered commit callback will be run as notification of completion.
*
* IN: ip - page mapped for inode.
* pp - page to push (page is locked)
* wbc - writeback control data
* IN: ip - page mapped for inode.
* pp - page to push (page is locked)
* wbc - writeback control data
* for_sync - does the caller intend to wait synchronously for the
* page writeback to complete?
*
* RETURN: 0 if success
* error code if failure
Expand All @@ -3419,7 +3432,8 @@ zfs_putpage_commit_cb(void *arg)
* ip - ctime|mtime updated
*/
int
zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
boolean_t for_sync)
{
znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);
Expand Down Expand Up @@ -3517,6 +3531,16 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
zfs_rangelock_exit(lr);

if (wbc->sync_mode != WB_SYNC_NONE) {
/*
* Speed up any non-sync page writebacks since
* they may take several seconds to complete.
* Refer to the comment in zpl_fsync() (when
* HAVE_FSYNC_RANGE is defined) for details.
*/
if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
zil_commit(zfsvfs->z_log, zp->z_id);
}

if (PageWriteback(pp))
#ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
folio_wait_bit(page_folio(pp), PG_writeback);
Expand All @@ -3542,6 +3566,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
* was in fact not skipped and should not be counted as if it were.
*/
wbc->pages_skipped--;
if (!for_sync)
atomic_inc_32(&zp->z_async_writes_cnt);
set_page_writeback(pp);
unlock_page(pp);

Expand All @@ -3563,6 +3589,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
#endif
ClearPageError(pp);
end_page_writeback(pp);
if (!for_sync)
atomic_dec_32(&zp->z_async_writes_cnt);
zfs_rangelock_exit(lr);
ZFS_EXIT(zfsvfs);
return (err);
Expand All @@ -3587,7 +3615,9 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);

zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
zfs_putpage_commit_cb, pp);
for_sync ? zfs_putpage_sync_commit_cb :
zfs_putpage_async_commit_cb, pp);

dmu_tx_commit(tx);

zfs_rangelock_exit(lr);
Expand All @@ -3599,6 +3629,16 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
* performance reasons.
*/
zil_commit(zfsvfs->z_log, zp->z_id);
} else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
/*
* If the caller does not intend to wait synchronously
* for this page writeback to complete and there are active
* synchronous calls on this file, do a commit so that
* the latter don't accidentally end up waiting for
* our writeback to complete. Refer to the comment in
* zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
*/
zil_commit(zfsvfs->z_log, zp->z_id);
}

dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
Expand Down
8 changes: 8 additions & 0 deletions module/os/linux/zfs/zfs_znode.c
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
zp->z_acl_cached = NULL;
zp->z_xattr_cached = NULL;
zp->z_xattr_parent = 0;
zp->z_sync_writes_cnt = 0;
zp->z_async_writes_cnt = 0;

return (0);
}

Expand All @@ -154,6 +157,9 @@ zfs_znode_cache_destructor(void *buf, void *arg)
ASSERT3P(zp->z_dirlocks, ==, NULL);
ASSERT3P(zp->z_acl_cached, ==, NULL);
ASSERT3P(zp->z_xattr_cached, ==, NULL);

ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
}

static int
Expand Down Expand Up @@ -554,6 +560,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
zp->z_blksz = blksz;
zp->z_seq = 0x7A4653;
zp->z_sync_cnt = 0;
zp->z_sync_writes_cnt = 0;
zp->z_async_writes_cnt = 0;

zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);

Expand Down
55 changes: 49 additions & 6 deletions module/os/linux/zfs/zpl_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -165,17 +165,56 @@ static int
zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
struct inode *inode = filp->f_mapping->host;
znode_t *zp = ITOZ(inode);
zfsvfs_t *zfsvfs = ITOZSB(inode);
cred_t *cr = CRED();
int error;
fstrans_cookie_t cookie;

/*
* The variables z_sync_writes_cnt and z_async_writes_cnt work in
* tandem so that sync writes can detect if there are any non-sync
* writes going on and vice-versa. The "vice-versa" part to this logic
* is located in zfs_putpage() where non-sync writes check if there are
* any ongoing sync writes. If any sync and non-sync writes overlap,
* we do a commit to complete the non-sync writes since the latter can
* potentially take several seconds to complete and thus block sync
* writes in the upcoming call to filemap_write_and_wait_range().
*/
atomic_inc_32(&zp->z_sync_writes_cnt);
/*
* If the following check does not detect an overlapping non-sync write
* (say because it's just about to start), then it is guaranteed that
* the non-sync write will detect this sync write. This is because we
* always increment z_sync_writes_cnt / z_async_writes_cnt before doing
* the check on z_async_writes_cnt / z_sync_writes_cnt here and in
* zfs_putpage() respectively.
*/
if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
ZPL_ENTER(zfsvfs);
zil_commit(zfsvfs->z_log, zp->z_id);
ZPL_EXIT(zfsvfs);
}

error = filemap_write_and_wait_range(inode->i_mapping, start, end);

/*
* The sync write is not complete yet but we decrement
* z_sync_writes_cnt since zfs_fsync() increments and decrements
* it internally. If a non-sync write starts just after the decrement
* operation but before we call zfs_fsync(), it may not detect this
* overlapping sync write but it does not matter since we have already
* gone past filemap_write_and_wait_range() and we won't block due to
* the non-sync write.
*/
atomic_dec_32(&zp->z_sync_writes_cnt);

if (error)
return (error);

crhold(cr);
cookie = spl_fstrans_mark();
error = -zfs_fsync(ITOZ(inode), datasync, cr);
error = -zfs_fsync(zp, datasync, cr);
spl_fstrans_unmark(cookie);
crfree(cr);
ASSERT3S(error, <=, 0);
Expand Down Expand Up @@ -680,14 +719,14 @@ zpl_readahead(struct readahead_control *ractl)
static int
zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
{
struct address_space *mapping = data;
boolean_t *for_sync = data;
fstrans_cookie_t cookie;

ASSERT(PageLocked(pp));
ASSERT(!PageWriteback(pp));

cookie = spl_fstrans_mark();
(void) zfs_putpage(mapping->host, pp, wbc);
(void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
spl_fstrans_unmark(cookie);

return (0);
Expand All @@ -714,8 +753,9 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
* we run it once in non-SYNC mode so that the ZIL gets all the data,
* and then we commit it all in one go.
*/
boolean_t for_sync = (sync_mode == WB_SYNC_ALL);
wbc->sync_mode = WB_SYNC_NONE;
result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
result = write_cache_pages(mapping, wbc, zpl_putpage, &for_sync);
if (sync_mode != wbc->sync_mode) {
ZPL_ENTER(zfsvfs);
ZPL_VERIFY_ZP(zp);
Expand All @@ -731,7 +771,8 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
* details). That being said, this is a no-op in most cases.
*/
wbc->sync_mode = sync_mode;
result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
result = write_cache_pages(mapping, wbc, zpl_putpage,
&for_sync);
}
return (result);
}
Expand All @@ -748,7 +789,9 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc)
if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
wbc->sync_mode = WB_SYNC_ALL;

return (zpl_putpage(pp, wbc, pp->mapping));
boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL);

return (zpl_putpage(pp, wbc, &for_sync));
}

/*
Expand Down
2 changes: 2 additions & 0 deletions module/zfs/zfs_vnops.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
atomic_inc_32(&zp->z_sync_writes_cnt);
zil_commit(zfsvfs->z_log, zp->z_id);
atomic_dec_32(&zp->z_sync_writes_cnt);
ZFS_EXIT(zfsvfs);
}
tsd_set(zfs_fsyncer_key, NULL);
Expand Down
2 changes: 1 addition & 1 deletion tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -685,7 +685,7 @@ tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos',
tags = ['functional', 'migration']

[tests/functional/mmap]
tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos']
tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos', 'mmap_sync_001_pos']
tags = ['functional', 'mmap']

[tests/functional/mount]
Expand Down
1 change: 1 addition & 0 deletions tests/test-runner/bin/zts-report.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ if sys.platform.startswith('freebsd'):
'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason],
'link_count/link_count_001': ['SKIP', na_reason],
'casenorm/mixed_create_failure': ['FAIL', 13215],
'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
})
elif sys.platform.startswith('linux'):
known.update({
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/cmd/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
/mmap_exec
/mmap_libaio
/mmap_seek
/mmap_sync
/mmapwrite
/nvlist_to_lua
/randfree_file
Expand Down
Loading

0 comments on commit 13a53a6

Please sign in to comment.