Skip to content

Commit

Permalink
vfs/zfs: remove per-file lock enforcement for ZFS read/write ops
Browse files Browse the repository at this point in the history
OSv VFS layer enforces a per-file lock for file operations, including read and
write operations. This approach prevents ZFS from doing parallel reads on the
same file, or even parallel writes into different regions from the same file.
ZFS allows that parallelism by using r/w range locks from its own internal
nodes, mapped 1:1 to VFS vnodes. It properly satisfies POSIX requeriments.

Currently, every write/read to a specific file will always be serialized.
Workloads where many threads concurrently read/write from/into the same file
would perform terribly bad as compared to FreeBSD, for example.
FreeBSD also uses a r/w range lock within the VFS layer to protect concurrent
read/write ops, however, that implies double range locking. It's not necessary,
and it's also redundant.

Specifically to ZFS, this bottleneck - which prevents parallel ZFS reads/writes
on the same file from happening - could be pulverized by discarding the vnode
locks surrounding VOP_READ and VOP_WRITE within vfs_file::read and vfs_file::
write, respectively. For read/write ops from the virtual file systems, vnode
locking could be moved over their respective functions. Unlike ZFS, the virtual
file systems don't deal with per-file locks on their own. From there on, ZFS
r/w range locks would be working effectively. Before that, there was *no* r/w
parallelism when working on the same file.

vfs_file::get_arcbuf, which is entirely intended for ZFS, could also discard
the vnode lock. It calls zfs_arc that also uses the r/w range lock approach, so
properly protecting the underlying file. writeback() from the pagecache doesn't
need to take the vnode lock either, given that it calls zfs_write (protected by
r/w range lock).

Changes involved:
* For zfs_read, use zfs internal node data to avoid reading vnode.
* For zfs_write, when file has grown in size, take vnode lock for updating
vnode size.
* For virtual file systems, surround their read()/write() ops with vnode lock.

Signed-off-by: Raphael S. Carvalho <[email protected]>
Signed-off-by: Pekka Enberg <[email protected]>
  • Loading branch information
raphaelsc authored and Pekka Enberg committed Aug 21, 2014
1 parent 000ce9a commit b5eadc3
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 19 deletions.
16 changes: 10 additions & 6 deletions bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
Original file line number Diff line number Diff line change
Expand Up @@ -565,15 +565,16 @@ zfs_read(vnode_t *vp, struct file* fp, uio_t *uio, int ioflag)
xuio_t *xuio = NULL;
#endif

// Return EISDIR when reading from a directory, as Linux does.
if (vp->v_type == VDIR) {
return EISDIR;
}

ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
os = zfsvfs->z_os;

// Return EISDIR when reading from a directory, as Linux does.
if (S_ISDIR(zp->z_mode)) {
ZFS_EXIT(zfsvfs);
return EISDIR;
}

if (zp->z_pflags & ZFS_AV_QUARANTINED) {
ZFS_EXIT(zfsvfs);
return (EACCES);
Expand Down Expand Up @@ -958,8 +959,11 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag)
*/
nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));

if (woff + nbytes > zp->z_size)
if (woff + nbytes > zp->z_size) {
vn_lock(vp);
vnode_pager_setsize(vp, woff + nbytes);
vn_unlock(vp);
}

if (abuf == NULL) {
tx_bytes = uio->uio_resid;
Expand Down
7 changes: 1 addition & 6 deletions core/pagecache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -204,17 +204,12 @@ class cached_page_write : public cached_page {
}
int writeback()
{
int error;
struct iovec iov {_page, mmu::page_size};
struct uio uio {&iov, 1, _key.offset, mmu::page_size, UIO_WRITE};

_dirty = false;

vn_lock(_vp);
error = VOP_WRITE(_vp, &uio, 0);
vn_unlock(_vp);

return error;
return VOP_WRITE(_vp, &uio, 0);
}
void* release() { // called to demote a page from cache page to anonymous
assert(boost::get<std::nullptr_t>(_ptes) == nullptr);
Expand Down
2 changes: 2 additions & 0 deletions fs/devfs/devfs_vnops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,14 @@ devfs_close(struct vnode *vp, struct file *fp)
static int
devfs_read(struct vnode *vp, struct file *fp, struct uio *uio, int ioflags)
{
SCOPE_LOCK(vp->v_lock);
return device_read((device*)vp->v_data, uio, ioflags);
}

static int
devfs_write(struct vnode *vp, struct uio *uio, int ioflags)
{
SCOPE_LOCK(vp->v_lock);
return device_write((device*)vp->v_data, uio, ioflags);
}

Expand Down
1 change: 1 addition & 0 deletions fs/procfs/procfs_vnops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ static int
procfs_read(vnode* vp, file *fp, uio* uio, int ioflags)
{
auto* data = static_cast<string*>(fp->f_data);
SCOPE_LOCK(vp->v_lock);

if (vp->v_type == VDIR)
return EISDIR;
Expand Down
2 changes: 2 additions & 0 deletions fs/ramfs/ramfs_vnops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ ramfs_read(struct vnode *vp, struct file *fp, struct uio *uio, int ioflag)
{
struct ramfs_node *np = (ramfs_node*)vp->v_data;
size_t len;
SCOPE_LOCK(vp->v_lock);

if (vp->v_type == VDIR)
return EISDIR;
Expand All @@ -325,6 +326,7 @@ static int
ramfs_write(struct vnode *vp, struct uio *uio, int ioflag)
{
struct ramfs_node *np = (ramfs_node*)vp->v_data;
SCOPE_LOCK(vp->v_lock);

if (vp->v_type == VDIR)
return EISDIR;
Expand Down
7 changes: 0 additions & 7 deletions fs/vfs/vfs_fops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ int vfs_file::read(struct uio *uio, int flags)

bytes = uio->uio_resid;

vn_lock(vp);
if ((flags & FOF_OFFSET) == 0)
uio->uio_offset = fp->f_offset;

Expand All @@ -57,7 +56,6 @@ int vfs_file::read(struct uio *uio, int flags)
if ((flags & FOF_OFFSET) == 0)
fp->f_offset += count;
}
vn_unlock(vp);

return error;
}
Expand All @@ -74,8 +72,6 @@ int vfs_file::write(struct uio *uio, int flags)

bytes = uio->uio_resid;

vn_lock(vp);

if (fp->f_flags & O_APPEND)
ioflags |= IO_APPEND;
if (fp->f_flags & (O_DSYNC|O_SYNC))
Expand All @@ -91,7 +87,6 @@ int vfs_file::write(struct uio *uio, int flags)
fp->f_offset += count;
}

vn_unlock(vp);
return error;
}

Expand Down Expand Up @@ -171,9 +166,7 @@ int vfs_file::get_arcbuf(void* key, off_t offset)
data.uio_resid = mmu::page_size;
data.uio_rw = UIO_READ;

vn_lock(vp);
assert(VOP_CACHE(vp, this, &data) == 0);
vn_unlock(vp);

return (data.uio_resid != 0) ? -1 : 0;
}
Expand Down

0 comments on commit b5eadc3

Please sign in to comment.