Skip to content

Commit

Permalink
cherry-pick lock-inversion patch for zvol_open
Browse files Browse the repository at this point in the history
the changes to zvol_open added to 2.1.2 (for coping with kernel
changes in 5.13) seem to have introduced a lock order inversion [0].

(noticed while reviewing the 2.0.6->2.0.7 changes (the patch was
applied after 2.1.2 was already tagged)

[0] openzfs/zfs#12863
Signed-off-by: Stoiko Ivanov <[email protected]>
  • Loading branch information
Stoiko Ivanov authored and ThomasLamprecht committed Jan 11, 2022
1 parent 64e59c5 commit 531eac3
Show file tree
Hide file tree
Showing 2 changed files with 213 additions and 0 deletions.
212 changes: 212 additions & 0 deletions debian/patches/0012-Fix-zvol_open-lock-inversion.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <[email protected]>
Date: Fri, 17 Dec 2021 09:52:13 -0800
Subject: [PATCH] Fix zvol_open() lock inversion

When restructuring the zvol_open() logic for the Linux 5.13 kernel
a lock inversion was accidentally introduced. In the updated code
the spa_namespace_lock is now taken before the zv_suspend_lock
allowing the following scenario to occur:

down_read <=== waiting for zv_suspend_lock
zvol_open <=== holds spa_namespace_lock
__blkdev_get
blkdev_get_by_dev
blkdev_open
...

mutex_lock <== waiting for spa_namespace_lock
spa_open_common
spa_open
dsl_pool_hold
dmu_objset_hold_flags
dmu_objset_hold
dsl_prop_get
dsl_prop_get_integer
zvol_create_minor
dmu_recv_end
zfs_ioc_recv_impl <=== holds zv_suspend_lock via zvol_suspend()
zfs_ioc_recv
...

This commit resolves the issue by moving the acquisition of the
spa_namespace_lock back to after the zv_suspend_lock which restores
the original ordering.

Additionally, as part of this change the error exit paths were
simplified where possible.

Reviewed-by: Tony Hutter <[email protected]>
Reviewed-by: Rich Ercolani <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #12863
(cherry picked from commit 8a02d01e85556bbe3a1c6947bc11b8ef028d4023)
Signed-off-by: Stoiko Ivanov <[email protected]>
---
module/os/linux/zfs/zvol_os.c | 121 ++++++++++++++++------------------
1 file changed, 58 insertions(+), 63 deletions(-)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 44caadd58..69479b3f7 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -496,8 +496,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
{
zvol_state_t *zv;
int error = 0;
- boolean_t drop_suspend = B_TRUE;
- boolean_t drop_namespace = B_FALSE;
+ boolean_t drop_suspend = B_FALSE;
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
hrtime_t start = gethrtime();
@@ -517,7 +516,36 @@ retry:
return (SET_ERROR(-ENXIO));
}

- if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
+ mutex_enter(&zv->zv_state_lock);
+ /*
+ * Make sure zvol is not suspended during first open
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
+ */
+ if (zv->zv_open_count == 0) {
+ if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ } else {
+ drop_suspend = B_TRUE;
+ }
+ } else {
+ drop_suspend = B_TRUE;
+ }
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ if (zv->zv_open_count == 0) {
+ boolean_t drop_namespace = B_FALSE;
+
+ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+
/*
* In all other call paths the spa_namespace_lock is taken
* before the bdev->bd_mutex lock. However, on open(2)
@@ -542,84 +570,51 @@ retry:
* the kernel so the only option is to return the error for
* the caller to handle it.
*/
- if (!mutex_tryenter(&spa_namespace_lock)) {
- rw_exit(&zvol_state_lock);
+ if (!mutex_owned(&spa_namespace_lock)) {
+ if (!mutex_tryenter(&spa_namespace_lock)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_exit(&zv->zv_suspend_lock);

#ifdef HAVE_BLKDEV_GET_ERESTARTSYS
- schedule();
- return (SET_ERROR(-ERESTARTSYS));
-#else
- if ((gethrtime() - start) > timeout)
+ schedule();
return (SET_ERROR(-ERESTARTSYS));
+#else
+ if ((gethrtime() - start) > timeout)
+ return (SET_ERROR(-ERESTARTSYS));

- schedule_timeout(MSEC_TO_TICK(10));
- goto retry;
+ schedule_timeout(MSEC_TO_TICK(10));
+ goto retry;
#endif
- } else {
- drop_namespace = B_TRUE;
- }
- }
-
- mutex_enter(&zv->zv_state_lock);
- /*
- * make sure zvol is not suspended during first open
- * (hold zv_suspend_lock) and respect proper lock acquisition
- * ordering - zv_suspend_lock before zv_state_lock
- */
- if (zv->zv_open_count == 0) {
- if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
- mutex_exit(&zv->zv_state_lock);
- rw_enter(&zv->zv_suspend_lock, RW_READER);
- mutex_enter(&zv->zv_state_lock);
- /* check to see if zv_suspend_lock is needed */
- if (zv->zv_open_count != 0) {
- rw_exit(&zv->zv_suspend_lock);
- drop_suspend = B_FALSE;
+ } else {
+ drop_namespace = B_TRUE;
}
}
- } else {
- drop_suspend = B_FALSE;
- }
- rw_exit(&zvol_state_lock);
-
- ASSERT(MUTEX_HELD(&zv->zv_state_lock));

- if (zv->zv_open_count == 0) {
- ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
- if (error)
- goto out_mutex;
- }

- if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
- error = -EROFS;
- goto out_open_count;
+ if (drop_namespace)
+ mutex_exit(&spa_namespace_lock);
}

- zv->zv_open_count++;
-
- mutex_exit(&zv->zv_state_lock);
- if (drop_namespace)
- mutex_exit(&spa_namespace_lock);
- if (drop_suspend)
- rw_exit(&zv->zv_suspend_lock);
-
- zfs_check_media_change(bdev);
-
- return (0);
+ if (error == 0) {
+ if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+ if (zv->zv_open_count == 0)
+ zvol_last_close(zv);

-out_open_count:
- if (zv->zv_open_count == 0)
- zvol_last_close(zv);
+ error = SET_ERROR(-EROFS);
+ } else {
+ zv->zv_open_count++;
+ }
+ }

-out_mutex:
mutex_exit(&zv->zv_state_lock);
- if (drop_namespace)
- mutex_exit(&spa_namespace_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);

- return (SET_ERROR(error));
+ if (error == 0)
+ zfs_check_media_change(bdev);
+
+ return (error);
}

static void
1 change: 1 addition & 0 deletions debian/patches/series
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@
0009-Patch-move-manpage-arcstat-1-to-arcstat-8.patch
0010-arcstat-Fix-integer-division-with-python3.patch
0011-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch
0012-Fix-zvol_open-lock-inversion.patch

0 comments on commit 531eac3

Please sign in to comment.