Skip to content

Commit

Permalink
Revert "scx: Avoid possible deadlock with cpus_read_lock()"
Browse files Browse the repository at this point in the history
This reverts commit c3c7041.

We hit a locking ordering issue in the other direction. Let's revert for
now.

[    9.378773] ======================================================
[    9.379476] WARNING: possible circular locking dependency detected
[    9.379532] 6.6.0-work-10442-ga7150a9168f8-dirty torvalds#134 Not tainted
[    9.379532] ------------------------------------------------------
[    9.379532] scx_rustland/1622 is trying to acquire lock:
[    9.379532] ffffffff8325f828 (cpu_hotplug_lock){++++}-{0:0}, at: bpf_scx_reg+0xe4/0xcf0
[    9.379532]
[    9.379532] but task is already holding lock:
[    9.379532] ffffffff83271be8 (scx_cgroup_rwsem){++++}-{0:0}, at: bpf_scx_reg+0xdf/0xcf0
[    9.379532]
[    9.379532] which lock already depends on the new lock.
[    9.379532]
[    9.379532]
[    9.379532] the existing dependency chain (in reverse order) is:
[    9.379532]
[    9.379532] -> #2 (scx_cgroup_rwsem){++++}-{0:0}:
[    9.379532]        percpu_down_read+0x2e/0xb0
[    9.379532]        scx_cgroup_can_attach+0x25/0x200
[    9.379532]        cpu_cgroup_can_attach+0xe/0x10
[    9.379532]        cgroup_migrate_execute+0xaf/0x450
[    9.379532]        cgroup_apply_control+0x227/0x2a0
[    9.379532]        cgroup_subtree_control_write+0x425/0x4b0
[    9.379532]        cgroup_file_write+0x82/0x260
[    9.379532]        kernfs_fop_write_iter+0x131/0x1c0
[    9.379532]        vfs_write+0x1f9/0x270
[    9.379532]        ksys_write+0x62/0xc0
[    9.379532]        __x64_sys_write+0x1b/0x20
[    9.379532]        do_syscall_64+0x40/0xe0
[    9.379532]        entry_SYSCALL_64_after_hwframe+0x46/0x4e
[    9.379532]
[    9.379532] -> #1 (cgroup_threadgroup_rwsem){++++}-{0:0}:
[    9.379532]        percpu_down_write+0x35/0x1e0
[    9.379532]        cgroup_procs_write_start+0x8a/0x210
[    9.379532]        __cgroup_procs_write+0x4c/0x160
[    9.379532]        cgroup_procs_write+0x17/0x30
[    9.379532]        cgroup_file_write+0x82/0x260
[    9.379532]        kernfs_fop_write_iter+0x131/0x1c0
[    9.379532]        vfs_write+0x1f9/0x270
[    9.379532]        ksys_write+0x62/0xc0
[    9.379532]        __x64_sys_write+0x1b/0x20
[    9.379532]        do_syscall_64+0x40/0xe0
[    9.379532]        entry_SYSCALL_64_after_hwframe+0x46/0x4e
[    9.379532]
[    9.379532] -> #0 (cpu_hotplug_lock){++++}-{0:0}:
[    9.379532]        __lock_acquire+0x142d/0x2a30
[    9.379532]        lock_acquire+0xbf/0x1f0
[    9.379532]        cpus_read_lock+0x2f/0xc0
[    9.379532]        bpf_scx_reg+0xe4/0xcf0
[    9.379532]        bpf_struct_ops_link_create+0xb6/0x100
[    9.379532]        link_create+0x49/0x200
[    9.379532]        __sys_bpf+0x351/0x3e0
[    9.379532]        __x64_sys_bpf+0x1c/0x20
[    9.379532]        do_syscall_64+0x40/0xe0
[    9.379532]        entry_SYSCALL_64_after_hwframe+0x46/0x4e
[    9.379532]
[    9.379532] other info that might help us debug this:
[    9.379532]
[    9.379532] Chain exists of:
[    9.379532]   cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem
[    9.379532]
[    9.379532]  Possible unsafe locking scenario:
[    9.379532]
[    9.379532]        CPU0                    CPU1
[    9.379532]        ----                    ----
[    9.379532]   lock(scx_cgroup_rwsem);
[    9.379532]                                lock(cgroup_threadgroup_rwsem);
[    9.379532]                                lock(scx_cgroup_rwsem);
[    9.379532]   rlock(cpu_hotplug_lock);
[    9.379532]
[    9.379532]  *** DEADLOCK ***
[    9.379532]
[    9.379532] 3 locks held by scx_rustland/1622:
[    9.379532]  #0: ffffffff83272708 (scx_ops_enable_mutex){+.+.}-{3:3}, at: bpf_scx_reg+0x2a/0xcf0
[    9.379532]  #1: ffffffff83271aa0 (scx_fork_rwsem){++++}-{0:0}, at: bpf_scx_reg+0xd3/0xcf0
[    9.379532]  #2: ffffffff83271be8 (scx_cgroup_rwsem){++++}-{0:0}, at: bpf_scx_reg+0xdf/0xcf0
[    9.379532]
[    9.379532] stack backtrace:
[    9.379532] CPU: 7 PID: 1622 Comm: scx_rustland Not tainted 6.6.0-work-10442-ga7150a9168f8-dirty torvalds#134
[    9.379532] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 2/2/2022
[    9.379532] Sched_ext: rustland (prepping)
[    9.379532] Call Trace:
[    9.379532]  <TASK>
[    9.379532]  dump_stack_lvl+0x55/0x70
[    9.379532]  dump_stack+0x10/0x20
[    9.379532]  print_circular_bug+0x2ea/0x2f0
[    9.379532]  check_noncircular+0xe2/0x100
[    9.379532]  __lock_acquire+0x142d/0x2a30
[    9.379532]  ? lock_acquire+0xbf/0x1f0
[    9.379532]  ? rcu_sync_func+0x2c/0xa0
[    9.379532]  lock_acquire+0xbf/0x1f0
[    9.379532]  ? bpf_scx_reg+0xe4/0xcf0
[    9.379532]  cpus_read_lock+0x2f/0xc0
[    9.379532]  ? bpf_scx_reg+0xe4/0xcf0
[    9.379532]  bpf_scx_reg+0xe4/0xcf0
[    9.379532]  ? alloc_file+0xa4/0x160
[    9.379532]  ? alloc_file_pseudo+0x99/0xd0
[    9.379532]  ? anon_inode_getfile+0x79/0xc0
[    9.379532]  ? bpf_link_prime+0xe2/0x1a0
[    9.379532]  bpf_struct_ops_link_create+0xb6/0x100
[    9.379532]  link_create+0x49/0x200
[    9.379532]  __sys_bpf+0x351/0x3e0
[    9.379532]  __x64_sys_bpf+0x1c/0x20
[    9.379532]  do_syscall_64+0x40/0xe0
[    9.379532]  ? sysvec_apic_timer_interrupt+0x44/0x80
[    9.379532]  entry_SYSCALL_64_after_hwframe+0x46/0x4e
[    9.379532] RIP: 0033:0x7fc391f7473d
[    9.379532] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 95 0c 00 f7 d8 64 89 01 48
[    9.379532] RSP: 002b:00007ffeb4fe4108 EFLAGS: 00000246 ORIG_RAX: 0000000000000141
[    9.379532] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fc391f7473d
[    9.379532] RDX: 0000000000000030 RSI: 00007ffeb4fe4120 RDI: 000000000000001c
[    9.379532] RBP: 000000000000000c R08: 000000000000000c R09: 000055d0a75b1a10
[    9.379532] R10: 0000000000000050 R11: 0000000000000246 R12: 000000000000002c
[    9.379532] R13: 00007ffeb4fe4628 R14: 0000000000000000 R15: 00007ffeb4fe4328
[    9.379532]  </TASK>
  • Loading branch information
htejun committed Jan 8, 2024
1 parent a7150a9 commit 8588d4f
Showing 1 changed file with 19 additions and 20 deletions.
39 changes: 19 additions & 20 deletions kernel/sched/ext.c
Original file line number Diff line number Diff line change
Expand Up @@ -3198,9 +3198,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
WRITE_ONCE(scx_switching_all, false);

/* avoid racing against fork and cgroup changes */
cpus_read_lock();
percpu_down_write(&scx_fork_rwsem);
scx_cgroup_lock();
cpus_read_lock();

spin_lock_irq(&scx_tasks_lock);
scx_task_iter_init(&sti);
Expand Down Expand Up @@ -3239,9 +3239,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)

scx_cgroup_exit();

cpus_read_unlock();
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
cpus_read_unlock();

if (ei->kind >= SCX_EXIT_ERROR) {
printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
Expand Down Expand Up @@ -3399,18 +3399,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
atomic_long_set(&scx_nr_rejected, 0);

/*
* Lock out forks, cgroup on/offlining and moves before opening the
* floodgate so that they don't wander into the operations prematurely.
*
* Also keep CPUs stable during enable so that the BPF scheduler can
* track online CPUs by watching ->on/offline_cpu() after ->init().
*
* Acquire scx_fork_rwsem and scx_group_rwsem before the hotplug lock.
* cpus_read_lock() is acquired in a ton of places, so let's be a bit
* cautious to avoid possible deadlock.
* Keep CPUs stable during enable so that the BPF scheduler can track
* online CPUs by watching ->on/offline_cpu() after ->init().
*/
percpu_down_write(&scx_fork_rwsem);
scx_cgroup_lock();
cpus_read_lock();

scx_switch_all_req = false;
Expand Down Expand Up @@ -3456,6 +3447,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
scx_watchdog_timeout / 2);

/*
* Lock out forks, cgroup on/offlining and moves before opening the
* floodgate so that they don't wander into the operations prematurely.
*/
percpu_down_write(&scx_fork_rwsem);
scx_cgroup_lock();

for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
if (((void (**)(void))ops)[i])
static_branch_enable_cpuslocked(&scx_has_op[i]);
Expand All @@ -3481,7 +3479,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
*/
ret = scx_cgroup_init();
if (ret)
goto err_disable;
goto err_disable_unlock;

static_branch_enable_cpuslocked(&__scx_ops_enabled);

Expand All @@ -3507,7 +3505,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
spin_unlock_irq(&scx_tasks_lock);
pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
ret, p->comm, p->pid);
goto err_disable;
goto err_disable_unlock;
}

put_task_struct(p);
Expand All @@ -3531,7 +3529,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
preempt_enable();
spin_unlock_irq(&scx_tasks_lock);
ret = -EBUSY;
goto err_disable;
goto err_disable_unlock;
}

/*
Expand Down Expand Up @@ -3565,6 +3563,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)

spin_unlock_irq(&scx_tasks_lock);
preempt_enable();
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);

if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
ret = -EBUSY;
Expand All @@ -3575,8 +3575,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
static_branch_enable_cpuslocked(&__scx_switched_all);

cpus_read_unlock();
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
mutex_unlock(&scx_ops_enable_mutex);

scx_cgroup_config_knobs();
Expand All @@ -3587,10 +3585,11 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
mutex_unlock(&scx_ops_enable_mutex);
return ret;

err_disable:
cpus_read_unlock();
err_disable_unlock:
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
err_disable:
cpus_read_unlock();
mutex_unlock(&scx_ops_enable_mutex);
/* must be fully disabled before returning */
scx_ops_disable(SCX_EXIT_ERROR);
Expand Down

0 comments on commit 8588d4f

Please sign in to comment.