Revert "scx: Avoid possible deadlock with cpus_read_lock()"

This reverts commit c3c7041. We hit a locking ordering issue in the other direction. Let's revert for now. [ 9.378773] ====================================================== [ 9.379476] WARNING: possible circular locking dependency detected [ 9.379532] 6.6.0-work-10442-ga7150a9168f8-dirty torvalds#134 Not tainted [ 9.379532] ------------------------------------------------------ [ 9.379532] scx_rustland/1622 is trying to acquire lock: [ 9.379532] ffffffff8325f828 (cpu_hotplug_lock){++++}-{0:0}, at: bpf_scx_reg+0xe4/0xcf0 [ 9.379532] [ 9.379532] but task is already holding lock: [ 9.379532] ffffffff83271be8 (scx_cgroup_rwsem){++++}-{0:0}, at: bpf_scx_reg+0xdf/0xcf0 [ 9.379532] [ 9.379532] which lock already depends on the new lock. [ 9.379532] [ 9.379532] [ 9.379532] the existing dependency chain (in reverse order) is: [ 9.379532] [ 9.379532] -> #2 (scx_cgroup_rwsem){++++}-{0:0}: [ 9.379532] percpu_down_read+0x2e/0xb0 [ 9.379532] scx_cgroup_can_attach+0x25/0x200 [ 9.379532] cpu_cgroup_can_attach+0xe/0x10 [ 9.379532] cgroup_migrate_execute+0xaf/0x450 [ 9.379532] cgroup_apply_control+0x227/0x2a0 [ 9.379532] cgroup_subtree_control_write+0x425/0x4b0 [ 9.379532] cgroup_file_write+0x82/0x260 [ 9.379532] kernfs_fop_write_iter+0x131/0x1c0 [ 9.379532] vfs_write+0x1f9/0x270 [ 9.379532] ksys_write+0x62/0xc0 [ 9.379532] __x64_sys_write+0x1b/0x20 [ 9.379532] do_syscall_64+0x40/0xe0 [ 9.379532] entry_SYSCALL_64_after_hwframe+0x46/0x4e [ 9.379532] [ 9.379532] -> #1 (cgroup_threadgroup_rwsem){++++}-{0:0}: [ 9.379532] percpu_down_write+0x35/0x1e0 [ 9.379532] cgroup_procs_write_start+0x8a/0x210 [ 9.379532] __cgroup_procs_write+0x4c/0x160 [ 9.379532] cgroup_procs_write+0x17/0x30 [ 9.379532] cgroup_file_write+0x82/0x260 [ 9.379532] kernfs_fop_write_iter+0x131/0x1c0 [ 9.379532] vfs_write+0x1f9/0x270 [ 9.379532] ksys_write+0x62/0xc0 [ 9.379532] __x64_sys_write+0x1b/0x20 [ 9.379532] do_syscall_64+0x40/0xe0 [ 9.379532] entry_SYSCALL_64_after_hwframe+0x46/0x4e [ 9.379532] [ 9.379532] -> #0 (cpu_hotplug_lock){++++}-{0:0}: [ 9.379532] __lock_acquire+0x142d/0x2a30 [ 9.379532] lock_acquire+0xbf/0x1f0 [ 9.379532] cpus_read_lock+0x2f/0xc0 [ 9.379532] bpf_scx_reg+0xe4/0xcf0 [ 9.379532] bpf_struct_ops_link_create+0xb6/0x100 [ 9.379532] link_create+0x49/0x200 [ 9.379532] __sys_bpf+0x351/0x3e0 [ 9.379532] __x64_sys_bpf+0x1c/0x20 [ 9.379532] do_syscall_64+0x40/0xe0 [ 9.379532] entry_SYSCALL_64_after_hwframe+0x46/0x4e [ 9.379532] [ 9.379532] other info that might help us debug this: [ 9.379532] [ 9.379532] Chain exists of: [ 9.379532] cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem [ 9.379532] [ 9.379532] Possible unsafe locking scenario: [ 9.379532] [ 9.379532] CPU0 CPU1 [ 9.379532] ---- ---- [ 9.379532] lock(scx_cgroup_rwsem); [ 9.379532] lock(cgroup_threadgroup_rwsem); [ 9.379532] lock(scx_cgroup_rwsem); [ 9.379532] rlock(cpu_hotplug_lock); [ 9.379532] [ 9.379532] *** DEADLOCK *** [ 9.379532] [ 9.379532] 3 locks held by scx_rustland/1622: [ 9.379532] #0: ffffffff83272708 (scx_ops_enable_mutex){+.+.}-{3:3}, at: bpf_scx_reg+0x2a/0xcf0 [ 9.379532] #1: ffffffff83271aa0 (scx_fork_rwsem){++++}-{0:0}, at: bpf_scx_reg+0xd3/0xcf0 [ 9.379532] #2: ffffffff83271be8 (scx_cgroup_rwsem){++++}-{0:0}, at: bpf_scx_reg+0xdf/0xcf0 [ 9.379532] [ 9.379532] stack backtrace: [ 9.379532] CPU: 7 PID: 1622 Comm: scx_rustland Not tainted 6.6.0-work-10442-ga7150a9168f8-dirty torvalds#134 [ 9.379532] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 2/2/2022 [ 9.379532] Sched_ext: rustland (prepping) [ 9.379532] Call Trace: [ 9.379532] <TASK> [ 9.379532] dump_stack_lvl+0x55/0x70 [ 9.379532] dump_stack+0x10/0x20 [ 9.379532] print_circular_bug+0x2ea/0x2f0 [ 9.379532] check_noncircular+0xe2/0x100 [ 9.379532] __lock_acquire+0x142d/0x2a30 [ 9.379532] ? lock_acquire+0xbf/0x1f0 [ 9.379532] ? rcu_sync_func+0x2c/0xa0 [ 9.379532] lock_acquire+0xbf/0x1f0 [ 9.379532] ? bpf_scx_reg+0xe4/0xcf0 [ 9.379532] cpus_read_lock+0x2f/0xc0 [ 9.379532] ? bpf_scx_reg+0xe4/0xcf0 [ 9.379532] bpf_scx_reg+0xe4/0xcf0 [ 9.379532] ? alloc_file+0xa4/0x160 [ 9.379532] ? alloc_file_pseudo+0x99/0xd0 [ 9.379532] ? anon_inode_getfile+0x79/0xc0 [ 9.379532] ? bpf_link_prime+0xe2/0x1a0 [ 9.379532] bpf_struct_ops_link_create+0xb6/0x100 [ 9.379532] link_create+0x49/0x200 [ 9.379532] __sys_bpf+0x351/0x3e0 [ 9.379532] __x64_sys_bpf+0x1c/0x20 [ 9.379532] do_syscall_64+0x40/0xe0 [ 9.379532] ? sysvec_apic_timer_interrupt+0x44/0x80 [ 9.379532] entry_SYSCALL_64_after_hwframe+0x46/0x4e [ 9.379532] RIP: 0033:0x7fc391f7473d [ 9.379532] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c3 95 0c 00 f7 d8 64 89 01 48 [ 9.379532] RSP: 002b:00007ffeb4fe4108 EFLAGS: 00000246 ORIG_RAX: 0000000000000141 [ 9.379532] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fc391f7473d [ 9.379532] RDX: 0000000000000030 RSI: 00007ffeb4fe4120 RDI: 000000000000001c [ 9.379532] RBP: 000000000000000c R08: 000000000000000c R09: 000055d0a75b1a10 [ 9.379532] R10: 0000000000000050 R11: 0000000000000246 R12: 000000000000002c [ 9.379532] R13: 00007ffeb4fe4628 R14: 0000000000000000 R15: 00007ffeb4fe4328 [ 9.379532] </TASK>
logic10492 · Jan 8, 2024 · 8588d4f · 8588d4f
1 parent a7150a9
commit 8588d4f
Showing 1 changed file with 19 additions and 20 deletions.
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
@@ -3198,9 +3198,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	WRITE_ONCE(scx_switching_all, false);
 
 	/* avoid racing against fork and cgroup changes */
+	cpus_read_lock();
 	percpu_down_write(&scx_fork_rwsem);
 	scx_cgroup_lock();
-	cpus_read_lock();
 
 	spin_lock_irq(&scx_tasks_lock);
 	scx_task_iter_init(&sti);
@@ -3239,9 +3239,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 
 	scx_cgroup_exit();
 
-	cpus_read_unlock();
 	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
+	cpus_read_unlock();
 
 	if (ei->kind >= SCX_EXIT_ERROR) {
 		printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
@@ -3399,18 +3399,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	atomic_long_set(&scx_nr_rejected, 0);
 
 	/*
-	 * Lock out forks, cgroup on/offlining and moves before opening the
-	 * floodgate so that they don't wander into the operations prematurely.
-	 *
-	 * Also keep CPUs stable during enable so that the BPF scheduler can
-	 * track online CPUs by watching ->on/offline_cpu() after ->init().
-	 *
-	 * Acquire scx_fork_rwsem and scx_group_rwsem before the hotplug lock.
-	 * cpus_read_lock() is acquired in a ton of places, so let's be a bit
-	 * cautious to avoid possible deadlock.
+	 * Keep CPUs stable during enable so that the BPF scheduler can track
+	 * online CPUs by watching ->on/offline_cpu() after ->init().
 	 */
-	percpu_down_write(&scx_fork_rwsem);
-	scx_cgroup_lock();
 	cpus_read_lock();
 
 	scx_switch_all_req = false;
@@ -3456,6 +3447,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
 			   scx_watchdog_timeout / 2);
 
+	/*
+	 * Lock out forks, cgroup on/offlining and moves before opening the
+	 * floodgate so that they don't wander into the operations prematurely.
+	 */
+	percpu_down_write(&scx_fork_rwsem);
+	scx_cgroup_lock();
+
 	for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
 		if (((void (**)(void))ops)[i])
 			static_branch_enable_cpuslocked(&scx_has_op[i]);
@@ -3481,7 +3479,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	 */
 	ret = scx_cgroup_init();
 	if (ret)
-		goto err_disable;
+		goto err_disable_unlock;
 
 	static_branch_enable_cpuslocked(&__scx_ops_enabled);
 
@@ -3507,7 +3505,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 			spin_unlock_irq(&scx_tasks_lock);
 			pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
 			       ret, p->comm, p->pid);
-			goto err_disable;
+			goto err_disable_unlock;
 		}
 
 		put_task_struct(p);
@@ -3531,7 +3529,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		preempt_enable();
 		spin_unlock_irq(&scx_tasks_lock);
 		ret = -EBUSY;
-		goto err_disable;
+		goto err_disable_unlock;
 	}
 
 	/*
@@ -3565,6 +3563,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 
 	spin_unlock_irq(&scx_tasks_lock);
 	preempt_enable();
+	scx_cgroup_unlock();
+	percpu_up_write(&scx_fork_rwsem);
 
 	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
 		ret = -EBUSY;
@@ -3575,8 +3575,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 		static_branch_enable_cpuslocked(&__scx_switched_all);
 
 	cpus_read_unlock();
-	scx_cgroup_unlock();
-	percpu_up_write(&scx_fork_rwsem);
 	mutex_unlock(&scx_ops_enable_mutex);
 
 	scx_cgroup_config_knobs();
@@ -3587,10 +3585,11 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	mutex_unlock(&scx_ops_enable_mutex);
 	return ret;
 
-err_disable:
-	cpus_read_unlock();
+err_disable_unlock:
 	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
+err_disable:
+	cpus_read_unlock();
 	mutex_unlock(&scx_ops_enable_mutex);
 	/* must be fully disabled before returning */
 	scx_ops_disable(SCX_EXIT_ERROR);