Skip to content

Commit

Permalink
scx: Avoid possible deadlock with cpus_read_lock()
Browse files Browse the repository at this point in the history
Han Xing Yi reported a syzbot lockdep error over the weekend:

======================================================
WARNING: possible circular locking dependency detected
6.6.0-g2f6ba98e2d3d #4 Not tainted
------------------------------------------------------
syz-executor.0/2181 is trying to acquire lock:
ffffffff84772410 (pernet_ops_rwsem){++++}-{3:3}, at: copy_net_ns+0x216/0x590 net/core/net_namespace.c:487
but task is already holding lock:
ffffffff8449dc50 (scx_fork_rwsem){++++}-{0:0}, at: sched_fork+0x3b/0x190 kernel/sched/core.c:4810
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #3 (scx_fork_rwsem){++++}-{0:0}:
       percpu_down_write+0x51/0x210 kernel/locking/percpu-rwsem.c:227
       scx_ops_enable+0x230/0xf90 kernel/sched/ext.c:3271
       bpf_struct_ops_link_create+0x1b9/0x220 kernel/bpf/bpf_struct_ops.c:914
       link_create kernel/bpf/syscall.c:4938 [inline]
       __sys_bpf+0x35af/0x4ac0 kernel/bpf/syscall.c:5453
       __do_sys_bpf kernel/bpf/syscall.c:5487 [inline]
       __se_sys_bpf kernel/bpf/syscall.c:5485 [inline]
       __x64_sys_bpf+0x48/0x60 kernel/bpf/syscall.c:5485
       do_syscall_x64 arch/x86/entry/common.c:51 [inline]
       do_syscall_64+0x46/0x100 arch/x86/entry/common.c:82
       entry_SYSCALL_64_after_hwframe+0x6e/0x76
-> #2 (cpu_hotplug_lock){++++}-{0:0}:
       percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
       cpus_read_lock+0x42/0x1b0 kernel/cpu.c:489
       flush_all_backlogs net/core/dev.c:5885 [inline]
       unregister_netdevice_many_notify+0x30a/0x1070 net/core/dev.c:10965
       unregister_netdevice_many+0x19/0x20 net/core/dev.c:11039
       sit_exit_batch_net+0x433/0x460 net/ipv6/sit.c:1887
       ops_exit_list+0xc5/0xe0 net/core/net_namespace.c:175
       cleanup_net+0x3e2/0x750 net/core/net_namespace.c:614
       process_one_work+0x50d/0xc20 kernel/workqueue.c:2630
       process_scheduled_works kernel/workqueue.c:2703 [inline]
       worker_thread+0x50b/0x950 kernel/workqueue.c:2784
       kthread+0x1fa/0x250 kernel/kthread.c:388
       ret_from_fork+0x48/0x60 arch/x86/kernel/process.c:147
       ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:242
-> #1 (rtnl_mutex){+.+.}-{3:3}:
       __mutex_lock_common kernel/locking/mutex.c:603 [inline]
       __mutex_lock+0xc1/0xea0 kernel/locking/mutex.c:747
       mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:799
       rtnl_lock+0x17/0x20 net/core/rtnetlink.c:79
       register_netdevice_notifier+0x25/0x1c0 net/core/dev.c:1741
       rtnetlink_init+0x3a/0x6e0 net/core/rtnetlink.c:6657
       netlink_proto_init+0x23d/0x2f0 net/netlink/af_netlink.c:2946
       do_one_initcall+0xb3/0x5f0 init/main.c:1232
       do_initcall_level init/main.c:1294 [inline]
       do_initcalls init/main.c:1310 [inline]
       do_basic_setup init/main.c:1329 [inline]
       kernel_init_freeable+0x40c/0x5d0 init/main.c:1547
       kernel_init+0x1d/0x350 init/main.c:1437
       ret_from_fork+0x48/0x60 arch/x86/kernel/process.c:147
       ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:242
-> #0 (pernet_ops_rwsem){++++}-{3:3}:
       check_prev_add kernel/locking/lockdep.c:3134 [inline]
       check_prevs_add kernel/locking/lockdep.c:3253 [inline]
       validate_chain kernel/locking/lockdep.c:3868 [inline]
       __lock_acquire+0x16b4/0x2b30 kernel/locking/lockdep.c:5136
       lock_acquire kernel/locking/lockdep.c:5753 [inline]
       lock_acquire+0xc1/0x2b0 kernel/locking/lockdep.c:5718
       down_read_killable+0x5d/0x280 kernel/locking/rwsem.c:1549
       copy_net_ns+0x216/0x590 net/core/net_namespace.c:487
       create_new_namespaces+0x2ed/0x770 kernel/nsproxy.c:110
       copy_namespaces+0x488/0x540 kernel/nsproxy.c:179
       copy_process+0x1b52/0x4680 kernel/fork.c:2504
       kernel_clone+0x116/0x660 kernel/fork.c:2914
       __do_sys_clone3+0x192/0x220 kernel/fork.c:3215
       __se_sys_clone3 kernel/fork.c:3199 [inline]
       __x64_sys_clone3+0x30/0x40 kernel/fork.c:3199
       do_syscall_x64 arch/x86/entry/common.c:51 [inline]
       do_syscall_64+0x46/0x100 arch/x86/entry/common.c:82
       entry_SYSCALL_64_after_hwframe+0x6e/0x76
other info that might help us debug this:
Chain exists of:
  pernet_ops_rwsem --> cpu_hotplug_lock --> scx_fork_rwsem
 Possible unsafe locking scenario:
       CPU0                    CPU1
       ----                    ----
  rlock(scx_fork_rwsem);
                               lock(cpu_hotplug_lock);
                               lock(scx_fork_rwsem);
  rlock(pernet_ops_rwsem);
 *** DEADLOCK ***
1 lock held by syz-executor.0/2181:
 #0: ffffffff8449dc50 (scx_fork_rwsem){++++}-{0:0}, at: sched_fork+0x3b/0x190 kernel/sched/core.c:4810
stack backtrace:
CPU: 0 PID: 2181 Comm: syz-executor.0 Not tainted 6.6.0-g2f6ba98e2d3d #4
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
Sched_ext: serialise (enabled), task: runnable_at=-6ms
Call Trace:
 <TASK>
 __dump_stack lib/dump_stack.c:89 [inline]
 dump_stack_lvl+0x91/0xf0 lib/dump_stack.c:107
 dump_stack+0x15/0x20 lib/dump_stack.c:114
 check_noncircular+0x134/0x150 kernel/locking/lockdep.c:2187
 check_prev_add kernel/locking/lockdep.c:3134 [inline]
 check_prevs_add kernel/locking/lockdep.c:3253 [inline]
 validate_chain kernel/locking/lockdep.c:3868 [inline]
 __lock_acquire+0x16b4/0x2b30 kernel/locking/lockdep.c:5136
 lock_acquire kernel/locking/lockdep.c:5753 [inline]
 lock_acquire+0xc1/0x2b0 kernel/locking/lockdep.c:5718
 down_read_killable+0x5d/0x280 kernel/locking/rwsem.c:1549
 copy_net_ns+0x216/0x590 net/core/net_namespace.c:487
 create_new_namespaces+0x2ed/0x770 kernel/nsproxy.c:110
 copy_namespaces+0x488/0x540 kernel/nsproxy.c:179
 copy_process+0x1b52/0x4680 kernel/fork.c:2504
 kernel_clone+0x116/0x660 kernel/fork.c:2914
 __do_sys_clone3+0x192/0x220 kernel/fork.c:3215
 __se_sys_clone3 kernel/fork.c:3199 [inline]
 __x64_sys_clone3+0x30/0x40 kernel/fork.c:3199
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x46/0x100 arch/x86/entry/common.c:82
 entry_SYSCALL_64_after_hwframe+0x6e/0x76
RIP: 0033:0x7f9f764e240d
Code: c3 e8 97 2b 00 00 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f9f75851ee8 EFLAGS: 00000246 ORIG_RAX: 00000000000001b3
RAX: ffffffffffffffda RBX: 00007f9f7661ef80 RCX: 00007f9f764e240d
RDX: 0000000000000100 RSI: 0000000000000058 RDI: 00007f9f75851f00
RBP: 00007f9f765434a6 R08: 0000000000000000 R09: 0000000000000058
R10: 00007f9f75851f00 R11: 0000000000000246 R12: 0000000000000058
R13: 0000000000000006 R14: 00007f9f7661ef80 R15: 00007f9f75832000
 </TASK>

The issue is that we're acquiring the cpus_read_lock() _before_ we
acquire scx_fork_rwsem in scx_ops_enable() and scx_ops_disable(), but we
acquire and hold scx_fork_rwsem around basically the whole fork() path.
I don't see how a deadlock could actually occur in practice, but it
should be safe to acquire the scx_fork_rwsem and scx_cgroup_rwsem
semaphores before the hotplug lock, so let's do that.

Reported-by: Han Xing Yi <[email protected]>
Signed-off-by: David Vernet <[email protected]>
  • Loading branch information
Byte-Lab committed Jan 8, 2024
1 parent 8ccf9d7 commit c3c7041
Showing 1 changed file with 20 additions and 19 deletions.
39 changes: 20 additions & 19 deletions kernel/sched/ext.c
Original file line number Diff line number Diff line change
Expand Up @@ -3155,9 +3155,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
WRITE_ONCE(scx_switching_all, false);

/* avoid racing against fork and cgroup changes */
cpus_read_lock();
percpu_down_write(&scx_fork_rwsem);
scx_cgroup_lock();
cpus_read_lock();

spin_lock_irq(&scx_tasks_lock);
scx_task_iter_init(&sti);
Expand Down Expand Up @@ -3196,9 +3196,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)

scx_cgroup_exit();

cpus_read_unlock();
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
cpus_read_unlock();

if (ei->kind >= SCX_EXIT_ERROR) {
printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
Expand Down Expand Up @@ -3353,9 +3353,18 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
atomic_long_set(&scx_nr_rejected, 0);

/*
* Keep CPUs stable during enable so that the BPF scheduler can track
* online CPUs by watching ->on/offline_cpu() after ->init().
* Lock out forks, cgroup on/offlining and moves before opening the
* floodgate so that they don't wander into the operations prematurely.
*
* Also keep CPUs stable during enable so that the BPF scheduler can
* track online CPUs by watching ->on/offline_cpu() after ->init().
*
* Acquire scx_fork_rwsem and scx_group_rwsem before the hotplug lock.
* cpus_read_lock() is acquired in a ton of places, so let's be a bit
* cautious to avoid possible deadlock.
*/
percpu_down_write(&scx_fork_rwsem);
scx_cgroup_lock();
cpus_read_lock();

scx_switch_all_req = false;
Expand Down Expand Up @@ -3399,13 +3408,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
scx_watchdog_timeout / 2);

/*
* Lock out forks, cgroup on/offlining and moves before opening the
* floodgate so that they don't wander into the operations prematurely.
*/
percpu_down_write(&scx_fork_rwsem);
scx_cgroup_lock();

for (i = 0; i < SCX_NR_ONLINE_OPS; i++)
if (((void (**)(void))ops)[i])
static_branch_enable_cpuslocked(&scx_has_op[i]);
Expand All @@ -3431,7 +3433,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
*/
ret = scx_cgroup_init();
if (ret)
goto err_disable_unlock;
goto err_disable;

static_branch_enable_cpuslocked(&__scx_ops_enabled);

Expand All @@ -3457,7 +3459,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
spin_unlock_irq(&scx_tasks_lock);
pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
ret, p->comm, p->pid);
goto err_disable_unlock;
goto err_disable;
}

put_task_struct(p);
Expand All @@ -3481,7 +3483,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
preempt_enable();
spin_unlock_irq(&scx_tasks_lock);
ret = -EBUSY;
goto err_disable_unlock;
goto err_disable;
}

/*
Expand Down Expand Up @@ -3515,8 +3517,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops)

spin_unlock_irq(&scx_tasks_lock);
preempt_enable();
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);

if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
ret = -EBUSY;
Expand All @@ -3527,6 +3527,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
static_branch_enable_cpuslocked(&__scx_switched_all);

cpus_read_unlock();
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
mutex_unlock(&scx_ops_enable_mutex);

scx_cgroup_config_knobs();
Expand All @@ -3537,11 +3539,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
mutex_unlock(&scx_ops_enable_mutex);
return ret;

err_disable_unlock:
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
err_disable:
cpus_read_unlock();
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
mutex_unlock(&scx_ops_enable_mutex);
/* must be fully disabled before returning */
scx_ops_disable(SCX_EXIT_ERROR);
Expand Down

0 comments on commit c3c7041

Please sign in to comment.