Skip to content

Commit

Permalink
rseq: Validate read-only fields under DEBUG_RSEQ config
Browse files Browse the repository at this point in the history
The rseq uapi requires cooperation between users of the rseq fields
to ensure that all libraries and applications using rseq within a
process do not interfere with each other.

This is especially important for fields which are meant to be read-only
from user-space, as documented in uapi/linux/rseq.h:

  - cpu_id_start,
  - cpu_id,
  - node_id,
  - mm_cid.

Storing to those fields from a user-space library prevents any sharing
of the rseq ABI with other libraries and applications, as other users
are not aware that the content of those fields has been altered by a
third-party library.

This is unfortunately the current behavior of tcmalloc: it purposefully
overlaps part of a cached value with the cpu_id_start upper bits to get
notified about preemption, because the kernel clears those upper bits
before returning to user-space. This behavior does not conform to the
rseq uapi header ABI.

This prevents tcmalloc from using rseq when rseq is registered by the
GNU C library 2.35+. It requires tcmalloc users to disable glibc rseq
registration with a glibc tunable, which is a sad state of affairs.

Considering that tcmalloc and the GNU C library are the two first
upstream projects using rseq, and that they are already incompatible due
to use of this hack, adding kernel-level validation of all read-only
fields content is necessary to ensure future users of rseq abide by the
rseq ABI requirements.

Validate that user-space does not corrupt the read-only fields and
conform to the rseq uapi header ABI when the kernel is built with
CONFIG_DEBUG_RSEQ=y. This is done by storing a copy of the read-only
fields in the task_struct, and validating the prior values present in
user-space before updating them. If the values do not match, print
a warning on the console (printk_ratelimited()).

This is a first step to identify misuses of the rseq ABI by printing
a warning on the console. After a giving some time to userspace to
correct its use of rseq, the plan is to eventually terminate offending
processes with SIGSEGV.

This change is expected to produce warnings for the upstream tcmalloc
implementation, but tcmalloc developers mentioned they were open to
adapt their implementation to kernel-level change.

Link: https://lore.kernel.org/all/CACT4Y+beLh1qnHF9bxhMUcva8KyuvZs7Mg_31SGK5xSoR=3m1A@mail.gmail.com/
Link: google/tcmalloc#144
Signed-off-by: Mathieu Desnoyers <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Boqun Feng <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Oskolkov <[email protected]>
Cc: Dmitry Vyukov <[email protected]>
Cc: Florian Weimer <[email protected]>
Cc: Carlos O'Donell <[email protected]>
Cc: DJ Delorie <[email protected]>
Cc: [email protected]
  • Loading branch information
compudj committed Oct 18, 2024
1 parent 7aa21fe commit e3bf38c
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 7 deletions.
3 changes: 3 additions & 0 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -1340,6 +1340,9 @@ struct task_struct {
* with respect to preemption.
*/
unsigned long rseq_event_mask;
# ifdef CONFIG_DEBUG_RSEQ
struct rseq rseq_fields;
# endif
#endif

#ifdef CONFIG_SCHED_MM_CID
Expand Down
87 changes: 80 additions & 7 deletions kernel/rseq.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,42 @@
RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)

#ifdef CONFIG_DEBUG_RSEQ
static int rseq_validate_ro_fields(struct task_struct *t)
{
struct rseq __user *rseq = t->rseq;
u32 cpu_id_start, cpu_id, node_id, mm_cid;

/*
* Validate fields which are required to be read-only by
* user-space.
*/
if (!user_read_access_begin(rseq, t->rseq_len))
return -EFAULT;
unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end);
unsafe_get_user(node_id, &rseq->node_id, efault_end);
unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end);
user_read_access_end();

if (cpu_id_start != t->rseq_fields.cpu_id_start ||
cpu_id != t->rseq_fields.cpu_id ||
node_id != t->rseq_fields.node_id ||
mm_cid != t->rseq_fields.mm_cid)
printk_ratelimited(KERN_WARNING
"Detected rseq ABI read-only fields corruption by user-space. (pid=%d).\n",
t->pid);

/* For now, only print a console warning on mismatch. */
return 0;
}
#else
static int rseq_validate_ro_fields(struct task_struct *t)
{
return 0;
}
#endif

/*
*
* Restartable sequences are a lightweight interface that allows
Expand Down Expand Up @@ -88,14 +124,18 @@
static int rseq_update_cpu_node_id(struct task_struct *t)
{
struct rseq __user *rseq = t->rseq;
u32 cpu_id = raw_smp_processor_id();
u32 node_id = cpu_to_node(cpu_id);
u32 mm_cid = task_mm_cid(t);
u32 cpu_id_start, cpu_id, node_id, mm_cid;

if (rseq_validate_ro_fields(t))
goto efault;
cpu_id_start = cpu_id = raw_smp_processor_id();
node_id = cpu_to_node(cpu_id);
mm_cid = task_mm_cid(t);
WARN_ON_ONCE((int) mm_cid < 0);

if (!user_write_access_begin(rseq, t->rseq_len))
goto efault;
unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
unsafe_put_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
unsafe_put_user(node_id, &rseq->node_id, efault_end);
unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end);
Expand All @@ -105,20 +145,33 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
* t->rseq_len != ORIG_RSEQ_SIZE.
*/
user_write_access_end();
#ifdef CONFIG_DEBUG_RSEQ
/* Save a copy of the values which are read-only in kernel space. */
t->rseq_fields.cpu_id_start = cpu_id_start;
t->rseq_fields.cpu_id = cpu_id;
t->rseq_fields.node_id = node_id;
t->rseq_fields.mm_cid = mm_cid;
#endif
trace_rseq_update(t);
return 0;

efault_end:
user_write_access_end();
efault:
return -EFAULT;
return -1;
}

static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
{
u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
mm_cid = 0;
u32 cpu_id_start, cpu_id, node_id, mm_cid;

if (!rseq_validate_ro_fields(t))
return -EFAULT;

cpu_id_start = 0;
cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
node_id = 0;
mm_cid = 0;
/*
* Reset cpu_id_start to its initial state (0).
*/
Expand All @@ -141,6 +194,15 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
*/
if (put_user(mm_cid, &t->rseq->mm_cid))
return -EFAULT;
#ifdef CONFIG_DEBUG_RSEQ
/*
* Reset the in-kernel rseq fields copy.
*/
t->rseq_fields.cpu_id_start = cpu_id_start;
t->rseq_fields.cpu_id = cpu_id;
t->rseq_fields.node_id = node_id;
t->rseq_fields.mm_cid = mm_cid;
#endif
/*
* Additional feature fields added after ORIG_RSEQ_SIZE
* need to be conditionally reset only if
Expand Down Expand Up @@ -423,6 +485,17 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
current->rseq = rseq;
current->rseq_len = rseq_len;
current->rseq_sig = sig;
#ifdef CONFIG_DEBUG_RSEQ
/*
* Initialize the in-kernel rseq fields copy for validation of
* read-only fields.
*/
if (get_user(current->rseq_fields.cpu_id_start, &current->rseq->cpu_id_start) ||
get_user(current->rseq_fields.cpu_id, &current->rseq->cpu_id) ||
get_user(current->rseq_fields.node_id, &current->rseq->node_id) ||
get_user(current->rseq_fields.mm_cid, &current->rseq->mm_cid))
return -EFAULT;
#endif
/*
* If rseq was previously inactive, and has just been
* registered, ensure the cpu_id_start and cpu_id fields
Expand Down

0 comments on commit e3bf38c

Please sign in to comment.