From 31512784209bca035059aca4cf8e90e5936e3978 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Tue, 11 Jul 2023 16:31:34 -0700 Subject: [PATCH 1/2] Implemented the clone fallback when clone3 returns ENOSYS For a number of reasons, platforms can choose to block clone3 and force return ENOSYS. We implement a clone fallback in the case that we can't use clone3. Also, clone3 has no libc wrapper at this point. The current implementation calls the kernel version of the syscall directly. There are undefined behaviors potentially when we create process bypassing the libc. However, we have not observed any issue with our tests. This is likely because `youki` runs short lived process and calls exec or exit in the end. Nonetheless, we should have a backup plan and this change is our way out in the case that we discover clone3 has issue as the default code path. Remove the use of the clone3 crate. We use `clone3` is a very specific way to create a process. We don't have to support the many other flags and usecases of the `clone3` call. So it is simpler for us to use the libc crate directly for the syscall. This avoids an extra dependency and reduces our binary size. Signed-off-by: yihuaf --- Cargo.lock | 123 ++---- crates/libcontainer/Cargo.toml | 1 - .../process/container_intermediate_process.rs | 87 +++-- .../src/process/container_main_process.rs | 57 ++- crates/libcontainer/src/process/fork.rs | 354 ++++++++++++++---- 5 files changed, 410 insertions(+), 212 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 75c72540f..0f898c4bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -43,7 +43,7 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "once_cell", "version_check", ] @@ -191,7 +191,7 @@ checksum = "4319208da049c43661739c5fade2ba182f09d1dc2299b32298d3a31692b17e12" dependencies = [ "addr2line 0.20.0", "cc", - "cfg-if 1.0.0", + "cfg-if", "libc", "miniz_oxide", "object 0.31.1", @@ -407,12 +407,6 @@ dependencies = [ "nom", ] -[[package]] -name = "cfg-if" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" - [[package]] name = "cfg-if" version = "1.0.0" @@ -495,16 +489,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" -[[package]] -name = "clone3" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ee4e061ea30800291ca09663878f3953840a69b08ce244b3e8b26e894d9f60f" -dependencies = [ - "bitflags 1.3.2", - "uapi", -] - [[package]] name = "cmake" version = "0.1.50" @@ -565,7 +549,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9847f90f32a50b0dcbd68bc23ff242798b13080b97b0569f6ed96a45ce4cf2cd" dependencies = [ "autocfg", - "cfg-if 1.0.0", + "cfg-if", "libc", "scopeguard", "windows-sys 0.33.0", @@ -577,7 +561,7 @@ version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eeaa953eaad386a53111e47172c2fedba671e5684c8dd601a5f474f4f118710f" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -786,7 +770,7 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -801,7 +785,7 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-channel", "crossbeam-deque", "crossbeam-epoch", @@ -815,7 +799,7 @@ version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-utils", ] @@ -825,7 +809,7 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] @@ -837,7 +821,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" dependencies = [ "autocfg", - "cfg-if 1.0.0", + "cfg-if", "crossbeam-utils", "memoffset 0.9.0", "scopeguard", @@ -849,7 +833,7 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-utils", ] @@ -859,7 +843,7 @@ version = "0.8.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -947,7 +931,7 @@ version = "5.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "hashbrown 0.14.0", "lock_api", "once_cell", @@ -1038,7 +1022,7 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "339ee130d97a610ea5a5872d2bbb130fdf68884ff09d3028b81bec8a1ac23bbc" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "dirs-sys-next", ] @@ -1097,7 +1081,7 @@ version = "0.8.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -1209,7 +1193,7 @@ version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b0377f1edc77dbd1118507bc7a66e4ab64d2b90c66f90726dc801e73a8c68f9" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "rustix 0.38.1", "windows-sys 0.48.0", ] @@ -1230,7 +1214,7 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "libc", "redox_syscall 0.2.16", "windows-sys 0.48.0", @@ -1448,7 +1432,7 @@ version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "js-sys", "libc", "wasi", @@ -1965,7 +1949,6 @@ dependencies = [ "bitflags 2.3.3", "caps", "chrono", - "clone3", "fastrand", "futures", "libc", @@ -2004,7 +1987,7 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "winapi", ] @@ -2204,7 +2187,7 @@ version = "0.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c84490118f2ee2d74570d114f3d0493cbf02790df303d2707606c3e14e07c96" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "downcast", "fragile", "lazy_static", @@ -2219,7 +2202,7 @@ version = "0.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22ce75669015c4f47b289fd4d4f56e894e4c96003ffdf3ac51313126f94c6cbb" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "proc-macro2", "quote", "syn 1.0.109", @@ -2256,7 +2239,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" dependencies = [ "bitflags 1.3.2", - "cfg-if 1.0.0", + "cfg-if", "libc", "memoffset 0.7.1", "pin-utils", @@ -2403,7 +2386,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d" dependencies = [ "bitflags 1.3.2", - "cfg-if 1.0.0", + "cfg-if", "foreign-types", "libc", "once_cell", @@ -2462,7 +2445,7 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "libc", "redox_syscall 0.3.5", "smallvec", @@ -3478,7 +3461,7 @@ version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "cpufeatures", "digest", ] @@ -3736,7 +3719,7 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5486094ee78b2e5038a6382ed7645bc084dc2ec433426ca4c3cb61e2007b8998" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "fastrand", "redox_syscall 0.3.5", "rustix 0.38.1", @@ -3811,7 +3794,7 @@ version = "1.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "once_cell", ] @@ -4016,7 +3999,7 @@ version = "0.1.37" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "log", "pin-project-lite", "tracing-attributes", @@ -4109,32 +4092,6 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" -[[package]] -name = "uapi" -version = "0.2.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "019450240401d342e2a5bc47f7fbaeb002a38fe18197b83788750d7ffb143274" -dependencies = [ - "cc", - "cfg-if 0.1.10", - "libc", - "uapi-proc", -] - -[[package]] -name = "uapi-proc" -version = "0.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54de46f980cea7b2ae8d8f7f9f1c35cf7062c68343e99345ef73758f8e60975a" -dependencies = [ - "lazy_static", - "libc", - "proc-macro2", - "quote", - "regex", - "syn 1.0.109", -] - [[package]] name = "unicase" version = "2.6.0" @@ -4479,7 +4436,7 @@ version = "0.2.84" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "wasm-bindgen-macro", ] @@ -4527,7 +4484,7 @@ version = "0.4.34" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f219e0d211ba40266969f6dbdd90636da12f75bee4fc9d6c23d1260dadb51454" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "js-sys", "wasm-bindgen", "web-sys", @@ -4645,7 +4602,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea790bcdfb4e6e9d1e5ddf75b4699aac62b078fcc9f27f44e1748165ceea67bf" dependencies = [ "bytes", - "cfg-if 1.0.0", + "cfg-if", "derivative", "indexmap 1.9.3", "js-sys", @@ -4673,7 +4630,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f093937725e242e5529fed27e08ff836c011a9ecc22e6819fb818c2ac6ff5f88" dependencies = [ "backtrace", - "cfg-if 1.0.0", + "cfg-if", "enum-iterator", "enumset", "lazy_static", @@ -4762,7 +4719,7 @@ checksum = "a1e000c2cbd4f9805427af5f3b3446574caf89ab3a1e66c2f3579fbde22b072b" dependencies = [ "backtrace", "cc", - "cfg-if 1.0.0", + "cfg-if", "corosensei", "dashmap", "derivative", @@ -4791,7 +4748,7 @@ dependencies = [ "async-trait", "bincode", "bytes", - "cfg-if 1.0.0", + "cfg-if", "cooked-waker", "dashmap", "derivative", @@ -4846,7 +4803,7 @@ dependencies = [ "anyhow", "bitflags 1.3.2", "byteorder", - "cfg-if 1.0.0", + "cfg-if", "num_enum", "serde", "time 0.2.27", @@ -4910,7 +4867,7 @@ dependencies = [ "async-trait", "bincode", "bumpalo", - "cfg-if 1.0.0", + "cfg-if", "encoding_rs", "fxprof-processed-profile", "indexmap 1.9.3", @@ -4944,7 +4901,7 @@ version = "10.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "284466ef356ce2d909bc0ad470b60c4d0df5df2de9084457e118131b3c779b92" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -5056,7 +5013,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14309cbdf2c395258b124a24757c727403070c0465a28bcc780c4f82f4bca5ff" dependencies = [ "cc", - "cfg-if 1.0.0", + "cfg-if", "rustix 0.37.19", "wasmtime-asm-macros", "windows-sys 0.48.0", @@ -5071,7 +5028,7 @@ dependencies = [ "addr2line 0.19.0", "anyhow", "bincode", - "cfg-if 1.0.0", + "cfg-if", "cpp_demangle", "gimli 0.27.2", "ittapi", @@ -5105,7 +5062,7 @@ version = "10.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b49ceb7e2105a8ebe5614d7bbab6f6ef137a284e371633af60b34925493081f" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "libc", "windows-sys 0.48.0", ] @@ -5118,7 +5075,7 @@ checksum = "3a5de4762421b0b2b19e02111ca403632852b53e506e03b4b227ffb0fbfa63c2" dependencies = [ "anyhow", "cc", - "cfg-if 1.0.0", + "cfg-if", "encoding_rs", "indexmap 1.9.3", "libc", diff --git a/crates/libcontainer/Cargo.toml b/crates/libcontainer/Cargo.toml index 3f701c5ca..d1d95fea8 100644 --- a/crates/libcontainer/Cargo.toml +++ b/crates/libcontainer/Cargo.toml @@ -36,7 +36,6 @@ libseccomp = { version = "0.3.0", optional=true } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" rust-criu = "0.4.0" -clone3 = "0.2.3" regex = "1.9.1" thiserror = "1.0.43" tracing = { version = "0.1.37", features = ["attributes"]} diff --git a/crates/libcontainer/src/process/container_intermediate_process.rs b/crates/libcontainer/src/process/container_intermediate_process.rs index 1d8b8918d..1d02637a5 100644 --- a/crates/libcontainer/src/process/container_intermediate_process.rs +++ b/crates/libcontainer/src/process/container_intermediate_process.rs @@ -8,6 +8,7 @@ use procfs::process::Process; use super::args::{ContainerArgs, ContainerType}; use super::container_init_process::container_init_process; +use super::fork::CloneCb; #[derive(Debug, thiserror::Error)] pub enum IntermediateProcessError { @@ -36,7 +37,7 @@ pub fn container_intermediate_process( intermediate_chan: &mut (channel::IntermediateSender, channel::IntermediateReceiver), init_chan: &mut (channel::InitSender, channel::InitReceiver), main_sender: &mut channel::MainSender, -) -> Result { +) -> Result<()> { let (inter_sender, inter_receiver) = intermediate_chan; let (init_sender, init_receiver) = init_chan; let command = args.syscall.create_syscall(); @@ -109,6 +110,51 @@ pub fn container_intermediate_process( namespaces.unshare_or_setns(pid_namespace)?; } + let cb: CloneCb = { + let args = args.clone(); + let init_sender = init_sender.clone(); + let inter_sender = inter_sender.clone(); + let mut main_sender = main_sender.clone(); + let mut init_receiver = init_receiver.clone(); + Box::new(move || { + if let Err(ret) = prctl::set_name("youki:[2:INIT]") { + tracing::error!(?ret, "failed to set name for child process"); + return ret; + } + + // We are inside the forked process here. The first thing we have to do + // is to close any unused senders, since fork will make a dup for all + // the socket. + if let Err(err) = init_sender.close() { + tracing::error!(?err, "failed to close receiver in init process"); + return -1; + } + if let Err(err) = inter_sender.close() { + tracing::error!(?err, "failed to close sender in the intermediate process"); + return -1; + } + match container_init_process(&args, &mut main_sender, &mut init_receiver) { + Ok(_) => 0, + Err(e) => { + if let ContainerType::TenantContainer { exec_notify_fd } = args.container_type { + let buf = format!("{e}"); + if let Err(err) = write(exec_notify_fd, buf.as_bytes()) { + tracing::error!(?err, "failed to write to exec notify fd"); + return -1; + } + if let Err(err) = close(exec_notify_fd) { + tracing::error!(?err, "failed to close exec notify fd"); + return -1; + } + } + + tracing::error!("failed to initialize container process: {e}"); + -1 + } + } + }) + }; + // We have to record the pid of the init process. The init process will be // inside the pid namespace, so we can't rely on the init process to send us // the correct pid. We also want to clone the init process as a sibling @@ -117,41 +163,7 @@ pub fn container_intermediate_process( // configuration. The youki main process can decide what to do with the init // process and the intermediate process can just exit safely after the job // is done. - let pid = fork::container_clone_sibling("youki:[2:INIT]", || { - // We are inside the forked process here. The first thing we have to do - // is to close any unused senders, since fork will make a dup for all - // the socket. - init_sender.close().map_err(|err| { - tracing::error!("failed to close receiver in init process: {}", err); - IntermediateProcessError::Channel(err) - })?; - inter_sender.close().map_err(|err| { - tracing::error!( - "failed to close sender in the intermediate process: {}", - err - ); - IntermediateProcessError::Channel(err) - })?; - match container_init_process(args, main_sender, init_receiver) { - Ok(_) => Ok(0), - Err(e) => { - if let ContainerType::TenantContainer { exec_notify_fd } = args.container_type { - let buf = format!("{e}"); - write(exec_notify_fd, buf.as_bytes()).map_err(|err| { - tracing::error!("failed to write to exec notify fd: {}", err); - IntermediateProcessError::ExecNotify(err) - })?; - close(exec_notify_fd).map_err(|err| { - tracing::error!("failed to close exec notify fd: {}", err); - IntermediateProcessError::ExecNotify(err) - })?; - } - tracing::error!("failed to initialize container process: {e}"); - Err(e.into()) - } - } - }) - .map_err(|err| { + let pid = fork::container_clone_sibling(cb).map_err(|err| { tracing::error!("failed to fork init process: {}", err); IntermediateProcessError::InitProcess(err) })?; @@ -185,7 +197,8 @@ pub fn container_intermediate_process( tracing::error!("failed to close unused init sender: {}", err); err })?; - Ok(pid) + + Ok(()) } fn apply_cgroups< diff --git a/crates/libcontainer/src/process/container_main_process.rs b/crates/libcontainer/src/process/container_main_process.rs index f3024caf6..38f7c0d60 100644 --- a/crates/libcontainer/src/process/container_main_process.rs +++ b/crates/libcontainer/src/process/container_main_process.rs @@ -1,6 +1,8 @@ use crate::{ process::{ - args::ContainerArgs, channel, container_intermediate_process, fork, + args::ContainerArgs, + channel, container_intermediate_process, + fork::{self, CloneCb}, intel_rdt::setup_intel_rdt, }, rootless::Rootless, @@ -37,33 +39,52 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo // cloned process, we have to be deligent about closing any unused channel. // At minimum, we have to close down any unused senders. The corresponding // receivers will be cleaned up once the senders are closed down. - let (main_sender, main_receiver) = &mut channel::main_channel()?; - let inter_chan = &mut channel::intermediate_channel()?; - let init_chan = &mut channel::init_channel()?; + let (main_sender, mut main_receiver) = channel::main_channel()?; + let inter_chan = channel::intermediate_channel()?; + let init_chan = channel::init_channel()?; - let intermediate_pid = fork::container_fork("youki:[1:INTER]", || { - container_intermediate_process::container_intermediate_process( - container_args, - inter_chan, - init_chan, - main_sender, - )?; + let cb: CloneCb = { + let container_args = container_args.clone(); + let mut main_sender = main_sender.clone(); + let mut inter_chan = inter_chan.clone(); + let mut init_chan = init_chan.clone(); + Box::new(move || { + if let Err(ret) = prctl::set_name("youki:[1:INTER]") { + tracing::error!(?ret, "failed to set name for child process"); + return ret; + } + + match container_intermediate_process::container_intermediate_process( + &container_args, + &mut inter_chan, + &mut init_chan, + &mut main_sender, + ) { + Ok(_) => 0, + Err(err) => { + tracing::error!(?err, "failed to run intermediate process"); + -1 + } + } + }) + }; - Ok(0) - }) - .map_err(|err| { + let intermediate_pid = fork::container_clone(cb).map_err(|err| { tracing::error!("failed to fork intermediate process: {}", err); ProcessError::IntermediateProcessFailed(err) })?; // Close down unused fds. The corresponding fds are duplicated to the - // child process during fork. + // child process during clone. main_sender.close().map_err(|err| { tracing::error!("failed to close unused sender: {}", err); err })?; - let (inter_sender, inter_receiver) = inter_chan; + let (mut inter_sender, inter_receiver) = inter_chan; + #[cfg(feature = "libseccomp")] + let (mut init_sender, init_receiver) = init_chan; + #[cfg(not(feature = "libseccomp"))] let (init_sender, init_receiver) = init_chan; // If creating a rootless container, the intermediate process will ask @@ -106,8 +127,8 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo crate::process::seccomp_listener::sync_seccomp( seccomp, &state, - init_sender, - main_receiver, + &mut init_sender, + &mut main_receiver, )?; } diff --git a/crates/libcontainer/src/process/fork.rs b/crates/libcontainer/src/process/fork.rs index a3557a531..114a22440 100644 --- a/crates/libcontainer/src/process/fork.rs +++ b/crates/libcontainer/src/process/fork.rs @@ -1,98 +1,242 @@ +use std::{ffi::c_int, num::NonZeroUsize}; + use libc::SIGCHLD; -use nix::unistd::Pid; -use prctl; +use nix::{ + sys::{mman, resource}, + unistd::Pid, +}; #[derive(Debug, thiserror::Error)] pub enum CloneError { - #[error("failed to clone process using clone3")] + #[error("failed to clone process")] Clone(#[source] nix::Error), + #[error("failed to get system memory page size")] + PageSize(#[source] nix::Error), + #[error("failed to get resource limit")] + ResourceLimit(#[source] nix::Error), + #[error("the stack size is zero")] + ZeroStackSize, + #[error("failed to allocate stack")] + StackAllocation(#[source] nix::Error), + #[error("failed to create stack guard page")] + GuardPage(#[source] nix::Error), + #[error("unknown error code {0}")] + UnknownErrno(i32), } -type Result = std::result::Result; - -#[derive(Debug, thiserror::Error)] -pub enum CallbackError { - #[error(transparent)] - IntermediateProcess( - #[from] crate::process::container_intermediate_process::IntermediateProcessError, - ), - #[error(transparent)] - InitProcess(#[from] crate::process::container_init_process::InitProcessError), - // Need a fake error for testing - #[error("unknown")] - #[cfg(test)] - Test, -} - -type CallbackResult = std::result::Result; +/// The callback function used in clone system call. The return value is i32 +/// which is consistent with C functions return code. The trait has to be +/// `FnMut` because we need to be able to call the closure multiple times, once +/// in clone3 and once in clone if fallback is required. The closure is boxed +/// because we need to store the closure on heap, not stack in the case of +/// `clone`. Unlike `fork` or `clone3`, the `clone` glibc wrapper requires us to +/// pass in a child stack, which is empty. By storing the closure in heap, we +/// can then in the new process to re-box the heap memory back to a closure +/// correctly. +pub type CloneCb = Box i32>; -// Fork/Clone a sibling process that shares the same parent as the calling +// Clone a sibling process that shares the same parent as the calling // process. This is used to launch the container init process so the parent // process of the calling process can receive ownership of the process. If we // clone a child process as the init process, the calling process (likely the // youki main process) will exit and the init process will be re-parented to the // process 1 (system init process), which is not the right behavior of what we // look for. -pub fn container_clone_sibling CallbackResult>( - child_name: &str, - cb: F, -) -> Result { - let mut clone = clone3::Clone3::default(); +pub fn container_clone_sibling(cb: CloneCb) -> Result { // Note: normally, an exit signal is required, but when using // `CLONE_PARENT`, the `clone3` will return EINVAL if an exit signal is set. // The older `clone` will not return EINVAL in this case. Instead it ignores - // the exit signal bits in the glibc wrapper. - clone.flag_parent(); + // the exit signal bits in the glibc wrapper. Therefore, we explicitly set + // the exit_signal to None here, so this works for both version of clone. + clone_internal(cb, libc::CLONE_PARENT as u64, None) +} - container_clone(child_name, cb, clone) +// Clone a child process and execute the callback. +pub fn container_clone(cb: CloneCb) -> Result { + clone_internal(cb, 0, Some(SIGCHLD as u64)) +} + +// An internal wrapper to manage the clone3 vs clone fallback logic. +fn clone_internal( + mut cb: CloneCb, + flags: u64, + exit_signal: Option, +) -> Result { + match clone3(&mut cb, flags, exit_signal) { + Ok(pid) => Ok(pid), + // For now, we decide to only fallback on ENOSYS + Err(CloneError::Clone(nix::Error::ENOSYS)) => { + tracing::debug!("clone3 is not supported, fallback to clone"); + let pid = clone(cb, flags, exit_signal)?; + + Ok(pid) + } + Err(err) => Err(err), + } } -// A simple clone wrapper to clone3 so we can share this logic in different -// fork/clone situations. We decided to minimally support kernel version >= 5.4, -// and `clone3` requires only kernel version >= 5.3. Therefore, we don't need to -// fall back to `clone` or `fork`. -fn container_clone CallbackResult>( - child_name: &str, - cb: F, - mut clone_cmd: clone3::Clone3, -) -> Result { - // Return the child's pid in case of parent/calling process, and for the - // cloned process, run the callback function, and exit with the same exit - // code returned by the callback. If there was any error when trying to run - // callback, exit with -1 - match unsafe { - clone_cmd - .call() - .map_err(|err| CloneError::Clone(nix::errno::from_i32(err.0)))? - } { +// Unlike the clone call, clone3 is currently using the kernel syscall, mimicking +// the interface of fork. There is not need to explicitly manage the memory, so +// we can safely passing the callback closure as reference. +fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option) -> Result { + #[repr(C)] + struct clone3_args { + flags: u64, + pidfd: u64, + child_tid: u64, + parent_tid: u64, + exit_signal: u64, + stack: u64, + stack_size: u64, + tls: u64, + set_tid: u64, + set_tid_size: u64, + cgroup: u64, + } + let mut args = clone3_args { + flags, + pidfd: 0, + child_tid: 0, + parent_tid: 0, + exit_signal: exit_signal.unwrap_or(0), + stack: 0, + stack_size: 0, + tls: 0, + set_tid: 0, + set_tid_size: 0, + cgroup: 0, + }; + let args_ptr = &mut args as *mut clone3_args; + let args_size = std::mem::size_of::(); + // For now, we can only use clone3 as a kernel syscall. Libc wrapper is not + // available yet. This can have undefined behavior because libc authors do + // not like people calling kernel syscall to directly create processes. Libc + // does perform additional bookkeeping when calling clone or fork. So far, + // we have not observed any issues with calling clone3 directly, but we + // should keep an eye on it. + match unsafe { libc::syscall(libc::SYS_clone3, args_ptr, args_size) } { + -1 => Err(CloneError::Clone(nix::Error::last())), 0 => { - prctl::set_name(child_name).expect("failed to set name"); - // Inside the cloned process - let ret = match cb() { - Err(error) => { - tracing::debug!("failed to run child process in clone: {:?}", error); - -1 - } - Ok(exit_code) => exit_code, - }; - std::process::exit(ret); + // Inside the cloned process, we execute the callback and exit with + // the return code. + std::process::exit(cb()); } - pid => Ok(Pid::from_raw(pid)), + ret if ret >= 0 => Ok(Pid::from_raw(ret as i32)), + ret => Err(CloneError::UnknownErrno(ret as i32)), } } -// Execute the cb in another process. Make the fork works more like thread_spawn -// or clone, so it is easier to reason. Compared to clone call, fork is easier -// to use since fork will magically take care of all the variable copying. If -// using clone, we would have to manually make sure all the variables are -// correctly send to the new process, especially Rust borrow checker will be a -// lot of hassel to deal with every details. -pub fn container_fork CallbackResult>(child_name: &str, cb: F) -> Result { - // Using `clone3` to mimic the effect of `fork`. - let mut clone = clone3::Clone3::default(); - clone.exit_signal(SIGCHLD as u64); - - container_clone(child_name, cb, clone) +fn clone(cb: CloneCb, flags: u64, exit_signal: Option) -> Result { + const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M + const DEFAULT_PAGE_SIZE: usize = 4 * 1024; // 4K + + // Use sysconf to find the page size. If there is an error, we assume + // the default 4K page size. + let page_size = nix::unistd::sysconf(nix::unistd::SysconfVar::PAGE_SIZE) + .map_err(CloneError::PageSize)? + .map(|size| size as usize) + .unwrap_or(DEFAULT_PAGE_SIZE); + + // Find out the default stack max size through getrlimit. + let (rlim_cur, _) = + resource::getrlimit(resource::Resource::RLIMIT_STACK).map_err(CloneError::ResourceLimit)?; + // mmap will return ENOMEM if stack size is unlimited when we create the + // child stack, so we need to set a reasonable default stack size. + let default_stack_size = if rlim_cur != u64::MAX { + rlim_cur as usize + } else { + tracing::debug!( + "stack size returned by getrlimit() is unlimited, use DEFAULT_STACK_SIZE(8MB)" + ); + DEFAULT_STACK_SIZE + }; + + // Using the clone syscall requires us to create the stack space for the + // child process instead of taken cared for us like fork call. We use mmap + // here to create the stack. Instead of guessing how much space the child + // process needs, we allocate through mmap to the system default limit, + // which is 8MB on most of the linux system today. This is OK since mmap + // will only reserve the address space upfront, instead of allocating + // physical memory upfront. The stack will grow as needed, up to the size + // reserved, so no wasted memory here. Lastly, the child stack only needs + // to support the container init process set up code in Youki. When Youki + // calls exec into the container payload, exec will reset the stack. Note, + // do not use MAP_GROWSDOWN since it is not well supported. + // Ref: https://man7.org/linux/man-pages/man2/mmap.2.html + let child_stack = unsafe { + mman::mmap( + None, + NonZeroUsize::new(default_stack_size).ok_or(CloneError::ZeroStackSize)?, + mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE, + mman::MapFlags::MAP_PRIVATE | mman::MapFlags::MAP_ANONYMOUS | mman::MapFlags::MAP_STACK, + -1, + 0, + ) + .map_err(CloneError::StackAllocation)? + }; + unsafe { + // Consistent with how pthread_create sets up the stack, we create a + // guard page of 1 page, to protect the child stack collision. Note, for + // clone call, the child stack will grow downward, so the bottom of the + // child stack is in the beginning. + mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE) + .map_err(CloneError::GuardPage)?; + }; + + // Since the child stack for clone grows downward, we need to pass in + // the top of the stack address. + let child_stack_top = unsafe { child_stack.add(default_stack_size) }; + + // Combine the clone flags with exit signals. + let combined_flags = (flags | exit_signal.unwrap_or(0)) as c_int; + + // We are passing the boxed closure "cb" into the clone function as the a + // function pointer in C. The box closure in Rust is both a function pointer + // and a struct. However, when casting the box closure into libc::c_void, + // the function pointer will be lost. Therefore, to work around the issue, + // we double box the closure. This is consistent with how std::unix::thread + // handles the closure. + // Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs + let data = Box::into_raw(Box::new(cb)); + // The main is a wrapper function passed into clone call below. The "data" + // arg is actually a raw pointer to the Box closure. so here, we re-box the + // pointer back into a box closure so the main takes ownership of the + // memory. Then we can call the closure. + extern "C" fn main(data: *mut libc::c_void) -> libc::c_int { + unsafe { Box::from_raw(data as *mut CloneCb)() } + } + + // The nix::sched::clone wrapper doesn't provide the right interface. Using + // the clone syscall is one of the rare cases where we don't want rust to + // manage the child stack memory. Instead, we want to use c_void directly + // here. Therefore, here we are using libc::clone syscall directly for + // better control. The child stack will be cleaned when exec is called or + // the child process terminates. The nix wrapper also does not treat the + // closure memory correctly. The wrapper implementation fails to pass the + // right ownership to the new child process. + // Ref: https://github.com/nix-rust/nix/issues/919 + // Ref: https://github.com/nix-rust/nix/pull/920 + let ret = unsafe { + libc::clone( + main, + child_stack_top, + combined_flags, + data as *mut libc::c_void, + ) + }; + + // After the clone returns, the heap memory associated with the Box closure + // is duplicated in the cloned process. Therefore, we can safely re-box the + // closure from the raw pointer and let rust to continue managing the + // memory. We call drop here explicitly to avoid the warning that the + // closure is not used. This is correct since the closure is called in the + // cloned process, not the parent process. + unsafe { drop(Box::from_raw(data)) }; + match ret { + -1 => Err(CloneError::Clone(nix::Error::last())), + pid if ret > 0 => Ok(Pid::from_raw(pid)), + _ => unreachable!("clone returned a negative pid {ret}"), + } } #[cfg(test)] @@ -106,7 +250,7 @@ mod test { #[test] fn test_container_fork() -> Result<()> { - let pid = container_fork("test:child", || Ok(0))?; + let pid = container_clone(Box::new(|| 0))?; match waitpid(pid, None).expect("wait pid failed.") { WaitStatus::Exited(p, status) => { assert_eq!(pid, p); @@ -119,7 +263,7 @@ mod test { #[test] fn test_container_err_fork() -> Result<()> { - let pid = container_fork("test:child", || Err(CallbackError::Test))?; + let pid = container_clone(Box::new(|| -1))?; match waitpid(pid, None).expect("wait pid failed.") { WaitStatus::Exited(p, status) => { assert_eq!(pid, p); @@ -171,7 +315,7 @@ mod test { unistd::ForkResult::Child => { // Inside the forked process. We call `container_clone` and pass // the pid to the parent process. - let pid = container_clone_sibling("test:child", || Ok(0))?; + let pid = container_clone_sibling(Box::new(|| 0))?; sender.send(pid.as_raw())?; sender.close()?; std::process::exit(0); @@ -180,4 +324,68 @@ mod test { Ok(()) } + + // This test depends on libseccomp to work. + #[cfg(feature = "libseccomp")] + #[test] + fn test_clone_fallback() -> Result<()> { + use crate::test_utils::TestCallbackError; + use oci_spec::runtime::{ + Arch, LinuxSeccompAction, LinuxSeccompBuilder, LinuxSyscallBuilder, + }; + + fn has_clone3() -> bool { + // We use the probe syscall to check if the kernel supports clone3 or + // seccomp has successfully blocked clone3. + let res = unsafe { libc::syscall(libc::SYS_clone3, 0, 0) }; + let err = (res == -1) + .then(std::io::Error::last_os_error) + .expect("probe syscall should not succeed"); + err.raw_os_error() != Some(libc::ENOSYS) + } + + // To test the fallback behavior, we will create a seccomp rule that + // blocks `clone3` as ENOSYS. + let syscall = LinuxSyscallBuilder::default() + .names(vec![String::from("clone3")]) + .action(LinuxSeccompAction::ScmpActErrno) + .errno_ret(libc::ENOSYS as u32) + .build()?; + let seccomp_profile = LinuxSeccompBuilder::default() + .default_action(LinuxSeccompAction::ScmpActAllow) + .architectures(vec![Arch::ScmpArchNative]) + .syscalls(vec![syscall]) + .build()?; + + crate::test_utils::test_in_child_process(|| { + // We use seccomp to block `clone3` + let _ = prctl::set_no_new_privileges(true); + crate::seccomp::initialize_seccomp(&seccomp_profile) + .expect("failed to initialize seccomp"); + + if has_clone3() { + return Err(TestCallbackError::Custom( + "clone3 is not blocked by seccomp".into(), + )); + } + + let pid = container_clone(Box::new(|| 0)).map_err(|err| err.to_string())?; + match waitpid(pid, None).expect("wait pid failed.") { + WaitStatus::Exited(p, status) => { + assert_eq!(pid, p); + assert_eq!(status, 0); + } + status => { + return Err(TestCallbackError::Custom(format!( + "failed to wait on child process: {:?}", + status + ))); + } + }; + + Ok(()) + })?; + + Ok(()) + } } From a67d180dc059eb3295c7c9ebdc5e9cd0399f2e19 Mon Sep 17 00:00:00 2001 From: yihuaf Date: Mon, 31 Jul 2023 08:14:44 -0700 Subject: [PATCH 2/2] Add a hack-benchmark recipe We often use this to quickly get a sense of perf of a PR change. Signed-off-by: yihuaf --- justfile | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/justfile b/justfile index fc347b6bf..6bc22bdc1 100644 --- a/justfile +++ b/justfile @@ -99,6 +99,17 @@ clean-test-kind: hack-bpftrace: BPFTRACE_STRLEN=120 ./hack/debug.bt +# a hacky benchmark method we have been using casually to compare performance +hack-benchmark: + #!/usr/bin/env bash + set -euo pipefail + + hyperfine \ + --prepare 'sudo sync; echo 3 | sudo tee /proc/sys/vm/drop_caches' \ + --warmup 10 \ + --min-runs 100 \ + 'sudo {{ cwd }}/youki create -b tutorial a && sudo {{ cwd }}/youki start a && sudo {{ cwd }}/youki delete -f a' + # run linting on project lint: cargo fmt --all -- --check