diff --git a/Cargo.lock b/Cargo.lock index 75c72540f..0f898c4bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -43,7 +43,7 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "once_cell", "version_check", ] @@ -191,7 +191,7 @@ checksum = "4319208da049c43661739c5fade2ba182f09d1dc2299b32298d3a31692b17e12" dependencies = [ "addr2line 0.20.0", "cc", - "cfg-if 1.0.0", + "cfg-if", "libc", "miniz_oxide", "object 0.31.1", @@ -407,12 +407,6 @@ dependencies = [ "nom", ] -[[package]] -name = "cfg-if" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" - [[package]] name = "cfg-if" version = "1.0.0" @@ -495,16 +489,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" -[[package]] -name = "clone3" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ee4e061ea30800291ca09663878f3953840a69b08ce244b3e8b26e894d9f60f" -dependencies = [ - "bitflags 1.3.2", - "uapi", -] - [[package]] name = "cmake" version = "0.1.50" @@ -565,7 +549,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9847f90f32a50b0dcbd68bc23ff242798b13080b97b0569f6ed96a45ce4cf2cd" dependencies = [ "autocfg", - "cfg-if 1.0.0", + "cfg-if", "libc", "scopeguard", "windows-sys 0.33.0", @@ -577,7 +561,7 @@ version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eeaa953eaad386a53111e47172c2fedba671e5684c8dd601a5f474f4f118710f" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -786,7 +770,7 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -801,7 +785,7 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-channel", "crossbeam-deque", "crossbeam-epoch", @@ -815,7 +799,7 @@ version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-utils", ] @@ -825,7 +809,7 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] @@ -837,7 +821,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" dependencies = [ "autocfg", - "cfg-if 1.0.0", + "cfg-if", "crossbeam-utils", "memoffset 0.9.0", "scopeguard", @@ -849,7 +833,7 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-utils", ] @@ -859,7 +843,7 @@ version = "0.8.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -947,7 +931,7 @@ version = "5.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "hashbrown 0.14.0", "lock_api", "once_cell", @@ -1038,7 +1022,7 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "339ee130d97a610ea5a5872d2bbb130fdf68884ff09d3028b81bec8a1ac23bbc" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "dirs-sys-next", ] @@ -1097,7 +1081,7 @@ version = "0.8.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -1209,7 +1193,7 @@ version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b0377f1edc77dbd1118507bc7a66e4ab64d2b90c66f90726dc801e73a8c68f9" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "rustix 0.38.1", "windows-sys 0.48.0", ] @@ -1230,7 +1214,7 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "libc", "redox_syscall 0.2.16", "windows-sys 0.48.0", @@ -1448,7 +1432,7 @@ version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "js-sys", "libc", "wasi", @@ -1965,7 +1949,6 @@ dependencies = [ "bitflags 2.3.3", "caps", "chrono", - "clone3", "fastrand", "futures", "libc", @@ -2004,7 +1987,7 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "winapi", ] @@ -2204,7 +2187,7 @@ version = "0.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c84490118f2ee2d74570d114f3d0493cbf02790df303d2707606c3e14e07c96" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "downcast", "fragile", "lazy_static", @@ -2219,7 +2202,7 @@ version = "0.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22ce75669015c4f47b289fd4d4f56e894e4c96003ffdf3ac51313126f94c6cbb" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "proc-macro2", "quote", "syn 1.0.109", @@ -2256,7 +2239,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" dependencies = [ "bitflags 1.3.2", - "cfg-if 1.0.0", + "cfg-if", "libc", "memoffset 0.7.1", "pin-utils", @@ -2403,7 +2386,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d" dependencies = [ "bitflags 1.3.2", - "cfg-if 1.0.0", + "cfg-if", "foreign-types", "libc", "once_cell", @@ -2462,7 +2445,7 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "libc", "redox_syscall 0.3.5", "smallvec", @@ -3478,7 +3461,7 @@ version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "cpufeatures", "digest", ] @@ -3736,7 +3719,7 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5486094ee78b2e5038a6382ed7645bc084dc2ec433426ca4c3cb61e2007b8998" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "fastrand", "redox_syscall 0.3.5", "rustix 0.38.1", @@ -3811,7 +3794,7 @@ version = "1.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "once_cell", ] @@ -4016,7 +3999,7 @@ version = "0.1.37" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "log", "pin-project-lite", "tracing-attributes", @@ -4109,32 +4092,6 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" -[[package]] -name = "uapi" -version = "0.2.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "019450240401d342e2a5bc47f7fbaeb002a38fe18197b83788750d7ffb143274" -dependencies = [ - "cc", - "cfg-if 0.1.10", - "libc", - "uapi-proc", -] - -[[package]] -name = "uapi-proc" -version = "0.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54de46f980cea7b2ae8d8f7f9f1c35cf7062c68343e99345ef73758f8e60975a" -dependencies = [ - "lazy_static", - "libc", - "proc-macro2", - "quote", - "regex", - "syn 1.0.109", -] - [[package]] name = "unicase" version = "2.6.0" @@ -4479,7 +4436,7 @@ version = "0.2.84" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "wasm-bindgen-macro", ] @@ -4527,7 +4484,7 @@ version = "0.4.34" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f219e0d211ba40266969f6dbdd90636da12f75bee4fc9d6c23d1260dadb51454" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "js-sys", "wasm-bindgen", "web-sys", @@ -4645,7 +4602,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea790bcdfb4e6e9d1e5ddf75b4699aac62b078fcc9f27f44e1748165ceea67bf" dependencies = [ "bytes", - "cfg-if 1.0.0", + "cfg-if", "derivative", "indexmap 1.9.3", "js-sys", @@ -4673,7 +4630,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f093937725e242e5529fed27e08ff836c011a9ecc22e6819fb818c2ac6ff5f88" dependencies = [ "backtrace", - "cfg-if 1.0.0", + "cfg-if", "enum-iterator", "enumset", "lazy_static", @@ -4762,7 +4719,7 @@ checksum = "a1e000c2cbd4f9805427af5f3b3446574caf89ab3a1e66c2f3579fbde22b072b" dependencies = [ "backtrace", "cc", - "cfg-if 1.0.0", + "cfg-if", "corosensei", "dashmap", "derivative", @@ -4791,7 +4748,7 @@ dependencies = [ "async-trait", "bincode", "bytes", - "cfg-if 1.0.0", + "cfg-if", "cooked-waker", "dashmap", "derivative", @@ -4846,7 +4803,7 @@ dependencies = [ "anyhow", "bitflags 1.3.2", "byteorder", - "cfg-if 1.0.0", + "cfg-if", "num_enum", "serde", "time 0.2.27", @@ -4910,7 +4867,7 @@ dependencies = [ "async-trait", "bincode", "bumpalo", - "cfg-if 1.0.0", + "cfg-if", "encoding_rs", "fxprof-processed-profile", "indexmap 1.9.3", @@ -4944,7 +4901,7 @@ version = "10.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "284466ef356ce2d909bc0ad470b60c4d0df5df2de9084457e118131b3c779b92" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -5056,7 +5013,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14309cbdf2c395258b124a24757c727403070c0465a28bcc780c4f82f4bca5ff" dependencies = [ "cc", - "cfg-if 1.0.0", + "cfg-if", "rustix 0.37.19", "wasmtime-asm-macros", "windows-sys 0.48.0", @@ -5071,7 +5028,7 @@ dependencies = [ "addr2line 0.19.0", "anyhow", "bincode", - "cfg-if 1.0.0", + "cfg-if", "cpp_demangle", "gimli 0.27.2", "ittapi", @@ -5105,7 +5062,7 @@ version = "10.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b49ceb7e2105a8ebe5614d7bbab6f6ef137a284e371633af60b34925493081f" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "libc", "windows-sys 0.48.0", ] @@ -5118,7 +5075,7 @@ checksum = "3a5de4762421b0b2b19e02111ca403632852b53e506e03b4b227ffb0fbfa63c2" dependencies = [ "anyhow", "cc", - "cfg-if 1.0.0", + "cfg-if", "encoding_rs", "indexmap 1.9.3", "libc", diff --git a/crates/libcontainer/Cargo.toml b/crates/libcontainer/Cargo.toml index 3f701c5ca..d1d95fea8 100644 --- a/crates/libcontainer/Cargo.toml +++ b/crates/libcontainer/Cargo.toml @@ -36,7 +36,6 @@ libseccomp = { version = "0.3.0", optional=true } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" rust-criu = "0.4.0" -clone3 = "0.2.3" regex = "1.9.1" thiserror = "1.0.43" tracing = { version = "0.1.37", features = ["attributes"]} diff --git a/crates/libcontainer/src/process/container_intermediate_process.rs b/crates/libcontainer/src/process/container_intermediate_process.rs index 1d8b8918d..1d02637a5 100644 --- a/crates/libcontainer/src/process/container_intermediate_process.rs +++ b/crates/libcontainer/src/process/container_intermediate_process.rs @@ -8,6 +8,7 @@ use procfs::process::Process; use super::args::{ContainerArgs, ContainerType}; use super::container_init_process::container_init_process; +use super::fork::CloneCb; #[derive(Debug, thiserror::Error)] pub enum IntermediateProcessError { @@ -36,7 +37,7 @@ pub fn container_intermediate_process( intermediate_chan: &mut (channel::IntermediateSender, channel::IntermediateReceiver), init_chan: &mut (channel::InitSender, channel::InitReceiver), main_sender: &mut channel::MainSender, -) -> Result { +) -> Result<()> { let (inter_sender, inter_receiver) = intermediate_chan; let (init_sender, init_receiver) = init_chan; let command = args.syscall.create_syscall(); @@ -109,6 +110,51 @@ pub fn container_intermediate_process( namespaces.unshare_or_setns(pid_namespace)?; } + let cb: CloneCb = { + let args = args.clone(); + let init_sender = init_sender.clone(); + let inter_sender = inter_sender.clone(); + let mut main_sender = main_sender.clone(); + let mut init_receiver = init_receiver.clone(); + Box::new(move || { + if let Err(ret) = prctl::set_name("youki:[2:INIT]") { + tracing::error!(?ret, "failed to set name for child process"); + return ret; + } + + // We are inside the forked process here. The first thing we have to do + // is to close any unused senders, since fork will make a dup for all + // the socket. + if let Err(err) = init_sender.close() { + tracing::error!(?err, "failed to close receiver in init process"); + return -1; + } + if let Err(err) = inter_sender.close() { + tracing::error!(?err, "failed to close sender in the intermediate process"); + return -1; + } + match container_init_process(&args, &mut main_sender, &mut init_receiver) { + Ok(_) => 0, + Err(e) => { + if let ContainerType::TenantContainer { exec_notify_fd } = args.container_type { + let buf = format!("{e}"); + if let Err(err) = write(exec_notify_fd, buf.as_bytes()) { + tracing::error!(?err, "failed to write to exec notify fd"); + return -1; + } + if let Err(err) = close(exec_notify_fd) { + tracing::error!(?err, "failed to close exec notify fd"); + return -1; + } + } + + tracing::error!("failed to initialize container process: {e}"); + -1 + } + } + }) + }; + // We have to record the pid of the init process. The init process will be // inside the pid namespace, so we can't rely on the init process to send us // the correct pid. We also want to clone the init process as a sibling @@ -117,41 +163,7 @@ pub fn container_intermediate_process( // configuration. The youki main process can decide what to do with the init // process and the intermediate process can just exit safely after the job // is done. - let pid = fork::container_clone_sibling("youki:[2:INIT]", || { - // We are inside the forked process here. The first thing we have to do - // is to close any unused senders, since fork will make a dup for all - // the socket. - init_sender.close().map_err(|err| { - tracing::error!("failed to close receiver in init process: {}", err); - IntermediateProcessError::Channel(err) - })?; - inter_sender.close().map_err(|err| { - tracing::error!( - "failed to close sender in the intermediate process: {}", - err - ); - IntermediateProcessError::Channel(err) - })?; - match container_init_process(args, main_sender, init_receiver) { - Ok(_) => Ok(0), - Err(e) => { - if let ContainerType::TenantContainer { exec_notify_fd } = args.container_type { - let buf = format!("{e}"); - write(exec_notify_fd, buf.as_bytes()).map_err(|err| { - tracing::error!("failed to write to exec notify fd: {}", err); - IntermediateProcessError::ExecNotify(err) - })?; - close(exec_notify_fd).map_err(|err| { - tracing::error!("failed to close exec notify fd: {}", err); - IntermediateProcessError::ExecNotify(err) - })?; - } - tracing::error!("failed to initialize container process: {e}"); - Err(e.into()) - } - } - }) - .map_err(|err| { + let pid = fork::container_clone_sibling(cb).map_err(|err| { tracing::error!("failed to fork init process: {}", err); IntermediateProcessError::InitProcess(err) })?; @@ -185,7 +197,8 @@ pub fn container_intermediate_process( tracing::error!("failed to close unused init sender: {}", err); err })?; - Ok(pid) + + Ok(()) } fn apply_cgroups< diff --git a/crates/libcontainer/src/process/container_main_process.rs b/crates/libcontainer/src/process/container_main_process.rs index f3024caf6..38f7c0d60 100644 --- a/crates/libcontainer/src/process/container_main_process.rs +++ b/crates/libcontainer/src/process/container_main_process.rs @@ -1,6 +1,8 @@ use crate::{ process::{ - args::ContainerArgs, channel, container_intermediate_process, fork, + args::ContainerArgs, + channel, container_intermediate_process, + fork::{self, CloneCb}, intel_rdt::setup_intel_rdt, }, rootless::Rootless, @@ -37,33 +39,52 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo // cloned process, we have to be deligent about closing any unused channel. // At minimum, we have to close down any unused senders. The corresponding // receivers will be cleaned up once the senders are closed down. - let (main_sender, main_receiver) = &mut channel::main_channel()?; - let inter_chan = &mut channel::intermediate_channel()?; - let init_chan = &mut channel::init_channel()?; + let (main_sender, mut main_receiver) = channel::main_channel()?; + let inter_chan = channel::intermediate_channel()?; + let init_chan = channel::init_channel()?; - let intermediate_pid = fork::container_fork("youki:[1:INTER]", || { - container_intermediate_process::container_intermediate_process( - container_args, - inter_chan, - init_chan, - main_sender, - )?; + let cb: CloneCb = { + let container_args = container_args.clone(); + let mut main_sender = main_sender.clone(); + let mut inter_chan = inter_chan.clone(); + let mut init_chan = init_chan.clone(); + Box::new(move || { + if let Err(ret) = prctl::set_name("youki:[1:INTER]") { + tracing::error!(?ret, "failed to set name for child process"); + return ret; + } + + match container_intermediate_process::container_intermediate_process( + &container_args, + &mut inter_chan, + &mut init_chan, + &mut main_sender, + ) { + Ok(_) => 0, + Err(err) => { + tracing::error!(?err, "failed to run intermediate process"); + -1 + } + } + }) + }; - Ok(0) - }) - .map_err(|err| { + let intermediate_pid = fork::container_clone(cb).map_err(|err| { tracing::error!("failed to fork intermediate process: {}", err); ProcessError::IntermediateProcessFailed(err) })?; // Close down unused fds. The corresponding fds are duplicated to the - // child process during fork. + // child process during clone. main_sender.close().map_err(|err| { tracing::error!("failed to close unused sender: {}", err); err })?; - let (inter_sender, inter_receiver) = inter_chan; + let (mut inter_sender, inter_receiver) = inter_chan; + #[cfg(feature = "libseccomp")] + let (mut init_sender, init_receiver) = init_chan; + #[cfg(not(feature = "libseccomp"))] let (init_sender, init_receiver) = init_chan; // If creating a rootless container, the intermediate process will ask @@ -106,8 +127,8 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bo crate::process::seccomp_listener::sync_seccomp( seccomp, &state, - init_sender, - main_receiver, + &mut init_sender, + &mut main_receiver, )?; } diff --git a/crates/libcontainer/src/process/fork.rs b/crates/libcontainer/src/process/fork.rs index a3557a531..114a22440 100644 --- a/crates/libcontainer/src/process/fork.rs +++ b/crates/libcontainer/src/process/fork.rs @@ -1,98 +1,242 @@ +use std::{ffi::c_int, num::NonZeroUsize}; + use libc::SIGCHLD; -use nix::unistd::Pid; -use prctl; +use nix::{ + sys::{mman, resource}, + unistd::Pid, +}; #[derive(Debug, thiserror::Error)] pub enum CloneError { - #[error("failed to clone process using clone3")] + #[error("failed to clone process")] Clone(#[source] nix::Error), + #[error("failed to get system memory page size")] + PageSize(#[source] nix::Error), + #[error("failed to get resource limit")] + ResourceLimit(#[source] nix::Error), + #[error("the stack size is zero")] + ZeroStackSize, + #[error("failed to allocate stack")] + StackAllocation(#[source] nix::Error), + #[error("failed to create stack guard page")] + GuardPage(#[source] nix::Error), + #[error("unknown error code {0}")] + UnknownErrno(i32), } -type Result = std::result::Result; - -#[derive(Debug, thiserror::Error)] -pub enum CallbackError { - #[error(transparent)] - IntermediateProcess( - #[from] crate::process::container_intermediate_process::IntermediateProcessError, - ), - #[error(transparent)] - InitProcess(#[from] crate::process::container_init_process::InitProcessError), - // Need a fake error for testing - #[error("unknown")] - #[cfg(test)] - Test, -} - -type CallbackResult = std::result::Result; +/// The callback function used in clone system call. The return value is i32 +/// which is consistent with C functions return code. The trait has to be +/// `FnMut` because we need to be able to call the closure multiple times, once +/// in clone3 and once in clone if fallback is required. The closure is boxed +/// because we need to store the closure on heap, not stack in the case of +/// `clone`. Unlike `fork` or `clone3`, the `clone` glibc wrapper requires us to +/// pass in a child stack, which is empty. By storing the closure in heap, we +/// can then in the new process to re-box the heap memory back to a closure +/// correctly. +pub type CloneCb = Box i32>; -// Fork/Clone a sibling process that shares the same parent as the calling +// Clone a sibling process that shares the same parent as the calling // process. This is used to launch the container init process so the parent // process of the calling process can receive ownership of the process. If we // clone a child process as the init process, the calling process (likely the // youki main process) will exit and the init process will be re-parented to the // process 1 (system init process), which is not the right behavior of what we // look for. -pub fn container_clone_sibling CallbackResult>( - child_name: &str, - cb: F, -) -> Result { - let mut clone = clone3::Clone3::default(); +pub fn container_clone_sibling(cb: CloneCb) -> Result { // Note: normally, an exit signal is required, but when using // `CLONE_PARENT`, the `clone3` will return EINVAL if an exit signal is set. // The older `clone` will not return EINVAL in this case. Instead it ignores - // the exit signal bits in the glibc wrapper. - clone.flag_parent(); + // the exit signal bits in the glibc wrapper. Therefore, we explicitly set + // the exit_signal to None here, so this works for both version of clone. + clone_internal(cb, libc::CLONE_PARENT as u64, None) +} - container_clone(child_name, cb, clone) +// Clone a child process and execute the callback. +pub fn container_clone(cb: CloneCb) -> Result { + clone_internal(cb, 0, Some(SIGCHLD as u64)) +} + +// An internal wrapper to manage the clone3 vs clone fallback logic. +fn clone_internal( + mut cb: CloneCb, + flags: u64, + exit_signal: Option, +) -> Result { + match clone3(&mut cb, flags, exit_signal) { + Ok(pid) => Ok(pid), + // For now, we decide to only fallback on ENOSYS + Err(CloneError::Clone(nix::Error::ENOSYS)) => { + tracing::debug!("clone3 is not supported, fallback to clone"); + let pid = clone(cb, flags, exit_signal)?; + + Ok(pid) + } + Err(err) => Err(err), + } } -// A simple clone wrapper to clone3 so we can share this logic in different -// fork/clone situations. We decided to minimally support kernel version >= 5.4, -// and `clone3` requires only kernel version >= 5.3. Therefore, we don't need to -// fall back to `clone` or `fork`. -fn container_clone CallbackResult>( - child_name: &str, - cb: F, - mut clone_cmd: clone3::Clone3, -) -> Result { - // Return the child's pid in case of parent/calling process, and for the - // cloned process, run the callback function, and exit with the same exit - // code returned by the callback. If there was any error when trying to run - // callback, exit with -1 - match unsafe { - clone_cmd - .call() - .map_err(|err| CloneError::Clone(nix::errno::from_i32(err.0)))? - } { +// Unlike the clone call, clone3 is currently using the kernel syscall, mimicking +// the interface of fork. There is not need to explicitly manage the memory, so +// we can safely passing the callback closure as reference. +fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option) -> Result { + #[repr(C)] + struct clone3_args { + flags: u64, + pidfd: u64, + child_tid: u64, + parent_tid: u64, + exit_signal: u64, + stack: u64, + stack_size: u64, + tls: u64, + set_tid: u64, + set_tid_size: u64, + cgroup: u64, + } + let mut args = clone3_args { + flags, + pidfd: 0, + child_tid: 0, + parent_tid: 0, + exit_signal: exit_signal.unwrap_or(0), + stack: 0, + stack_size: 0, + tls: 0, + set_tid: 0, + set_tid_size: 0, + cgroup: 0, + }; + let args_ptr = &mut args as *mut clone3_args; + let args_size = std::mem::size_of::(); + // For now, we can only use clone3 as a kernel syscall. Libc wrapper is not + // available yet. This can have undefined behavior because libc authors do + // not like people calling kernel syscall to directly create processes. Libc + // does perform additional bookkeeping when calling clone or fork. So far, + // we have not observed any issues with calling clone3 directly, but we + // should keep an eye on it. + match unsafe { libc::syscall(libc::SYS_clone3, args_ptr, args_size) } { + -1 => Err(CloneError::Clone(nix::Error::last())), 0 => { - prctl::set_name(child_name).expect("failed to set name"); - // Inside the cloned process - let ret = match cb() { - Err(error) => { - tracing::debug!("failed to run child process in clone: {:?}", error); - -1 - } - Ok(exit_code) => exit_code, - }; - std::process::exit(ret); + // Inside the cloned process, we execute the callback and exit with + // the return code. + std::process::exit(cb()); } - pid => Ok(Pid::from_raw(pid)), + ret if ret >= 0 => Ok(Pid::from_raw(ret as i32)), + ret => Err(CloneError::UnknownErrno(ret as i32)), } } -// Execute the cb in another process. Make the fork works more like thread_spawn -// or clone, so it is easier to reason. Compared to clone call, fork is easier -// to use since fork will magically take care of all the variable copying. If -// using clone, we would have to manually make sure all the variables are -// correctly send to the new process, especially Rust borrow checker will be a -// lot of hassel to deal with every details. -pub fn container_fork CallbackResult>(child_name: &str, cb: F) -> Result { - // Using `clone3` to mimic the effect of `fork`. - let mut clone = clone3::Clone3::default(); - clone.exit_signal(SIGCHLD as u64); - - container_clone(child_name, cb, clone) +fn clone(cb: CloneCb, flags: u64, exit_signal: Option) -> Result { + const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M + const DEFAULT_PAGE_SIZE: usize = 4 * 1024; // 4K + + // Use sysconf to find the page size. If there is an error, we assume + // the default 4K page size. + let page_size = nix::unistd::sysconf(nix::unistd::SysconfVar::PAGE_SIZE) + .map_err(CloneError::PageSize)? + .map(|size| size as usize) + .unwrap_or(DEFAULT_PAGE_SIZE); + + // Find out the default stack max size through getrlimit. + let (rlim_cur, _) = + resource::getrlimit(resource::Resource::RLIMIT_STACK).map_err(CloneError::ResourceLimit)?; + // mmap will return ENOMEM if stack size is unlimited when we create the + // child stack, so we need to set a reasonable default stack size. + let default_stack_size = if rlim_cur != u64::MAX { + rlim_cur as usize + } else { + tracing::debug!( + "stack size returned by getrlimit() is unlimited, use DEFAULT_STACK_SIZE(8MB)" + ); + DEFAULT_STACK_SIZE + }; + + // Using the clone syscall requires us to create the stack space for the + // child process instead of taken cared for us like fork call. We use mmap + // here to create the stack. Instead of guessing how much space the child + // process needs, we allocate through mmap to the system default limit, + // which is 8MB on most of the linux system today. This is OK since mmap + // will only reserve the address space upfront, instead of allocating + // physical memory upfront. The stack will grow as needed, up to the size + // reserved, so no wasted memory here. Lastly, the child stack only needs + // to support the container init process set up code in Youki. When Youki + // calls exec into the container payload, exec will reset the stack. Note, + // do not use MAP_GROWSDOWN since it is not well supported. + // Ref: https://man7.org/linux/man-pages/man2/mmap.2.html + let child_stack = unsafe { + mman::mmap( + None, + NonZeroUsize::new(default_stack_size).ok_or(CloneError::ZeroStackSize)?, + mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE, + mman::MapFlags::MAP_PRIVATE | mman::MapFlags::MAP_ANONYMOUS | mman::MapFlags::MAP_STACK, + -1, + 0, + ) + .map_err(CloneError::StackAllocation)? + }; + unsafe { + // Consistent with how pthread_create sets up the stack, we create a + // guard page of 1 page, to protect the child stack collision. Note, for + // clone call, the child stack will grow downward, so the bottom of the + // child stack is in the beginning. + mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE) + .map_err(CloneError::GuardPage)?; + }; + + // Since the child stack for clone grows downward, we need to pass in + // the top of the stack address. + let child_stack_top = unsafe { child_stack.add(default_stack_size) }; + + // Combine the clone flags with exit signals. + let combined_flags = (flags | exit_signal.unwrap_or(0)) as c_int; + + // We are passing the boxed closure "cb" into the clone function as the a + // function pointer in C. The box closure in Rust is both a function pointer + // and a struct. However, when casting the box closure into libc::c_void, + // the function pointer will be lost. Therefore, to work around the issue, + // we double box the closure. This is consistent with how std::unix::thread + // handles the closure. + // Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs + let data = Box::into_raw(Box::new(cb)); + // The main is a wrapper function passed into clone call below. The "data" + // arg is actually a raw pointer to the Box closure. so here, we re-box the + // pointer back into a box closure so the main takes ownership of the + // memory. Then we can call the closure. + extern "C" fn main(data: *mut libc::c_void) -> libc::c_int { + unsafe { Box::from_raw(data as *mut CloneCb)() } + } + + // The nix::sched::clone wrapper doesn't provide the right interface. Using + // the clone syscall is one of the rare cases where we don't want rust to + // manage the child stack memory. Instead, we want to use c_void directly + // here. Therefore, here we are using libc::clone syscall directly for + // better control. The child stack will be cleaned when exec is called or + // the child process terminates. The nix wrapper also does not treat the + // closure memory correctly. The wrapper implementation fails to pass the + // right ownership to the new child process. + // Ref: https://github.com/nix-rust/nix/issues/919 + // Ref: https://github.com/nix-rust/nix/pull/920 + let ret = unsafe { + libc::clone( + main, + child_stack_top, + combined_flags, + data as *mut libc::c_void, + ) + }; + + // After the clone returns, the heap memory associated with the Box closure + // is duplicated in the cloned process. Therefore, we can safely re-box the + // closure from the raw pointer and let rust to continue managing the + // memory. We call drop here explicitly to avoid the warning that the + // closure is not used. This is correct since the closure is called in the + // cloned process, not the parent process. + unsafe { drop(Box::from_raw(data)) }; + match ret { + -1 => Err(CloneError::Clone(nix::Error::last())), + pid if ret > 0 => Ok(Pid::from_raw(pid)), + _ => unreachable!("clone returned a negative pid {ret}"), + } } #[cfg(test)] @@ -106,7 +250,7 @@ mod test { #[test] fn test_container_fork() -> Result<()> { - let pid = container_fork("test:child", || Ok(0))?; + let pid = container_clone(Box::new(|| 0))?; match waitpid(pid, None).expect("wait pid failed.") { WaitStatus::Exited(p, status) => { assert_eq!(pid, p); @@ -119,7 +263,7 @@ mod test { #[test] fn test_container_err_fork() -> Result<()> { - let pid = container_fork("test:child", || Err(CallbackError::Test))?; + let pid = container_clone(Box::new(|| -1))?; match waitpid(pid, None).expect("wait pid failed.") { WaitStatus::Exited(p, status) => { assert_eq!(pid, p); @@ -171,7 +315,7 @@ mod test { unistd::ForkResult::Child => { // Inside the forked process. We call `container_clone` and pass // the pid to the parent process. - let pid = container_clone_sibling("test:child", || Ok(0))?; + let pid = container_clone_sibling(Box::new(|| 0))?; sender.send(pid.as_raw())?; sender.close()?; std::process::exit(0); @@ -180,4 +324,68 @@ mod test { Ok(()) } + + // This test depends on libseccomp to work. + #[cfg(feature = "libseccomp")] + #[test] + fn test_clone_fallback() -> Result<()> { + use crate::test_utils::TestCallbackError; + use oci_spec::runtime::{ + Arch, LinuxSeccompAction, LinuxSeccompBuilder, LinuxSyscallBuilder, + }; + + fn has_clone3() -> bool { + // We use the probe syscall to check if the kernel supports clone3 or + // seccomp has successfully blocked clone3. + let res = unsafe { libc::syscall(libc::SYS_clone3, 0, 0) }; + let err = (res == -1) + .then(std::io::Error::last_os_error) + .expect("probe syscall should not succeed"); + err.raw_os_error() != Some(libc::ENOSYS) + } + + // To test the fallback behavior, we will create a seccomp rule that + // blocks `clone3` as ENOSYS. + let syscall = LinuxSyscallBuilder::default() + .names(vec![String::from("clone3")]) + .action(LinuxSeccompAction::ScmpActErrno) + .errno_ret(libc::ENOSYS as u32) + .build()?; + let seccomp_profile = LinuxSeccompBuilder::default() + .default_action(LinuxSeccompAction::ScmpActAllow) + .architectures(vec![Arch::ScmpArchNative]) + .syscalls(vec![syscall]) + .build()?; + + crate::test_utils::test_in_child_process(|| { + // We use seccomp to block `clone3` + let _ = prctl::set_no_new_privileges(true); + crate::seccomp::initialize_seccomp(&seccomp_profile) + .expect("failed to initialize seccomp"); + + if has_clone3() { + return Err(TestCallbackError::Custom( + "clone3 is not blocked by seccomp".into(), + )); + } + + let pid = container_clone(Box::new(|| 0)).map_err(|err| err.to_string())?; + match waitpid(pid, None).expect("wait pid failed.") { + WaitStatus::Exited(p, status) => { + assert_eq!(pid, p); + assert_eq!(status, 0); + } + status => { + return Err(TestCallbackError::Custom(format!( + "failed to wait on child process: {:?}", + status + ))); + } + }; + + Ok(()) + })?; + + Ok(()) + } } diff --git a/justfile b/justfile index fc347b6bf..6bc22bdc1 100644 --- a/justfile +++ b/justfile @@ -99,6 +99,17 @@ clean-test-kind: hack-bpftrace: BPFTRACE_STRLEN=120 ./hack/debug.bt +# a hacky benchmark method we have been using casually to compare performance +hack-benchmark: + #!/usr/bin/env bash + set -euo pipefail + + hyperfine \ + --prepare 'sudo sync; echo 3 | sudo tee /proc/sys/vm/drop_caches' \ + --warmup 10 \ + --min-runs 100 \ + 'sudo {{ cwd }}/youki create -b tutorial a && sudo {{ cwd }}/youki start a && sudo {{ cwd }}/youki delete -f a' + # run linting on project lint: cargo fmt --all -- --check