diff --git a/crates/libcontainer/src/container/builder_impl.rs b/crates/libcontainer/src/container/builder_impl.rs index ed2cd07dd..0a9e43f52 100644 --- a/crates/libcontainer/src/container/builder_impl.rs +++ b/crates/libcontainer/src/container/builder_impl.rs @@ -49,6 +49,8 @@ pub(super) struct ContainerBuilderImpl { pub detached: bool, /// Default executes the specified execution of a generic command pub executor: Box, + /// If do not use pivot root to jail process inside rootfs + pub no_pivot: bool, } impl ContainerBuilderImpl { @@ -154,6 +156,7 @@ impl ContainerBuilderImpl { cgroup_config, detached: self.detached, executor: self.executor.clone(), + no_pivot: self.no_pivot, }; let (init_pid, need_to_clean_up_intel_rdt_dir) = diff --git a/crates/libcontainer/src/container/init_builder.rs b/crates/libcontainer/src/container/init_builder.rs index 2230acc60..4ac2104de 100644 --- a/crates/libcontainer/src/container/init_builder.rs +++ b/crates/libcontainer/src/container/init_builder.rs @@ -20,6 +20,7 @@ pub struct InitContainerBuilder { bundle: PathBuf, use_systemd: bool, detached: bool, + no_pivot: bool, } impl InitContainerBuilder { @@ -31,6 +32,7 @@ impl InitContainerBuilder { bundle, use_systemd: true, detached: true, + no_pivot: false, } } @@ -45,6 +47,11 @@ impl InitContainerBuilder { self } + pub fn with_no_pivot(mut self, no_pivot: bool) -> Self { + self.no_pivot = no_pivot; + self + } + /// Creates a new container pub fn build(self) -> Result { let spec = self.load_spec()?; @@ -95,6 +102,7 @@ impl InitContainerBuilder { preserve_fds: self.base.preserve_fds, detached: self.detached, executor: self.base.executor, + no_pivot: self.no_pivot, }; builder_impl.create()?; diff --git a/crates/libcontainer/src/container/tenant_builder.rs b/crates/libcontainer/src/container/tenant_builder.rs index 1ebddd76b..e54a22cca 100644 --- a/crates/libcontainer/src/container/tenant_builder.rs +++ b/crates/libcontainer/src/container/tenant_builder.rs @@ -142,6 +142,7 @@ impl TenantContainerBuilder { preserve_fds: self.base.preserve_fds, detached: self.detached, executor: self.base.executor, + no_pivot: false, }; let pid = builder_impl.create()?; diff --git a/crates/libcontainer/src/process/args.rs b/crates/libcontainer/src/process/args.rs index a4451dd85..1c7d0c395 100644 --- a/crates/libcontainer/src/process/args.rs +++ b/crates/libcontainer/src/process/args.rs @@ -42,4 +42,6 @@ pub struct ContainerArgs { pub detached: bool, /// Manage the functions that actually run on the container pub executor: Box, + /// If do not use pivot root to jail process inside rootfs + pub no_pivot: bool, } diff --git a/crates/libcontainer/src/process/container_init_process.rs b/crates/libcontainer/src/process/container_init_process.rs index 64182c1c7..8d30b74b4 100644 --- a/crates/libcontainer/src/process/container_init_process.rs +++ b/crates/libcontainer/src/process/container_init_process.rs @@ -4,7 +4,7 @@ use std::path::{Path, PathBuf}; use std::{env, fs, mem}; use nc; -use nix::mount::MsFlags; +use nix::mount::{MntFlags, MsFlags}; use nix::sched::CloneFlags; use nix::sys::stat::Mode; use nix::unistd::{self, setsid, Gid, Uid}; @@ -270,6 +270,76 @@ fn reopen_dev_null() -> Result<()> { Ok(()) } +// umount or hide the target path. If the target path is mounted +// try to unmount it first if the unmount operation fails with EINVAL +// then mount a tmpfs with size 0k to hide the target path. +fn unmount_or_hide(syscall: &dyn Syscall, target: impl AsRef) -> Result<()> { + let target_path = target.as_ref(); + match syscall.umount2(target_path, MntFlags::MNT_DETACH) { + Ok(_) => Ok(()), + Err(SyscallError::Nix(nix::errno::Errno::EINVAL)) => syscall + .mount( + None, + target_path, + Some("tmpfs"), + MsFlags::MS_RDONLY, + Some("size=0k"), + ) + .map_err(InitProcessError::SyscallOther), + Err(err) => Err(InitProcessError::SyscallOther(err)), + } +} + +fn move_root(syscall: &dyn Syscall, rootfs: &Path) -> Result<()> { + unistd::chdir(rootfs).map_err(InitProcessError::NixOther)?; + // umount /sys and /proc if they are mounted, the purpose is to + // unmount or hide the /sys and /proc filesystems before the process changes its + // root to the new rootfs. thus ensure that the /sys and /proc filesystems are not + // accessible in the new rootfs. the logic is borrowed from crun + // https://github.com/containers/crun/blob/53cd1c1c697d7351d0cad23708d29bf4a7980a3a/src/libcrun/linux.c#L2780 + unmount_or_hide(syscall, "/sys")?; + unmount_or_hide(syscall, "/proc")?; + syscall + .mount(Some(rootfs), Path::new("/"), None, MsFlags::MS_MOVE, None) + .map_err(|err| { + tracing::error!(?err, ?rootfs, "failed to mount ms_move"); + InitProcessError::SyscallOther(err) + })?; + + syscall.chroot(Path::new(".")).map_err(|err| { + tracing::error!(?err, ?rootfs, "failed to chroot"); + InitProcessError::SyscallOther(err) + })?; + + unistd::chdir("/").map_err(InitProcessError::NixOther)?; + + Ok(()) +} + +fn do_pivot_root( + syscall: &dyn Syscall, + namespaces: &Namespaces, + no_pivot: bool, + rootfs: impl AsRef, +) -> Result<()> { + let rootfs_path = rootfs.as_ref(); + + let handle_error = |err: SyscallError, msg: &str| -> InitProcessError { + tracing::error!(?err, ?rootfs_path, msg); + InitProcessError::SyscallOther(err) + }; + + match namespaces.get(LinuxNamespaceType::Mount)? { + Some(_) if no_pivot => move_root(syscall, rootfs_path), + Some(_) => syscall + .pivot_rootfs(rootfs.as_ref()) + .map_err(|err| handle_error(err, "failed to pivot root")), + None => syscall + .chroot(rootfs_path) + .map_err(|err| handle_error(err, "failed to chroot")), + } +} + // Some variables are unused in the case where libseccomp feature is not enabled. #[allow(unused_variables)] pub fn container_init_process( @@ -343,18 +413,7 @@ pub fn container_init_process( // we use pivot_root, but if we are on the host mount namespace, we will // use simple chroot. Scary things will happen if you try to pivot_root // in the host mount namespace... - if namespaces.get(LinuxNamespaceType::Mount)?.is_some() { - // change the root of filesystem of the process to the rootfs - syscall.pivot_rootfs(rootfs_path).map_err(|err| { - tracing::error!(?err, ?rootfs_path, "failed to pivot root"); - InitProcessError::SyscallOther(err) - })?; - } else { - syscall.chroot(rootfs_path).map_err(|err| { - tracing::error!(?err, ?rootfs_path, "failed to chroot"); - InitProcessError::SyscallOther(err) - })?; - } + do_pivot_root(syscall.as_ref(), &namespaces, args.no_pivot, rootfs_path)?; // As we have changed the root mount, from here on // logs are no longer visible in journalctl diff --git a/crates/libcontainer/src/syscall/linux.rs b/crates/libcontainer/src/syscall/linux.rs index 9bc2f13de..ed68e104a 100644 --- a/crates/libcontainer/src/syscall/linux.rs +++ b/crates/libcontainer/src/syscall/linux.rs @@ -574,6 +574,11 @@ impl Syscall for LinuxSyscall { }?; Ok(()) } + + fn umount2(&self, target: &Path, flags: MntFlags) -> Result<()> { + umount2(target, flags)?; + Ok(()) + } } #[cfg(test)] diff --git a/crates/libcontainer/src/syscall/syscall.rs b/crates/libcontainer/src/syscall/syscall.rs index 6868a180e..e886aef7a 100644 --- a/crates/libcontainer/src/syscall/syscall.rs +++ b/crates/libcontainer/src/syscall/syscall.rs @@ -8,7 +8,7 @@ use std::sync::Arc; use caps::{CapSet, CapsHashSet}; use libc; -use nix::mount::MsFlags; +use nix::mount::{MntFlags, MsFlags}; use nix::sched::CloneFlags; use nix::sys::stat::{Mode, SFlag}; use nix::unistd::{Gid, Uid}; @@ -54,6 +54,7 @@ pub trait Syscall { size: libc::size_t, ) -> Result<()>; fn set_io_priority(&self, class: i64, priority: i64) -> Result<()>; + fn umount2(&self, target: &Path, flags: MntFlags) -> Result<()>; } #[derive(Clone, Copy)] diff --git a/crates/libcontainer/src/syscall/test.rs b/crates/libcontainer/src/syscall/test.rs index 4b5cc0d44..6e2e01977 100644 --- a/crates/libcontainer/src/syscall/test.rs +++ b/crates/libcontainer/src/syscall/test.rs @@ -6,7 +6,7 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; use caps::{CapSet, CapsHashSet}; -use nix::mount::MsFlags; +use nix::mount::{MntFlags, MsFlags}; use nix::sched::CloneFlags; use nix::sys::stat::{Mode, SFlag}; use nix::unistd::{Gid, Uid}; @@ -44,6 +44,12 @@ pub struct IoPriorityArgs { pub priority: i64, } +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct UMount2Args { + pub target: PathBuf, + pub flags: MntFlags, +} + #[derive(Default)] struct Mock { values: Vec>, @@ -64,6 +70,7 @@ pub enum ArgName { Groups, Capability, IoPriority, + UMount2, } impl ArgName { @@ -259,6 +266,16 @@ impl Syscall for TestHelperSyscall { Box::new(IoPriorityArgs { class, priority }), ) } + + fn umount2(&self, target: &Path, flags: MntFlags) -> Result<()> { + self.mocks.act( + ArgName::UMount2, + Box::new(UMount2Args { + target: target.to_owned(), + flags, + }), + ) + } } impl TestHelperSyscall { @@ -369,4 +386,13 @@ impl TestHelperSyscall { .map(|x| x.downcast_ref::().unwrap().clone()) .collect::>() } + + pub fn get_umount_args(&self) -> Vec { + self.mocks + .fetch(ArgName::UMount2) + .values + .iter() + .map(|x| x.downcast_ref::().unwrap().clone()) + .collect::>() + } } diff --git a/crates/youki/src/commands/create.rs b/crates/youki/src/commands/create.rs index dca591fb2..41eda151a 100644 --- a/crates/youki/src/commands/create.rs +++ b/crates/youki/src/commands/create.rs @@ -24,6 +24,7 @@ pub fn create(args: Create, root_path: PathBuf, systemd_cgroup: bool) -> Result< .as_init(&args.bundle) .with_systemd(systemd_cgroup) .with_detach(true) + .with_no_pivot(args.no_pivot) .build()?; Ok(()) diff --git a/crates/youki/src/commands/run.rs b/crates/youki/src/commands/run.rs index f297903f9..2f6d1812e 100644 --- a/crates/youki/src/commands/run.rs +++ b/crates/youki/src/commands/run.rs @@ -22,6 +22,7 @@ pub fn run(args: Run, root_path: PathBuf, systemd_cgroup: bool) -> Result { .as_init(&args.bundle) .with_systemd(systemd_cgroup) .with_detach(args.detach) + .with_no_pivot(args.no_pivot) .build()?; container diff --git a/tests/contest/contest/src/main.rs b/tests/contest/contest/src/main.rs index 8049457c0..e0d3a4a02 100644 --- a/tests/contest/contest/src/main.rs +++ b/tests/contest/contest/src/main.rs @@ -19,6 +19,7 @@ use crate::tests::io_priority::get_io_priority_test; use crate::tests::lifecycle::{ContainerCreate, ContainerLifecycle}; use crate::tests::linux_ns_itype::get_ns_itype_tests; use crate::tests::mounts_recursive::get_mounts_recursive_test; +use crate::tests::no_pivot::get_no_pivot_test; use crate::tests::pidfile::get_pidfile_test; use crate::tests::readonly_paths::get_ro_paths_test; use crate::tests::scheduler::get_scheduler_test; @@ -113,6 +114,7 @@ fn main() -> Result<()> { let scheduler = get_scheduler_test(); let io_priority_test = get_io_priority_test(); let devices = get_devices_test(); + let no_pivot = get_no_pivot_test(); tm.add_test_group(Box::new(cl)); tm.add_test_group(Box::new(cc)); @@ -136,6 +138,7 @@ fn main() -> Result<()> { tm.add_test_group(Box::new(sysctl)); tm.add_test_group(Box::new(scheduler)); tm.add_test_group(Box::new(devices)); + tm.add_test_group(Box::new(no_pivot)); tm.add_test_group(Box::new(io_priority_test)); tm.add_cleanup(Box::new(cgroups::cleanup_v1)); diff --git a/tests/contest/contest/src/tests/mod.rs b/tests/contest/contest/src/tests/mod.rs index 1fee606b1..7a742d384 100644 --- a/tests/contest/contest/src/tests/mod.rs +++ b/tests/contest/contest/src/tests/mod.rs @@ -9,6 +9,7 @@ pub mod io_priority; pub mod lifecycle; pub mod linux_ns_itype; pub mod mounts_recursive; +pub mod no_pivot; pub mod pidfile; pub mod readonly_paths; pub mod scheduler; diff --git a/tests/contest/contest/src/tests/no_pivot/mod.rs b/tests/contest/contest/src/tests/no_pivot/mod.rs new file mode 100644 index 000000000..8540a058a --- /dev/null +++ b/tests/contest/contest/src/tests/no_pivot/mod.rs @@ -0,0 +1,29 @@ +use anyhow::{Context, Result}; +use oci_spec::runtime::{ProcessBuilder, Spec, SpecBuilder}; +use test_framework::{test_result, Test, TestGroup, TestResult}; + +use crate::utils::test_utils::test_inside_container_with_no_pivot; + +fn create_spec() -> Result { + SpecBuilder::default() + .process( + ProcessBuilder::default() + .args(vec!["runtimetest".to_string(), "no_pivot".to_string()]) + .build()?, + ) + .build() + .context("failed to create spec") +} + +fn no_pivot_test() -> TestResult { + let spec = test_result!(create_spec()); + test_inside_container_with_no_pivot(spec, &|_| Ok(())) +} + +pub fn get_no_pivot_test() -> TestGroup { + let mut test_group = TestGroup::new("no_pivot"); + let no_pivot_test = Test::new("no_pivot_test", Box::new(no_pivot_test)); + test_group.add(vec![Box::new(no_pivot_test)]); + + test_group +} diff --git a/tests/contest/contest/src/utils/test_utils.rs b/tests/contest/contest/src/utils/test_utils.rs index c72cd8f0d..ceb31262e 100644 --- a/tests/contest/contest/src/utils/test_utils.rs +++ b/tests/contest/contest/src/utils/test_utils.rs @@ -42,11 +42,9 @@ pub struct ContainerData { pub create_result: std::io::Result, } -/// Starts the runtime with given directory as root directory -pub fn create_container>(id: &str, dir: P) -> Result { - let res = Command::new(get_runtime_path()) - // set stdio so that we can get o/p of runtimetest - // in test_inside_container function +fn create_container_command>(id: &str, dir: P, with_pivot_root: bool) -> Command { + let mut command = Command::new(get_runtime_path()); + command .stdout(Stdio::piped()) .stderr(Stdio::piped()) .arg("--root") @@ -54,7 +52,23 @@ pub fn create_container>(id: &str, dir: P) -> Result { .arg("create") .arg(id) .arg("--bundle") - .arg(dir.as_ref().join("bundle")) + .arg(dir.as_ref().join("bundle")); + if with_pivot_root { + command.arg("--no-pivot"); + } + command +} + +/// Starts the runtime with given directory as root directory +pub fn create_container>(id: &str, dir: P) -> Result { + let res = create_container_command(id, dir, false) + .spawn() + .context("could not create container")?; + Ok(res) +} + +pub fn create_container_no_pivot>(id: &str, dir: P) -> Result { + let res = create_container_command(id, dir, true) .spawn() .context("could not create container")?; Ok(res) @@ -232,6 +246,100 @@ pub fn test_inside_container( TestResult::Passed } +// just copy-pasted from test_inside_container for now, but with no pivot root +// need to refactor this to avoid duplication +pub fn test_inside_container_with_no_pivot( + spec: Spec, + setup_for_test: &dyn Fn(&Path) -> Result<()>, +) -> TestResult { + let id = generate_uuid(); + let id_str = id.to_string(); + let bundle = prepare_bundle().unwrap(); + + // This will do the required setup for the test + test_result!(setup_for_test( + &bundle.as_ref().join("bundle").join("rootfs") + )); + + set_config(&bundle, &spec).unwrap(); + // as we have to run runtimetest inside the container, and is expects + // the config.json to be at path /config.json we save it there + let path = bundle + .as_ref() + .join("bundle") + .join("rootfs") + .join("config.json"); + spec.save(path).unwrap(); + + let runtimetest_path = get_runtimetest_path(); + // The config will directly use runtime as the command to be run, so we have to + // save the runtimetest binary at its /bin + std::fs::copy( + runtimetest_path, + bundle + .as_ref() + .join("bundle") + .join("rootfs") + .join("bin") + .join("runtimetest"), + ) + .unwrap(); + let create_process = create_container_no_pivot(&id_str, &bundle).unwrap(); + // here we do not wait for the process by calling wait() as in the test_outside_container + // function because we need the output of the runtimetest. If we call wait, it will return + // and we won't have an easy way of getting the stdio of the runtimetest. + // Thus to make sure the container is created, we just wait for sometime, and + // assume that the create command was successful. If it wasn't we can catch that error + // in the start_container, as we can not start a non-created container anyways + std::thread::sleep(std::time::Duration::from_millis(1000)); + match start_container(&id_str, &bundle) + .unwrap() + .wait_with_output() + { + Ok(c) => c, + Err(e) => return TestResult::Failed(anyhow!("container start failed : {:?}", e)), + }; + + let create_output = create_process + .wait_with_output() + .context("getting output after starting the container failed") + .unwrap(); + + let stdout = String::from_utf8_lossy(&create_output.stdout); + if !stdout.is_empty() { + println!( + "{:?}", + anyhow!("container stdout was not empty, found : {}", stdout) + ) + } + let stderr = String::from_utf8_lossy(&create_output.stderr); + if !stderr.is_empty() { + return TestResult::Failed(anyhow!( + "container stderr was not empty, found : {}", + stderr + )); + } + + let (out, err) = get_state(&id_str, &bundle).unwrap(); + if !err.is_empty() { + return TestResult::Failed(anyhow!( + "error in getting state after starting the container : {}", + err + )); + } + + let state: State = match serde_json::from_str(&out) { + Ok(v) => v, + Err(e) => return TestResult::Failed(anyhow!("error in parsing state of container after start in test_inside_container : stdout : {}, parse error : {}",out,e)), + }; + if state.status != "stopped" { + return TestResult::Failed(anyhow!("error : unexpected container status in test_inside_runtime : expected stopped, got {}, container state : {:?}",state.status,state)); + } + kill_container(&id_str, &bundle).unwrap().wait().unwrap(); + delete_container(&id_str, &bundle).unwrap().wait().unwrap(); + TestResult::Passed +} + pub fn check_container_created(data: &ContainerData) -> Result<()> { match &data.create_result { Ok(exit_status) => { diff --git a/tests/contest/runtimetest/src/main.rs b/tests/contest/runtimetest/src/main.rs index 95780bd48..486495a8b 100644 --- a/tests/contest/runtimetest/src/main.rs +++ b/tests/contest/runtimetest/src/main.rs @@ -44,6 +44,7 @@ fn main() { "io_priority_class_be" => tests::test_io_priority_class(&spec, IoprioClassBe), "io_priority_class_idle" => tests::test_io_priority_class(&spec, IoprioClassIdle), "devices" => tests::validate_devices(&spec), + "no_pivot" => tests::validate_rootfs(), _ => eprintln!("error due to unexpected execute test name: {execute_test}"), } } diff --git a/tests/contest/runtimetest/src/tests.rs b/tests/contest/runtimetest/src/tests.rs index 40f5ad29c..dec34dee3 100644 --- a/tests/contest/runtimetest/src/tests.rs +++ b/tests/contest/runtimetest/src/tests.rs @@ -545,3 +545,37 @@ pub fn test_io_priority_class(spec: &Spec, io_priority_class: IOPriorityClass) { eprintln!("error ioprio_get expected priority {expected_priority:?}, got {priority}") } } + +// the validate_rootfs function is used to validate the rootfs of the container is +// as expected. This function is used in the no_pivot test to validate the rootfs +pub fn validate_rootfs() { + // list the first level directories in the rootfs + let mut entries = fs::read_dir("/") + .unwrap() + .filter_map(|entry| { + entry.ok().and_then(|e| { + let path = e.path(); + if path.is_dir() { + path.file_name() + .and_then(|name| name.to_str().map(|s| s.to_owned())) + } else { + None + } + }) + }) + .collect::>(); + // sort the entries to make the test deterministic + entries.sort(); + + // this is the list of directories that we expect to find in the rootfs + let mut expected = vec![ + "bin", "dev", "etc", "home", "proc", "root", "sys", "tmp", "usr", "var", + ]; + // sort the expected entries to make the test deterministic + expected.sort(); + + // compare the expected entries with the actual entries + if entries != expected { + eprintln!("error due to rootfs want {expected:?}, got {entries:?}"); + } +}