From 4c4848230c8a0f79f24412ff80d3beedd62aab6b Mon Sep 17 00:00:00 2001 From: Christian Duerr Date: Tue, 26 Sep 2023 00:55:11 +0200 Subject: [PATCH] Add mount namespaces to linux sandbox This patch adds optional mount namespaces to the linux sandbox to allow for filesystem isolation on systems without landlock support. Filesystem isolation now requires either landlock OR namespace creation to be successful in order for the sandbox creation to be successful. Landlock will be layered on top of the mount namespace if both are available. While landlock automatically resolves symlink access, mount namespaces do not. So to allow access to `/usr/lib` through `/lib`, it is now necessary to allow both `/lib` AND `/usr/lib`. --- Cargo.toml | 5 + src/linux/mod.rs | 77 ++++++++++-- src/linux/namespaces.rs | 232 +++++++++++++++++++++++++++++++++-- tests/exec.rs | 2 + tests/fs_without_landlock.rs | 61 +++++++++ tests/full_sandbox.rs | 4 +- 6 files changed, 363 insertions(+), 18 deletions(-) create mode 100644 tests/fs_without_landlock.rs diff --git a/Cargo.toml b/Cargo.toml index 0256ba9..991da45 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,11 @@ name = "fs" path = "tests/fs.rs" harness = false +[[test]] +name = "fs_without_landlock" +path = "tests/fs_without_landlock.rs" +harness = false + [[test]] name = "full_env" path = "tests/full_env.rs" diff --git a/src/linux/mod.rs b/src/linux/mod.rs index e3b6bf1..5542836 100644 --- a/src/linux/mod.rs +++ b/src/linux/mod.rs @@ -3,6 +3,9 @@ //! This module implements sandboxing on Linux based on the Landlock LSM, //! namespaces, and seccomp. +use std::collections::HashMap; +use std::path::PathBuf; + use landlock::{ make_bitflags, Access, AccessFs, Compatible, PathBeneath, PathFd, Ruleset, RulesetAttr, RulesetCreated, RulesetCreatedAttr, RulesetStatus, ABI as LANDLOCK_ABI, @@ -20,12 +23,34 @@ const ABI: LANDLOCK_ABI = LANDLOCK_ABI::V1; /// Linux sandboxing. pub struct LinuxSandbox { + bind_mounts: HashMap, env_exceptions: Vec, landlock: RulesetCreated, allow_networking: bool, full_env: bool, } +impl LinuxSandbox { + /// Add or modify a bind mount. + /// + /// This will add a new bind mount with the specified permission if it does + /// not exist already. + /// + /// If the bind mount already exists, it will *ADD* the additional + /// permissions. + fn update_bind_mount(&mut self, path: PathBuf, write: bool, execute: bool) { + let flags = self.bind_mounts.entry(path).or_insert(libc::MS_RDONLY | libc::MS_NOEXEC); + + if write { + *flags &= !libc::MS_RDONLY; + } + + if execute { + *flags &= !libc::MS_NOEXEC; + } + } +} + impl Sandbox for LinuxSandbox { fn new() -> Result { // Setup landlock filtering. @@ -35,14 +60,38 @@ impl Sandbox for LinuxSandbox { .create()?; landlock.as_mut().set_no_new_privs(true); - Ok(Self { landlock, env_exceptions: Vec::new(), allow_networking: false, full_env: false }) + Ok(Self { + landlock, + allow_networking: false, + full_env: false, + env_exceptions: Default::default(), + bind_mounts: Default::default(), + }) } fn add_exception(&mut self, exception: Exception) -> Result<&mut Self> { - let (path, access) = match exception { - Exception::Read(path) => (path, make_bitflags!(AccessFs::{ ReadFile | ReadDir })), - Exception::Write(path) => (path, AccessFs::from_write(ABI)), - Exception::ExecuteAndRead(path) => (path, AccessFs::from_read(ABI)), + let (path_fd, access) = match exception { + Exception::Read(path) => { + let path_fd = PathFd::new(&path)?; + + self.update_bind_mount(path, false, false); + + (path_fd, make_bitflags!(AccessFs::{ ReadFile | ReadDir })) + }, + Exception::Write(path) => { + let path_fd = PathFd::new(&path)?; + + self.update_bind_mount(path, true, false); + + (path_fd, AccessFs::from_write(ABI)) + }, + Exception::ExecuteAndRead(path) => { + let path_fd = PathFd::new(&path)?; + + self.update_bind_mount(path, false, true); + + (path_fd, AccessFs::from_read(ABI)) + }, Exception::Environment(key) => { self.env_exceptions.push(key); return Ok(self); @@ -57,7 +106,7 @@ impl Sandbox for LinuxSandbox { }, }; - let rule = PathBeneath::new(PathFd::new(path)?, access); + let rule = PathBeneath::new(path_fd, access); self.landlock.as_mut().add_rule(rule)?; @@ -71,18 +120,28 @@ impl Sandbox for LinuxSandbox { } // Setup namespaces. - let namespace_result = namespaces::create_namespaces(!self.allow_networking); + let namespace_result = + namespaces::create_namespaces(self.allow_networking, self.bind_mounts); // Setup seccomp network filter. if !self.allow_networking { let seccomp_result = NetworkFilter::apply(); // Propagate failure if neither seccomp nor namespaces could isolate networking. - namespace_result.or(seccomp_result)?; + if let (Err(_), Err(err)) = (&namespace_result, seccomp_result) { + return Err(err); + } } // Apply landlock rules. - let status = self.landlock.restrict_self()?; + let landlock_result = self.landlock.restrict_self(); + + // Ensure either landlock or namespaces are enforced. + let status = match (landlock_result, namespace_result) { + (Ok(status), _) => status, + (Err(_), Ok(_)) => return Ok(()), + (Err(err), _) => return Err(err.into()), + }; // Ensure all restrictions were properly applied. if status.no_new_privs && status.ruleset == RulesetStatus::FullyEnforced { diff --git a/src/linux/namespaces.rs b/src/linux/namespaces.rs index 7b6af6e..a670b74 100644 --- a/src/linux/namespaces.rs +++ b/src/linux/namespaces.rs @@ -1,44 +1,262 @@ //! Linux namespaces. -use std::fs; +use std::cmp::Ordering; +use std::collections::HashMap; +use std::ffi::{CStr, CString}; +use std::fs::{self, File}; use std::io::Error as IoError; +use std::os::unix::ffi::OsStrExt; +use std::path::{Component, Path, PathBuf}; +use std::{env, ptr}; use bitflags::bitflags; use crate::error::Result; +/// Path for mount namespace's new root. +const NEW_ROOT: &str = "/tmp/birdcage-root"; + +/// Old root mount point inside the new root. +/// +/// This should not conflict with any files or directories which might be +/// present in a root directory, since it will be placed in the new root. +/// +/// However this is only present within the new bind mount, so it will not be +/// persisted and cannot conflict with previous mount namespace sandboxes. +const OLD_ROOT_DIR: &str = "birdcage-old-root"; + /// Isolate process using Linux namespaces. /// /// If successful, this will always clear the abstract namespace. /// /// Additionally it will isolate network access if `deny_networking` is `true`. -pub fn create_namespaces(deny_networking: bool) -> Result<()> { +pub fn create_namespaces( + allow_networking: bool, + bind_mounts: HashMap, +) -> Result<()> { // Get EUID/EGID outside of the namespace. let uid = unsafe { libc::geteuid() }; let gid = unsafe { libc::getegid() }; // Setup the network namespace. - if deny_networking { - create_user_namespace(uid, gid, 0, 0, Namespaces::NETWORK)?; + if !allow_networking { + create_user_namespace(0, 0, Namespaces::NETWORK)?; } + // Isolate filesystem and procfs. + create_mount_namespace(bind_mounts)?; + // Drop root user mapping and ensure abstract namespace is cleared. - create_user_namespace(uid, gid, uid, gid, Namespaces::empty())?; + create_user_namespace(uid, gid, Namespaces::empty())?; + + Ok(()) +} + +/// Create a mount namespace to isolate filesystem access. +/// +/// This will deny access to any path which isn't part of `bind_mounts`. Allowed +/// paths are mounted according to their bind mount flags. +fn create_mount_namespace(bind_mounts: HashMap) -> Result<()> { + // Create mount namespace to allow creation of new mounts. + create_user_namespace(0, 0, Namespaces::MOUNT)?; + + // Get target paths for new and old root. + let new_root = PathBuf::from(NEW_ROOT); + let put_old = new_root.join(OLD_ROOT_DIR); + + // Ensure new root exists. + if !new_root.exists() { + fs::create_dir(&new_root)?; + } + + // Create C-friendly versions for our paths. + let new_root_c = CString::new(new_root.as_os_str().as_bytes()).unwrap(); + let put_old_c = CString::new(put_old.as_os_str().as_bytes()).unwrap(); + + // Create bind mount for new root to allow pivot. + bind_mount(&new_root_c, &new_root_c, 0)?; + + // Sort bind mounts by shortest length, to create parents before their children. + let mut bind_mounts = bind_mounts.into_iter().collect::>(); + bind_mounts.sort_unstable_by(|(a_path, a_flags), (b_path, b_flags)| { + match a_path.components().count().cmp(&b_path.components().count()) { + Ordering::Equal => (a_path, a_flags).cmp(&(b_path, b_flags)), + ord => ord, + } + }); + + // Bind mount all allowed directories. + let current_dir = env::current_dir().ok(); + for (mut path, flags) in bind_mounts { + // Ensure all paths are relative. + if path.is_relative() { + let current_dir = match ¤t_dir { + Some(current_dir) => current_dir, + // Ignore relative paths if we cannot access the working directory. + None => continue, + }; + path = current_dir.join(path); + } + + let src_c = CString::new(path.as_os_str().as_bytes()).unwrap(); + + // Get bind mount destination. + let unrooted_path = path.strip_prefix("/").unwrap(); + let dst = new_root.join(unrooted_path); + let dst_c = CString::new(dst.as_os_str().as_bytes()).unwrap(); + + // Create mount target. + copy_tree(path, &new_root)?; + + // Bind path with full permissions. + bind_mount(&src_c, &dst_c, flags)?; + } + + // Bind mount old procfs. + let old_proc_c = CString::new("/proc").unwrap(); + let new_proc = new_root.join("proc"); + let new_proc_c = CString::new(new_proc.as_os_str().as_bytes()).unwrap(); + fs::create_dir_all(&new_proc)?; + bind_mount(&old_proc_c, &new_proc_c, 0).unwrap(); + + // Pivot root to `new_root`, placing the old root in `put_old`. + fs::create_dir_all(put_old)?; + pivot_root(&new_root_c, &put_old_c)?; + + // Remove the old root mount. + let new_old = PathBuf::from("/").join(OLD_ROOT_DIR); + let new_old_c = CString::new(new_old.as_os_str().as_bytes()).unwrap(); + umount(&new_old_c)?; + + // Prevent child mount namespaces from accessing this namespace's mounts. + deny_mount_propagation()?; + + Ok(()) +} + +/// Replicate a directory tree under a different directory. +/// +/// This will create all missing empty diretories and copy their permissions +/// from the source tree. +/// +/// If `src` ends in a file, an empty file with matching permissions will be +/// created. +fn copy_tree(src: impl AsRef, dst: impl AsRef) -> Result<()> { + let mut dst = dst.as_ref().to_path_buf(); + let mut src_sub = PathBuf::new(); + let src = src.as_ref(); + + for component in src.components() { + // Append root only to source. + if component == Component::RootDir { + src_sub = src_sub.join(component); + continue; + } + + src_sub = src_sub.join(component); + dst = dst.join(component); + + // Skip directories that already exist. + if dst.exists() { + continue; + } + + // Create target file/directory. + let metadata = src_sub.metadata()?; + if metadata.is_file() { + File::create(&dst)?; + } else if metadata.is_dir() { + fs::create_dir(&dst)?; + } else { + unreachable!("metadata call failed to follow symlink"); + } + + // Copy permissions. + let permissions = metadata.permissions(); + fs::set_permissions(&dst, permissions)?; + } + + Ok(()) +} + +/// Create a new bind mount. +fn bind_mount(src: &CStr, dst: &CStr, flags: libc::c_ulong) -> Result<()> { + let flags = libc::MS_BIND | libc::MS_NOSUID | libc::MS_REC | flags; + let fstype = CString::new("").unwrap(); + let res = + unsafe { libc::mount(src.as_ptr(), dst.as_ptr(), fstype.as_ptr(), flags, ptr::null()) }; + + if res == 0 { + Ok(()) + } else { + Err(IoError::last_os_error().into()) + } +} + +/// Recursively update the root to deny mount propagation. +fn deny_mount_propagation() -> Result<()> { + let flags = libc::MS_PRIVATE | libc::MS_REC; + let root = CString::new("/").unwrap(); + let fstype = CString::new("").unwrap(); + let res = + unsafe { libc::mount(root.as_ptr(), root.as_ptr(), fstype.as_ptr(), flags, ptr::null()) }; + + if res == 0 { + Ok(()) + } else { + Err(IoError::last_os_error().into()) + } +} + +/// Change root directory to `new_root` and mount the old root in `put_old`. +/// +/// The `put_old` directory must be at or undearneath `new_root`. +fn pivot_root(new_root: &CStr, put_old: &CStr) -> Result<()> { + // Get target working directory path. + let working_dir = env::current_dir().unwrap_or_else(|_| PathBuf::from("/")); + + let result = + unsafe { libc::syscall(libc::SYS_pivot_root, new_root.as_ptr(), put_old.as_ptr()) }; + + if result != 0 { + return Err(IoError::last_os_error().into()); + } + + // Attempt to recover working directory, or switch to root. + // + // Without this, the user's working directory would stay the same, giving him + // full access to it even if it is not bound. + if env::set_current_dir(working_dir).is_err() { + env::set_current_dir("/")?; + } Ok(()) } +/// Unmount a filesystem. +fn umount(target: &CStr) -> Result<()> { + let result = unsafe { libc::umount2(target.as_ptr(), libc::MNT_DETACH) }; + + if result == 0 { + Ok(()) + } else { + Err(IoError::last_os_error().into()) + } +} + /// Create a new user namespace. /// /// The parent and child UIDs and GIDs define the user and group mappings /// between the parent namespace and the new user namespace. fn create_user_namespace( - parent_uid: u32, - parent_gid: u32, child_uid: u32, child_gid: u32, extra_namespaces: Namespaces, ) -> Result<()> { + // Get current user's EUID and EGID. + let parent_uid = unsafe { libc::geteuid() }; + let parent_gid = unsafe { libc::getegid() }; + // Create the namespace. unshare(Namespaces::USER | extra_namespaces)?; diff --git a/tests/exec.rs b/tests/exec.rs index f7f67ff..cba4c8e 100644 --- a/tests/exec.rs +++ b/tests/exec.rs @@ -7,6 +7,8 @@ fn main() { let mut birdcage = Birdcage::new().unwrap(); birdcage.add_exception(Exception::ExecuteAndRead("/usr/bin/true".into())).unwrap(); birdcage.add_exception(Exception::ExecuteAndRead("/usr/lib".into())).unwrap(); + birdcage.add_exception(Exception::ExecuteAndRead("/lib64".into())).unwrap(); + birdcage.add_exception(Exception::ExecuteAndRead("/lib".into())).unwrap(); birdcage.lock().unwrap(); // Check for success when executing `true`. diff --git a/tests/fs_without_landlock.rs b/tests/fs_without_landlock.rs new file mode 100644 index 0000000..f6c2232 --- /dev/null +++ b/tests/fs_without_landlock.rs @@ -0,0 +1,61 @@ +#[cfg(target_os = "linux")] +use std::collections::BTreeMap; +#[cfg(target_os = "linux")] +use std::fs; + +#[cfg(target_os = "linux")] +use birdcage::{Birdcage, Exception, Sandbox}; +#[cfg(target_os = "linux")] +use seccompiler::{BpfProgram, SeccompAction, SeccompFilter, TargetArch}; +#[cfg(target_os = "linux")] +use tempfile::NamedTempFile; + +#[cfg(target_os = "linux")] +#[cfg(target_arch = "x86_64")] +const ARCH: TargetArch = TargetArch::x86_64; +#[cfg(target_os = "linux")] +#[cfg(target_arch = "aarch64")] +const ARCH: TargetArch = TargetArch::aarch64; + +fn main() { + const FILE_CONTENT: &str = "expected content"; + + // Create seccomp filter blocking `landlock_create_ruleset` syscall. + let mut rules = BTreeMap::new(); + rules.insert(libc::SYS_landlock_restrict_self, Vec::new()); + let filter = SeccompFilter::new( + rules, + SeccompAction::Allow, + SeccompAction::Errno(libc::EACCES as u32), + ARCH, + ) + .unwrap(); + let program: BpfProgram = filter.try_into().unwrap(); + seccompiler::apply_filter(&program).unwrap(); + + // Setup our test files. + let private_path = NamedTempFile::new().unwrap(); + fs::write(&private_path, FILE_CONTENT.as_bytes()).unwrap(); + let public_path = NamedTempFile::new().unwrap(); + fs::write(&public_path, FILE_CONTENT.as_bytes()).unwrap(); + + // Activate our sandbox. + let mut birdcage = Birdcage::new().unwrap(); + birdcage.add_exception(Exception::Read(public_path.path().into())).unwrap(); + let result = birdcage.lock(); + + match result { + // Namespaces are supported, so filesystem should still be restricted. + Ok(_) => (), + // Namespaces aren't supported, so failure is desired. + Err(_) => return, + } + + // Access to the public file is allowed. + let content = fs::read_to_string(public_path).unwrap(); + assert_eq!(content, FILE_CONTENT); + + // Access to the private file is prohibited. + let result = fs::read_to_string(private_path); + assert!(result.is_err()); +} diff --git a/tests/full_sandbox.rs b/tests/full_sandbox.rs index db66e2f..d6db036 100644 --- a/tests/full_sandbox.rs +++ b/tests/full_sandbox.rs @@ -24,7 +24,7 @@ fn main() { drop(stream); // Ensure non-sandboxed execution works. - let cmd = Command::new("/bin/echo").arg("hello world").status(); + let cmd = Command::new("/bin/true").status(); assert!(cmd.is_ok()); // Ensure non-sandboxed env access works. @@ -48,7 +48,7 @@ fn main() { drop(stream); // Ensure sandboxed execution is blocked. - let cmd = Command::new("/bin/echo").arg("hello world").status(); + let cmd = Command::new("/bin/true").status(); assert!(cmd.is_err()); // Ensure sandboxed env access is blocked.