linux.rs: unify jail creation
This change unifies two substantially similiar segments of code used to
create a jail.
BUG=none
TEST=Ran 'build_test'. Local build, deployed to DUT, and verified that
termina VM could still be used.
Change-Id: Ib1f2f9bc5cfe1e6c9f3633af7e23f52e5eafe3c7
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/crosvm/+/2057744
Tested-by: Matt Delco <delco@chromium.org>
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Dylan Reid <dgreid@chromium.org>
Commit-Queue: Matt Delco <delco@chromium.org>
diff --git a/src/linux.rs b/src/linux.rs
index 0f8a848..bf2c014 100644
--- a/src/linux.rs
+++ b/src/linux.rs
@@ -304,55 +304,78 @@
}
}
+struct SandboxConfig<'a> {
+ limit_caps: bool,
+ log_failures: bool,
+ seccomp_policy: &'a Path,
+ uid_map: Option<&'a str>,
+ gid_map: Option<&'a str>,
+}
+
fn create_base_minijail(
root: &Path,
- log_failures: bool,
- seccomp_policy: &Path,
+ r_limit: Option<u64>,
+ config: Option<&SandboxConfig>,
) -> Result<Minijail> {
// All child jails run in a new user namespace without any users mapped,
// they run as nobody unless otherwise configured.
let mut j = Minijail::new().map_err(Error::DeviceJail)?;
- j.namespace_pids();
- j.namespace_user();
- j.namespace_user_disable_setgroups();
- // Don't need any capabilities.
- j.use_caps(0);
+
+ if let Some(config) = config {
+ j.namespace_pids();
+ j.namespace_user();
+ j.namespace_user_disable_setgroups();
+ if config.limit_caps {
+ // Don't need any capabilities.
+ j.use_caps(0);
+ }
+ if let Some(uid_map) = config.uid_map {
+ j.uidmap(uid_map).map_err(Error::SettingUidMap)?;
+ }
+ if let Some(gid_map) = config.gid_map {
+ j.gidmap(gid_map).map_err(Error::SettingGidMap)?;
+ }
+ // Run in an empty network namespace.
+ j.namespace_net();
+ // Apply the block device seccomp policy.
+ j.no_new_privs();
+
+ // By default we'll prioritize using the pre-compiled .bpf over the .policy
+ // file (the .bpf is expected to be compiled using "trap" as the failure
+ // behavior instead of the default "kill" behavior).
+ // Refer to the code comment for the "seccomp-log-failures"
+ // command-line parameter for an explanation about why the |log_failures|
+ // flag forces the use of .policy files (and the build-time alternative to
+ // this run-time flag).
+ let bpf_policy_file = config.seccomp_policy.with_extension("bpf");
+ if bpf_policy_file.exists() && !config.log_failures {
+ j.parse_seccomp_program(&bpf_policy_file)
+ .map_err(Error::DeviceJail)?;
+ } else {
+ // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP,
+ // which will correctly kill the entire device process if a worker
+ // thread commits a seccomp violation.
+ j.set_seccomp_filter_tsync();
+ if config.log_failures {
+ j.log_seccomp_filter_failures();
+ }
+ j.parse_seccomp_filters(&config.seccomp_policy.with_extension("policy"))
+ .map_err(Error::DeviceJail)?;
+ }
+ j.use_seccomp_filter();
+ // Don't do init setup.
+ j.run_as_init();
+ }
+
// Create a new mount namespace with an empty root FS.
j.namespace_vfs();
j.enter_pivot_root(root).map_err(Error::DevicePivotRoot)?;
- // Run in an empty network namespace.
- j.namespace_net();
- // Most devices don't need to open many fds.
- j.set_rlimit(libc::RLIMIT_NOFILE as i32, 1024, 1024)
- .map_err(Error::SettingMaxOpenFiles)?;
- // Apply the block device seccomp policy.
- j.no_new_privs();
- // By default we'll prioritize using the pre-compiled .bpf over the .policy
- // file (the .bpf is expected to be compiled using "trap" as the failure
- // behavior instead of the default "kill" behavior).
- // Refer to the code comment for the "seccomp-log-failures"
- // command-line parameter for an explanation about why the |log_failures|
- // flag forces the use of .policy files (and the build-time alternative to
- // this run-time flag).
- let bpf_policy_file = seccomp_policy.with_extension("bpf");
- if bpf_policy_file.exists() && !log_failures {
- j.parse_seccomp_program(&bpf_policy_file)
- .map_err(Error::DeviceJail)?;
- } else {
- // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP,
- // which will correctly kill the entire device process if a worker
- // thread commits a seccomp violation.
- j.set_seccomp_filter_tsync();
- if log_failures {
- j.log_seccomp_filter_failures();
- }
- j.parse_seccomp_filters(&seccomp_policy.with_extension("policy"))
- .map_err(Error::DeviceJail)?;
- }
- j.use_seccomp_filter();
- // Don't do init setup.
- j.run_as_init();
+ // Most devices don't need to open many fds.
+ let limit = if let Some(r) = r_limit { r } else { 1024u64 };
+ j.set_rlimit(libc::RLIMIT_NOFILE as i32, limit, limit)
+ .map_err(Error::SettingMaxOpenFiles)?;
+
Ok(j)
}
@@ -365,11 +388,14 @@
return Err(Error::PivotRootDoesntExist(pivot_root));
}
let policy_path: PathBuf = cfg.seccomp_policy_dir.join(policy);
- Ok(Some(create_base_minijail(
- root_path,
- cfg.seccomp_log_failures,
- &policy_path,
- )?))
+ let config = SandboxConfig {
+ limit_caps: true,
+ log_failures: cfg.seccomp_log_failures,
+ seccomp_policy: &policy_path,
+ uid_map: None,
+ gid_map: None,
+ };
+ Ok(Some(create_base_minijail(root_path, None, Some(&config))?))
} else {
Ok(None)
}
@@ -774,45 +800,20 @@
tag: &str,
fs_cfg: virtio::fs::passthrough::Config,
) -> DeviceResult {
- let mut j = Minijail::new().map_err(Error::DeviceJail)?;
-
- if cfg.sandbox {
- j.namespace_pids();
- j.namespace_user();
- j.namespace_user_disable_setgroups();
- j.uidmap(uid_map).map_err(Error::SettingUidMap)?;
- j.gidmap(gid_map).map_err(Error::SettingGidMap)?;
-
- // Run in an empty network namespace.
- j.namespace_net();
-
- j.no_new_privs();
-
- // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly kill
- // the entire device process if a worker thread commits a seccomp violation.
- let seccomp_policy = cfg.seccomp_policy_dir.join("fs_device");
- j.set_seccomp_filter_tsync();
- if cfg.seccomp_log_failures {
- j.log_seccomp_filter_failures();
- }
- j.parse_seccomp_filters(&seccomp_policy)
- .map_err(Error::DeviceJail)?;
- j.use_seccomp_filter();
-
- // Don't do init setup.
- j.run_as_init();
- }
-
- // Create a new mount namespace with the source directory as the root. We need this even when
- // sandboxing is disabled as the server relies on the host kernel to prevent path traversals
- // from leaking out of the shared directory.
- j.namespace_vfs();
- j.enter_pivot_root(src).map_err(Error::DevicePivotRoot)?;
-
- // The file server opens a lot of fds and needs a really high open file limit.
let max_open_files = get_max_open_files()?;
- j.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
- .map_err(Error::SettingMaxOpenFiles)?;
+ let j = if cfg.sandbox {
+ let seccomp_policy = cfg.seccomp_policy_dir.join("fs_device");
+ let config = SandboxConfig {
+ limit_caps: false,
+ uid_map: Some(uid_map),
+ gid_map: Some(gid_map),
+ log_failures: cfg.seccomp_log_failures,
+ seccomp_policy: &seccomp_policy,
+ };
+ create_base_minijail(src, Some(max_open_files), Some(&config))?
+ } else {
+ create_base_minijail(src, Some(max_open_files), None)?
+ };
// TODO(chirantan): Use more than one worker once the kernel driver has been fixed to not panic
// when num_queues > 1.